2 /* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 #if defined(__FreeBSD__) || defined(__NetBSD__)
67 #include "opt_inet6.h"
69 #endif /* __FreeBSD__ || __NetBSD__ */
70 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
72 #include <sys/param.h>
73 #include <sys/malloc.h>
75 #include <sys/socket.h>
76 #include <sys/systm.h>
77 #include <sys/errno.h>
78 #if 1 /* ALTQ3_COMPAT */
79 #include <sys/sockio.h>
81 #include <sys/kernel.h>
83 #include <sys/queue.h>
86 #endif /* ALTQ3_COMPAT */
89 #include <net/if_var.h>
91 #include <netinet/in.h>
92 #include <netinet/in_systm.h>
93 #include <netinet/ip.h>
95 #include <netinet/ip6.h>
98 #include <netpfil/pf/pf.h>
99 #include <netpfil/pf/pf_altq.h>
100 #include <netpfil/pf/pf_mtag.h>
101 #include <altq/altq.h>
102 #include <altq/altq_red.h>
104 #include <altq/altq_conf.h>
105 #ifdef ALTQ_FLOWVALVE
106 #include <altq/altq_flowvalve.h>
111 * ALTQ/RED (Random Early Detection) implementation using 32-bit
112 * fixed-point calculation.
114 * written by kjc using the ns code as a reference.
115 * you can learn more about red and ns from Sally's home page at
116 * http://www-nrg.ee.lbl.gov/floyd/
118 * most of the red parameter values are fixed in this implementation
119 * to prevent fixed-point overflow/underflow.
120 * if you change the parameters, watch out for overflow/underflow!
122 * the parameters used are recommended values by Sally.
123 * the corresponding ns config looks:
125 * minthresh=5 maxthresh=15 queue-size=60
128 * bytes=false (can't be handled by 32-bit fixed-point)
129 * doubleq=false dqthresh=false
133 * alternative red parameters for a slow link.
135 * assume the queue length becomes from zero to L and keeps L, it takes
136 * N packets for q_avg to reach 63% of L.
137 * when q_weight is 0.002, N is about 500 packets.
138 * for a slow link like dial-up, 500 packets takes more than 1 minute!
139 * when q_weight is 0.008, N is about 127 packets.
140 * when q_weight is 0.016, N is about 63 packets.
141 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
142 * are allowed for 0.016.
143 * see Sally's paper for more details.
145 /* normal red parameters */
146 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
147 /* q_weight = 0.00195 */
149 /* red parameters for a slow link */
150 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
151 /* q_weight = 0.0078125 */
153 /* red parameters for a very slow link (e.g., dialup) */
154 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
155 /* q_weight = 0.015625 */
157 /* fixed-point uses 12-bit decimal places */
158 #define FP_SHIFT 12 /* fixed-point shift */
160 /* red parameters for drop probability */
161 #define INV_P_MAX 10 /* inverse of max drop probability */
162 #define TH_MIN 5 /* min threshold */
163 #define TH_MAX 15 /* max threshold */
165 #define RED_LIMIT 60 /* default max queue lenght */
166 #define RED_STATS /* collect statistics */
169 * our default policy for forced-drop is drop-tail.
170 * (in altq-1.1.2 or earlier, the default was random-drop.
171 * but it makes more sense to punish the cause of the surge.)
172 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
176 #ifdef ALTQ_FLOWVALVE
178 * flow-valve is an extention to protect red from unresponsive flows
179 * and to promote end-to-end congestion control.
180 * flow-valve observes the average drop rates of the flows that have
181 * experienced packet drops in the recent past.
182 * when the average drop rate exceeds the threshold, the flow is
183 * blocked by the flow-valve. the trapped flow should back off
184 * exponentially to escape from the flow-valve.
186 #ifdef RED_RANDOM_DROP
187 #error "random-drop can't be used with flow-valve!"
189 #endif /* ALTQ_FLOWVALVE */
191 /* red_list keeps all red_queue_t's allocated. */
192 static red_queue_t *red_list = NULL;
194 #endif /* ALTQ3_COMPAT */
196 /* default red parameter values */
197 static int default_th_min = TH_MIN;
198 static int default_th_max = TH_MAX;
199 static int default_inv_pmax = INV_P_MAX;
202 /* internal function prototypes */
203 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
204 static struct mbuf *red_dequeue(struct ifaltq *, int);
205 static int red_request(struct ifaltq *, int, void *);
206 static void red_purgeq(red_queue_t *);
207 static int red_detach(red_queue_t *);
208 #ifdef ALTQ_FLOWVALVE
209 static __inline struct fve *flowlist_lookup(struct flowvalve *,
210 struct altq_pktattr *, struct timeval *);
211 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
212 struct altq_pktattr *);
213 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
214 static __inline int fv_p2f(struct flowvalve *, int);
215 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
216 static struct flowvalve *fv_alloc(struct red *);
218 static void fv_destroy(struct flowvalve *);
219 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
221 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
224 #endif /* ALTQ3_COMPAT */
227 * red support routines
230 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
237 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
242 rp->red_weight = W_WEIGHT;
244 rp->red_weight = weight;
246 /* allocate weight table */
247 rp->red_wtab = wtab_alloc(rp->red_weight);
248 if (rp->red_wtab == NULL) {
257 rp->red_inv_pmax = default_inv_pmax;
259 rp->red_inv_pmax = inv_pmax;
261 rp->red_thmin = default_th_min;
263 rp->red_thmin = th_min;
265 rp->red_thmax = default_th_max;
267 rp->red_thmax = th_max;
269 rp->red_flags = flags;
272 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
273 rp->red_pkttime = 800;
275 rp->red_pkttime = pkttime;
278 /* when the link is very slow, adjust red parameters */
279 npkts_per_sec = 1000000 / rp->red_pkttime;
280 if (npkts_per_sec < 50) {
281 /* up to about 400Kbps */
282 rp->red_weight = W_WEIGHT_2;
283 } else if (npkts_per_sec < 300) {
284 /* up to about 2.4Mbps */
285 rp->red_weight = W_WEIGHT_1;
289 /* calculate wshift. weight must be power of 2 */
291 for (i = 0; w > 1; i++)
294 w = 1 << rp->red_wshift;
295 if (w != rp->red_weight) {
296 printf("invalid weight value %d for red! use %d\n",
302 * thmin_s and thmax_s are scaled versions of th_min and th_max
303 * to be compared with avg.
305 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
306 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
309 * precompute probability denominator
310 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
312 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
313 * rp->red_inv_pmax) << FP_SHIFT;
315 microtime(&rp->red_last);
320 red_destroy(red_t *rp)
323 #ifdef ALTQ_FLOWVALVE
324 if (rp->red_flowvalve != NULL)
325 fv_destroy(rp->red_flowvalve);
327 #endif /* ALTQ3_COMPAT */
328 wtab_destroy(rp->red_wtab);
333 red_getstats(red_t *rp, struct redstats *sp)
335 sp->q_avg = rp->red_avg >> rp->red_wshift;
336 sp->xmit_cnt = rp->red_stats.xmit_cnt;
337 sp->drop_cnt = rp->red_stats.drop_cnt;
338 sp->drop_forced = rp->red_stats.drop_forced;
339 sp->drop_unforced = rp->red_stats.drop_unforced;
340 sp->marked_packets = rp->red_stats.marked_packets;
344 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
345 struct altq_pktattr *pktattr)
350 #ifdef ALTQ_FLOWVALVE
351 struct fve *fve = NULL;
353 if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
354 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
359 #endif /* ALTQ3_COMPAT */
364 * if we were idle, we pretend that n packets arrived during
373 t = (now.tv_sec - rp->red_last.tv_sec);
376 * being idle for more than 1 minute, set avg to zero.
377 * this prevents t from overflow.
381 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
382 n = t / rp->red_pkttime - 1;
384 /* the following line does (avg = (1 - Wq)^n * avg) */
386 avg = (avg >> FP_SHIFT) *
387 pow_w(rp->red_wtab, n);
391 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
392 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
393 rp->red_avg = avg; /* save the new value */
396 * red_count keeps a tally of arriving traffic that has not
401 /* see if we drop early */
402 droptype = DTYPE_NODROP;
403 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
404 if (avg >= rp->red_thmax_s) {
405 /* avg >= th_max: forced drop */
406 droptype = DTYPE_FORCED;
407 } else if (rp->red_old == 0) {
408 /* first exceeds th_min */
411 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
412 rp->red_probd, rp->red_count)) {
413 /* mark or drop by red */
414 if ((rp->red_flags & REDF_ECN) &&
415 mark_ecn(m, pktattr, rp->red_flags)) {
416 /* successfully marked. do not drop. */
419 rp->red_stats.marked_packets++;
422 /* unforced drop by red */
423 droptype = DTYPE_EARLY;
432 * if the queue length hits the hard limit, it's a forced drop.
434 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
435 droptype = DTYPE_FORCED;
437 #ifdef RED_RANDOM_DROP
438 /* if successful or forced drop, enqueue this packet. */
439 if (droptype != DTYPE_EARLY)
442 /* if successful, enqueue this packet. */
443 if (droptype == DTYPE_NODROP)
446 if (droptype != DTYPE_NODROP) {
447 if (droptype == DTYPE_EARLY) {
448 /* drop the incoming packet */
450 rp->red_stats.drop_unforced++;
453 /* forced drop, select a victim packet in the queue. */
454 #ifdef RED_RANDOM_DROP
458 rp->red_stats.drop_forced++;
462 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
466 #ifdef ALTQ_FLOWVALVE
467 if (rp->red_flowvalve != NULL)
468 fv_dropbyred(rp->red_flowvalve, pktattr, fve);
470 #endif /* ALTQ3_COMPAT */
474 /* successfully queued */
476 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
482 * early-drop probability is calculated as follows:
483 * prob = p_max * (avg - th_min) / (th_max - th_min)
484 * prob_a = prob / (2 - count*prob)
485 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
486 * here prob_a increases as successive undrop count increases.
487 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
488 * becomes 1 when (count >= (2 / prob))).
491 drop_early(int fp_len, int fp_probd, int count)
493 int d; /* denominator of drop-probability */
495 d = fp_probd - count * fp_len;
497 /* count exceeds the hard limit: drop or mark */
501 * now the range of d is [1..600] in fixed-point. (when
502 * th_max-th_min=10 and p_max=1/30)
503 * drop probability = (avg - TH_MIN) / d
506 if ((arc4random() % d) < fp_len) {
515 * try to mark CE bit to the packet.
516 * returns 1 if successfully marked, 0 otherwise.
519 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
525 at = pf_find_mtag(m);
529 } else if (pktattr != NULL) {
530 af = pktattr->pattr_af;
531 hdr = pktattr->pattr_hdr;
532 #endif /* ALTQ3_COMPAT */
536 /* verify that pattr_hdr is within the mbuf data */
537 for (m0 = m; m0 != NULL; m0 = m0->m_next)
538 if (((caddr_t)hdr >= m0->m_data) &&
539 ((caddr_t)hdr < m0->m_data + m0->m_len))
542 /* ick, tag info is stale */
546 switch (((struct ip *)hdr)->ip_v) {
548 if (flags & REDF_ECN4) {
554 return (0); /* version mismatch! */
556 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
557 return (0); /* not-ECT */
558 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
559 return (1); /* already marked */
562 * ecn-capable but not marked,
563 * mark CE and update checksum
566 ip->ip_tos |= IPTOS_ECN_CE;
568 * update checksum (from RFC1624)
569 * HC' = ~(~HC + ~m + m')
571 sum = ~ntohs(ip->ip_sum) & 0xffff;
572 sum += (~otos & 0xffff) + ip->ip_tos;
573 sum = (sum >> 16) + (sum & 0xffff);
574 sum += (sum >> 16); /* add carry */
575 ip->ip_sum = htons(~sum & 0xffff);
580 case (IPV6_VERSION >> 4):
581 if (flags & REDF_ECN6) {
582 struct ip6_hdr *ip6 = hdr;
585 flowlabel = ntohl(ip6->ip6_flow);
586 if ((flowlabel >> 28) != 6)
587 return (0); /* version mismatch! */
588 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
589 (IPTOS_ECN_NOTECT << 20))
590 return (0); /* not-ECT */
591 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
592 (IPTOS_ECN_CE << 20))
593 return (1); /* already marked */
595 * ecn-capable but not marked, mark CE
597 flowlabel |= (IPTOS_ECN_CE << 20);
598 ip6->ip6_flow = htonl(flowlabel);
616 if ((m = _getq(q)) == NULL) {
617 if (rp->red_idle == 0) {
619 microtime(&rp->red_last);
629 * helper routine to calibrate avg during idle.
630 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
631 * here Wq = 1/weight and the code assumes Wq is close to zero.
633 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
635 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
638 wtab_alloc(int weight)
643 for (w = wtab_list; w != NULL; w = w->w_next)
644 if (w->w_weight == weight) {
649 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
652 w->w_weight = weight;
654 w->w_next = wtab_list;
657 /* initialize the weight table */
658 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
659 for (i = 1; i < 32; i++) {
660 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
661 if (w->w_tab[i] == 0 && w->w_param_max == 0)
662 w->w_param_max = 1 << i;
669 wtab_destroy(struct wtab *w)
673 if (--w->w_refcount > 0)
677 wtab_list = w->w_next;
678 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
679 if (prev->w_next == w) {
680 prev->w_next = w->w_next;
689 pow_w(struct wtab *w, int n)
694 if (n >= w->w_param_max)
705 val = (val * w->w_tab[i]) >> FP_SHIFT;
716 * red device interface
721 redopen(dev, flag, fmt, p)
724 #if (__FreeBSD_version > 500000)
730 /* everything will be done when the queueing scheme is attached. */
735 redclose(dev, flag, fmt, p)
738 #if (__FreeBSD_version > 500000)
747 while ((rqp = red_list) != NULL) {
749 err = red_detach(rqp);
750 if (err != 0 && error == 0)
758 redioctl(dev, cmd, addr, flag, p)
763 #if (__FreeBSD_version > 500000)
770 struct red_interface *ifacep;
774 /* check super-user privilege */
779 #if (__FreeBSD_version > 700000)
780 if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
781 #elsif (__FreeBSD_version > 400000)
782 if ((error = suser(p)) != 0)
784 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
793 ifacep = (struct red_interface *)addr;
794 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
798 error = altq_enable(rqp->rq_ifq);
802 ifacep = (struct red_interface *)addr;
803 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
807 error = altq_disable(rqp->rq_ifq);
811 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
817 /* allocate and initialize red_queue_t */
818 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
823 bzero(rqp, sizeof(red_queue_t));
825 rqp->rq_q = malloc(sizeof(class_queue_t),
827 if (rqp->rq_q == NULL) {
832 bzero(rqp->rq_q, sizeof(class_queue_t));
834 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
835 if (rqp->rq_red == NULL) {
836 free(rqp->rq_q, M_DEVBUF);
842 rqp->rq_ifq = &ifp->if_snd;
843 qtail(rqp->rq_q) = NULL;
845 qlimit(rqp->rq_q) = RED_LIMIT;
846 qtype(rqp->rq_q) = Q_RED;
849 * set RED to this ifnet structure.
851 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
852 red_enqueue, red_dequeue, red_request,
855 red_destroy(rqp->rq_red);
856 free(rqp->rq_q, M_DEVBUF);
861 /* add this state to the red list */
862 rqp->rq_next = red_list;
867 ifacep = (struct red_interface *)addr;
868 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
872 error = red_detach(rqp);
877 struct red_stats *q_stats;
880 q_stats = (struct red_stats *)addr;
881 if ((rqp = altq_lookup(q_stats->iface.red_ifname,
882 ALTQT_RED)) == NULL) {
887 q_stats->q_len = qlen(rqp->rq_q);
888 q_stats->q_limit = qlimit(rqp->rq_q);
891 q_stats->q_avg = rp->red_avg >> rp->red_wshift;
892 q_stats->xmit_cnt = rp->red_stats.xmit_cnt;
893 q_stats->drop_cnt = rp->red_stats.drop_cnt;
894 q_stats->drop_forced = rp->red_stats.drop_forced;
895 q_stats->drop_unforced = rp->red_stats.drop_unforced;
896 q_stats->marked_packets = rp->red_stats.marked_packets;
898 q_stats->weight = rp->red_weight;
899 q_stats->inv_pmax = rp->red_inv_pmax;
900 q_stats->th_min = rp->red_thmin;
901 q_stats->th_max = rp->red_thmax;
903 #ifdef ALTQ_FLOWVALVE
904 if (rp->red_flowvalve != NULL) {
905 struct flowvalve *fv = rp->red_flowvalve;
906 q_stats->fv_flows = fv->fv_flows;
907 q_stats->fv_pass = fv->fv_stats.pass;
908 q_stats->fv_predrop = fv->fv_stats.predrop;
909 q_stats->fv_alloc = fv->fv_stats.alloc;
910 q_stats->fv_escape = fv->fv_stats.escape;
912 #endif /* ALTQ_FLOWVALVE */
913 q_stats->fv_flows = 0;
914 q_stats->fv_pass = 0;
915 q_stats->fv_predrop = 0;
916 q_stats->fv_alloc = 0;
917 q_stats->fv_escape = 0;
918 #ifdef ALTQ_FLOWVALVE
920 #endif /* ALTQ_FLOWVALVE */
921 } while (/*CONSTCOND*/ 0);
930 fc = (struct red_conf *)addr;
931 if ((rqp = altq_lookup(fc->iface.red_ifname,
932 ALTQT_RED)) == NULL) {
936 new = red_alloc(fc->red_weight,
953 limit = fc->red_limit;
954 if (limit < fc->red_thmax)
955 limit = fc->red_thmax;
956 qlimit(rqp->rq_q) = limit;
957 fc->red_limit = limit; /* write back the new value */
959 red_destroy(rqp->rq_red);
964 /* write back new values */
965 fc->red_limit = limit;
966 fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
967 fc->red_thmin = rqp->rq_red->red_thmin;
968 fc->red_thmax = rqp->rq_red->red_thmax;
970 } while (/*CONSTCOND*/ 0);
973 case RED_SETDEFAULTS:
975 struct redparams *rp;
977 rp = (struct redparams *)addr;
979 default_th_min = rp->th_min;
980 default_th_max = rp->th_max;
981 default_inv_pmax = rp->inv_pmax;
982 } while (/*CONSTCOND*/ 0);
999 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1000 altq_disable(rqp->rq_ifq);
1002 if ((error = altq_detach(rqp->rq_ifq)))
1005 if (red_list == rqp)
1006 red_list = rqp->rq_next;
1008 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1009 if (tmp->rq_next == rqp) {
1010 tmp->rq_next = rqp->rq_next;
1014 printf("red_detach: no state found in red_list!\n");
1017 red_destroy(rqp->rq_red);
1018 free(rqp->rq_q, M_DEVBUF);
1019 free(rqp, M_DEVBUF);
1026 * returns: 0 when successfully queued.
1027 * ENOBUFS when drop occurs.
1030 red_enqueue(ifq, m, pktattr)
1033 struct altq_pktattr *pktattr;
1035 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1037 IFQ_LOCK_ASSERT(ifq);
1039 if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1047 * must be called in splimp.
1049 * returns: mbuf dequeued.
1050 * NULL when no packet is available in the queue.
1053 static struct mbuf *
1054 red_dequeue(ifq, op)
1058 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1061 IFQ_LOCK_ASSERT(ifq);
1063 if (op == ALTDQ_POLL)
1064 return qhead(rqp->rq_q);
1066 /* op == ALTDQ_REMOVE */
1067 m = red_getq(rqp->rq_red, rqp->rq_q);
1074 red_request(ifq, req, arg)
1079 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1081 IFQ_LOCK_ASSERT(ifq);
1096 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1097 rqp->rq_ifq->ifq_len = 0;
1100 #ifdef ALTQ_FLOWVALVE
1102 #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */
1103 #define FV_PSCALE(x) ((x) << FV_PSHIFT)
1104 #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT)
1105 #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */
1106 #define FV_FSCALE(x) ((x) << FV_FSHIFT)
1107 #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT)
1109 #define FV_TIMER (3 * hz) /* timer value for garbage collector */
1110 #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */
1112 #define FV_N 10 /* update fve_f every FV_N packets */
1114 #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */
1115 #define FV_TTHRESH 3 /* time threshold to delete fve */
1116 #define FV_ALPHA 5 /* extra packet count */
1120 #if (__FreeBSD_version > 300000)
1121 #define FV_TIMESTAMP(tp) getmicrotime(tp)
1123 #define FV_TIMESTAMP(tp) { (*(tp)) = time; }
1127 * Brtt table: 127 entry table to convert drop rate (p) to
1128 * the corresponding bandwidth fraction (f)
1129 * the following equation is implemented to use scaled values,
1130 * fve_p and fve_f, in the fixed point format.
1132 * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1133 * f = Brtt(p) / (max_th + alpha)
1135 #define BRTT_SIZE 128
1136 #define BRTT_SHIFT 12
1137 #define BRTT_MASK 0x0007f000
1138 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT))
1140 const int brtt_tab[BRTT_SIZE] = {
1141 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1142 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1143 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1144 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1145 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1146 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1147 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1148 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1149 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1150 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1151 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1152 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1153 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1154 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1155 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1156 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1159 static __inline struct fve *
1160 flowlist_lookup(fv, pktattr, now)
1161 struct flowvalve *fv;
1162 struct altq_pktattr *pktattr;
1163 struct timeval *now;
1169 struct ip6_hdr *ip6;
1171 struct timeval tthresh;
1173 if (pktattr == NULL)
1176 tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1179 * search the flow list
1181 switch (pktattr->pattr_af) {
1183 ip = (struct ip *)pktattr->pattr_hdr;
1184 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1185 if (fve->fve_lastdrop.tv_sec == 0)
1187 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1188 fve->fve_lastdrop.tv_sec = 0;
1191 if (fve->fve_flow.flow_af == AF_INET &&
1192 fve->fve_flow.flow_ip.ip_src.s_addr ==
1193 ip->ip_src.s_addr &&
1194 fve->fve_flow.flow_ip.ip_dst.s_addr ==
1202 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1203 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1204 if (fve->fve_lastdrop.tv_sec == 0)
1206 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1207 fve->fve_lastdrop.tv_sec = 0;
1210 if (fve->fve_flow.flow_af == AF_INET6 &&
1211 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1213 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1222 /* unknown protocol. no drop. */
1225 fv->fv_flows = flows; /* save the number of active fve's */
1229 static __inline struct fve *
1230 flowlist_reclaim(fv, pktattr)
1231 struct flowvalve *fv;
1232 struct altq_pktattr *pktattr;
1237 struct ip6_hdr *ip6;
1241 * get an entry from the tail of the LRU list.
1243 fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1245 switch (pktattr->pattr_af) {
1247 ip = (struct ip *)pktattr->pattr_hdr;
1248 fve->fve_flow.flow_af = AF_INET;
1249 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1250 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1254 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1255 fve->fve_flow.flow_af = AF_INET6;
1256 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1257 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1262 fve->fve_state = Green;
1265 fve->fve_ifseq = fv->fv_ifseq - 1;
1270 fv->fv_stats.alloc++;
1275 static __inline void
1276 flowlist_move_to_head(fv, fve)
1277 struct flowvalve *fv;
1280 if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1281 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1282 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1286 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1288 * allocate flowvalve structure
1290 static struct flowvalve *
1294 struct flowvalve *fv;
1298 num = FV_FLOWLISTSIZE;
1299 fv = malloc(sizeof(struct flowvalve),
1300 M_DEVBUF, M_WAITOK);
1303 bzero(fv, sizeof(struct flowvalve));
1305 fv->fv_fves = malloc(sizeof(struct fve) * num,
1306 M_DEVBUF, M_WAITOK);
1307 if (fv->fv_fves == NULL) {
1311 bzero(fv->fv_fves, sizeof(struct fve) * num);
1314 TAILQ_INIT(&fv->fv_flowlist);
1315 for (i = 0; i < num; i++) {
1316 fve = &fv->fv_fves[i];
1317 fve->fve_lastdrop.tv_sec = 0;
1318 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1321 /* initialize drop rate threshold in scaled fixed-point */
1322 fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1324 /* initialize drop rate to fraction table */
1325 fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1326 M_DEVBUF, M_WAITOK);
1327 if (fv->fv_p2ftab == NULL) {
1328 free(fv->fv_fves, M_DEVBUF);
1333 * create the p2f table.
1334 * (shift is used to keep the precision)
1336 for (i = 1; i < BRTT_SIZE; i++) {
1339 f = brtt_tab[i] << 8;
1340 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1347 static void fv_destroy(fv)
1348 struct flowvalve *fv;
1350 free(fv->fv_p2ftab, M_DEVBUF);
1351 free(fv->fv_fves, M_DEVBUF);
1357 struct flowvalve *fv;
1363 f = fv->fv_p2ftab[BRTT_SIZE-1];
1364 else if ((val = (p & BRTT_MASK)))
1365 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1367 f = fv->fv_p2ftab[1];
1372 * check if an arriving packet should be pre-dropped.
1373 * called from red_addq() when a packet arrives.
1374 * returns 1 when the packet should be pre-dropped.
1375 * should be called in splimp.
1378 fv_checkflow(fv, pktattr, fcache)
1379 struct flowvalve *fv;
1380 struct altq_pktattr *pktattr;
1381 struct fve **fcache;
1389 if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1390 /* no matching entry in the flowlist */
1395 /* update fraction f for every FV_N packets */
1396 if (++fve->fve_count == FV_N) {
1398 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1401 (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1402 + fve->fve_f - FV_FUNSCALE(fve->fve_f);
1403 fve->fve_ifseq = fv->fv_ifseq;
1410 if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1413 /* calculate a threshold */
1414 fthresh = fv_p2f(fv, fve->fve_p);
1415 if (fve->fve_f > fthresh)
1416 fve->fve_state = Red;
1419 if (fve->fve_state == Red) {
1423 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1424 /* no drop for at least FV_BACKOFFTHRESH sec */
1426 fve->fve_state = Green;
1428 fv->fv_stats.escape++;
1431 /* block this flow */
1432 flowlist_move_to_head(fv, fve);
1433 fve->fve_lastdrop = now;
1435 fv->fv_stats.predrop++;
1444 fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1448 fv->fv_stats.pass++;
1454 * called from red_addq when a packet is dropped by red.
1455 * should be called in splimp.
1457 static void fv_dropbyred(fv, pktattr, fcache)
1458 struct flowvalve *fv;
1459 struct altq_pktattr *pktattr;
1465 if (pktattr == NULL)
1470 /* the fve of this packet is already cached */
1472 else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1473 fve = flowlist_reclaim(fv, pktattr);
1475 flowlist_move_to_head(fv, fve);
1478 * update p: the following line cancels the update
1479 * in fv_checkflow() and calculate
1480 * p = Wp + (1 - Wp) * p
1482 fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1484 fve->fve_lastdrop = now;
1487 #endif /* ALTQ_FLOWVALVE */
1491 static struct altqsw red_sw =
1492 {"red", redopen, redclose, redioctl};
1494 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1495 MODULE_VERSION(altq_red, 1);
1497 #endif /* KLD_MODULE */
1498 #endif /* ALTQ3_COMPAT */
1500 #endif /* ALTQ_RED */