2 /* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 #if defined(__FreeBSD__) || defined(__NetBSD__)
67 #include "opt_inet6.h"
69 #endif /* __FreeBSD__ || __NetBSD__ */
70 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
72 #include <sys/param.h>
73 #include <sys/malloc.h>
75 #include <sys/socket.h>
76 #include <sys/systm.h>
77 #include <sys/errno.h>
78 #if 1 /* ALTQ3_COMPAT */
79 #include <sys/sockio.h>
81 #include <sys/kernel.h>
83 #include <sys/queue.h>
86 #endif /* ALTQ3_COMPAT */
90 #include <netinet/in.h>
91 #include <netinet/in_systm.h>
92 #include <netinet/ip.h>
94 #include <netinet/ip6.h>
97 #include <net/pfvar.h>
98 #include <altq/altq.h>
99 #include <altq/altq_red.h>
101 #include <altq/altq_conf.h>
102 #ifdef ALTQ_FLOWVALVE
103 #include <altq/altq_flowvalve.h>
108 * ALTQ/RED (Random Early Detection) implementation using 32-bit
109 * fixed-point calculation.
111 * written by kjc using the ns code as a reference.
112 * you can learn more about red and ns from Sally's home page at
113 * http://www-nrg.ee.lbl.gov/floyd/
115 * most of the red parameter values are fixed in this implementation
116 * to prevent fixed-point overflow/underflow.
117 * if you change the parameters, watch out for overflow/underflow!
119 * the parameters used are recommended values by Sally.
120 * the corresponding ns config looks:
122 * minthresh=5 maxthresh=15 queue-size=60
125 * bytes=false (can't be handled by 32-bit fixed-point)
126 * doubleq=false dqthresh=false
130 * alternative red parameters for a slow link.
132 * assume the queue length becomes from zero to L and keeps L, it takes
133 * N packets for q_avg to reach 63% of L.
134 * when q_weight is 0.002, N is about 500 packets.
135 * for a slow link like dial-up, 500 packets takes more than 1 minute!
136 * when q_weight is 0.008, N is about 127 packets.
137 * when q_weight is 0.016, N is about 63 packets.
138 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
139 * are allowed for 0.016.
140 * see Sally's paper for more details.
142 /* normal red parameters */
143 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
144 /* q_weight = 0.00195 */
146 /* red parameters for a slow link */
147 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
148 /* q_weight = 0.0078125 */
150 /* red parameters for a very slow link (e.g., dialup) */
151 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
152 /* q_weight = 0.015625 */
154 /* fixed-point uses 12-bit decimal places */
155 #define FP_SHIFT 12 /* fixed-point shift */
157 /* red parameters for drop probability */
158 #define INV_P_MAX 10 /* inverse of max drop probability */
159 #define TH_MIN 5 /* min threshold */
160 #define TH_MAX 15 /* max threshold */
162 #define RED_LIMIT 60 /* default max queue lenght */
163 #define RED_STATS /* collect statistics */
166 * our default policy for forced-drop is drop-tail.
167 * (in altq-1.1.2 or earlier, the default was random-drop.
168 * but it makes more sense to punish the cause of the surge.)
169 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
173 #ifdef ALTQ_FLOWVALVE
175 * flow-valve is an extention to protect red from unresponsive flows
176 * and to promote end-to-end congestion control.
177 * flow-valve observes the average drop rates of the flows that have
178 * experienced packet drops in the recent past.
179 * when the average drop rate exceeds the threshold, the flow is
180 * blocked by the flow-valve. the trapped flow should back off
181 * exponentially to escape from the flow-valve.
183 #ifdef RED_RANDOM_DROP
184 #error "random-drop can't be used with flow-valve!"
186 #endif /* ALTQ_FLOWVALVE */
188 /* red_list keeps all red_queue_t's allocated. */
189 static red_queue_t *red_list = NULL;
191 #endif /* ALTQ3_COMPAT */
193 /* default red parameter values */
194 static int default_th_min = TH_MIN;
195 static int default_th_max = TH_MAX;
196 static int default_inv_pmax = INV_P_MAX;
199 /* internal function prototypes */
200 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
201 static struct mbuf *red_dequeue(struct ifaltq *, int);
202 static int red_request(struct ifaltq *, int, void *);
203 static void red_purgeq(red_queue_t *);
204 static int red_detach(red_queue_t *);
205 #ifdef ALTQ_FLOWVALVE
206 static __inline struct fve *flowlist_lookup(struct flowvalve *,
207 struct altq_pktattr *, struct timeval *);
208 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
209 struct altq_pktattr *);
210 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
211 static __inline int fv_p2f(struct flowvalve *, int);
212 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
213 static struct flowvalve *fv_alloc(struct red *);
215 static void fv_destroy(struct flowvalve *);
216 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
218 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
221 #endif /* ALTQ3_COMPAT */
224 * red support routines
227 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
234 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
239 rp->red_weight = W_WEIGHT;
241 rp->red_weight = weight;
243 /* allocate weight table */
244 rp->red_wtab = wtab_alloc(rp->red_weight);
245 if (rp->red_wtab == NULL) {
254 rp->red_inv_pmax = default_inv_pmax;
256 rp->red_inv_pmax = inv_pmax;
258 rp->red_thmin = default_th_min;
260 rp->red_thmin = th_min;
262 rp->red_thmax = default_th_max;
264 rp->red_thmax = th_max;
266 rp->red_flags = flags;
269 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
270 rp->red_pkttime = 800;
272 rp->red_pkttime = pkttime;
275 /* when the link is very slow, adjust red parameters */
276 npkts_per_sec = 1000000 / rp->red_pkttime;
277 if (npkts_per_sec < 50) {
278 /* up to about 400Kbps */
279 rp->red_weight = W_WEIGHT_2;
280 } else if (npkts_per_sec < 300) {
281 /* up to about 2.4Mbps */
282 rp->red_weight = W_WEIGHT_1;
286 /* calculate wshift. weight must be power of 2 */
288 for (i = 0; w > 1; i++)
291 w = 1 << rp->red_wshift;
292 if (w != rp->red_weight) {
293 printf("invalid weight value %d for red! use %d\n",
299 * thmin_s and thmax_s are scaled versions of th_min and th_max
300 * to be compared with avg.
302 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
303 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
306 * precompute probability denominator
307 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
309 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
310 * rp->red_inv_pmax) << FP_SHIFT;
312 microtime(&rp->red_last);
317 red_destroy(red_t *rp)
320 #ifdef ALTQ_FLOWVALVE
321 if (rp->red_flowvalve != NULL)
322 fv_destroy(rp->red_flowvalve);
324 #endif /* ALTQ3_COMPAT */
325 wtab_destroy(rp->red_wtab);
330 red_getstats(red_t *rp, struct redstats *sp)
332 sp->q_avg = rp->red_avg >> rp->red_wshift;
333 sp->xmit_cnt = rp->red_stats.xmit_cnt;
334 sp->drop_cnt = rp->red_stats.drop_cnt;
335 sp->drop_forced = rp->red_stats.drop_forced;
336 sp->drop_unforced = rp->red_stats.drop_unforced;
337 sp->marked_packets = rp->red_stats.marked_packets;
341 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
342 struct altq_pktattr *pktattr)
347 #ifdef ALTQ_FLOWVALVE
348 struct fve *fve = NULL;
350 if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
351 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
356 #endif /* ALTQ3_COMPAT */
361 * if we were idle, we pretend that n packets arrived during
370 t = (now.tv_sec - rp->red_last.tv_sec);
373 * being idle for more than 1 minute, set avg to zero.
374 * this prevents t from overflow.
378 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
379 n = t / rp->red_pkttime - 1;
381 /* the following line does (avg = (1 - Wq)^n * avg) */
383 avg = (avg >> FP_SHIFT) *
384 pow_w(rp->red_wtab, n);
388 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
389 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
390 rp->red_avg = avg; /* save the new value */
393 * red_count keeps a tally of arriving traffic that has not
398 /* see if we drop early */
399 droptype = DTYPE_NODROP;
400 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
401 if (avg >= rp->red_thmax_s) {
402 /* avg >= th_max: forced drop */
403 droptype = DTYPE_FORCED;
404 } else if (rp->red_old == 0) {
405 /* first exceeds th_min */
408 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
409 rp->red_probd, rp->red_count)) {
410 /* mark or drop by red */
411 if ((rp->red_flags & REDF_ECN) &&
412 mark_ecn(m, pktattr, rp->red_flags)) {
413 /* successfully marked. do not drop. */
416 rp->red_stats.marked_packets++;
419 /* unforced drop by red */
420 droptype = DTYPE_EARLY;
429 * if the queue length hits the hard limit, it's a forced drop.
431 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
432 droptype = DTYPE_FORCED;
434 #ifdef RED_RANDOM_DROP
435 /* if successful or forced drop, enqueue this packet. */
436 if (droptype != DTYPE_EARLY)
439 /* if successful, enqueue this packet. */
440 if (droptype == DTYPE_NODROP)
443 if (droptype != DTYPE_NODROP) {
444 if (droptype == DTYPE_EARLY) {
445 /* drop the incoming packet */
447 rp->red_stats.drop_unforced++;
450 /* forced drop, select a victim packet in the queue. */
451 #ifdef RED_RANDOM_DROP
455 rp->red_stats.drop_forced++;
459 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
463 #ifdef ALTQ_FLOWVALVE
464 if (rp->red_flowvalve != NULL)
465 fv_dropbyred(rp->red_flowvalve, pktattr, fve);
467 #endif /* ALTQ3_COMPAT */
471 /* successfully queued */
473 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
479 * early-drop probability is calculated as follows:
480 * prob = p_max * (avg - th_min) / (th_max - th_min)
481 * prob_a = prob / (2 - count*prob)
482 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
483 * here prob_a increases as successive undrop count increases.
484 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
485 * becomes 1 when (count >= (2 / prob))).
488 drop_early(int fp_len, int fp_probd, int count)
490 int d; /* denominator of drop-probability */
492 d = fp_probd - count * fp_len;
494 /* count exceeds the hard limit: drop or mark */
498 * now the range of d is [1..600] in fixed-point. (when
499 * th_max-th_min=10 and p_max=1/30)
500 * drop probability = (avg - TH_MIN) / d
503 if ((arc4random() % d) < fp_len) {
512 * try to mark CE bit to the packet.
513 * returns 1 if successfully marked, 0 otherwise.
516 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
522 at = pf_find_mtag(m);
526 } else if (pktattr != NULL) {
527 af = pktattr->pattr_af;
528 hdr = pktattr->pattr_hdr;
529 #endif /* ALTQ3_COMPAT */
533 /* verify that pattr_hdr is within the mbuf data */
534 for (m0 = m; m0 != NULL; m0 = m0->m_next)
535 if (((caddr_t)hdr >= m0->m_data) &&
536 ((caddr_t)hdr < m0->m_data + m0->m_len))
539 /* ick, tag info is stale */
543 switch (((struct ip *)hdr)->ip_v) {
545 if (flags & REDF_ECN4) {
551 return (0); /* version mismatch! */
553 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
554 return (0); /* not-ECT */
555 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
556 return (1); /* already marked */
559 * ecn-capable but not marked,
560 * mark CE and update checksum
563 ip->ip_tos |= IPTOS_ECN_CE;
565 * update checksum (from RFC1624)
566 * HC' = ~(~HC + ~m + m')
568 sum = ~ntohs(ip->ip_sum) & 0xffff;
569 sum += (~otos & 0xffff) + ip->ip_tos;
570 sum = (sum >> 16) + (sum & 0xffff);
571 sum += (sum >> 16); /* add carry */
572 ip->ip_sum = htons(~sum & 0xffff);
577 case (IPV6_VERSION >> 4):
578 if (flags & REDF_ECN6) {
579 struct ip6_hdr *ip6 = hdr;
582 flowlabel = ntohl(ip6->ip6_flow);
583 if ((flowlabel >> 28) != 6)
584 return (0); /* version mismatch! */
585 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
586 (IPTOS_ECN_NOTECT << 20))
587 return (0); /* not-ECT */
588 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
589 (IPTOS_ECN_CE << 20))
590 return (1); /* already marked */
592 * ecn-capable but not marked, mark CE
594 flowlabel |= (IPTOS_ECN_CE << 20);
595 ip6->ip6_flow = htonl(flowlabel);
613 if ((m = _getq(q)) == NULL) {
614 if (rp->red_idle == 0) {
616 microtime(&rp->red_last);
626 * helper routine to calibrate avg during idle.
627 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
628 * here Wq = 1/weight and the code assumes Wq is close to zero.
630 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
632 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
635 wtab_alloc(int weight)
640 for (w = wtab_list; w != NULL; w = w->w_next)
641 if (w->w_weight == weight) {
646 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
649 w->w_weight = weight;
651 w->w_next = wtab_list;
654 /* initialize the weight table */
655 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
656 for (i = 1; i < 32; i++) {
657 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
658 if (w->w_tab[i] == 0 && w->w_param_max == 0)
659 w->w_param_max = 1 << i;
666 wtab_destroy(struct wtab *w)
670 if (--w->w_refcount > 0)
674 wtab_list = w->w_next;
675 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
676 if (prev->w_next == w) {
677 prev->w_next = w->w_next;
686 pow_w(struct wtab *w, int n)
691 if (n >= w->w_param_max)
702 val = (val * w->w_tab[i]) >> FP_SHIFT;
713 * red device interface
718 redopen(dev, flag, fmt, p)
721 #if (__FreeBSD_version > 500000)
727 /* everything will be done when the queueing scheme is attached. */
732 redclose(dev, flag, fmt, p)
735 #if (__FreeBSD_version > 500000)
744 while ((rqp = red_list) != NULL) {
746 err = red_detach(rqp);
747 if (err != 0 && error == 0)
755 redioctl(dev, cmd, addr, flag, p)
760 #if (__FreeBSD_version > 500000)
767 struct red_interface *ifacep;
771 /* check super-user privilege */
776 #if (__FreeBSD_version > 700000)
777 if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
778 #elsif (__FreeBSD_version > 400000)
779 if ((error = suser(p)) != 0)
781 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
790 ifacep = (struct red_interface *)addr;
791 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
795 error = altq_enable(rqp->rq_ifq);
799 ifacep = (struct red_interface *)addr;
800 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
804 error = altq_disable(rqp->rq_ifq);
808 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
814 /* allocate and initialize red_queue_t */
815 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
820 bzero(rqp, sizeof(red_queue_t));
822 rqp->rq_q = malloc(sizeof(class_queue_t),
824 if (rqp->rq_q == NULL) {
829 bzero(rqp->rq_q, sizeof(class_queue_t));
831 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
832 if (rqp->rq_red == NULL) {
833 free(rqp->rq_q, M_DEVBUF);
839 rqp->rq_ifq = &ifp->if_snd;
840 qtail(rqp->rq_q) = NULL;
842 qlimit(rqp->rq_q) = RED_LIMIT;
843 qtype(rqp->rq_q) = Q_RED;
846 * set RED to this ifnet structure.
848 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
849 red_enqueue, red_dequeue, red_request,
852 red_destroy(rqp->rq_red);
853 free(rqp->rq_q, M_DEVBUF);
858 /* add this state to the red list */
859 rqp->rq_next = red_list;
864 ifacep = (struct red_interface *)addr;
865 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
869 error = red_detach(rqp);
874 struct red_stats *q_stats;
877 q_stats = (struct red_stats *)addr;
878 if ((rqp = altq_lookup(q_stats->iface.red_ifname,
879 ALTQT_RED)) == NULL) {
884 q_stats->q_len = qlen(rqp->rq_q);
885 q_stats->q_limit = qlimit(rqp->rq_q);
888 q_stats->q_avg = rp->red_avg >> rp->red_wshift;
889 q_stats->xmit_cnt = rp->red_stats.xmit_cnt;
890 q_stats->drop_cnt = rp->red_stats.drop_cnt;
891 q_stats->drop_forced = rp->red_stats.drop_forced;
892 q_stats->drop_unforced = rp->red_stats.drop_unforced;
893 q_stats->marked_packets = rp->red_stats.marked_packets;
895 q_stats->weight = rp->red_weight;
896 q_stats->inv_pmax = rp->red_inv_pmax;
897 q_stats->th_min = rp->red_thmin;
898 q_stats->th_max = rp->red_thmax;
900 #ifdef ALTQ_FLOWVALVE
901 if (rp->red_flowvalve != NULL) {
902 struct flowvalve *fv = rp->red_flowvalve;
903 q_stats->fv_flows = fv->fv_flows;
904 q_stats->fv_pass = fv->fv_stats.pass;
905 q_stats->fv_predrop = fv->fv_stats.predrop;
906 q_stats->fv_alloc = fv->fv_stats.alloc;
907 q_stats->fv_escape = fv->fv_stats.escape;
909 #endif /* ALTQ_FLOWVALVE */
910 q_stats->fv_flows = 0;
911 q_stats->fv_pass = 0;
912 q_stats->fv_predrop = 0;
913 q_stats->fv_alloc = 0;
914 q_stats->fv_escape = 0;
915 #ifdef ALTQ_FLOWVALVE
917 #endif /* ALTQ_FLOWVALVE */
918 } while (/*CONSTCOND*/ 0);
927 fc = (struct red_conf *)addr;
928 if ((rqp = altq_lookup(fc->iface.red_ifname,
929 ALTQT_RED)) == NULL) {
933 new = red_alloc(fc->red_weight,
950 limit = fc->red_limit;
951 if (limit < fc->red_thmax)
952 limit = fc->red_thmax;
953 qlimit(rqp->rq_q) = limit;
954 fc->red_limit = limit; /* write back the new value */
956 red_destroy(rqp->rq_red);
961 /* write back new values */
962 fc->red_limit = limit;
963 fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
964 fc->red_thmin = rqp->rq_red->red_thmin;
965 fc->red_thmax = rqp->rq_red->red_thmax;
967 } while (/*CONSTCOND*/ 0);
970 case RED_SETDEFAULTS:
972 struct redparams *rp;
974 rp = (struct redparams *)addr;
976 default_th_min = rp->th_min;
977 default_th_max = rp->th_max;
978 default_inv_pmax = rp->inv_pmax;
979 } while (/*CONSTCOND*/ 0);
996 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
997 altq_disable(rqp->rq_ifq);
999 if ((error = altq_detach(rqp->rq_ifq)))
1002 if (red_list == rqp)
1003 red_list = rqp->rq_next;
1005 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1006 if (tmp->rq_next == rqp) {
1007 tmp->rq_next = rqp->rq_next;
1011 printf("red_detach: no state found in red_list!\n");
1014 red_destroy(rqp->rq_red);
1015 free(rqp->rq_q, M_DEVBUF);
1016 free(rqp, M_DEVBUF);
1023 * returns: 0 when successfully queued.
1024 * ENOBUFS when drop occurs.
1027 red_enqueue(ifq, m, pktattr)
1030 struct altq_pktattr *pktattr;
1032 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1034 IFQ_LOCK_ASSERT(ifq);
1036 if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1044 * must be called in splimp.
1046 * returns: mbuf dequeued.
1047 * NULL when no packet is available in the queue.
1050 static struct mbuf *
1051 red_dequeue(ifq, op)
1055 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1058 IFQ_LOCK_ASSERT(ifq);
1060 if (op == ALTDQ_POLL)
1061 return qhead(rqp->rq_q);
1063 /* op == ALTDQ_REMOVE */
1064 m = red_getq(rqp->rq_red, rqp->rq_q);
1071 red_request(ifq, req, arg)
1076 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1078 IFQ_LOCK_ASSERT(ifq);
1093 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1094 rqp->rq_ifq->ifq_len = 0;
1097 #ifdef ALTQ_FLOWVALVE
1099 #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */
1100 #define FV_PSCALE(x) ((x) << FV_PSHIFT)
1101 #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT)
1102 #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */
1103 #define FV_FSCALE(x) ((x) << FV_FSHIFT)
1104 #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT)
1106 #define FV_TIMER (3 * hz) /* timer value for garbage collector */
1107 #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */
1109 #define FV_N 10 /* update fve_f every FV_N packets */
1111 #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */
1112 #define FV_TTHRESH 3 /* time threshold to delete fve */
1113 #define FV_ALPHA 5 /* extra packet count */
1117 #if (__FreeBSD_version > 300000)
1118 #define FV_TIMESTAMP(tp) getmicrotime(tp)
1120 #define FV_TIMESTAMP(tp) { (*(tp)) = time; }
1124 * Brtt table: 127 entry table to convert drop rate (p) to
1125 * the corresponding bandwidth fraction (f)
1126 * the following equation is implemented to use scaled values,
1127 * fve_p and fve_f, in the fixed point format.
1129 * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1130 * f = Brtt(p) / (max_th + alpha)
1132 #define BRTT_SIZE 128
1133 #define BRTT_SHIFT 12
1134 #define BRTT_MASK 0x0007f000
1135 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT))
1137 const int brtt_tab[BRTT_SIZE] = {
1138 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1139 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1140 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1141 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1142 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1143 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1144 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1145 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1146 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1147 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1148 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1149 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1150 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1151 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1152 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1153 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1156 static __inline struct fve *
1157 flowlist_lookup(fv, pktattr, now)
1158 struct flowvalve *fv;
1159 struct altq_pktattr *pktattr;
1160 struct timeval *now;
1166 struct ip6_hdr *ip6;
1168 struct timeval tthresh;
1170 if (pktattr == NULL)
1173 tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1176 * search the flow list
1178 switch (pktattr->pattr_af) {
1180 ip = (struct ip *)pktattr->pattr_hdr;
1181 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1182 if (fve->fve_lastdrop.tv_sec == 0)
1184 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1185 fve->fve_lastdrop.tv_sec = 0;
1188 if (fve->fve_flow.flow_af == AF_INET &&
1189 fve->fve_flow.flow_ip.ip_src.s_addr ==
1190 ip->ip_src.s_addr &&
1191 fve->fve_flow.flow_ip.ip_dst.s_addr ==
1199 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1200 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1201 if (fve->fve_lastdrop.tv_sec == 0)
1203 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1204 fve->fve_lastdrop.tv_sec = 0;
1207 if (fve->fve_flow.flow_af == AF_INET6 &&
1208 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1210 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1219 /* unknown protocol. no drop. */
1222 fv->fv_flows = flows; /* save the number of active fve's */
1226 static __inline struct fve *
1227 flowlist_reclaim(fv, pktattr)
1228 struct flowvalve *fv;
1229 struct altq_pktattr *pktattr;
1234 struct ip6_hdr *ip6;
1238 * get an entry from the tail of the LRU list.
1240 fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1242 switch (pktattr->pattr_af) {
1244 ip = (struct ip *)pktattr->pattr_hdr;
1245 fve->fve_flow.flow_af = AF_INET;
1246 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1247 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1251 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1252 fve->fve_flow.flow_af = AF_INET6;
1253 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1254 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1259 fve->fve_state = Green;
1262 fve->fve_ifseq = fv->fv_ifseq - 1;
1267 fv->fv_stats.alloc++;
1272 static __inline void
1273 flowlist_move_to_head(fv, fve)
1274 struct flowvalve *fv;
1277 if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1278 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1279 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1283 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1285 * allocate flowvalve structure
1287 static struct flowvalve *
1291 struct flowvalve *fv;
1295 num = FV_FLOWLISTSIZE;
1296 fv = malloc(sizeof(struct flowvalve),
1297 M_DEVBUF, M_WAITOK);
1300 bzero(fv, sizeof(struct flowvalve));
1302 fv->fv_fves = malloc(sizeof(struct fve) * num,
1303 M_DEVBUF, M_WAITOK);
1304 if (fv->fv_fves == NULL) {
1308 bzero(fv->fv_fves, sizeof(struct fve) * num);
1311 TAILQ_INIT(&fv->fv_flowlist);
1312 for (i = 0; i < num; i++) {
1313 fve = &fv->fv_fves[i];
1314 fve->fve_lastdrop.tv_sec = 0;
1315 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1318 /* initialize drop rate threshold in scaled fixed-point */
1319 fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1321 /* initialize drop rate to fraction table */
1322 fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1323 M_DEVBUF, M_WAITOK);
1324 if (fv->fv_p2ftab == NULL) {
1325 free(fv->fv_fves, M_DEVBUF);
1330 * create the p2f table.
1331 * (shift is used to keep the precision)
1333 for (i = 1; i < BRTT_SIZE; i++) {
1336 f = brtt_tab[i] << 8;
1337 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1344 static void fv_destroy(fv)
1345 struct flowvalve *fv;
1347 free(fv->fv_p2ftab, M_DEVBUF);
1348 free(fv->fv_fves, M_DEVBUF);
1354 struct flowvalve *fv;
1360 f = fv->fv_p2ftab[BRTT_SIZE-1];
1361 else if ((val = (p & BRTT_MASK)))
1362 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1364 f = fv->fv_p2ftab[1];
1369 * check if an arriving packet should be pre-dropped.
1370 * called from red_addq() when a packet arrives.
1371 * returns 1 when the packet should be pre-dropped.
1372 * should be called in splimp.
1375 fv_checkflow(fv, pktattr, fcache)
1376 struct flowvalve *fv;
1377 struct altq_pktattr *pktattr;
1378 struct fve **fcache;
1386 if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1387 /* no matching entry in the flowlist */
1392 /* update fraction f for every FV_N packets */
1393 if (++fve->fve_count == FV_N) {
1395 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1398 (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1399 + fve->fve_f - FV_FUNSCALE(fve->fve_f);
1400 fve->fve_ifseq = fv->fv_ifseq;
1407 if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1410 /* calculate a threshold */
1411 fthresh = fv_p2f(fv, fve->fve_p);
1412 if (fve->fve_f > fthresh)
1413 fve->fve_state = Red;
1416 if (fve->fve_state == Red) {
1420 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1421 /* no drop for at least FV_BACKOFFTHRESH sec */
1423 fve->fve_state = Green;
1425 fv->fv_stats.escape++;
1428 /* block this flow */
1429 flowlist_move_to_head(fv, fve);
1430 fve->fve_lastdrop = now;
1432 fv->fv_stats.predrop++;
1441 fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1445 fv->fv_stats.pass++;
1451 * called from red_addq when a packet is dropped by red.
1452 * should be called in splimp.
1454 static void fv_dropbyred(fv, pktattr, fcache)
1455 struct flowvalve *fv;
1456 struct altq_pktattr *pktattr;
1462 if (pktattr == NULL)
1467 /* the fve of this packet is already cached */
1469 else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1470 fve = flowlist_reclaim(fv, pktattr);
1472 flowlist_move_to_head(fv, fve);
1475 * update p: the following line cancels the update
1476 * in fv_checkflow() and calculate
1477 * p = Wp + (1 - Wp) * p
1479 fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1481 fve->fve_lastdrop = now;
1484 #endif /* ALTQ_FLOWVALVE */
1488 static struct altqsw red_sw =
1489 {"red", redopen, redclose, redioctl};
1491 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1492 MODULE_VERSION(altq_red, 1);
1494 #endif /* KLD_MODULE */
1495 #endif /* ALTQ3_COMPAT */
1497 #endif /* ALTQ_RED */