2 /* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 #if defined(__FreeBSD__) || defined(__NetBSD__)
67 #include "opt_inet6.h"
69 #endif /* __FreeBSD__ || __NetBSD__ */
70 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
72 #include <sys/param.h>
73 #include <sys/malloc.h>
75 #include <sys/socket.h>
76 #include <sys/systm.h>
77 #include <sys/errno.h>
78 #if 1 /* ALTQ3_COMPAT */
79 #include <sys/sockio.h>
81 #include <sys/kernel.h>
83 #include <sys/queue.h>
86 #endif /* ALTQ3_COMPAT */
90 #include <netinet/in.h>
91 #include <netinet/in_systm.h>
92 #include <netinet/ip.h>
94 #include <netinet/ip6.h>
97 #include <net/pfvar.h>
98 #include <altq/altq.h>
99 #include <altq/altq_red.h>
101 #include <altq/altq_conf.h>
102 #ifdef ALTQ_FLOWVALVE
103 #include <altq/altq_flowvalve.h>
108 * ALTQ/RED (Random Early Detection) implementation using 32-bit
109 * fixed-point calculation.
111 * written by kjc using the ns code as a reference.
112 * you can learn more about red and ns from Sally's home page at
113 * http://www-nrg.ee.lbl.gov/floyd/
115 * most of the red parameter values are fixed in this implementation
116 * to prevent fixed-point overflow/underflow.
117 * if you change the parameters, watch out for overflow/underflow!
119 * the parameters used are recommended values by Sally.
120 * the corresponding ns config looks:
122 * minthresh=5 maxthresh=15 queue-size=60
125 * bytes=false (can't be handled by 32-bit fixed-point)
126 * doubleq=false dqthresh=false
130 * alternative red parameters for a slow link.
132 * assume the queue length becomes from zero to L and keeps L, it takes
133 * N packets for q_avg to reach 63% of L.
134 * when q_weight is 0.002, N is about 500 packets.
135 * for a slow link like dial-up, 500 packets takes more than 1 minute!
136 * when q_weight is 0.008, N is about 127 packets.
137 * when q_weight is 0.016, N is about 63 packets.
138 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
139 * are allowed for 0.016.
140 * see Sally's paper for more details.
142 /* normal red parameters */
143 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
144 /* q_weight = 0.00195 */
146 /* red parameters for a slow link */
147 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
148 /* q_weight = 0.0078125 */
150 /* red parameters for a very slow link (e.g., dialup) */
151 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
152 /* q_weight = 0.015625 */
154 /* fixed-point uses 12-bit decimal places */
155 #define FP_SHIFT 12 /* fixed-point shift */
157 /* red parameters for drop probability */
158 #define INV_P_MAX 10 /* inverse of max drop probability */
159 #define TH_MIN 5 /* min threshold */
160 #define TH_MAX 15 /* max threshold */
162 #define RED_LIMIT 60 /* default max queue lenght */
163 #define RED_STATS /* collect statistics */
166 * our default policy for forced-drop is drop-tail.
167 * (in altq-1.1.2 or earlier, the default was random-drop.
168 * but it makes more sense to punish the cause of the surge.)
169 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
173 #ifdef ALTQ_FLOWVALVE
175 * flow-valve is an extention to protect red from unresponsive flows
176 * and to promote end-to-end congestion control.
177 * flow-valve observes the average drop rates of the flows that have
178 * experienced packet drops in the recent past.
179 * when the average drop rate exceeds the threshold, the flow is
180 * blocked by the flow-valve. the trapped flow should back off
181 * exponentially to escape from the flow-valve.
183 #ifdef RED_RANDOM_DROP
184 #error "random-drop can't be used with flow-valve!"
186 #endif /* ALTQ_FLOWVALVE */
188 /* red_list keeps all red_queue_t's allocated. */
189 static red_queue_t *red_list = NULL;
191 #endif /* ALTQ3_COMPAT */
193 /* default red parameter values */
194 static int default_th_min = TH_MIN;
195 static int default_th_max = TH_MAX;
196 static int default_inv_pmax = INV_P_MAX;
199 /* internal function prototypes */
200 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
201 static struct mbuf *red_dequeue(struct ifaltq *, int);
202 static int red_request(struct ifaltq *, int, void *);
203 static void red_purgeq(red_queue_t *);
204 static int red_detach(red_queue_t *);
205 #ifdef ALTQ_FLOWVALVE
206 static __inline struct fve *flowlist_lookup(struct flowvalve *,
207 struct altq_pktattr *, struct timeval *);
208 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
209 struct altq_pktattr *);
210 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
211 static __inline int fv_p2f(struct flowvalve *, int);
212 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
213 static struct flowvalve *fv_alloc(struct red *);
215 static void fv_destroy(struct flowvalve *);
216 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
218 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
221 #endif /* ALTQ3_COMPAT */
224 * red support routines
227 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
234 rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK);
237 bzero(rp, sizeof(red_t));
243 rp->red_weight = W_WEIGHT;
245 rp->red_weight = weight;
247 rp->red_inv_pmax = default_inv_pmax;
249 rp->red_inv_pmax = inv_pmax;
251 rp->red_thmin = default_th_min;
253 rp->red_thmin = th_min;
255 rp->red_thmax = default_th_max;
257 rp->red_thmax = th_max;
259 rp->red_flags = flags;
262 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
263 rp->red_pkttime = 800;
265 rp->red_pkttime = pkttime;
268 /* when the link is very slow, adjust red parameters */
269 npkts_per_sec = 1000000 / rp->red_pkttime;
270 if (npkts_per_sec < 50) {
271 /* up to about 400Kbps */
272 rp->red_weight = W_WEIGHT_2;
273 } else if (npkts_per_sec < 300) {
274 /* up to about 2.4Mbps */
275 rp->red_weight = W_WEIGHT_1;
279 /* calculate wshift. weight must be power of 2 */
281 for (i = 0; w > 1; i++)
284 w = 1 << rp->red_wshift;
285 if (w != rp->red_weight) {
286 printf("invalid weight value %d for red! use %d\n",
292 * thmin_s and thmax_s are scaled versions of th_min and th_max
293 * to be compared with avg.
295 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
296 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
299 * precompute probability denominator
300 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
302 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
303 * rp->red_inv_pmax) << FP_SHIFT;
305 /* allocate weight table */
306 rp->red_wtab = wtab_alloc(rp->red_weight);
308 microtime(&rp->red_last);
313 red_destroy(red_t *rp)
316 #ifdef ALTQ_FLOWVALVE
317 if (rp->red_flowvalve != NULL)
318 fv_destroy(rp->red_flowvalve);
320 #endif /* ALTQ3_COMPAT */
321 wtab_destroy(rp->red_wtab);
326 red_getstats(red_t *rp, struct redstats *sp)
328 sp->q_avg = rp->red_avg >> rp->red_wshift;
329 sp->xmit_cnt = rp->red_stats.xmit_cnt;
330 sp->drop_cnt = rp->red_stats.drop_cnt;
331 sp->drop_forced = rp->red_stats.drop_forced;
332 sp->drop_unforced = rp->red_stats.drop_unforced;
333 sp->marked_packets = rp->red_stats.marked_packets;
337 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
338 struct altq_pktattr *pktattr)
343 #ifdef ALTQ_FLOWVALVE
344 struct fve *fve = NULL;
346 if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
347 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
352 #endif /* ALTQ3_COMPAT */
357 * if we were idle, we pretend that n packets arrived during
366 t = (now.tv_sec - rp->red_last.tv_sec);
369 * being idle for more than 1 minute, set avg to zero.
370 * this prevents t from overflow.
374 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
375 n = t / rp->red_pkttime - 1;
377 /* the following line does (avg = (1 - Wq)^n * avg) */
379 avg = (avg >> FP_SHIFT) *
380 pow_w(rp->red_wtab, n);
384 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
385 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
386 rp->red_avg = avg; /* save the new value */
389 * red_count keeps a tally of arriving traffic that has not
394 /* see if we drop early */
395 droptype = DTYPE_NODROP;
396 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
397 if (avg >= rp->red_thmax_s) {
398 /* avg >= th_max: forced drop */
399 droptype = DTYPE_FORCED;
400 } else if (rp->red_old == 0) {
401 /* first exceeds th_min */
404 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
405 rp->red_probd, rp->red_count)) {
406 /* mark or drop by red */
407 if ((rp->red_flags & REDF_ECN) &&
408 mark_ecn(m, pktattr, rp->red_flags)) {
409 /* successfully marked. do not drop. */
412 rp->red_stats.marked_packets++;
415 /* unforced drop by red */
416 droptype = DTYPE_EARLY;
425 * if the queue length hits the hard limit, it's a forced drop.
427 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
428 droptype = DTYPE_FORCED;
430 #ifdef RED_RANDOM_DROP
431 /* if successful or forced drop, enqueue this packet. */
432 if (droptype != DTYPE_EARLY)
435 /* if successful, enqueue this packet. */
436 if (droptype == DTYPE_NODROP)
439 if (droptype != DTYPE_NODROP) {
440 if (droptype == DTYPE_EARLY) {
441 /* drop the incoming packet */
443 rp->red_stats.drop_unforced++;
446 /* forced drop, select a victim packet in the queue. */
447 #ifdef RED_RANDOM_DROP
451 rp->red_stats.drop_forced++;
455 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
459 #ifdef ALTQ_FLOWVALVE
460 if (rp->red_flowvalve != NULL)
461 fv_dropbyred(rp->red_flowvalve, pktattr, fve);
463 #endif /* ALTQ3_COMPAT */
467 /* successfully queued */
469 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
475 * early-drop probability is calculated as follows:
476 * prob = p_max * (avg - th_min) / (th_max - th_min)
477 * prob_a = prob / (2 - count*prob)
478 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
479 * here prob_a increases as successive undrop count increases.
480 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
481 * becomes 1 when (count >= (2 / prob))).
484 drop_early(int fp_len, int fp_probd, int count)
486 int d; /* denominator of drop-probability */
488 d = fp_probd - count * fp_len;
490 /* count exceeds the hard limit: drop or mark */
494 * now the range of d is [1..600] in fixed-point. (when
495 * th_max-th_min=10 and p_max=1/30)
496 * drop probability = (avg - TH_MIN) / d
499 if ((arc4random() % d) < fp_len) {
508 * try to mark CE bit to the packet.
509 * returns 1 if successfully marked, 0 otherwise.
512 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
518 at = pf_find_mtag(m);
522 } else if (pktattr != NULL) {
523 af = pktattr->pattr_af;
524 hdr = pktattr->pattr_hdr;
525 #endif /* ALTQ3_COMPAT */
529 /* verify that pattr_hdr is within the mbuf data */
530 for (m0 = m; m0 != NULL; m0 = m0->m_next)
531 if (((caddr_t)hdr >= m0->m_data) &&
532 ((caddr_t)hdr < m0->m_data + m0->m_len))
535 /* ick, tag info is stale */
539 switch (((struct ip *)hdr)->ip_v) {
541 if (flags & REDF_ECN4) {
547 return (0); /* version mismatch! */
549 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
550 return (0); /* not-ECT */
551 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
552 return (1); /* already marked */
555 * ecn-capable but not marked,
556 * mark CE and update checksum
559 ip->ip_tos |= IPTOS_ECN_CE;
561 * update checksum (from RFC1624)
562 * HC' = ~(~HC + ~m + m')
564 sum = ~ntohs(ip->ip_sum) & 0xffff;
565 sum += (~otos & 0xffff) + ip->ip_tos;
566 sum = (sum >> 16) + (sum & 0xffff);
567 sum += (sum >> 16); /* add carry */
568 ip->ip_sum = htons(~sum & 0xffff);
573 case (IPV6_VERSION >> 4):
574 if (flags & REDF_ECN6) {
575 struct ip6_hdr *ip6 = hdr;
578 flowlabel = ntohl(ip6->ip6_flow);
579 if ((flowlabel >> 28) != 6)
580 return (0); /* version mismatch! */
581 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
582 (IPTOS_ECN_NOTECT << 20))
583 return (0); /* not-ECT */
584 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
585 (IPTOS_ECN_CE << 20))
586 return (1); /* already marked */
588 * ecn-capable but not marked, mark CE
590 flowlabel |= (IPTOS_ECN_CE << 20);
591 ip6->ip6_flow = htonl(flowlabel);
609 if ((m = _getq(q)) == NULL) {
610 if (rp->red_idle == 0) {
612 microtime(&rp->red_last);
622 * helper routine to calibrate avg during idle.
623 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
624 * here Wq = 1/weight and the code assumes Wq is close to zero.
626 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
628 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
631 wtab_alloc(int weight)
636 for (w = wtab_list; w != NULL; w = w->w_next)
637 if (w->w_weight == weight) {
642 w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK);
644 panic("wtab_alloc: malloc failed!");
645 bzero(w, sizeof(struct wtab));
646 w->w_weight = weight;
648 w->w_next = wtab_list;
651 /* initialize the weight table */
652 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
653 for (i = 1; i < 32; i++) {
654 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
655 if (w->w_tab[i] == 0 && w->w_param_max == 0)
656 w->w_param_max = 1 << i;
663 wtab_destroy(struct wtab *w)
667 if (--w->w_refcount > 0)
671 wtab_list = w->w_next;
672 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
673 if (prev->w_next == w) {
674 prev->w_next = w->w_next;
683 pow_w(struct wtab *w, int n)
688 if (n >= w->w_param_max)
699 val = (val * w->w_tab[i]) >> FP_SHIFT;
710 * red device interface
715 redopen(dev, flag, fmt, p)
718 #if (__FreeBSD_version > 500000)
724 /* everything will be done when the queueing scheme is attached. */
729 redclose(dev, flag, fmt, p)
732 #if (__FreeBSD_version > 500000)
741 while ((rqp = red_list) != NULL) {
743 err = red_detach(rqp);
744 if (err != 0 && error == 0)
752 redioctl(dev, cmd, addr, flag, p)
757 #if (__FreeBSD_version > 500000)
764 struct red_interface *ifacep;
768 /* check super-user privilege */
773 #if (__FreeBSD_version > 700000)
774 if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
775 #elsif (__FreeBSD_version > 400000)
776 if ((error = suser(p)) != 0)
778 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
787 ifacep = (struct red_interface *)addr;
788 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
792 error = altq_enable(rqp->rq_ifq);
796 ifacep = (struct red_interface *)addr;
797 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
801 error = altq_disable(rqp->rq_ifq);
805 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
811 /* allocate and initialize red_queue_t */
812 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
817 bzero(rqp, sizeof(red_queue_t));
819 rqp->rq_q = malloc(sizeof(class_queue_t),
821 if (rqp->rq_q == NULL) {
826 bzero(rqp->rq_q, sizeof(class_queue_t));
828 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
829 if (rqp->rq_red == NULL) {
830 free(rqp->rq_q, M_DEVBUF);
836 rqp->rq_ifq = &ifp->if_snd;
837 qtail(rqp->rq_q) = NULL;
839 qlimit(rqp->rq_q) = RED_LIMIT;
840 qtype(rqp->rq_q) = Q_RED;
843 * set RED to this ifnet structure.
845 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
846 red_enqueue, red_dequeue, red_request,
849 red_destroy(rqp->rq_red);
850 free(rqp->rq_q, M_DEVBUF);
855 /* add this state to the red list */
856 rqp->rq_next = red_list;
861 ifacep = (struct red_interface *)addr;
862 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
866 error = red_detach(rqp);
871 struct red_stats *q_stats;
874 q_stats = (struct red_stats *)addr;
875 if ((rqp = altq_lookup(q_stats->iface.red_ifname,
876 ALTQT_RED)) == NULL) {
881 q_stats->q_len = qlen(rqp->rq_q);
882 q_stats->q_limit = qlimit(rqp->rq_q);
885 q_stats->q_avg = rp->red_avg >> rp->red_wshift;
886 q_stats->xmit_cnt = rp->red_stats.xmit_cnt;
887 q_stats->drop_cnt = rp->red_stats.drop_cnt;
888 q_stats->drop_forced = rp->red_stats.drop_forced;
889 q_stats->drop_unforced = rp->red_stats.drop_unforced;
890 q_stats->marked_packets = rp->red_stats.marked_packets;
892 q_stats->weight = rp->red_weight;
893 q_stats->inv_pmax = rp->red_inv_pmax;
894 q_stats->th_min = rp->red_thmin;
895 q_stats->th_max = rp->red_thmax;
897 #ifdef ALTQ_FLOWVALVE
898 if (rp->red_flowvalve != NULL) {
899 struct flowvalve *fv = rp->red_flowvalve;
900 q_stats->fv_flows = fv->fv_flows;
901 q_stats->fv_pass = fv->fv_stats.pass;
902 q_stats->fv_predrop = fv->fv_stats.predrop;
903 q_stats->fv_alloc = fv->fv_stats.alloc;
904 q_stats->fv_escape = fv->fv_stats.escape;
906 #endif /* ALTQ_FLOWVALVE */
907 q_stats->fv_flows = 0;
908 q_stats->fv_pass = 0;
909 q_stats->fv_predrop = 0;
910 q_stats->fv_alloc = 0;
911 q_stats->fv_escape = 0;
912 #ifdef ALTQ_FLOWVALVE
914 #endif /* ALTQ_FLOWVALVE */
915 } while (/*CONSTCOND*/ 0);
924 fc = (struct red_conf *)addr;
925 if ((rqp = altq_lookup(fc->iface.red_ifname,
926 ALTQT_RED)) == NULL) {
930 new = red_alloc(fc->red_weight,
947 limit = fc->red_limit;
948 if (limit < fc->red_thmax)
949 limit = fc->red_thmax;
950 qlimit(rqp->rq_q) = limit;
951 fc->red_limit = limit; /* write back the new value */
953 red_destroy(rqp->rq_red);
958 /* write back new values */
959 fc->red_limit = limit;
960 fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
961 fc->red_thmin = rqp->rq_red->red_thmin;
962 fc->red_thmax = rqp->rq_red->red_thmax;
964 } while (/*CONSTCOND*/ 0);
967 case RED_SETDEFAULTS:
969 struct redparams *rp;
971 rp = (struct redparams *)addr;
973 default_th_min = rp->th_min;
974 default_th_max = rp->th_max;
975 default_inv_pmax = rp->inv_pmax;
976 } while (/*CONSTCOND*/ 0);
993 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
994 altq_disable(rqp->rq_ifq);
996 if ((error = altq_detach(rqp->rq_ifq)))
1000 red_list = rqp->rq_next;
1002 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1003 if (tmp->rq_next == rqp) {
1004 tmp->rq_next = rqp->rq_next;
1008 printf("red_detach: no state found in red_list!\n");
1011 red_destroy(rqp->rq_red);
1012 free(rqp->rq_q, M_DEVBUF);
1013 free(rqp, M_DEVBUF);
1020 * returns: 0 when successfully queued.
1021 * ENOBUFS when drop occurs.
1024 red_enqueue(ifq, m, pktattr)
1027 struct altq_pktattr *pktattr;
1029 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1031 IFQ_LOCK_ASSERT(ifq);
1033 if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1041 * must be called in splimp.
1043 * returns: mbuf dequeued.
1044 * NULL when no packet is available in the queue.
1047 static struct mbuf *
1048 red_dequeue(ifq, op)
1052 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1055 IFQ_LOCK_ASSERT(ifq);
1057 if (op == ALTDQ_POLL)
1058 return qhead(rqp->rq_q);
1060 /* op == ALTDQ_REMOVE */
1061 m = red_getq(rqp->rq_red, rqp->rq_q);
1068 red_request(ifq, req, arg)
1073 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1075 IFQ_LOCK_ASSERT(ifq);
1090 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1091 rqp->rq_ifq->ifq_len = 0;
1094 #ifdef ALTQ_FLOWVALVE
1096 #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */
1097 #define FV_PSCALE(x) ((x) << FV_PSHIFT)
1098 #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT)
1099 #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */
1100 #define FV_FSCALE(x) ((x) << FV_FSHIFT)
1101 #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT)
1103 #define FV_TIMER (3 * hz) /* timer value for garbage collector */
1104 #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */
1106 #define FV_N 10 /* update fve_f every FV_N packets */
1108 #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */
1109 #define FV_TTHRESH 3 /* time threshold to delete fve */
1110 #define FV_ALPHA 5 /* extra packet count */
1114 #if (__FreeBSD_version > 300000)
1115 #define FV_TIMESTAMP(tp) getmicrotime(tp)
1117 #define FV_TIMESTAMP(tp) { (*(tp)) = time; }
1121 * Brtt table: 127 entry table to convert drop rate (p) to
1122 * the corresponding bandwidth fraction (f)
1123 * the following equation is implemented to use scaled values,
1124 * fve_p and fve_f, in the fixed point format.
1126 * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1127 * f = Brtt(p) / (max_th + alpha)
1129 #define BRTT_SIZE 128
1130 #define BRTT_SHIFT 12
1131 #define BRTT_MASK 0x0007f000
1132 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT))
1134 const int brtt_tab[BRTT_SIZE] = {
1135 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1136 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1137 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1138 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1139 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1140 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1141 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1142 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1143 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1144 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1145 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1146 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1147 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1148 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1149 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1150 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1153 static __inline struct fve *
1154 flowlist_lookup(fv, pktattr, now)
1155 struct flowvalve *fv;
1156 struct altq_pktattr *pktattr;
1157 struct timeval *now;
1163 struct ip6_hdr *ip6;
1165 struct timeval tthresh;
1167 if (pktattr == NULL)
1170 tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1173 * search the flow list
1175 switch (pktattr->pattr_af) {
1177 ip = (struct ip *)pktattr->pattr_hdr;
1178 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1179 if (fve->fve_lastdrop.tv_sec == 0)
1181 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1182 fve->fve_lastdrop.tv_sec = 0;
1185 if (fve->fve_flow.flow_af == AF_INET &&
1186 fve->fve_flow.flow_ip.ip_src.s_addr ==
1187 ip->ip_src.s_addr &&
1188 fve->fve_flow.flow_ip.ip_dst.s_addr ==
1196 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1197 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1198 if (fve->fve_lastdrop.tv_sec == 0)
1200 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1201 fve->fve_lastdrop.tv_sec = 0;
1204 if (fve->fve_flow.flow_af == AF_INET6 &&
1205 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1207 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1216 /* unknown protocol. no drop. */
1219 fv->fv_flows = flows; /* save the number of active fve's */
1223 static __inline struct fve *
1224 flowlist_reclaim(fv, pktattr)
1225 struct flowvalve *fv;
1226 struct altq_pktattr *pktattr;
1231 struct ip6_hdr *ip6;
1235 * get an entry from the tail of the LRU list.
1237 fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1239 switch (pktattr->pattr_af) {
1241 ip = (struct ip *)pktattr->pattr_hdr;
1242 fve->fve_flow.flow_af = AF_INET;
1243 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1244 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1248 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1249 fve->fve_flow.flow_af = AF_INET6;
1250 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1251 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1256 fve->fve_state = Green;
1259 fve->fve_ifseq = fv->fv_ifseq - 1;
1264 fv->fv_stats.alloc++;
1269 static __inline void
1270 flowlist_move_to_head(fv, fve)
1271 struct flowvalve *fv;
1274 if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1275 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1276 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1280 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1282 * allocate flowvalve structure
1284 static struct flowvalve *
1288 struct flowvalve *fv;
1292 num = FV_FLOWLISTSIZE;
1293 fv = malloc(sizeof(struct flowvalve),
1294 M_DEVBUF, M_WAITOK);
1297 bzero(fv, sizeof(struct flowvalve));
1299 fv->fv_fves = malloc(sizeof(struct fve) * num,
1300 M_DEVBUF, M_WAITOK);
1301 if (fv->fv_fves == NULL) {
1305 bzero(fv->fv_fves, sizeof(struct fve) * num);
1308 TAILQ_INIT(&fv->fv_flowlist);
1309 for (i = 0; i < num; i++) {
1310 fve = &fv->fv_fves[i];
1311 fve->fve_lastdrop.tv_sec = 0;
1312 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1315 /* initialize drop rate threshold in scaled fixed-point */
1316 fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1318 /* initialize drop rate to fraction table */
1319 fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1320 M_DEVBUF, M_WAITOK);
1321 if (fv->fv_p2ftab == NULL) {
1322 free(fv->fv_fves, M_DEVBUF);
1327 * create the p2f table.
1328 * (shift is used to keep the precision)
1330 for (i = 1; i < BRTT_SIZE; i++) {
1333 f = brtt_tab[i] << 8;
1334 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1341 static void fv_destroy(fv)
1342 struct flowvalve *fv;
1344 free(fv->fv_p2ftab, M_DEVBUF);
1345 free(fv->fv_fves, M_DEVBUF);
1351 struct flowvalve *fv;
1357 f = fv->fv_p2ftab[BRTT_SIZE-1];
1358 else if ((val = (p & BRTT_MASK)))
1359 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1361 f = fv->fv_p2ftab[1];
1366 * check if an arriving packet should be pre-dropped.
1367 * called from red_addq() when a packet arrives.
1368 * returns 1 when the packet should be pre-dropped.
1369 * should be called in splimp.
1372 fv_checkflow(fv, pktattr, fcache)
1373 struct flowvalve *fv;
1374 struct altq_pktattr *pktattr;
1375 struct fve **fcache;
1383 if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1384 /* no matching entry in the flowlist */
1389 /* update fraction f for every FV_N packets */
1390 if (++fve->fve_count == FV_N) {
1392 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1395 (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1396 + fve->fve_f - FV_FUNSCALE(fve->fve_f);
1397 fve->fve_ifseq = fv->fv_ifseq;
1404 if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1407 /* calculate a threshold */
1408 fthresh = fv_p2f(fv, fve->fve_p);
1409 if (fve->fve_f > fthresh)
1410 fve->fve_state = Red;
1413 if (fve->fve_state == Red) {
1417 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1418 /* no drop for at least FV_BACKOFFTHRESH sec */
1420 fve->fve_state = Green;
1422 fv->fv_stats.escape++;
1425 /* block this flow */
1426 flowlist_move_to_head(fv, fve);
1427 fve->fve_lastdrop = now;
1429 fv->fv_stats.predrop++;
1438 fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1442 fv->fv_stats.pass++;
1448 * called from red_addq when a packet is dropped by red.
1449 * should be called in splimp.
1451 static void fv_dropbyred(fv, pktattr, fcache)
1452 struct flowvalve *fv;
1453 struct altq_pktattr *pktattr;
1459 if (pktattr == NULL)
1464 /* the fve of this packet is already cached */
1466 else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1467 fve = flowlist_reclaim(fv, pktattr);
1469 flowlist_move_to_head(fv, fve);
1472 * update p: the following line cancels the update
1473 * in fv_checkflow() and calculate
1474 * p = Wp + (1 - Wp) * p
1476 fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1478 fve->fve_lastdrop = now;
1481 #endif /* ALTQ_FLOWVALVE */
1485 static struct altqsw red_sw =
1486 {"red", redopen, redclose, redioctl};
1488 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1489 MODULE_VERSION(altq_red, 1);
1491 #endif /* KLD_MODULE */
1492 #endif /* ALTQ3_COMPAT */
1494 #endif /* ALTQ_RED */