2 /* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */
5 * Copyright (C) 1997-2003
6 * Sony Computer Science Laboratories Inc. All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * Copyright (c) 1990-1994 Regents of the University of California.
32 * All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the Computer Systems
45 * Engineering Group at Lawrence Berkeley Laboratory.
46 * 4. Neither the name of the University nor of the Laboratory may be used
47 * to endorse or promote products derived from this software without
48 * specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 #if defined(__FreeBSD__) || defined(__NetBSD__)
65 #if (__FreeBSD__ != 2)
68 #include "opt_inet6.h"
71 #endif /* __FreeBSD__ || __NetBSD__ */
72 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
74 #include <sys/param.h>
75 #include <sys/malloc.h>
77 #include <sys/socket.h>
78 #include <sys/systm.h>
79 #include <sys/errno.h>
80 #if 1 /* ALTQ3_COMPAT */
81 #include <sys/sockio.h>
83 #include <sys/kernel.h>
85 #include <sys/queue.h>
88 #endif /* ALTQ3_COMPAT */
92 #include <netinet/in.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
99 #include <net/pfvar.h>
100 #include <altq/altq.h>
101 #include <altq/altq_red.h>
103 #include <altq/altq_conf.h>
104 #ifdef ALTQ_FLOWVALVE
105 #include <altq/altq_flowvalve.h>
110 * ALTQ/RED (Random Early Detection) implementation using 32-bit
111 * fixed-point calculation.
113 * written by kjc using the ns code as a reference.
114 * you can learn more about red and ns from Sally's home page at
115 * http://www-nrg.ee.lbl.gov/floyd/
117 * most of the red parameter values are fixed in this implementation
118 * to prevent fixed-point overflow/underflow.
119 * if you change the parameters, watch out for overflow/underflow!
121 * the parameters used are recommended values by Sally.
122 * the corresponding ns config looks:
124 * minthresh=5 maxthresh=15 queue-size=60
127 * bytes=false (can't be handled by 32-bit fixed-point)
128 * doubleq=false dqthresh=false
132 * alternative red parameters for a slow link.
134 * assume the queue length becomes from zero to L and keeps L, it takes
135 * N packets for q_avg to reach 63% of L.
136 * when q_weight is 0.002, N is about 500 packets.
137 * for a slow link like dial-up, 500 packets takes more than 1 minute!
138 * when q_weight is 0.008, N is about 127 packets.
139 * when q_weight is 0.016, N is about 63 packets.
140 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
141 * are allowed for 0.016.
142 * see Sally's paper for more details.
144 /* normal red parameters */
145 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
146 /* q_weight = 0.00195 */
148 /* red parameters for a slow link */
149 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
150 /* q_weight = 0.0078125 */
152 /* red parameters for a very slow link (e.g., dialup) */
153 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
154 /* q_weight = 0.015625 */
156 /* fixed-point uses 12-bit decimal places */
157 #define FP_SHIFT 12 /* fixed-point shift */
159 /* red parameters for drop probability */
160 #define INV_P_MAX 10 /* inverse of max drop probability */
161 #define TH_MIN 5 /* min threshold */
162 #define TH_MAX 15 /* max threshold */
164 #define RED_LIMIT 60 /* default max queue lenght */
165 #define RED_STATS /* collect statistics */
168 * our default policy for forced-drop is drop-tail.
169 * (in altq-1.1.2 or earlier, the default was random-drop.
170 * but it makes more sense to punish the cause of the surge.)
171 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
175 #ifdef ALTQ_FLOWVALVE
177 * flow-valve is an extention to protect red from unresponsive flows
178 * and to promote end-to-end congestion control.
179 * flow-valve observes the average drop rates of the flows that have
180 * experienced packet drops in the recent past.
181 * when the average drop rate exceeds the threshold, the flow is
182 * blocked by the flow-valve. the trapped flow should back off
183 * exponentially to escape from the flow-valve.
185 #ifdef RED_RANDOM_DROP
186 #error "random-drop can't be used with flow-valve!"
188 #endif /* ALTQ_FLOWVALVE */
190 /* red_list keeps all red_queue_t's allocated. */
191 static red_queue_t *red_list = NULL;
193 #endif /* ALTQ3_COMPAT */
195 /* default red parameter values */
196 static int default_th_min = TH_MIN;
197 static int default_th_max = TH_MAX;
198 static int default_inv_pmax = INV_P_MAX;
201 /* internal function prototypes */
202 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
203 static struct mbuf *red_dequeue(struct ifaltq *, int);
204 static int red_request(struct ifaltq *, int, void *);
205 static void red_purgeq(red_queue_t *);
206 static int red_detach(red_queue_t *);
207 #ifdef ALTQ_FLOWVALVE
208 static __inline struct fve *flowlist_lookup(struct flowvalve *,
209 struct altq_pktattr *, struct timeval *);
210 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
211 struct altq_pktattr *);
212 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
213 static __inline int fv_p2f(struct flowvalve *, int);
214 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
215 static struct flowvalve *fv_alloc(struct red *);
217 static void fv_destroy(struct flowvalve *);
218 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
220 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
223 #endif /* ALTQ3_COMPAT */
226 * red support routines
229 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
236 rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK);
239 bzero(rp, sizeof(red_t));
245 rp->red_weight = W_WEIGHT;
247 rp->red_weight = weight;
249 rp->red_inv_pmax = default_inv_pmax;
251 rp->red_inv_pmax = inv_pmax;
253 rp->red_thmin = default_th_min;
255 rp->red_thmin = th_min;
257 rp->red_thmax = default_th_max;
259 rp->red_thmax = th_max;
261 rp->red_flags = flags;
264 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
265 rp->red_pkttime = 800;
267 rp->red_pkttime = pkttime;
270 /* when the link is very slow, adjust red parameters */
271 npkts_per_sec = 1000000 / rp->red_pkttime;
272 if (npkts_per_sec < 50) {
273 /* up to about 400Kbps */
274 rp->red_weight = W_WEIGHT_2;
275 } else if (npkts_per_sec < 300) {
276 /* up to about 2.4Mbps */
277 rp->red_weight = W_WEIGHT_1;
281 /* calculate wshift. weight must be power of 2 */
283 for (i = 0; w > 1; i++)
286 w = 1 << rp->red_wshift;
287 if (w != rp->red_weight) {
288 printf("invalid weight value %d for red! use %d\n",
294 * thmin_s and thmax_s are scaled versions of th_min and th_max
295 * to be compared with avg.
297 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
298 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
301 * precompute probability denominator
302 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
304 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
305 * rp->red_inv_pmax) << FP_SHIFT;
307 /* allocate weight table */
308 rp->red_wtab = wtab_alloc(rp->red_weight);
310 microtime(&rp->red_last);
315 red_destroy(red_t *rp)
318 #ifdef ALTQ_FLOWVALVE
319 if (rp->red_flowvalve != NULL)
320 fv_destroy(rp->red_flowvalve);
322 #endif /* ALTQ3_COMPAT */
323 wtab_destroy(rp->red_wtab);
328 red_getstats(red_t *rp, struct redstats *sp)
330 sp->q_avg = rp->red_avg >> rp->red_wshift;
331 sp->xmit_cnt = rp->red_stats.xmit_cnt;
332 sp->drop_cnt = rp->red_stats.drop_cnt;
333 sp->drop_forced = rp->red_stats.drop_forced;
334 sp->drop_unforced = rp->red_stats.drop_unforced;
335 sp->marked_packets = rp->red_stats.marked_packets;
339 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
340 struct altq_pktattr *pktattr)
345 #ifdef ALTQ_FLOWVALVE
346 struct fve *fve = NULL;
348 if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
349 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
354 #endif /* ALTQ3_COMPAT */
359 * if we were idle, we pretend that n packets arrived during
368 t = (now.tv_sec - rp->red_last.tv_sec);
371 * being idle for more than 1 minute, set avg to zero.
372 * this prevents t from overflow.
376 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
377 n = t / rp->red_pkttime - 1;
379 /* the following line does (avg = (1 - Wq)^n * avg) */
381 avg = (avg >> FP_SHIFT) *
382 pow_w(rp->red_wtab, n);
386 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
387 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
388 rp->red_avg = avg; /* save the new value */
391 * red_count keeps a tally of arriving traffic that has not
396 /* see if we drop early */
397 droptype = DTYPE_NODROP;
398 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
399 if (avg >= rp->red_thmax_s) {
400 /* avg >= th_max: forced drop */
401 droptype = DTYPE_FORCED;
402 } else if (rp->red_old == 0) {
403 /* first exceeds th_min */
406 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
407 rp->red_probd, rp->red_count)) {
408 /* mark or drop by red */
409 if ((rp->red_flags & REDF_ECN) &&
410 mark_ecn(m, pktattr, rp->red_flags)) {
411 /* successfully marked. do not drop. */
414 rp->red_stats.marked_packets++;
417 /* unforced drop by red */
418 droptype = DTYPE_EARLY;
427 * if the queue length hits the hard limit, it's a forced drop.
429 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
430 droptype = DTYPE_FORCED;
432 #ifdef RED_RANDOM_DROP
433 /* if successful or forced drop, enqueue this packet. */
434 if (droptype != DTYPE_EARLY)
437 /* if successful, enqueue this packet. */
438 if (droptype == DTYPE_NODROP)
441 if (droptype != DTYPE_NODROP) {
442 if (droptype == DTYPE_EARLY) {
443 /* drop the incoming packet */
445 rp->red_stats.drop_unforced++;
448 /* forced drop, select a victim packet in the queue. */
449 #ifdef RED_RANDOM_DROP
453 rp->red_stats.drop_forced++;
457 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
461 #ifdef ALTQ_FLOWVALVE
462 if (rp->red_flowvalve != NULL)
463 fv_dropbyred(rp->red_flowvalve, pktattr, fve);
465 #endif /* ALTQ3_COMPAT */
469 /* successfully queued */
471 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
477 * early-drop probability is calculated as follows:
478 * prob = p_max * (avg - th_min) / (th_max - th_min)
479 * prob_a = prob / (2 - count*prob)
480 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
481 * here prob_a increases as successive undrop count increases.
482 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
483 * becomes 1 when (count >= (2 / prob))).
486 drop_early(int fp_len, int fp_probd, int count)
488 int d; /* denominator of drop-probability */
490 d = fp_probd - count * fp_len;
492 /* count exceeds the hard limit: drop or mark */
496 * now the range of d is [1..600] in fixed-point. (when
497 * th_max-th_min=10 and p_max=1/30)
498 * drop probability = (avg - TH_MIN) / d
501 if ((arc4random() % d) < fp_len) {
510 * try to mark CE bit to the packet.
511 * returns 1 if successfully marked, 0 otherwise.
514 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
521 at = pf_find_mtag(m);
526 } else if (pktattr != NULL) {
527 af = pktattr->pattr_af;
528 hdr = pktattr->pattr_hdr;
529 #endif /* ALTQ3_COMPAT */
533 if (af != AF_INET && af != AF_INET6)
536 /* verify that pattr_hdr is within the mbuf data */
537 for (m0 = m; m0 != NULL; m0 = m0->m_next)
538 if (((caddr_t)hdr >= m0->m_data) &&
539 ((caddr_t)hdr < m0->m_data + m0->m_len))
542 /* ick, tag info is stale */
548 if (flags & REDF_ECN4) {
554 return (0); /* version mismatch! */
556 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
557 return (0); /* not-ECT */
558 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
559 return (1); /* already marked */
562 * ecn-capable but not marked,
563 * mark CE and update checksum
566 ip->ip_tos |= IPTOS_ECN_CE;
568 * update checksum (from RFC1624)
569 * HC' = ~(~HC + ~m + m')
571 sum = ~ntohs(ip->ip_sum) & 0xffff;
572 sum += (~otos & 0xffff) + ip->ip_tos;
573 sum = (sum >> 16) + (sum & 0xffff);
574 sum += (sum >> 16); /* add carry */
575 ip->ip_sum = htons(~sum & 0xffff);
581 if (flags & REDF_ECN6) {
582 struct ip6_hdr *ip6 = hdr;
585 flowlabel = ntohl(ip6->ip6_flow);
586 if ((flowlabel >> 28) != 6)
587 return (0); /* version mismatch! */
588 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
589 (IPTOS_ECN_NOTECT << 20))
590 return (0); /* not-ECT */
591 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
592 (IPTOS_ECN_CE << 20))
593 return (1); /* already marked */
595 * ecn-capable but not marked, mark CE
597 flowlabel |= (IPTOS_ECN_CE << 20);
598 ip6->ip6_flow = htonl(flowlabel);
616 if ((m = _getq(q)) == NULL) {
617 if (rp->red_idle == 0) {
619 microtime(&rp->red_last);
629 * helper routine to calibrate avg during idle.
630 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
631 * here Wq = 1/weight and the code assumes Wq is close to zero.
633 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
635 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
638 wtab_alloc(int weight)
643 for (w = wtab_list; w != NULL; w = w->w_next)
644 if (w->w_weight == weight) {
649 w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK);
651 panic("wtab_alloc: malloc failed!");
652 bzero(w, sizeof(struct wtab));
653 w->w_weight = weight;
655 w->w_next = wtab_list;
658 /* initialize the weight table */
659 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
660 for (i = 1; i < 32; i++) {
661 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
662 if (w->w_tab[i] == 0 && w->w_param_max == 0)
663 w->w_param_max = 1 << i;
670 wtab_destroy(struct wtab *w)
674 if (--w->w_refcount > 0)
678 wtab_list = w->w_next;
679 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
680 if (prev->w_next == w) {
681 prev->w_next = w->w_next;
690 pow_w(struct wtab *w, int n)
695 if (n >= w->w_param_max)
706 val = (val * w->w_tab[i]) >> FP_SHIFT;
717 * red device interface
722 redopen(dev, flag, fmt, p)
725 #if (__FreeBSD_version > 500000)
731 /* everything will be done when the queueing scheme is attached. */
736 redclose(dev, flag, fmt, p)
739 #if (__FreeBSD_version > 500000)
748 while ((rqp = red_list) != NULL) {
750 err = red_detach(rqp);
751 if (err != 0 && error == 0)
759 redioctl(dev, cmd, addr, flag, p)
764 #if (__FreeBSD_version > 500000)
771 struct red_interface *ifacep;
775 /* check super-user privilege */
780 #if (__FreeBSD_version > 700000)
781 if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
782 #elsif (__FreeBSD_version > 400000)
783 if ((error = suser(p)) != 0)
785 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
794 ifacep = (struct red_interface *)addr;
795 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
799 error = altq_enable(rqp->rq_ifq);
803 ifacep = (struct red_interface *)addr;
804 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
808 error = altq_disable(rqp->rq_ifq);
812 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
818 /* allocate and initialize red_queue_t */
819 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
824 bzero(rqp, sizeof(red_queue_t));
826 rqp->rq_q = malloc(sizeof(class_queue_t),
828 if (rqp->rq_q == NULL) {
833 bzero(rqp->rq_q, sizeof(class_queue_t));
835 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
836 if (rqp->rq_red == NULL) {
837 free(rqp->rq_q, M_DEVBUF);
843 rqp->rq_ifq = &ifp->if_snd;
844 qtail(rqp->rq_q) = NULL;
846 qlimit(rqp->rq_q) = RED_LIMIT;
847 qtype(rqp->rq_q) = Q_RED;
850 * set RED to this ifnet structure.
852 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
853 red_enqueue, red_dequeue, red_request,
856 red_destroy(rqp->rq_red);
857 free(rqp->rq_q, M_DEVBUF);
862 /* add this state to the red list */
863 rqp->rq_next = red_list;
868 ifacep = (struct red_interface *)addr;
869 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
873 error = red_detach(rqp);
878 struct red_stats *q_stats;
881 q_stats = (struct red_stats *)addr;
882 if ((rqp = altq_lookup(q_stats->iface.red_ifname,
883 ALTQT_RED)) == NULL) {
888 q_stats->q_len = qlen(rqp->rq_q);
889 q_stats->q_limit = qlimit(rqp->rq_q);
892 q_stats->q_avg = rp->red_avg >> rp->red_wshift;
893 q_stats->xmit_cnt = rp->red_stats.xmit_cnt;
894 q_stats->drop_cnt = rp->red_stats.drop_cnt;
895 q_stats->drop_forced = rp->red_stats.drop_forced;
896 q_stats->drop_unforced = rp->red_stats.drop_unforced;
897 q_stats->marked_packets = rp->red_stats.marked_packets;
899 q_stats->weight = rp->red_weight;
900 q_stats->inv_pmax = rp->red_inv_pmax;
901 q_stats->th_min = rp->red_thmin;
902 q_stats->th_max = rp->red_thmax;
904 #ifdef ALTQ_FLOWVALVE
905 if (rp->red_flowvalve != NULL) {
906 struct flowvalve *fv = rp->red_flowvalve;
907 q_stats->fv_flows = fv->fv_flows;
908 q_stats->fv_pass = fv->fv_stats.pass;
909 q_stats->fv_predrop = fv->fv_stats.predrop;
910 q_stats->fv_alloc = fv->fv_stats.alloc;
911 q_stats->fv_escape = fv->fv_stats.escape;
913 #endif /* ALTQ_FLOWVALVE */
914 q_stats->fv_flows = 0;
915 q_stats->fv_pass = 0;
916 q_stats->fv_predrop = 0;
917 q_stats->fv_alloc = 0;
918 q_stats->fv_escape = 0;
919 #ifdef ALTQ_FLOWVALVE
921 #endif /* ALTQ_FLOWVALVE */
922 } while (/*CONSTCOND*/ 0);
931 fc = (struct red_conf *)addr;
932 if ((rqp = altq_lookup(fc->iface.red_ifname,
933 ALTQT_RED)) == NULL) {
937 new = red_alloc(fc->red_weight,
954 limit = fc->red_limit;
955 if (limit < fc->red_thmax)
956 limit = fc->red_thmax;
957 qlimit(rqp->rq_q) = limit;
958 fc->red_limit = limit; /* write back the new value */
960 red_destroy(rqp->rq_red);
965 /* write back new values */
966 fc->red_limit = limit;
967 fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
968 fc->red_thmin = rqp->rq_red->red_thmin;
969 fc->red_thmax = rqp->rq_red->red_thmax;
971 } while (/*CONSTCOND*/ 0);
974 case RED_SETDEFAULTS:
976 struct redparams *rp;
978 rp = (struct redparams *)addr;
980 default_th_min = rp->th_min;
981 default_th_max = rp->th_max;
982 default_inv_pmax = rp->inv_pmax;
983 } while (/*CONSTCOND*/ 0);
1000 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1001 altq_disable(rqp->rq_ifq);
1003 if ((error = altq_detach(rqp->rq_ifq)))
1006 if (red_list == rqp)
1007 red_list = rqp->rq_next;
1009 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1010 if (tmp->rq_next == rqp) {
1011 tmp->rq_next = rqp->rq_next;
1015 printf("red_detach: no state found in red_list!\n");
1018 red_destroy(rqp->rq_red);
1019 free(rqp->rq_q, M_DEVBUF);
1020 free(rqp, M_DEVBUF);
1027 * returns: 0 when successfully queued.
1028 * ENOBUFS when drop occurs.
1031 red_enqueue(ifq, m, pktattr)
1034 struct altq_pktattr *pktattr;
1036 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1038 IFQ_LOCK_ASSERT(ifq);
1040 if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1048 * must be called in splimp.
1050 * returns: mbuf dequeued.
1051 * NULL when no packet is available in the queue.
1054 static struct mbuf *
1055 red_dequeue(ifq, op)
1059 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1062 IFQ_LOCK_ASSERT(ifq);
1064 if (op == ALTDQ_POLL)
1065 return qhead(rqp->rq_q);
1067 /* op == ALTDQ_REMOVE */
1068 m = red_getq(rqp->rq_red, rqp->rq_q);
1075 red_request(ifq, req, arg)
1080 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1082 IFQ_LOCK_ASSERT(ifq);
1097 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1098 rqp->rq_ifq->ifq_len = 0;
1101 #ifdef ALTQ_FLOWVALVE
1103 #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */
1104 #define FV_PSCALE(x) ((x) << FV_PSHIFT)
1105 #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT)
1106 #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */
1107 #define FV_FSCALE(x) ((x) << FV_FSHIFT)
1108 #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT)
1110 #define FV_TIMER (3 * hz) /* timer value for garbage collector */
1111 #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */
1113 #define FV_N 10 /* update fve_f every FV_N packets */
1115 #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */
1116 #define FV_TTHRESH 3 /* time threshold to delete fve */
1117 #define FV_ALPHA 5 /* extra packet count */
1121 #if (__FreeBSD_version > 300000)
1122 #define FV_TIMESTAMP(tp) getmicrotime(tp)
1124 #define FV_TIMESTAMP(tp) { (*(tp)) = time; }
1128 * Brtt table: 127 entry table to convert drop rate (p) to
1129 * the corresponding bandwidth fraction (f)
1130 * the following equation is implemented to use scaled values,
1131 * fve_p and fve_f, in the fixed point format.
1133 * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1134 * f = Brtt(p) / (max_th + alpha)
1136 #define BRTT_SIZE 128
1137 #define BRTT_SHIFT 12
1138 #define BRTT_MASK 0x0007f000
1139 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT))
1141 const int brtt_tab[BRTT_SIZE] = {
1142 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1143 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1144 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1145 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1146 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1147 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1148 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1149 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1150 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1151 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1152 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1153 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1154 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1155 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1156 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1157 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1160 static __inline struct fve *
1161 flowlist_lookup(fv, pktattr, now)
1162 struct flowvalve *fv;
1163 struct altq_pktattr *pktattr;
1164 struct timeval *now;
1170 struct ip6_hdr *ip6;
1172 struct timeval tthresh;
1174 if (pktattr == NULL)
1177 tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1180 * search the flow list
1182 switch (pktattr->pattr_af) {
1184 ip = (struct ip *)pktattr->pattr_hdr;
1185 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1186 if (fve->fve_lastdrop.tv_sec == 0)
1188 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1189 fve->fve_lastdrop.tv_sec = 0;
1192 if (fve->fve_flow.flow_af == AF_INET &&
1193 fve->fve_flow.flow_ip.ip_src.s_addr ==
1194 ip->ip_src.s_addr &&
1195 fve->fve_flow.flow_ip.ip_dst.s_addr ==
1203 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1204 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1205 if (fve->fve_lastdrop.tv_sec == 0)
1207 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1208 fve->fve_lastdrop.tv_sec = 0;
1211 if (fve->fve_flow.flow_af == AF_INET6 &&
1212 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1214 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1223 /* unknown protocol. no drop. */
1226 fv->fv_flows = flows; /* save the number of active fve's */
1230 static __inline struct fve *
1231 flowlist_reclaim(fv, pktattr)
1232 struct flowvalve *fv;
1233 struct altq_pktattr *pktattr;
1238 struct ip6_hdr *ip6;
1242 * get an entry from the tail of the LRU list.
1244 fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1246 switch (pktattr->pattr_af) {
1248 ip = (struct ip *)pktattr->pattr_hdr;
1249 fve->fve_flow.flow_af = AF_INET;
1250 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1251 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1255 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1256 fve->fve_flow.flow_af = AF_INET6;
1257 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1258 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1263 fve->fve_state = Green;
1266 fve->fve_ifseq = fv->fv_ifseq - 1;
1271 fv->fv_stats.alloc++;
1276 static __inline void
1277 flowlist_move_to_head(fv, fve)
1278 struct flowvalve *fv;
1281 if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1282 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1283 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1287 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1289 * allocate flowvalve structure
1291 static struct flowvalve *
1295 struct flowvalve *fv;
1299 num = FV_FLOWLISTSIZE;
1300 fv = malloc(sizeof(struct flowvalve),
1301 M_DEVBUF, M_WAITOK);
1304 bzero(fv, sizeof(struct flowvalve));
1306 fv->fv_fves = malloc(sizeof(struct fve) * num,
1307 M_DEVBUF, M_WAITOK);
1308 if (fv->fv_fves == NULL) {
1312 bzero(fv->fv_fves, sizeof(struct fve) * num);
1315 TAILQ_INIT(&fv->fv_flowlist);
1316 for (i = 0; i < num; i++) {
1317 fve = &fv->fv_fves[i];
1318 fve->fve_lastdrop.tv_sec = 0;
1319 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1322 /* initialize drop rate threshold in scaled fixed-point */
1323 fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1325 /* initialize drop rate to fraction table */
1326 fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1327 M_DEVBUF, M_WAITOK);
1328 if (fv->fv_p2ftab == NULL) {
1329 free(fv->fv_fves, M_DEVBUF);
1334 * create the p2f table.
1335 * (shift is used to keep the precision)
1337 for (i = 1; i < BRTT_SIZE; i++) {
1340 f = brtt_tab[i] << 8;
1341 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1348 static void fv_destroy(fv)
1349 struct flowvalve *fv;
1351 free(fv->fv_p2ftab, M_DEVBUF);
1352 free(fv->fv_fves, M_DEVBUF);
1358 struct flowvalve *fv;
1364 f = fv->fv_p2ftab[BRTT_SIZE-1];
1365 else if ((val = (p & BRTT_MASK)))
1366 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1368 f = fv->fv_p2ftab[1];
1373 * check if an arriving packet should be pre-dropped.
1374 * called from red_addq() when a packet arrives.
1375 * returns 1 when the packet should be pre-dropped.
1376 * should be called in splimp.
1379 fv_checkflow(fv, pktattr, fcache)
1380 struct flowvalve *fv;
1381 struct altq_pktattr *pktattr;
1382 struct fve **fcache;
1390 if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1391 /* no matching entry in the flowlist */
1396 /* update fraction f for every FV_N packets */
1397 if (++fve->fve_count == FV_N) {
1399 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1402 (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1403 + fve->fve_f - FV_FUNSCALE(fve->fve_f);
1404 fve->fve_ifseq = fv->fv_ifseq;
1411 if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1414 /* calculate a threshold */
1415 fthresh = fv_p2f(fv, fve->fve_p);
1416 if (fve->fve_f > fthresh)
1417 fve->fve_state = Red;
1420 if (fve->fve_state == Red) {
1424 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1425 /* no drop for at least FV_BACKOFFTHRESH sec */
1427 fve->fve_state = Green;
1429 fv->fv_stats.escape++;
1432 /* block this flow */
1433 flowlist_move_to_head(fv, fve);
1434 fve->fve_lastdrop = now;
1436 fv->fv_stats.predrop++;
1445 fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1449 fv->fv_stats.pass++;
1455 * called from red_addq when a packet is dropped by red.
1456 * should be called in splimp.
1458 static void fv_dropbyred(fv, pktattr, fcache)
1459 struct flowvalve *fv;
1460 struct altq_pktattr *pktattr;
1466 if (pktattr == NULL)
1471 /* the fve of this packet is already cached */
1473 else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1474 fve = flowlist_reclaim(fv, pktattr);
1476 flowlist_move_to_head(fv, fve);
1479 * update p: the following line cancels the update
1480 * in fv_checkflow() and calculate
1481 * p = Wp + (1 - Wp) * p
1483 fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1485 fve->fve_lastdrop = now;
1488 #endif /* ALTQ_FLOWVALVE */
1492 static struct altqsw red_sw =
1493 {"red", redopen, redclose, redioctl};
1495 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1496 MODULE_VERSION(altq_red, 1);
1498 #endif /* KLD_MODULE */
1499 #endif /* ALTQ3_COMPAT */
1501 #endif /* ALTQ_RED */