2 * Copyright (C) 1997-2003
3 * Sony Computer Science Laboratories Inc. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * Copyright (c) 1990-1994 Regents of the University of California.
29 * All rights reserved.
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
34 * 1. Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in the
38 * documentation and/or other materials provided with the distribution.
39 * 3. All advertising materials mentioning features or use of this software
40 * must display the following acknowledgement:
41 * This product includes software developed by the Computer Systems
42 * Engineering Group at Lawrence Berkeley Laboratory.
43 * 4. Neither the name of the University nor of the Laboratory may be used
44 * to endorse or promote products derived from this software without
45 * specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
65 #include "opt_inet6.h"
66 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
68 #include <sys/param.h>
69 #include <sys/malloc.h>
71 #include <sys/socket.h>
72 #include <sys/systm.h>
73 #include <sys/errno.h>
74 #if 1 /* ALTQ3_COMPAT */
75 #include <sys/sockio.h>
77 #include <sys/kernel.h>
79 #include <sys/queue.h>
82 #endif /* ALTQ3_COMPAT */
85 #include <net/if_var.h>
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/ip.h>
91 #include <netinet/ip6.h>
94 #include <netpfil/pf/pf.h>
95 #include <netpfil/pf/pf_altq.h>
96 #include <netpfil/pf/pf_mtag.h>
97 #include <net/altq/altq.h>
98 #include <net/altq/altq_red.h>
100 #include <net/altq/altq_conf.h>
101 #ifdef ALTQ_FLOWVALVE
102 #include <net/altq/altq_flowvalve.h>
107 * ALTQ/RED (Random Early Detection) implementation using 32-bit
108 * fixed-point calculation.
110 * written by kjc using the ns code as a reference.
111 * you can learn more about red and ns from Sally's home page at
112 * http://www-nrg.ee.lbl.gov/floyd/
114 * most of the red parameter values are fixed in this implementation
115 * to prevent fixed-point overflow/underflow.
116 * if you change the parameters, watch out for overflow/underflow!
118 * the parameters used are recommended values by Sally.
119 * the corresponding ns config looks:
121 * minthresh=5 maxthresh=15 queue-size=60
124 * bytes=false (can't be handled by 32-bit fixed-point)
125 * doubleq=false dqthresh=false
129 * alternative red parameters for a slow link.
131 * assume the queue length becomes from zero to L and keeps L, it takes
132 * N packets for q_avg to reach 63% of L.
133 * when q_weight is 0.002, N is about 500 packets.
134 * for a slow link like dial-up, 500 packets takes more than 1 minute!
135 * when q_weight is 0.008, N is about 127 packets.
136 * when q_weight is 0.016, N is about 63 packets.
137 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
138 * are allowed for 0.016.
139 * see Sally's paper for more details.
141 /* normal red parameters */
142 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
143 /* q_weight = 0.00195 */
145 /* red parameters for a slow link */
146 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
147 /* q_weight = 0.0078125 */
149 /* red parameters for a very slow link (e.g., dialup) */
150 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
151 /* q_weight = 0.015625 */
153 /* fixed-point uses 12-bit decimal places */
154 #define FP_SHIFT 12 /* fixed-point shift */
156 /* red parameters for drop probability */
157 #define INV_P_MAX 10 /* inverse of max drop probability */
158 #define TH_MIN 5 /* min threshold */
159 #define TH_MAX 15 /* max threshold */
161 #define RED_LIMIT 60 /* default max queue length */
162 #define RED_STATS /* collect statistics */
165 * our default policy for forced-drop is drop-tail.
166 * (in altq-1.1.2 or earlier, the default was random-drop.
167 * but it makes more sense to punish the cause of the surge.)
168 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
172 #ifdef ALTQ_FLOWVALVE
174 * flow-valve is an extension to protect red from unresponsive flows
175 * and to promote end-to-end congestion control.
176 * flow-valve observes the average drop rates of the flows that have
177 * experienced packet drops in the recent past.
178 * when the average drop rate exceeds the threshold, the flow is
179 * blocked by the flow-valve. the trapped flow should back off
180 * exponentially to escape from the flow-valve.
182 #ifdef RED_RANDOM_DROP
183 #error "random-drop can't be used with flow-valve!"
185 #endif /* ALTQ_FLOWVALVE */
187 /* red_list keeps all red_queue_t's allocated. */
188 static red_queue_t *red_list = NULL;
190 #endif /* ALTQ3_COMPAT */
192 /* default red parameter values */
193 static int default_th_min = TH_MIN;
194 static int default_th_max = TH_MAX;
195 static int default_inv_pmax = INV_P_MAX;
198 /* internal function prototypes */
199 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
200 static struct mbuf *red_dequeue(struct ifaltq *, int);
201 static int red_request(struct ifaltq *, int, void *);
202 static void red_purgeq(red_queue_t *);
203 static int red_detach(red_queue_t *);
204 #ifdef ALTQ_FLOWVALVE
205 static __inline struct fve *flowlist_lookup(struct flowvalve *,
206 struct altq_pktattr *, struct timeval *);
207 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
208 struct altq_pktattr *);
209 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
210 static __inline int fv_p2f(struct flowvalve *, int);
211 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
212 static struct flowvalve *fv_alloc(struct red *);
214 static void fv_destroy(struct flowvalve *);
215 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
217 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
220 #endif /* ALTQ3_COMPAT */
223 * red support routines
226 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
233 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
238 rp->red_weight = W_WEIGHT;
240 rp->red_weight = weight;
242 /* allocate weight table */
243 rp->red_wtab = wtab_alloc(rp->red_weight);
244 if (rp->red_wtab == NULL) {
253 rp->red_inv_pmax = default_inv_pmax;
255 rp->red_inv_pmax = inv_pmax;
257 rp->red_thmin = default_th_min;
259 rp->red_thmin = th_min;
261 rp->red_thmax = default_th_max;
263 rp->red_thmax = th_max;
265 rp->red_flags = flags;
268 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
269 rp->red_pkttime = 800;
271 rp->red_pkttime = pkttime;
274 /* when the link is very slow, adjust red parameters */
275 npkts_per_sec = 1000000 / rp->red_pkttime;
276 if (npkts_per_sec < 50) {
277 /* up to about 400Kbps */
278 rp->red_weight = W_WEIGHT_2;
279 } else if (npkts_per_sec < 300) {
280 /* up to about 2.4Mbps */
281 rp->red_weight = W_WEIGHT_1;
285 /* calculate wshift. weight must be power of 2 */
287 for (i = 0; w > 1; i++)
290 w = 1 << rp->red_wshift;
291 if (w != rp->red_weight) {
292 printf("invalid weight value %d for red! use %d\n",
298 * thmin_s and thmax_s are scaled versions of th_min and th_max
299 * to be compared with avg.
301 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
302 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
305 * precompute probability denominator
306 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
308 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
309 * rp->red_inv_pmax) << FP_SHIFT;
311 microtime(&rp->red_last);
316 red_destroy(red_t *rp)
319 #ifdef ALTQ_FLOWVALVE
320 if (rp->red_flowvalve != NULL)
321 fv_destroy(rp->red_flowvalve);
323 #endif /* ALTQ3_COMPAT */
324 wtab_destroy(rp->red_wtab);
329 red_getstats(red_t *rp, struct redstats *sp)
331 sp->q_avg = rp->red_avg >> rp->red_wshift;
332 sp->xmit_cnt = rp->red_stats.xmit_cnt;
333 sp->drop_cnt = rp->red_stats.drop_cnt;
334 sp->drop_forced = rp->red_stats.drop_forced;
335 sp->drop_unforced = rp->red_stats.drop_unforced;
336 sp->marked_packets = rp->red_stats.marked_packets;
340 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
341 struct altq_pktattr *pktattr)
346 #ifdef ALTQ_FLOWVALVE
347 struct fve *fve = NULL;
349 if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
350 if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
355 #endif /* ALTQ3_COMPAT */
360 * if we were idle, we pretend that n packets arrived during
369 t = (now.tv_sec - rp->red_last.tv_sec);
372 * being idle for more than 1 minute, set avg to zero.
373 * this prevents t from overflow.
377 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
378 n = t / rp->red_pkttime - 1;
380 /* the following line does (avg = (1 - Wq)^n * avg) */
382 avg = (avg >> FP_SHIFT) *
383 pow_w(rp->red_wtab, n);
387 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
388 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
389 rp->red_avg = avg; /* save the new value */
392 * red_count keeps a tally of arriving traffic that has not
397 /* see if we drop early */
398 droptype = DTYPE_NODROP;
399 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
400 if (avg >= rp->red_thmax_s) {
401 /* avg >= th_max: forced drop */
402 droptype = DTYPE_FORCED;
403 } else if (rp->red_old == 0) {
404 /* first exceeds th_min */
407 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
408 rp->red_probd, rp->red_count)) {
409 /* mark or drop by red */
410 if ((rp->red_flags & REDF_ECN) &&
411 mark_ecn(m, pktattr, rp->red_flags)) {
412 /* successfully marked. do not drop. */
415 rp->red_stats.marked_packets++;
418 /* unforced drop by red */
419 droptype = DTYPE_EARLY;
428 * if the queue length hits the hard limit, it's a forced drop.
430 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
431 droptype = DTYPE_FORCED;
433 #ifdef RED_RANDOM_DROP
434 /* if successful or forced drop, enqueue this packet. */
435 if (droptype != DTYPE_EARLY)
438 /* if successful, enqueue this packet. */
439 if (droptype == DTYPE_NODROP)
442 if (droptype != DTYPE_NODROP) {
443 if (droptype == DTYPE_EARLY) {
444 /* drop the incoming packet */
446 rp->red_stats.drop_unforced++;
449 /* forced drop, select a victim packet in the queue. */
450 #ifdef RED_RANDOM_DROP
454 rp->red_stats.drop_forced++;
458 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
462 #ifdef ALTQ_FLOWVALVE
463 if (rp->red_flowvalve != NULL)
464 fv_dropbyred(rp->red_flowvalve, pktattr, fve);
466 #endif /* ALTQ3_COMPAT */
470 /* successfully queued */
472 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
478 * early-drop probability is calculated as follows:
479 * prob = p_max * (avg - th_min) / (th_max - th_min)
480 * prob_a = prob / (2 - count*prob)
481 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
482 * here prob_a increases as successive undrop count increases.
483 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
484 * becomes 1 when (count >= (2 / prob))).
487 drop_early(int fp_len, int fp_probd, int count)
489 int d; /* denominator of drop-probability */
491 d = fp_probd - count * fp_len;
493 /* count exceeds the hard limit: drop or mark */
497 * now the range of d is [1..600] in fixed-point. (when
498 * th_max-th_min=10 and p_max=1/30)
499 * drop probability = (avg - TH_MIN) / d
502 if ((arc4random() % d) < fp_len) {
511 * try to mark CE bit to the packet.
512 * returns 1 if successfully marked, 0 otherwise.
515 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
521 at = pf_find_mtag(m);
525 } else if (pktattr != NULL) {
526 af = pktattr->pattr_af;
527 hdr = pktattr->pattr_hdr;
528 #endif /* ALTQ3_COMPAT */
532 /* verify that pattr_hdr is within the mbuf data */
533 for (m0 = m; m0 != NULL; m0 = m0->m_next)
534 if (((caddr_t)hdr >= m0->m_data) &&
535 ((caddr_t)hdr < m0->m_data + m0->m_len))
538 /* ick, tag info is stale */
542 switch (((struct ip *)hdr)->ip_v) {
544 if (flags & REDF_ECN4) {
550 return (0); /* version mismatch! */
552 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
553 return (0); /* not-ECT */
554 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
555 return (1); /* already marked */
558 * ecn-capable but not marked,
559 * mark CE and update checksum
562 ip->ip_tos |= IPTOS_ECN_CE;
564 * update checksum (from RFC1624)
565 * HC' = ~(~HC + ~m + m')
567 sum = ~ntohs(ip->ip_sum) & 0xffff;
568 sum += (~otos & 0xffff) + ip->ip_tos;
569 sum = (sum >> 16) + (sum & 0xffff);
570 sum += (sum >> 16); /* add carry */
571 ip->ip_sum = htons(~sum & 0xffff);
576 case (IPV6_VERSION >> 4):
577 if (flags & REDF_ECN6) {
578 struct ip6_hdr *ip6 = hdr;
581 flowlabel = ntohl(ip6->ip6_flow);
582 if ((flowlabel >> 28) != 6)
583 return (0); /* version mismatch! */
584 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
585 (IPTOS_ECN_NOTECT << 20))
586 return (0); /* not-ECT */
587 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
588 (IPTOS_ECN_CE << 20))
589 return (1); /* already marked */
591 * ecn-capable but not marked, mark CE
593 flowlabel |= (IPTOS_ECN_CE << 20);
594 ip6->ip6_flow = htonl(flowlabel);
612 if ((m = _getq(q)) == NULL) {
613 if (rp->red_idle == 0) {
615 microtime(&rp->red_last);
625 * helper routine to calibrate avg during idle.
626 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
627 * here Wq = 1/weight and the code assumes Wq is close to zero.
629 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
631 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
634 wtab_alloc(int weight)
639 for (w = wtab_list; w != NULL; w = w->w_next)
640 if (w->w_weight == weight) {
645 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
648 w->w_weight = weight;
650 w->w_next = wtab_list;
653 /* initialize the weight table */
654 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
655 for (i = 1; i < 32; i++) {
656 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
657 if (w->w_tab[i] == 0 && w->w_param_max == 0)
658 w->w_param_max = 1 << i;
665 wtab_destroy(struct wtab *w)
669 if (--w->w_refcount > 0)
673 wtab_list = w->w_next;
674 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
675 if (prev->w_next == w) {
676 prev->w_next = w->w_next;
685 pow_w(struct wtab *w, int n)
690 if (n >= w->w_param_max)
701 val = (val * w->w_tab[i]) >> FP_SHIFT;
712 * red device interface
717 redopen(dev, flag, fmt, p)
720 #if (__FreeBSD_version > 500000)
726 /* everything will be done when the queueing scheme is attached. */
731 redclose(dev, flag, fmt, p)
734 #if (__FreeBSD_version > 500000)
743 while ((rqp = red_list) != NULL) {
745 err = red_detach(rqp);
746 if (err != 0 && error == 0)
754 redioctl(dev, cmd, addr, flag, p)
759 #if (__FreeBSD_version > 500000)
766 struct red_interface *ifacep;
770 /* check super-user privilege */
775 #if (__FreeBSD_version > 700000)
776 if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
777 #elsif (__FreeBSD_version > 400000)
778 if ((error = suser(p)) != 0)
780 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
789 ifacep = (struct red_interface *)addr;
790 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
794 error = altq_enable(rqp->rq_ifq);
798 ifacep = (struct red_interface *)addr;
799 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
803 error = altq_disable(rqp->rq_ifq);
807 ifp = ifunit(((struct red_interface *)addr)->red_ifname);
813 /* allocate and initialize red_queue_t */
814 rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
819 bzero(rqp, sizeof(red_queue_t));
821 rqp->rq_q = malloc(sizeof(class_queue_t),
823 if (rqp->rq_q == NULL) {
828 bzero(rqp->rq_q, sizeof(class_queue_t));
830 rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
831 if (rqp->rq_red == NULL) {
832 free(rqp->rq_q, M_DEVBUF);
838 rqp->rq_ifq = &ifp->if_snd;
839 qtail(rqp->rq_q) = NULL;
841 qlimit(rqp->rq_q) = RED_LIMIT;
842 qtype(rqp->rq_q) = Q_RED;
845 * set RED to this ifnet structure.
847 error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
848 red_enqueue, red_dequeue, red_request,
851 red_destroy(rqp->rq_red);
852 free(rqp->rq_q, M_DEVBUF);
857 /* add this state to the red list */
858 rqp->rq_next = red_list;
863 ifacep = (struct red_interface *)addr;
864 if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
868 error = red_detach(rqp);
873 struct red_stats *q_stats;
876 q_stats = (struct red_stats *)addr;
877 if ((rqp = altq_lookup(q_stats->iface.red_ifname,
878 ALTQT_RED)) == NULL) {
883 q_stats->q_len = qlen(rqp->rq_q);
884 q_stats->q_limit = qlimit(rqp->rq_q);
887 q_stats->q_avg = rp->red_avg >> rp->red_wshift;
888 q_stats->xmit_cnt = rp->red_stats.xmit_cnt;
889 q_stats->drop_cnt = rp->red_stats.drop_cnt;
890 q_stats->drop_forced = rp->red_stats.drop_forced;
891 q_stats->drop_unforced = rp->red_stats.drop_unforced;
892 q_stats->marked_packets = rp->red_stats.marked_packets;
894 q_stats->weight = rp->red_weight;
895 q_stats->inv_pmax = rp->red_inv_pmax;
896 q_stats->th_min = rp->red_thmin;
897 q_stats->th_max = rp->red_thmax;
899 #ifdef ALTQ_FLOWVALVE
900 if (rp->red_flowvalve != NULL) {
901 struct flowvalve *fv = rp->red_flowvalve;
902 q_stats->fv_flows = fv->fv_flows;
903 q_stats->fv_pass = fv->fv_stats.pass;
904 q_stats->fv_predrop = fv->fv_stats.predrop;
905 q_stats->fv_alloc = fv->fv_stats.alloc;
906 q_stats->fv_escape = fv->fv_stats.escape;
908 #endif /* ALTQ_FLOWVALVE */
909 q_stats->fv_flows = 0;
910 q_stats->fv_pass = 0;
911 q_stats->fv_predrop = 0;
912 q_stats->fv_alloc = 0;
913 q_stats->fv_escape = 0;
914 #ifdef ALTQ_FLOWVALVE
916 #endif /* ALTQ_FLOWVALVE */
917 } while (/*CONSTCOND*/ 0);
926 fc = (struct red_conf *)addr;
927 if ((rqp = altq_lookup(fc->iface.red_ifname,
928 ALTQT_RED)) == NULL) {
932 new = red_alloc(fc->red_weight,
945 limit = fc->red_limit;
946 if (limit < fc->red_thmax)
947 limit = fc->red_thmax;
948 qlimit(rqp->rq_q) = limit;
949 fc->red_limit = limit; /* write back the new value */
951 red_destroy(rqp->rq_red);
956 /* write back new values */
957 fc->red_limit = limit;
958 fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
959 fc->red_thmin = rqp->rq_red->red_thmin;
960 fc->red_thmax = rqp->rq_red->red_thmax;
962 } while (/*CONSTCOND*/ 0);
965 case RED_SETDEFAULTS:
967 struct redparams *rp;
969 rp = (struct redparams *)addr;
971 default_th_min = rp->th_min;
972 default_th_max = rp->th_max;
973 default_inv_pmax = rp->inv_pmax;
974 } while (/*CONSTCOND*/ 0);
991 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
992 altq_disable(rqp->rq_ifq);
994 if ((error = altq_detach(rqp->rq_ifq)))
998 red_list = rqp->rq_next;
1000 for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1001 if (tmp->rq_next == rqp) {
1002 tmp->rq_next = rqp->rq_next;
1006 printf("red_detach: no state found in red_list!\n");
1009 red_destroy(rqp->rq_red);
1010 free(rqp->rq_q, M_DEVBUF);
1011 free(rqp, M_DEVBUF);
1018 * returns: 0 when successfully queued.
1019 * ENOBUFS when drop occurs.
1022 red_enqueue(ifq, m, pktattr)
1025 struct altq_pktattr *pktattr;
1027 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1029 IFQ_LOCK_ASSERT(ifq);
1031 if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1039 * must be called in splimp.
1041 * returns: mbuf dequeued.
1042 * NULL when no packet is available in the queue.
1045 static struct mbuf *
1046 red_dequeue(ifq, op)
1050 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1053 IFQ_LOCK_ASSERT(ifq);
1055 if (op == ALTDQ_POLL)
1056 return qhead(rqp->rq_q);
1058 /* op == ALTDQ_REMOVE */
1059 m = red_getq(rqp->rq_red, rqp->rq_q);
1066 red_request(ifq, req, arg)
1071 red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1073 IFQ_LOCK_ASSERT(ifq);
1088 if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1089 rqp->rq_ifq->ifq_len = 0;
1092 #ifdef ALTQ_FLOWVALVE
1094 #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */
1095 #define FV_PSCALE(x) ((x) << FV_PSHIFT)
1096 #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT)
1097 #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */
1098 #define FV_FSCALE(x) ((x) << FV_FSHIFT)
1099 #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT)
1101 #define FV_TIMER (3 * hz) /* timer value for garbage collector */
1102 #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */
1104 #define FV_N 10 /* update fve_f every FV_N packets */
1106 #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */
1107 #define FV_TTHRESH 3 /* time threshold to delete fve */
1108 #define FV_ALPHA 5 /* extra packet count */
1112 #if (__FreeBSD_version > 300000)
1113 #define FV_TIMESTAMP(tp) getmicrotime(tp)
1115 #define FV_TIMESTAMP(tp) { (*(tp)) = time; }
1119 * Brtt table: 127 entry table to convert drop rate (p) to
1120 * the corresponding bandwidth fraction (f)
1121 * the following equation is implemented to use scaled values,
1122 * fve_p and fve_f, in the fixed point format.
1124 * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1125 * f = Brtt(p) / (max_th + alpha)
1127 #define BRTT_SIZE 128
1128 #define BRTT_SHIFT 12
1129 #define BRTT_MASK 0x0007f000
1130 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT))
1132 const int brtt_tab[BRTT_SIZE] = {
1133 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1134 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1135 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1136 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1137 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1138 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1139 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1140 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1141 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1142 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1143 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1144 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1145 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1146 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1147 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1148 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1151 static __inline struct fve *
1152 flowlist_lookup(fv, pktattr, now)
1153 struct flowvalve *fv;
1154 struct altq_pktattr *pktattr;
1155 struct timeval *now;
1161 struct ip6_hdr *ip6;
1163 struct timeval tthresh;
1165 if (pktattr == NULL)
1168 tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1171 * search the flow list
1173 switch (pktattr->pattr_af) {
1175 ip = (struct ip *)pktattr->pattr_hdr;
1176 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1177 if (fve->fve_lastdrop.tv_sec == 0)
1179 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1180 fve->fve_lastdrop.tv_sec = 0;
1183 if (fve->fve_flow.flow_af == AF_INET &&
1184 fve->fve_flow.flow_ip.ip_src.s_addr ==
1185 ip->ip_src.s_addr &&
1186 fve->fve_flow.flow_ip.ip_dst.s_addr ==
1194 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1195 TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1196 if (fve->fve_lastdrop.tv_sec == 0)
1198 if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1199 fve->fve_lastdrop.tv_sec = 0;
1202 if (fve->fve_flow.flow_af == AF_INET6 &&
1203 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1205 IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1214 /* unknown protocol. no drop. */
1217 fv->fv_flows = flows; /* save the number of active fve's */
1221 static __inline struct fve *
1222 flowlist_reclaim(fv, pktattr)
1223 struct flowvalve *fv;
1224 struct altq_pktattr *pktattr;
1229 struct ip6_hdr *ip6;
1233 * get an entry from the tail of the LRU list.
1235 fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1237 switch (pktattr->pattr_af) {
1239 ip = (struct ip *)pktattr->pattr_hdr;
1240 fve->fve_flow.flow_af = AF_INET;
1241 fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1242 fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1246 ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1247 fve->fve_flow.flow_af = AF_INET6;
1248 fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1249 fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1254 fve->fve_state = Green;
1257 fve->fve_ifseq = fv->fv_ifseq - 1;
1262 fv->fv_stats.alloc++;
1267 static __inline void
1268 flowlist_move_to_head(fv, fve)
1269 struct flowvalve *fv;
1272 if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1273 TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1274 TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1278 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1280 * allocate flowvalve structure
1282 static struct flowvalve *
1286 struct flowvalve *fv;
1290 num = FV_FLOWLISTSIZE;
1291 fv = malloc(sizeof(struct flowvalve),
1292 M_DEVBUF, M_WAITOK);
1295 bzero(fv, sizeof(struct flowvalve));
1297 fv->fv_fves = malloc(sizeof(struct fve) * num,
1298 M_DEVBUF, M_WAITOK);
1299 if (fv->fv_fves == NULL) {
1303 bzero(fv->fv_fves, sizeof(struct fve) * num);
1306 TAILQ_INIT(&fv->fv_flowlist);
1307 for (i = 0; i < num; i++) {
1308 fve = &fv->fv_fves[i];
1309 fve->fve_lastdrop.tv_sec = 0;
1310 TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1313 /* initialize drop rate threshold in scaled fixed-point */
1314 fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1316 /* initialize drop rate to fraction table */
1317 fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1318 M_DEVBUF, M_WAITOK);
1319 if (fv->fv_p2ftab == NULL) {
1320 free(fv->fv_fves, M_DEVBUF);
1325 * create the p2f table.
1326 * (shift is used to keep the precision)
1328 for (i = 1; i < BRTT_SIZE; i++) {
1331 f = brtt_tab[i] << 8;
1332 fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1339 static void fv_destroy(fv)
1340 struct flowvalve *fv;
1342 free(fv->fv_p2ftab, M_DEVBUF);
1343 free(fv->fv_fves, M_DEVBUF);
1349 struct flowvalve *fv;
1355 f = fv->fv_p2ftab[BRTT_SIZE-1];
1356 else if ((val = (p & BRTT_MASK)))
1357 f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1359 f = fv->fv_p2ftab[1];
1364 * check if an arriving packet should be pre-dropped.
1365 * called from red_addq() when a packet arrives.
1366 * returns 1 when the packet should be pre-dropped.
1367 * should be called in splimp.
1370 fv_checkflow(fv, pktattr, fcache)
1371 struct flowvalve *fv;
1372 struct altq_pktattr *pktattr;
1373 struct fve **fcache;
1381 if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1382 /* no matching entry in the flowlist */
1387 /* update fraction f for every FV_N packets */
1388 if (++fve->fve_count == FV_N) {
1390 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1393 (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1394 + fve->fve_f - FV_FUNSCALE(fve->fve_f);
1395 fve->fve_ifseq = fv->fv_ifseq;
1402 if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1405 /* calculate a threshold */
1406 fthresh = fv_p2f(fv, fve->fve_p);
1407 if (fve->fve_f > fthresh)
1408 fve->fve_state = Red;
1411 if (fve->fve_state == Red) {
1415 if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1416 /* no drop for at least FV_BACKOFFTHRESH sec */
1418 fve->fve_state = Green;
1420 fv->fv_stats.escape++;
1423 /* block this flow */
1424 flowlist_move_to_head(fv, fve);
1425 fve->fve_lastdrop = now;
1427 fv->fv_stats.predrop++;
1436 fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1440 fv->fv_stats.pass++;
1446 * called from red_addq when a packet is dropped by red.
1447 * should be called in splimp.
1449 static void fv_dropbyred(fv, pktattr, fcache)
1450 struct flowvalve *fv;
1451 struct altq_pktattr *pktattr;
1457 if (pktattr == NULL)
1462 /* the fve of this packet is already cached */
1464 else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1465 fve = flowlist_reclaim(fv, pktattr);
1467 flowlist_move_to_head(fv, fve);
1470 * update p: the following line cancels the update
1471 * in fv_checkflow() and calculate
1472 * p = Wp + (1 - Wp) * p
1474 fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1476 fve->fve_lastdrop = now;
1479 #endif /* ALTQ_FLOWVALVE */
1483 static struct altqsw red_sw =
1484 {"red", redopen, redclose, redioctl};
1486 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1487 MODULE_VERSION(altq_red, 1);
1489 #endif /* KLD_MODULE */
1490 #endif /* ALTQ3_COMPAT */
1492 #endif /* ALTQ_RED */