2 * Copyright (C) 1997-2003
3 * Sony Computer Science Laboratories Inc. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * Copyright (c) 1990-1994 Regents of the University of California.
29 * All rights reserved.
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
34 * 1. Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in the
38 * documentation and/or other materials provided with the distribution.
39 * 3. All advertising materials mentioning features or use of this software
40 * must display the following acknowledgement:
41 * This product includes software developed by the Computer Systems
42 * Engineering Group at Lawrence Berkeley Laboratory.
43 * 4. Neither the name of the University nor of the Laboratory may be used
44 * to endorse or promote products derived from this software without
45 * specific prior written permission.
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
65 #include "opt_inet6.h"
66 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */
68 #include <sys/param.h>
69 #include <sys/malloc.h>
71 #include <sys/socket.h>
72 #include <sys/systm.h>
73 #include <sys/errno.h>
74 #if 1 /* ALTQ3_COMPAT */
75 #include <sys/sockio.h>
77 #include <sys/kernel.h>
79 #include <sys/queue.h>
82 #endif /* ALTQ3_COMPAT */
85 #include <net/if_var.h>
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/ip.h>
91 #include <netinet/ip6.h>
94 #include <netpfil/pf/pf.h>
95 #include <netpfil/pf/pf_altq.h>
96 #include <netpfil/pf/pf_mtag.h>
97 #include <net/altq/altq.h>
98 #include <net/altq/altq_red.h>
101 * ALTQ/RED (Random Early Detection) implementation using 32-bit
102 * fixed-point calculation.
104 * written by kjc using the ns code as a reference.
105 * you can learn more about red and ns from Sally's home page at
106 * http://www-nrg.ee.lbl.gov/floyd/
108 * most of the red parameter values are fixed in this implementation
109 * to prevent fixed-point overflow/underflow.
110 * if you change the parameters, watch out for overflow/underflow!
112 * the parameters used are recommended values by Sally.
113 * the corresponding ns config looks:
115 * minthresh=5 maxthresh=15 queue-size=60
118 * bytes=false (can't be handled by 32-bit fixed-point)
119 * doubleq=false dqthresh=false
123 * alternative red parameters for a slow link.
125 * assume the queue length becomes from zero to L and keeps L, it takes
126 * N packets for q_avg to reach 63% of L.
127 * when q_weight is 0.002, N is about 500 packets.
128 * for a slow link like dial-up, 500 packets takes more than 1 minute!
129 * when q_weight is 0.008, N is about 127 packets.
130 * when q_weight is 0.016, N is about 63 packets.
131 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
132 * are allowed for 0.016.
133 * see Sally's paper for more details.
135 /* normal red parameters */
136 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
137 /* q_weight = 0.00195 */
139 /* red parameters for a slow link */
140 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
141 /* q_weight = 0.0078125 */
143 /* red parameters for a very slow link (e.g., dialup) */
144 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
145 /* q_weight = 0.015625 */
147 /* fixed-point uses 12-bit decimal places */
148 #define FP_SHIFT 12 /* fixed-point shift */
150 /* red parameters for drop probability */
151 #define INV_P_MAX 10 /* inverse of max drop probability */
152 #define TH_MIN 5 /* min threshold */
153 #define TH_MAX 15 /* max threshold */
155 #define RED_LIMIT 60 /* default max queue length */
156 #define RED_STATS /* collect statistics */
159 * our default policy for forced-drop is drop-tail.
160 * (in altq-1.1.2 or earlier, the default was random-drop.
161 * but it makes more sense to punish the cause of the surge.)
162 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
166 /* default red parameter values */
167 static int default_th_min = TH_MIN;
168 static int default_th_max = TH_MAX;
169 static int default_inv_pmax = INV_P_MAX;
173 * red support routines
176 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
183 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
188 rp->red_weight = W_WEIGHT;
190 rp->red_weight = weight;
192 /* allocate weight table */
193 rp->red_wtab = wtab_alloc(rp->red_weight);
194 if (rp->red_wtab == NULL) {
203 rp->red_inv_pmax = default_inv_pmax;
205 rp->red_inv_pmax = inv_pmax;
207 rp->red_thmin = default_th_min;
209 rp->red_thmin = th_min;
211 rp->red_thmax = default_th_max;
213 rp->red_thmax = th_max;
215 rp->red_flags = flags;
218 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
219 rp->red_pkttime = 800;
221 rp->red_pkttime = pkttime;
224 /* when the link is very slow, adjust red parameters */
225 npkts_per_sec = 1000000 / rp->red_pkttime;
226 if (npkts_per_sec < 50) {
227 /* up to about 400Kbps */
228 rp->red_weight = W_WEIGHT_2;
229 } else if (npkts_per_sec < 300) {
230 /* up to about 2.4Mbps */
231 rp->red_weight = W_WEIGHT_1;
235 /* calculate wshift. weight must be power of 2 */
237 for (i = 0; w > 1; i++)
240 w = 1 << rp->red_wshift;
241 if (w != rp->red_weight) {
242 printf("invalid weight value %d for red! use %d\n",
248 * thmin_s and thmax_s are scaled versions of th_min and th_max
249 * to be compared with avg.
251 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
252 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
255 * precompute probability denominator
256 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
258 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
259 * rp->red_inv_pmax) << FP_SHIFT;
261 microtime(&rp->red_last);
266 red_destroy(red_t *rp)
268 wtab_destroy(rp->red_wtab);
273 red_getstats(red_t *rp, struct redstats *sp)
275 sp->q_avg = rp->red_avg >> rp->red_wshift;
276 sp->xmit_cnt = rp->red_stats.xmit_cnt;
277 sp->drop_cnt = rp->red_stats.drop_cnt;
278 sp->drop_forced = rp->red_stats.drop_forced;
279 sp->drop_unforced = rp->red_stats.drop_unforced;
280 sp->marked_packets = rp->red_stats.marked_packets;
284 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
285 struct altq_pktattr *pktattr)
293 * if we were idle, we pretend that n packets arrived during
302 t = (now.tv_sec - rp->red_last.tv_sec);
305 * being idle for more than 1 minute, set avg to zero.
306 * this prevents t from overflow.
310 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
311 n = t / rp->red_pkttime - 1;
313 /* the following line does (avg = (1 - Wq)^n * avg) */
315 avg = (avg >> FP_SHIFT) *
316 pow_w(rp->red_wtab, n);
320 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
321 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
322 rp->red_avg = avg; /* save the new value */
325 * red_count keeps a tally of arriving traffic that has not
330 /* see if we drop early */
331 droptype = DTYPE_NODROP;
332 if (avg >= rp->red_thmin_s && qlen(q) > 1) {
333 if (avg >= rp->red_thmax_s) {
334 /* avg >= th_max: forced drop */
335 droptype = DTYPE_FORCED;
336 } else if (rp->red_old == 0) {
337 /* first exceeds th_min */
340 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
341 rp->red_probd, rp->red_count)) {
342 /* mark or drop by red */
343 if ((rp->red_flags & REDF_ECN) &&
344 mark_ecn(m, pktattr, rp->red_flags)) {
345 /* successfully marked. do not drop. */
348 rp->red_stats.marked_packets++;
351 /* unforced drop by red */
352 droptype = DTYPE_EARLY;
361 * if the queue length hits the hard limit, it's a forced drop.
363 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
364 droptype = DTYPE_FORCED;
366 #ifdef RED_RANDOM_DROP
367 /* if successful or forced drop, enqueue this packet. */
368 if (droptype != DTYPE_EARLY)
371 /* if successful, enqueue this packet. */
372 if (droptype == DTYPE_NODROP)
375 if (droptype != DTYPE_NODROP) {
376 if (droptype == DTYPE_EARLY) {
377 /* drop the incoming packet */
379 rp->red_stats.drop_unforced++;
382 /* forced drop, select a victim packet in the queue. */
383 #ifdef RED_RANDOM_DROP
387 rp->red_stats.drop_forced++;
391 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
397 /* successfully queued */
399 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
405 * early-drop probability is calculated as follows:
406 * prob = p_max * (avg - th_min) / (th_max - th_min)
407 * prob_a = prob / (2 - count*prob)
408 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
409 * here prob_a increases as successive undrop count increases.
410 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
411 * becomes 1 when (count >= (2 / prob))).
414 drop_early(int fp_len, int fp_probd, int count)
416 int d; /* denominator of drop-probability */
418 d = fp_probd - count * fp_len;
420 /* count exceeds the hard limit: drop or mark */
424 * now the range of d is [1..600] in fixed-point. (when
425 * th_max-th_min=10 and p_max=1/30)
426 * drop probability = (avg - TH_MIN) / d
429 if ((arc4random() % d) < fp_len) {
438 * try to mark CE bit to the packet.
439 * returns 1 if successfully marked, 0 otherwise.
442 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
448 at = pf_find_mtag(m);
454 /* verify that pattr_hdr is within the mbuf data */
455 for (m0 = m; m0 != NULL; m0 = m0->m_next)
456 if (((caddr_t)hdr >= m0->m_data) &&
457 ((caddr_t)hdr < m0->m_data + m0->m_len))
460 /* ick, tag info is stale */
464 switch (((struct ip *)hdr)->ip_v) {
466 if (flags & REDF_ECN4) {
472 return (0); /* version mismatch! */
474 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
475 return (0); /* not-ECT */
476 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
477 return (1); /* already marked */
480 * ecn-capable but not marked,
481 * mark CE and update checksum
484 ip->ip_tos |= IPTOS_ECN_CE;
486 * update checksum (from RFC1624)
487 * HC' = ~(~HC + ~m + m')
489 sum = ~ntohs(ip->ip_sum) & 0xffff;
490 sum += (~otos & 0xffff) + ip->ip_tos;
491 sum = (sum >> 16) + (sum & 0xffff);
492 sum += (sum >> 16); /* add carry */
493 ip->ip_sum = htons(~sum & 0xffff);
498 case (IPV6_VERSION >> 4):
499 if (flags & REDF_ECN6) {
500 struct ip6_hdr *ip6 = hdr;
503 flowlabel = ntohl(ip6->ip6_flow);
504 if ((flowlabel >> 28) != 6)
505 return (0); /* version mismatch! */
506 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
507 (IPTOS_ECN_NOTECT << 20))
508 return (0); /* not-ECT */
509 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
510 (IPTOS_ECN_CE << 20))
511 return (1); /* already marked */
513 * ecn-capable but not marked, mark CE
515 flowlabel |= (IPTOS_ECN_CE << 20);
516 ip6->ip6_flow = htonl(flowlabel);
534 if ((m = _getq(q)) == NULL) {
535 if (rp->red_idle == 0) {
537 microtime(&rp->red_last);
547 * helper routine to calibrate avg during idle.
548 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
549 * here Wq = 1/weight and the code assumes Wq is close to zero.
551 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
553 static struct wtab *wtab_list = NULL; /* pointer to wtab list */
556 wtab_alloc(int weight)
561 for (w = wtab_list; w != NULL; w = w->w_next)
562 if (w->w_weight == weight) {
567 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
570 w->w_weight = weight;
572 w->w_next = wtab_list;
575 /* initialize the weight table */
576 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
577 for (i = 1; i < 32; i++) {
578 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
579 if (w->w_tab[i] == 0 && w->w_param_max == 0)
580 w->w_param_max = 1 << i;
587 wtab_destroy(struct wtab *w)
591 if (--w->w_refcount > 0)
595 wtab_list = w->w_next;
596 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
597 if (prev->w_next == w) {
598 prev->w_next = w->w_next;
607 pow_w(struct wtab *w, int n)
612 if (n >= w->w_param_max)
623 val = (val * w->w_tab[i]) >> FP_SHIFT;
633 #endif /* ALTQ_RED */