3 * SPDX-License-Identifier: BSD-3-Clause
5 * Copyright (c) 2018-2019
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * Author: Randall Stewart <rrs@netflix.com>
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/eventhandler.h>
50 #include <sys/mutex.h>
52 #define TCPSTATES /* for logging */
53 #include <netinet/in.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/tcp_var.h>
57 #include <netinet6/tcp6_var.h>
59 #include <netinet/tcp_ratelimit.h>
60 #ifndef USECS_IN_SECOND
61 #define USECS_IN_SECOND 1000000
64 * For the purposes of each send, what is the size
65 * of an ethernet frame.
67 #ifndef ETHERNET_SEGMENT_SIZE
68 #define ETHERNET_SEGMENT_SIZE 1500
70 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
73 #define COMMON_RATE 180500
74 uint64_t desired_rates[] = {
76 180500, /* 1.44Mpbs */
87 12500000, /* 100Mbps */
88 25000000, /* 200Mbps */
89 50000000, /* 400Mbps */
90 100000000, /* 800Mbps */
98 10000000, /* 80Mbps */
99 18750000, /* 150Mbps */
100 20000000, /* 250Mbps */
101 37500000, /* 350Mbps */
102 62500000, /* 500Mbps */
103 78125000, /* 625Mbps */
104 125000000, /* 1Gbps */
106 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
107 #define RS_ORDERED_COUNT 16 /*
108 * Number that are in order
109 * at the beginning of the table,
110 * over this a sort is required.
112 #define RS_NEXT_ORDER_GROUP 16 /*
113 * The point in our table where
114 * we come fill in a second ordered
115 * group (index wise means -1).
117 #define ALL_HARDWARE_RATES 1004 /*
118 * 1Meg - 1Gig in 1 Meg steps
119 * plus 100, 200k and 500k and
123 #define RS_ONE_MEGABIT_PERSEC 1000000
124 #define RS_ONE_GIGABIT_PERSEC 1000000000
125 #define RS_TEN_GIGABIT_PERSEC 10000000000
127 static struct head_tcp_rate_set int_rs;
128 static struct mtx rs_mtx;
129 uint32_t rs_number_alive;
130 uint32_t rs_number_dead;
132 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
133 "TCP Ratelimit stats");
134 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
136 "Number of interfaces initialized for ratelimiting");
137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
139 "Number of interfaces departing from ratelimiting");
142 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
145 * Add sysctl entries for thus interface.
147 if (rs->rs_flags & RS_INTF_NO_SUP) {
148 SYSCTL_ADD_S32(&rs->sysctl_ctx,
149 SYSCTL_CHILDREN(rl_sysctl_root),
150 OID_AUTO, "disable", CTLFLAG_RD,
152 "Disable this interface from new hdwr limiting?");
154 SYSCTL_ADD_S32(&rs->sysctl_ctx,
155 SYSCTL_CHILDREN(rl_sysctl_root),
156 OID_AUTO, "disable", CTLFLAG_RW,
158 "Disable this interface from new hdwr limiting?");
160 SYSCTL_ADD_S32(&rs->sysctl_ctx,
161 SYSCTL_CHILDREN(rl_sysctl_root),
162 OID_AUTO, "minseg", CTLFLAG_RW,
164 "What is the minimum we need to send on this interface?");
165 SYSCTL_ADD_U64(&rs->sysctl_ctx,
166 SYSCTL_CHILDREN(rl_sysctl_root),
167 OID_AUTO, "flow_limit", CTLFLAG_RW,
168 &rs->rs_flow_limit, 0,
169 "What is the limit for number of flows (0=unlimited)?");
170 SYSCTL_ADD_S32(&rs->sysctl_ctx,
171 SYSCTL_CHILDREN(rl_sysctl_root),
172 OID_AUTO, "highest", CTLFLAG_RD,
173 &rs->rs_highest_valid, 0,
174 "Highest valid rate");
175 SYSCTL_ADD_S32(&rs->sysctl_ctx,
176 SYSCTL_CHILDREN(rl_sysctl_root),
177 OID_AUTO, "lowest", CTLFLAG_RD,
178 &rs->rs_lowest_valid, 0,
179 "Lowest valid rate");
180 SYSCTL_ADD_S32(&rs->sysctl_ctx,
181 SYSCTL_CHILDREN(rl_sysctl_root),
182 OID_AUTO, "flags", CTLFLAG_RD,
184 "What lags are on the entry?");
185 SYSCTL_ADD_S32(&rs->sysctl_ctx,
186 SYSCTL_CHILDREN(rl_sysctl_root),
187 OID_AUTO, "numrates", CTLFLAG_RD,
189 "How many rates re there?");
190 SYSCTL_ADD_U64(&rs->sysctl_ctx,
191 SYSCTL_CHILDREN(rl_sysctl_root),
192 OID_AUTO, "flows_using", CTLFLAG_RD,
193 &rs->rs_flows_using, 0,
194 "How many flows are using this interface now?");
195 #ifdef DETAILED_RATELIMIT_SYSCTL
196 if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
197 /* Lets display the rates */
199 struct sysctl_oid *rl_rates;
200 struct sysctl_oid *rl_rate_num;
202 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
203 SYSCTL_CHILDREN(rl_sysctl_root),
208 for( i = 0; i < rs->rs_rate_cnt; i++) {
209 sprintf(rate_num, "%d", i);
210 rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
211 SYSCTL_CHILDREN(rl_rates),
216 SYSCTL_ADD_U32(&rs->sysctl_ctx,
217 SYSCTL_CHILDREN(rl_rate_num),
218 OID_AUTO, "flags", CTLFLAG_RD,
219 &rs->rs_rlt[i].flags, 0,
220 "Flags on this rate");
221 SYSCTL_ADD_U32(&rs->sysctl_ctx,
222 SYSCTL_CHILDREN(rl_rate_num),
223 OID_AUTO, "pacetime", CTLFLAG_RD,
224 &rs->rs_rlt[i].time_between, 0,
225 "Time hardware inserts between 1500 byte sends");
226 SYSCTL_ADD_U64(&rs->sysctl_ctx,
227 SYSCTL_CHILDREN(rl_rate_num),
228 OID_AUTO, "rate", CTLFLAG_RD,
229 &rs->rs_rlt[i].rate, 0,
230 "Rate in bytes per second");
237 rs_destroy(epoch_context_t ctx)
239 struct tcp_rate_set *rs;
242 rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
245 rs->rs_flags &= ~RS_FUNERAL_SCHD;
247 * In theory its possible (but unlikely)
248 * that while the delete was occuring
249 * and we were applying the DEAD flag
250 * someone slipped in and found the
251 * interface in a lookup. While we
252 * decided rs_flows_using were 0 and
253 * scheduling the epoch_call, the other
254 * thread incremented rs_flow_using. This
255 * is because users have a pointer and
256 * we only use the rs_flows_using in an
257 * atomic fashion, i.e. the other entities
258 * are not protected. To assure this did
259 * not occur, we check rs_flows_using here
262 do_free_rs = (rs->rs_flows_using == 0);
267 sysctl_ctx_free(&rs->sysctl_ctx);
268 free(rs->rs_rlt, M_TCPPACE);
274 extern counter_u64_t rate_limit_set_ok;
275 extern counter_u64_t rate_limit_active;
276 extern counter_u64_t rate_limit_alloc_fail;
280 rl_attach_txrtlmt(struct ifnet *ifp,
284 struct m_snd_tag **tag)
287 union if_snd_tag_alloc_params params = {
288 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
289 .rate_limit.hdr.flowid = flowid,
290 .rate_limit.hdr.flowtype = flowtype,
291 .rate_limit.max_rate = cfg_rate,
292 .rate_limit.flags = M_NOWAIT,
295 if (ifp->if_snd_tag_alloc == NULL) {
298 error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag);
302 counter_u64_add(rate_limit_set_ok, 1);
303 counter_u64_add(rate_limit_active, 1);
305 counter_u64_add(rate_limit_alloc_fail, 1);
312 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
315 * The internal table is "special", it
316 * is two seperate ordered tables that
317 * must be merged. We get here when the
318 * adapter specifies a number of rates that
319 * covers both ranges in the table in some
322 int i, at_low, at_high;
323 uint8_t low_disabled = 0, high_disabled = 0;
325 for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
326 rs->rs_rlt[i].flags = 0;
327 rs->rs_rlt[i].time_between = 0;
328 if ((low_disabled == 0) &&
330 (rate_table_act[at_low] < rate_table_act[at_high]))) {
331 rs->rs_rlt[i].rate = rate_table_act[at_low];
333 if (at_low == RS_NEXT_ORDER_GROUP)
335 } else if (high_disabled == 0) {
336 rs->rs_rlt[i].rate = rate_table_act[at_high];
338 if (at_high == MAX_HDWR_RATES)
344 static struct tcp_rate_set *
345 rt_setup_new_rs(struct ifnet *ifp, int *error)
347 struct tcp_rate_set *rs;
348 const uint64_t *rate_table_act;
349 uint64_t lentim, res;
353 struct if_ratelimit_query_results rl;
354 struct sysctl_oid *rl_sysctl_root;
356 * We expect to enter with the
360 if (ifp->if_ratelimit_query == NULL) {
362 * We can do nothing if we cannot
363 * get a query back from the driver.
367 rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
373 rl.flags = RT_NOSUPPORT;
374 ifp->if_ratelimit_query(ifp, &rl);
375 if (rl.flags & RT_IS_UNUSABLE) {
377 * The interface does not really support
380 memset(rs, 0, sizeof(struct tcp_rate_set));
382 rs->rs_if_dunit = ifp->if_dunit;
383 rs->rs_flags = RS_INTF_NO_SUP;
386 sysctl_ctx_init(&rs->sysctl_ctx);
387 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
388 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
390 rs->rs_ifp->if_xname,
393 rl_add_syctl_entries(rl_sysctl_root, rs);
395 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
398 } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
399 memset(rs, 0, sizeof(struct tcp_rate_set));
401 rs->rs_if_dunit = ifp->if_dunit;
402 rs->rs_flags = RS_IS_DEFF;
404 sysctl_ctx_init(&rs->sysctl_ctx);
405 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
406 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
408 rs->rs_ifp->if_xname,
411 rl_add_syctl_entries(rl_sysctl_root, rs);
413 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
416 } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
417 /* Mellanox most likely */
419 rs->rs_if_dunit = ifp->if_dunit;
420 rs->rs_rate_cnt = rl.number_of_rates;
421 rs->rs_min_seg = rl.min_segment_burst;
422 rs->rs_highest_valid = 0;
423 rs->rs_flow_limit = rl.max_flows;
424 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
426 rate_table_act = rl.rate_table;
427 } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
430 rs->rs_if_dunit = ifp->if_dunit;
431 rs->rs_rate_cnt = rl.number_of_rates;
432 rs->rs_min_seg = rl.min_segment_burst;
434 rs->rs_flow_limit = rl.max_flows;
435 rate_table_act = desired_rates;
436 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
437 (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
439 * Our desired table is not big
440 * enough, do what we can.
442 rs->rs_rate_cnt = MAX_HDWR_RATES;
444 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
445 rs->rs_flags = RS_IS_INTF;
447 rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
448 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
449 rs->rs_rate_cnt = ALL_HARDWARE_RATES;
451 printf("Interface:%s unit:%d not one known to have rate-limits\n",
457 sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
458 rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
459 if (rs->rs_rlt == NULL) {
466 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
468 * The interface supports all
469 * the rates we could possibly want.
473 rs->rs_rlt[0].rate = 12500; /* 100k */
474 rs->rs_rlt[1].rate = 25000; /* 200k */
475 rs->rs_rlt[2].rate = 62500; /* 500k */
476 /* Note 125000 == 1Megabit
477 * populate 1Meg - 1000meg.
479 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
480 rs->rs_rlt[i].rate = rat;
483 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
484 } else if (rs->rs_flags & RS_INT_TBL) {
485 /* We populate this in a special way */
486 populate_canned_table(rs, rate_table_act);
489 * Just copy in the rates from
490 * the table, it is in order.
492 for (i=0; i<rs->rs_rate_cnt; i++) {
493 rs->rs_rlt[i].rate = rate_table_act[i];
494 rs->rs_rlt[i].time_between = 0;
495 rs->rs_rlt[i].flags = 0;
498 for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
500 * We go backwards through the list so that if we can't get
501 * a rate and fail to init one, we have at least a chance of
502 * getting the highest one.
504 rs->rs_rlt[i].ptbl = rs;
505 rs->rs_rlt[i].tag = NULL;
507 * Calculate the time between.
509 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
510 res = lentim / rs->rs_rlt[i].rate;
512 rs->rs_rlt[i].time_between = res;
514 rs->rs_rlt[i].time_between = 1;
515 if (rs->rs_flags & RS_NO_PRE) {
516 rs->rs_rlt[i].flags = HDWRPACE_INITED;
517 rs->rs_lowest_valid = i;
521 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
523 hash_type = M_HASHTYPE_OPAQUE_HASH;
525 err = rl_attach_txrtlmt(ifp,
531 if (i == (rs->rs_rate_cnt - 1)) {
533 * Huh - first rate and we can't get
536 free(rs->rs_rlt, M_TCPPACE);
546 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
547 rs->rs_lowest_valid = i;
551 /* Did we get at least 1 rate? */
552 if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
553 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
555 free(rs->rs_rlt, M_TCPPACE);
559 sysctl_ctx_init(&rs->sysctl_ctx);
560 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
561 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
563 rs->rs_ifp->if_xname,
566 rl_add_syctl_entries(rl_sysctl_root, rs);
568 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
573 static const struct tcp_hwrate_limit_table *
574 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
575 uint64_t bytes_per_sec, uint32_t flags)
577 struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
578 uint64_t mbits_per_sec, ind_calc;
581 mbits_per_sec = (bytes_per_sec * 8);
582 if (flags & RS_PACING_LT) {
583 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
584 (rs->rs_lowest_valid <= 2)){
586 * Smaller than 1Meg, only
587 * 3 entries can match it.
589 for(i = rs->rs_lowest_valid; i < 3; i++) {
590 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
591 rte = &rs->rs_rlt[i];
593 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
594 arte = &rs->rs_rlt[i];
598 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
599 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
601 * Larger than 1G (the majority of
604 if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
605 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
607 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
611 * If we reach here its in our table (between 1Meg - 1000Meg),
612 * just take the rounded down mbits per second, and add
613 * 1Megabit to it, from this we can calculate
614 * the index in the table.
616 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
617 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
619 /* our table is offset by 3, we add 2 */
621 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
622 /* This should not happen */
623 ind_calc = ALL_HARDWARE_RATES-1;
625 if ((ind_calc >= rs->rs_lowest_valid) &&
626 (ind_calc <= rs->rs_highest_valid))
627 rte = &rs->rs_rlt[ind_calc];
628 } else if (flags & RS_PACING_EXACT_MATCH) {
629 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
630 (rs->rs_lowest_valid <= 2)){
631 for(i = rs->rs_lowest_valid; i < 3; i++) {
632 if (bytes_per_sec == rs->rs_rlt[i].rate) {
633 rte = &rs->rs_rlt[i];
637 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
638 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
639 /* > 1Gbps only one rate */
640 if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
642 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
645 /* Ok it must be a exact meg (its between 1G and 1Meg) */
646 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
647 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
648 /* its an exact Mbps */
650 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
651 /* This should not happen */
652 ind_calc = ALL_HARDWARE_RATES-1;
654 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
655 rte = &rs->rs_rlt[ind_calc];
659 /* we want greater than the requested rate */
660 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
661 (rs->rs_lowest_valid <= 2)){
662 arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
663 for (i=2; i>=rs->rs_lowest_valid; i--) {
664 if (bytes_per_sec < rs->rs_rlt[i].rate) {
665 rte = &rs->rs_rlt[i];
667 } else if ((flags & RS_PACING_GEQ) &&
668 (bytes_per_sec == rs->rs_rlt[i].rate)) {
669 rte = &rs->rs_rlt[i];
672 arte = &rs->rs_rlt[i]; /* new alternate */
675 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
676 if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
677 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
678 /* Our top rate is larger than the request */
679 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
680 } else if ((flags & RS_PACING_GEQ) &&
681 (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
682 (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
683 /* It matches our top rate */
684 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
685 } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
686 /* The top rate is an alternative */
687 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
690 /* Its in our range 1Meg - 1Gig */
691 if (flags & RS_PACING_GEQ) {
692 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
693 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
694 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
695 /* This should not happen */
696 ind_calc = (ALL_HARDWARE_RATES-1);
698 rte = &rs->rs_rlt[ind_calc];
702 ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
704 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
705 /* This should not happen */
706 ind_calc = ALL_HARDWARE_RATES-1;
708 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
709 rte = &rs->rs_rlt[ind_calc];
715 (flags & RS_PACING_SUB_OK)) {
716 /* We can use the substitute */
722 static const struct tcp_hwrate_limit_table *
723 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
726 * Hunt the rate table with the restrictions in flags and find a
727 * suitable rate if possible.
728 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
729 * RS_PACING_GT - must be greater than.
730 * RS_PACING_GEQ - must be greater than or equal.
731 * RS_PACING_LT - must be less than.
732 * RS_PACING_SUB_OK - If we don't meet criteria a
736 struct tcp_hwrate_limit_table *rte = NULL;
739 if ((rs->rs_flags & RS_INT_TBL) &&
740 (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
742 * Here we don't want to paw thru
743 * a big table, we have everything
744 * from 1Meg - 1000Meg in 1Meg increments.
745 * Use an alternate method to "lookup".
747 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
749 if ((flags & RS_PACING_LT) ||
750 (flags & RS_PACING_EXACT_MATCH)) {
752 * For exact and less than we go forward through the table.
753 * This way when we find one larger we stop (exact was a
756 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
757 if ((flags & RS_PACING_EXACT_MATCH) &&
758 (bytes_per_sec == rs->rs_rlt[i].rate)) {
759 rte = &rs->rs_rlt[i];
762 } else if ((flags & RS_PACING_LT) &&
763 (bytes_per_sec <= rs->rs_rlt[i].rate)) {
764 rte = &rs->rs_rlt[i];
768 if (bytes_per_sec > rs->rs_rlt[i].rate)
771 if ((matched == 0) &&
772 (flags & RS_PACING_LT) &&
773 (flags & RS_PACING_SUB_OK)) {
774 /* Kick in a substitute (the lowest) */
775 rte = &rs->rs_rlt[rs->rs_lowest_valid];
779 * Here we go backward through the table so that we can find
780 * the one greater in theory faster (but its probably a
783 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
784 if (rs->rs_rlt[i].rate > bytes_per_sec) {
785 /* A possible candidate */
786 rte = &rs->rs_rlt[i];
788 if ((flags & RS_PACING_GEQ) &&
789 (bytes_per_sec == rs->rs_rlt[i].rate)) {
790 /* An exact match and we want equal */
792 rte = &rs->rs_rlt[i];
796 * Found one that is larger than but don't
797 * stop, there may be a more closer match.
801 if (rs->rs_rlt[i].rate < bytes_per_sec) {
803 * We found a table entry that is smaller,
804 * stop there will be none greater or equal.
809 if ((matched == 0) &&
810 (flags & RS_PACING_SUB_OK)) {
811 /* Kick in a substitute (the highest) */
812 rte = &rs->rs_rlt[rs->rs_highest_valid];
818 static struct ifnet *
819 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
822 struct m_snd_tag *tag;
823 union if_snd_tag_alloc_params params = {
824 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
825 .rate_limit.hdr.flowid = 1,
826 .rate_limit.max_rate = COMMON_RATE,
827 .rate_limit.flags = M_NOWAIT,
831 params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
832 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
834 params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
837 if (ifp->if_snd_tag_alloc) {
842 err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag);
844 /* Failed to setup a tag? */
850 tifp->if_snd_tag_free(tag);
854 static const struct tcp_hwrate_limit_table *
855 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
856 uint32_t flags, int *error)
858 /* First lets find the interface if it exists */
859 const struct tcp_hwrate_limit_table *rte;
860 struct tcp_rate_set *rs;
861 struct epoch_tracker et;
864 epoch_enter_preempt(net_epoch_preempt, &et);
866 CK_LIST_FOREACH(rs, &int_rs, next) {
868 * Note we don't look with the lock since we either see a
869 * new entry or will get one when we try to add it.
871 if (rs->rs_flags & RS_IS_DEAD) {
872 /* The dead are not looked at */
875 if ((rs->rs_ifp == ifp) &&
876 (rs->rs_if_dunit == ifp->if_dunit)) {
882 (rs->rs_flags & RS_INTF_NO_SUP) ||
883 (rs->rs_flags & RS_IS_DEAD)) {
885 * This means we got a packet *before*
886 * the IF-UP was processed below, <or>
887 * while or after we already received an interface
888 * departed event. In either case we really don't
889 * want to do anything with pacing, in
890 * the departing case the packet is not
891 * going to go very far. The new case
892 * might be arguable, but its impossible
893 * to tell from the departing case.
895 if (rs->rs_disable && error)
897 epoch_exit_preempt(net_epoch_preempt, &et);
901 if ((rs == NULL) || (rs->rs_disable != 0)) {
902 if (rs->rs_disable && error)
904 epoch_exit_preempt(net_epoch_preempt, &et);
907 if (rs->rs_flags & RS_IS_DEFF) {
908 /* We need to find the real interface */
911 tifp = rt_find_real_interface(ifp, inp, error);
913 if (rs->rs_disable && error)
915 epoch_exit_preempt(net_epoch_preempt, &et);
918 goto use_real_interface;
920 if (rs->rs_flow_limit &&
921 ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
924 epoch_exit_preempt(net_epoch_preempt, &et);
927 rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
929 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
935 /* Failed to attach */
943 * We use an atomic here for accounting so we don't have to
944 * use locks when freeing.
946 atomic_add_64(&rs->rs_flows_using, 1);
948 epoch_exit_preempt(net_epoch_preempt, &et);
953 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
956 struct tcp_rate_set *rs;
958 if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
959 (link_state != LINK_STATE_UP)) {
961 * We only care on an interface going up that is rate-limit
967 CK_LIST_FOREACH(rs, &int_rs, next) {
968 if ((rs->rs_ifp == ifp) &&
969 (rs->rs_if_dunit == ifp->if_dunit)) {
970 /* We already have initialized this guy */
976 rt_setup_new_rs(ifp, &error);
980 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
982 struct tcp_rate_set *rs, *nrs;
987 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
988 if ((rs->rs_ifp == ifp) &&
989 (rs->rs_if_dunit == ifp->if_dunit)) {
990 CK_LIST_REMOVE(rs, next);
993 rs->rs_flags |= RS_IS_DEAD;
994 for (i = 0; i < rs->rs_rate_cnt; i++) {
995 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
996 tifp = rs->rs_rlt[i].tag->ifp;
997 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
998 rs->rs_rlt[i].tag = NULL;
1000 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1002 if (rs->rs_flows_using == 0) {
1004 * No references left, so we can schedule the
1005 * destruction after the epoch (with a caveat).
1007 rs->rs_flags |= RS_FUNERAL_SCHD;
1008 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1013 mtx_unlock(&rs_mtx);
1017 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1019 struct tcp_rate_set *rs, *nrs;
1024 CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1025 CK_LIST_REMOVE(rs, next);
1028 rs->rs_flags |= RS_IS_DEAD;
1029 for (i = 0; i < rs->rs_rate_cnt; i++) {
1030 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1031 tifp = rs->rs_rlt[i].tag->ifp;
1032 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1033 rs->rs_rlt[i].tag = NULL;
1035 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1037 if (rs->rs_flows_using != 0) {
1039 * We dont hold a reference
1040 * so we have nothing left to
1045 * No references left, so we can destroy it
1048 rs->rs_flags |= RS_FUNERAL_SCHD;
1049 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1052 mtx_unlock(&rs_mtx);
1055 const struct tcp_hwrate_limit_table *
1056 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1057 uint64_t bytes_per_sec, int flags, int *error)
1059 const struct tcp_hwrate_limit_table *rte;
1061 if (tp->t_inpcb->inp_snd_tag == NULL) {
1063 * We are setting up a rate for the first time.
1065 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1066 /* Not supported by the egress */
1072 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1074 * We currently can't do both TLS and hardware
1082 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1085 * We are modifying a rate, wrong interface?
1094 const struct tcp_hwrate_limit_table *
1095 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1096 struct tcpcb *tp, struct ifnet *ifp,
1097 uint64_t bytes_per_sec, int flags, int *error)
1099 const struct tcp_hwrate_limit_table *nrte;
1100 const struct tcp_rate_set *rs;
1101 int is_indirect = 0;
1105 if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1107 /* Wrong interface */
1113 if ((rs->rs_flags & RS_IS_DEAD) ||
1114 (crte->flags & HDWRPACE_IFPDEPARTED)) {
1115 /* Release the rate, and try anew */
1117 tcp_rel_pacing_rate(crte, tp);
1118 nrte = tcp_set_pacing_rate(tp, ifp,
1119 bytes_per_sec, flags, error);
1122 if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1126 if ((is_indirect == 0) &&
1127 ((ifp != rs->rs_ifp) ||
1128 (ifp->if_dunit != rs->rs_if_dunit))) {
1130 * Something changed, the user is not pointing to the same
1131 * ifp? Maybe a route updated on this guy?
1134 } else if (is_indirect) {
1136 * For indirect we have to dig in and find the real interface.
1140 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1142 /* Can't find it? */
1145 if ((rifp != rs->rs_ifp) ||
1146 (ifp->if_dunit != rs->rs_if_dunit)) {
1150 nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1158 /* Release the old rate */
1159 tcp_rel_pacing_rate(crte, tp);
1162 /* Change rates to our new entry */
1163 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1175 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1177 const struct tcp_rate_set *crs;
1178 struct tcp_rate_set *rs;
1183 * Now we must break the const
1184 * in order to release our refcount.
1186 rs = __DECONST(struct tcp_rate_set *, crs);
1187 pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1193 if ((rs->rs_flags & RS_IS_DEAD) &&
1194 ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
1197 * and a funeral is not pending, so
1198 * we must schedule it.
1200 rs->rs_flags |= RS_FUNERAL_SCHD;
1201 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1203 mtx_unlock(&rs_mtx);
1205 in_pcbdetach_txrtlmt(tp->t_inpcb);
1208 static eventhandler_tag rl_ifnet_departs;
1209 static eventhandler_tag rl_ifnet_arrives;
1210 static eventhandler_tag rl_shutdown_start;
1213 tcp_rs_init(void *st __unused)
1215 CK_LIST_INIT(&int_rs);
1216 rs_number_alive = 0;
1217 rs_number_dead = 0;;
1218 mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1219 rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1220 tcp_rl_ifnet_departure,
1221 NULL, EVENTHANDLER_PRI_ANY);
1222 rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1224 NULL, EVENTHANDLER_PRI_ANY);
1225 rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1226 tcp_rl_shutdown, NULL,
1227 SHUTDOWN_PRI_FIRST);
1228 printf("TCP_ratelimit: Is now initialized\n");
1231 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);