]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ratelimit.c
Merge ^/vendor/llvm-openmp/dist up to its last change, and resolve conflicts.
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ratelimit.c
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2019
6  *      Netflix Inc.
7  *      All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 /**
32  * Author: Randall Stewart <rrs@netflix.com>
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/eventhandler.h>
50 #include <sys/mutex.h>
51 #include <sys/ck.h>
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #define TCPSTATES               /* for logging */
57 #include <netinet/tcp_var.h>
58 #ifdef INET6
59 #include <netinet6/tcp6_var.h>
60 #endif
61 #include <netinet/tcp_ratelimit.h>
62 #ifndef USECS_IN_SECOND
63 #define USECS_IN_SECOND 1000000
64 #endif
65 /*
66  * For the purposes of each send, what is the size
67  * of an ethernet frame.
68  */
69 #ifndef ETHERNET_SEGMENT_SIZE
70 #define ETHERNET_SEGMENT_SIZE 1500
71 #endif
72 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
73 #ifdef RATELIMIT
74
75 #define COMMON_RATE 180500
76 uint64_t desired_rates[] = {
77         62500,                  /* 500Kbps */
78         180500,                 /* 1.44Mpbs */
79         375000,                 /* 3Mbps */
80         500000,                 /* 4Mbps */
81         625000,                 /* 5Mbps */
82         750000,                 /* 6Mbps */
83         1000000,                /* 8Mbps */
84         1250000,                /* 10Mbps */
85         2500000,                /* 20Mbps */
86         3750000,                /* 30Mbps */
87         5000000,                /* 40Meg */
88         6250000,                /* 50Mbps */
89         12500000,               /* 100Mbps */
90         25000000,               /* 200Mbps */
91         50000000,               /* 400Mbps */
92         100000000,              /* 800Mbps */
93         12500,                  /* 100kbps */
94         25000,                  /* 200kbps */
95         875000,                 /* 7Mbps */
96         1125000,                /* 9Mbps */
97         1875000,                /* 15Mbps */
98         3125000,                /* 25Mbps */
99         8125000,                /* 65Mbps */
100         10000000,               /* 80Mbps */
101         18750000,               /* 150Mbps */
102         20000000,               /* 250Mbps */
103         37500000,               /* 350Mbps */
104         62500000,               /* 500Mbps */
105         78125000,               /* 625Mbps */
106         125000000,              /* 1Gbps */
107 };
108 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
109 #define RS_ORDERED_COUNT 16     /*
110                                  * Number that are in order
111                                  * at the beginning of the table,
112                                  * over this a sort is required.
113                                  */
114 #define RS_NEXT_ORDER_GROUP 16  /*
115                                  * The point in our table where
116                                  * we come fill in a second ordered
117                                  * group (index wise means -1).
118                                  */
119 #define ALL_HARDWARE_RATES 1004 /*
120                                  * 1Meg - 1Gig in 1 Meg steps
121                                  * plus 100, 200k  and 500k and
122                                  * 10Gig
123                                  */
124
125 #define RS_ONE_MEGABIT_PERSEC 1000000
126 #define RS_ONE_GIGABIT_PERSEC 1000000000
127 #define RS_TEN_GIGABIT_PERSEC 10000000000
128
129 static struct head_tcp_rate_set int_rs;
130 static struct mtx rs_mtx;
131 uint32_t rs_number_alive;
132 uint32_t rs_number_dead;
133
134 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
135     "TCP Ratelimit stats");
136 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
137     &rs_number_alive, 0,
138     "Number of interfaces initialized for ratelimiting");
139 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
140     &rs_number_dead, 0,
141     "Number of interfaces departing from ratelimiting");
142
143 static void
144 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
145 {
146         /*
147          * Add sysctl entries for thus interface.
148          */
149         if (rs->rs_flags & RS_INTF_NO_SUP) {
150                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
151                    SYSCTL_CHILDREN(rl_sysctl_root),
152                    OID_AUTO, "disable", CTLFLAG_RD,
153                    &rs->rs_disable, 0,
154                    "Disable this interface from new hdwr limiting?");
155         } else {
156                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
157                    SYSCTL_CHILDREN(rl_sysctl_root),
158                    OID_AUTO, "disable", CTLFLAG_RW,
159                    &rs->rs_disable, 0,
160                    "Disable this interface from new hdwr limiting?");
161         }
162         SYSCTL_ADD_S32(&rs->sysctl_ctx,
163             SYSCTL_CHILDREN(rl_sysctl_root),
164             OID_AUTO, "minseg", CTLFLAG_RW,
165             &rs->rs_min_seg, 0,
166             "What is the minimum we need to send on this interface?");
167         SYSCTL_ADD_U64(&rs->sysctl_ctx,
168             SYSCTL_CHILDREN(rl_sysctl_root),
169             OID_AUTO, "flow_limit", CTLFLAG_RW,
170             &rs->rs_flow_limit, 0,
171             "What is the limit for number of flows (0=unlimited)?");
172         SYSCTL_ADD_S32(&rs->sysctl_ctx,
173             SYSCTL_CHILDREN(rl_sysctl_root),
174             OID_AUTO, "highest", CTLFLAG_RD,
175             &rs->rs_highest_valid, 0,
176             "Highest valid rate");
177         SYSCTL_ADD_S32(&rs->sysctl_ctx,
178             SYSCTL_CHILDREN(rl_sysctl_root),
179             OID_AUTO, "lowest", CTLFLAG_RD,
180             &rs->rs_lowest_valid, 0,
181             "Lowest valid rate");
182         SYSCTL_ADD_S32(&rs->sysctl_ctx,
183             SYSCTL_CHILDREN(rl_sysctl_root),
184             OID_AUTO, "flags", CTLFLAG_RD,
185             &rs->rs_flags, 0,
186             "What lags are on the entry?");
187         SYSCTL_ADD_S32(&rs->sysctl_ctx,
188             SYSCTL_CHILDREN(rl_sysctl_root),
189             OID_AUTO, "numrates", CTLFLAG_RD,
190             &rs->rs_rate_cnt, 0,
191             "How many rates re there?");
192         SYSCTL_ADD_U64(&rs->sysctl_ctx,
193             SYSCTL_CHILDREN(rl_sysctl_root),
194             OID_AUTO, "flows_using", CTLFLAG_RD,
195             &rs->rs_flows_using, 0,
196             "How many flows are using this interface now?");
197 #ifdef DETAILED_RATELIMIT_SYSCTL
198         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
199                 /*  Lets display the rates */
200                 int i;
201                 struct sysctl_oid *rl_rates;
202                 struct sysctl_oid *rl_rate_num;
203                 char rate_num[16];
204                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
205                                             SYSCTL_CHILDREN(rl_sysctl_root),
206                                             OID_AUTO,
207                                             "rate",
208                                             CTLFLAG_RW, 0,
209                                             "Ratelist");
210                 for( i = 0; i < rs->rs_rate_cnt; i++) {
211                         sprintf(rate_num, "%d", i);
212                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
213                                             SYSCTL_CHILDREN(rl_rates),
214                                             OID_AUTO,
215                                             rate_num,
216                                             CTLFLAG_RW, 0,
217                                             "Individual Rate");
218                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
219                                        SYSCTL_CHILDREN(rl_rate_num),
220                                        OID_AUTO, "flags", CTLFLAG_RD,
221                                        &rs->rs_rlt[i].flags, 0,
222                                        "Flags on this rate");
223                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
224                                        SYSCTL_CHILDREN(rl_rate_num),
225                                        OID_AUTO, "pacetime", CTLFLAG_RD,
226                                        &rs->rs_rlt[i].time_between, 0,
227                                        "Time hardware inserts between 1500 byte sends");
228                         SYSCTL_ADD_U64(&rs->sysctl_ctx,
229                                        SYSCTL_CHILDREN(rl_rate_num),
230                                        OID_AUTO, "rate", CTLFLAG_RD,
231                                        &rs->rs_rlt[i].rate, 0,
232                                        "Rate in bytes per second");
233                 }
234         }
235 #endif
236 }
237
238 static void
239 rs_destroy(epoch_context_t ctx)
240 {
241         struct tcp_rate_set *rs;
242         bool do_free_rs;
243
244         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
245
246         mtx_lock(&rs_mtx);
247         rs->rs_flags &= ~RS_FUNERAL_SCHD;
248         /*
249          * In theory its possible (but unlikely)
250          * that while the delete was occuring
251          * and we were applying the DEAD flag
252          * someone slipped in and found the
253          * interface in a lookup. While we
254          * decided rs_flows_using were 0 and
255          * scheduling the epoch_call, the other
256          * thread incremented rs_flow_using. This
257          * is because users have a pointer and
258          * we only use the rs_flows_using in an
259          * atomic fashion, i.e. the other entities
260          * are not protected. To assure this did
261          * not occur, we check rs_flows_using here
262          * before deleting.
263          */
264         do_free_rs = (rs->rs_flows_using == 0);
265         rs_number_dead--;
266         mtx_unlock(&rs_mtx);
267
268         if (do_free_rs) {
269                 sysctl_ctx_free(&rs->sysctl_ctx);
270                 free(rs->rs_rlt, M_TCPPACE);
271                 free(rs, M_TCPPACE);
272         }
273 }
274
275 static void
276 rs_defer_destroy(struct tcp_rate_set *rs)
277 {
278
279         mtx_assert(&rs_mtx, MA_OWNED);
280
281         /* Check if already pending. */
282         if (rs->rs_flags & RS_FUNERAL_SCHD)
283                 return;
284
285         rs_number_dead++;
286
287         /* Set flag to only defer once. */
288         rs->rs_flags |= RS_FUNERAL_SCHD;
289         NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
290 }
291
292 #ifdef INET
293 extern counter_u64_t rate_limit_set_ok;
294 extern counter_u64_t rate_limit_active;
295 extern counter_u64_t rate_limit_alloc_fail;
296 #endif
297
298 static int
299 rl_attach_txrtlmt(struct ifnet *ifp,
300     uint32_t flowtype,
301     int flowid,
302     uint64_t cfg_rate,
303     struct m_snd_tag **tag)
304 {
305         int error;
306         union if_snd_tag_alloc_params params = {
307                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
308                 .rate_limit.hdr.flowid = flowid,
309                 .rate_limit.hdr.flowtype = flowtype,
310                 .rate_limit.max_rate = cfg_rate,
311                 .rate_limit.flags = M_NOWAIT,
312         };
313
314         if (ifp->if_snd_tag_alloc == NULL) {
315                 error = EOPNOTSUPP;
316         } else {
317                 error = ifp->if_snd_tag_alloc(ifp, &params, tag);
318 #ifdef INET
319                 if (error == 0) {
320                         if_ref((*tag)->ifp);
321                         counter_u64_add(rate_limit_set_ok, 1);
322                         counter_u64_add(rate_limit_active, 1);
323                 } else
324                         counter_u64_add(rate_limit_alloc_fail, 1);
325 #endif
326         }
327         return (error);
328 }
329
330 static void
331 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
332 {
333         /*
334          * The internal table is "special", it
335          * is two seperate ordered tables that
336          * must be merged. We get here when the
337          * adapter specifies a number of rates that
338          * covers both ranges in the table in some
339          * form.
340          */
341         int i, at_low, at_high;
342         uint8_t low_disabled = 0, high_disabled = 0;
343
344         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
345                 rs->rs_rlt[i].flags = 0;
346                 rs->rs_rlt[i].time_between = 0;
347                 if ((low_disabled == 0) &&
348                     (high_disabled ||
349                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
350                         rs->rs_rlt[i].rate = rate_table_act[at_low];
351                         at_low++;
352                         if (at_low == RS_NEXT_ORDER_GROUP)
353                                 low_disabled = 1;
354                 } else if (high_disabled == 0) {
355                         rs->rs_rlt[i].rate = rate_table_act[at_high];
356                         at_high++;
357                         if (at_high == MAX_HDWR_RATES)
358                                 high_disabled = 1;
359                 }
360         }
361 }
362
363 static struct tcp_rate_set *
364 rt_setup_new_rs(struct ifnet *ifp, int *error)
365 {
366         struct tcp_rate_set *rs;
367         const uint64_t *rate_table_act;
368         uint64_t lentim, res;
369         size_t sz;
370         uint32_t hash_type;
371         int i;
372         struct if_ratelimit_query_results rl;
373         struct sysctl_oid *rl_sysctl_root;
374         /*
375          * We expect to enter with the 
376          * mutex locked.
377          */
378
379         if (ifp->if_ratelimit_query == NULL) {
380                 /*
381                  * We can do nothing if we cannot
382                  * get a query back from the driver.
383                  */
384                 return (NULL);
385         }
386         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
387         if (rs == NULL) {
388                 if (error)
389                         *error = ENOMEM;
390                 return (NULL);
391         }
392         rl.flags = RT_NOSUPPORT;
393         ifp->if_ratelimit_query(ifp, &rl);
394         if (rl.flags & RT_IS_UNUSABLE) {
395                 /* 
396                  * The interface does not really support 
397                  * the rate-limiting.
398                  */
399                 memset(rs, 0, sizeof(struct tcp_rate_set));
400                 rs->rs_ifp = ifp;
401                 rs->rs_if_dunit = ifp->if_dunit;
402                 rs->rs_flags = RS_INTF_NO_SUP;
403                 rs->rs_disable = 1;
404                 rs_number_alive++;
405                 sysctl_ctx_init(&rs->sysctl_ctx);
406                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
407                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
408                     OID_AUTO,
409                     rs->rs_ifp->if_xname,
410                     CTLFLAG_RW, 0,
411                     "");
412                 rl_add_syctl_entries(rl_sysctl_root, rs);
413                 mtx_lock(&rs_mtx);
414                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
415                 mtx_unlock(&rs_mtx);
416                 return (rs);
417         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
418                 memset(rs, 0, sizeof(struct tcp_rate_set));
419                 rs->rs_ifp = ifp;
420                 rs->rs_if_dunit = ifp->if_dunit;
421                 rs->rs_flags = RS_IS_DEFF;
422                 rs_number_alive++;
423                 sysctl_ctx_init(&rs->sysctl_ctx);
424                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
425                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
426                     OID_AUTO,
427                     rs->rs_ifp->if_xname,
428                     CTLFLAG_RW, 0,
429                     "");
430                 rl_add_syctl_entries(rl_sysctl_root, rs);
431                 mtx_lock(&rs_mtx);
432                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
433                 mtx_unlock(&rs_mtx);
434                 return (rs);
435         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
436                 /* Mellanox most likely */
437                 rs->rs_ifp = ifp;
438                 rs->rs_if_dunit = ifp->if_dunit;
439                 rs->rs_rate_cnt = rl.number_of_rates;
440                 rs->rs_min_seg = rl.min_segment_burst;
441                 rs->rs_highest_valid = 0;
442                 rs->rs_flow_limit = rl.max_flows;
443                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
444                 rs->rs_disable = 0;
445                 rate_table_act = rl.rate_table;
446         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
447                 /* Chelsio */
448                 rs->rs_ifp = ifp;
449                 rs->rs_if_dunit = ifp->if_dunit;
450                 rs->rs_rate_cnt = rl.number_of_rates;
451                 rs->rs_min_seg = rl.min_segment_burst;
452                 rs->rs_disable = 0;
453                 rs->rs_flow_limit = rl.max_flows;
454                 rate_table_act = desired_rates;
455                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
456                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
457                         /*
458                          * Our desired table is not big
459                          * enough, do what we can.
460                          */
461                         rs->rs_rate_cnt = MAX_HDWR_RATES;
462                  }
463                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
464                         rs->rs_flags = RS_IS_INTF;
465                 else
466                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
467                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
468                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
469         } else {
470                 printf("Interface:%s unit:%d not one known to have rate-limits\n",
471                     ifp->if_dname,
472                     ifp->if_dunit);
473                 free(rs, M_TCPPACE);
474                 return (NULL);
475         }
476         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
477         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
478         if (rs->rs_rlt == NULL) {
479                 if (error)
480                         *error = ENOMEM;
481 bail:
482                 free(rs, M_TCPPACE);
483                 return (NULL);
484         }
485         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
486                 /*
487                  * The interface supports all
488                  * the rates we could possibly want.
489                  */
490                 uint64_t rat;
491
492                 rs->rs_rlt[0].rate = 12500;     /* 100k */
493                 rs->rs_rlt[1].rate = 25000;     /* 200k */
494                 rs->rs_rlt[2].rate = 62500;     /* 500k */
495                 /* Note 125000 == 1Megabit
496                  * populate 1Meg - 1000meg.
497                  */
498                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
499                         rs->rs_rlt[i].rate = rat;
500                         rat += 125000;
501                 }
502                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
503         } else if (rs->rs_flags & RS_INT_TBL) {
504                 /* We populate this in a special way */
505                 populate_canned_table(rs, rate_table_act);
506         } else {
507                 /*
508                  * Just copy in the rates from
509                  * the table, it is in order.
510                  */
511                 for (i=0; i<rs->rs_rate_cnt; i++) {
512                         rs->rs_rlt[i].rate = rate_table_act[i];
513                         rs->rs_rlt[i].time_between = 0;
514                         rs->rs_rlt[i].flags = 0;
515                 }
516         }
517         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
518                 /*
519                  * We go backwards through the list so that if we can't get
520                  * a rate and fail to init one, we have at least a chance of
521                  * getting the highest one.
522                  */
523                 rs->rs_rlt[i].ptbl = rs;
524                 rs->rs_rlt[i].tag = NULL;
525                 /*
526                  * Calculate the time between.
527                  */
528                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
529                 res = lentim / rs->rs_rlt[i].rate;
530                 if (res > 0)
531                         rs->rs_rlt[i].time_between = res;
532                 else
533                         rs->rs_rlt[i].time_between = 1;
534                 if (rs->rs_flags & RS_NO_PRE) {
535                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
536                         rs->rs_lowest_valid = i;
537                 } else {
538                         int err;
539 #ifdef RSS
540                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
541 #else
542                         hash_type = M_HASHTYPE_OPAQUE_HASH;
543 #endif
544                         err = rl_attach_txrtlmt(ifp,
545                             hash_type,
546                             (i + 1),
547                             rs->rs_rlt[i].rate,
548                             &rs->rs_rlt[i].tag);
549                         if (err) {
550                                 if (i == (rs->rs_rate_cnt - 1)) {
551                                         /*
552                                          * Huh - first rate and we can't get
553                                          * it?
554                                          */
555                                         free(rs->rs_rlt, M_TCPPACE);
556                                         if (error)
557                                                 *error = err;
558                                         goto bail;
559                                 } else {
560                                         if (error)
561                                                 *error = err;
562                                 }
563                                 break;
564                         } else {
565                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
566                                 rs->rs_lowest_valid = i;
567                         }
568                 }
569         }
570         /* Did we get at least 1 rate? */
571         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
572                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
573         else {
574                 free(rs->rs_rlt, M_TCPPACE);
575                 goto bail;
576         }
577         rs_number_alive++;
578         sysctl_ctx_init(&rs->sysctl_ctx);
579         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
580             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
581             OID_AUTO,
582             rs->rs_ifp->if_xname,
583             CTLFLAG_RW, 0,
584             "");
585         rl_add_syctl_entries(rl_sysctl_root, rs);
586         mtx_lock(&rs_mtx);
587         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
588         mtx_unlock(&rs_mtx);
589         return (rs);
590 }
591
592 static const struct tcp_hwrate_limit_table *
593 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
594     uint64_t bytes_per_sec, uint32_t flags)
595 {
596         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
597         uint64_t mbits_per_sec, ind_calc;
598         int i;
599
600         mbits_per_sec = (bytes_per_sec * 8);
601         if (flags & RS_PACING_LT) {
602                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
603                     (rs->rs_lowest_valid <= 2)){
604                         /*
605                          * Smaller than 1Meg, only
606                          * 3 entries can match it.
607                          */
608                         for(i = rs->rs_lowest_valid; i < 3; i++) {
609                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
610                                         rte = &rs->rs_rlt[i];
611                                         break;
612                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
613                                         arte = &rs->rs_rlt[i];
614                                 }
615                         }
616                         goto done;
617                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
618                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
619                         /*
620                          * Larger than 1G (the majority of
621                          * our table.
622                          */
623                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
624                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
625                         else
626                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
627                         goto done;
628                 }
629                 /*
630                  * If we reach here its in our table (between 1Meg - 1000Meg),
631                  * just take the rounded down mbits per second, and add
632                  * 1Megabit to it, from this we can calculate
633                  * the index in the table.
634                  */
635                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
636                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
637                         ind_calc++;
638                 /* our table is offset by 3, we add 2 */
639                 ind_calc += 2;
640                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
641                         /* This should not happen */
642                         ind_calc = ALL_HARDWARE_RATES-1;
643                 }
644                 if ((ind_calc >= rs->rs_lowest_valid) &&
645                     (ind_calc <= rs->rs_highest_valid))
646                 rte = &rs->rs_rlt[ind_calc];
647         } else if (flags & RS_PACING_EXACT_MATCH) {
648                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
649                     (rs->rs_lowest_valid <= 2)){
650                         for(i = rs->rs_lowest_valid; i < 3; i++) {
651                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
652                                         rte = &rs->rs_rlt[i];
653                                         break;
654                                 }
655                         }
656                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
657                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
658                         /* > 1Gbps only one rate */
659                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
660                                 /* Its 10G wow */
661                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
662                         }
663                 } else {
664                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
665                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
666                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
667                                 /* its an exact Mbps */
668                                 ind_calc += 2;
669                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
670                                         /* This should not happen */
671                                         ind_calc = ALL_HARDWARE_RATES-1;
672                                 }
673                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
674                                         rte = &rs->rs_rlt[ind_calc];
675                         }
676                 }
677         } else {
678                 /* we want greater than the requested rate */
679                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
680                     (rs->rs_lowest_valid <= 2)){
681                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
682                         for (i=2; i>=rs->rs_lowest_valid; i--) {
683                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
684                                         rte = &rs->rs_rlt[i];
685                                         break;
686                                 } else if ((flags & RS_PACING_GEQ) &&
687                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
688                                         rte = &rs->rs_rlt[i];
689                                         break;
690                                 } else {
691                                         arte = &rs->rs_rlt[i]; /* new alternate */
692                                 }
693                         }
694                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
695                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
696                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
697                                 /* Our top rate is larger than the request */
698                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
699                         } else if ((flags & RS_PACING_GEQ) &&
700                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
701                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
702                                 /* It matches our top rate */
703                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
704                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
705                                 /* The top rate is an alternative */
706                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
707                         }
708                 } else {
709                         /* Its in our range 1Meg - 1Gig */
710                         if (flags & RS_PACING_GEQ) {
711                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
712                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
713                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
714                                                 /* This should not happen */
715                                                 ind_calc = (ALL_HARDWARE_RATES-1);
716                                         }
717                                         rte = &rs->rs_rlt[ind_calc];
718                                 }
719                                 goto done;
720                         }
721                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
722                         ind_calc += 2;
723                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
724                                 /* This should not happen */
725                                 ind_calc = ALL_HARDWARE_RATES-1;
726                         }
727                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
728                                 rte = &rs->rs_rlt[ind_calc];
729                 }
730         }
731 done:
732         if ((rte == NULL) &&
733             (arte != NULL) &&
734             (flags & RS_PACING_SUB_OK)) {
735                 /* We can use the substitute */
736                 rte = arte;
737         }
738         return (rte);
739 }
740
741 static const struct tcp_hwrate_limit_table *
742 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
743 {
744         /**
745          * Hunt the rate table with the restrictions in flags and find a
746          * suitable rate if possible.
747          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
748          * RS_PACING_GT     - must be greater than.
749          * RS_PACING_GEQ    - must be greater than or equal.
750          * RS_PACING_LT     - must be less than.
751          * RS_PACING_SUB_OK - If we don't meet criteria a
752          *                    substitute is ok.
753          */
754         int i, matched;
755         struct tcp_hwrate_limit_table *rte = NULL;
756
757
758         if ((rs->rs_flags & RS_INT_TBL) &&
759             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
760                 /*
761                  * Here we don't want to paw thru
762                  * a big table, we have everything
763                  * from 1Meg - 1000Meg in 1Meg increments.
764                  * Use an alternate method to "lookup".
765                  */
766                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
767         }
768         if ((flags & RS_PACING_LT) ||
769             (flags & RS_PACING_EXACT_MATCH)) {
770                 /*
771                  * For exact and less than we go forward through the table.
772                  * This way when we find one larger we stop (exact was a
773                  * toss up).
774                  */
775                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
776                         if ((flags & RS_PACING_EXACT_MATCH) &&
777                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
778                                 rte = &rs->rs_rlt[i];
779                                 matched = 1;
780                                 break;
781                         } else if ((flags & RS_PACING_LT) &&
782                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
783                                 rte = &rs->rs_rlt[i];
784                                 matched = 1;
785                                 break;
786                         }
787                         if (bytes_per_sec > rs->rs_rlt[i].rate)
788                                 break;
789                 }
790                 if ((matched == 0) &&
791                     (flags & RS_PACING_LT) &&
792                     (flags & RS_PACING_SUB_OK)) {
793                         /* Kick in a substitute (the lowest) */
794                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
795                 }
796         } else {
797                 /*
798                  * Here we go backward through the table so that we can find
799                  * the one greater in theory faster (but its probably a
800                  * wash).
801                  */
802                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
803                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
804                                 /* A possible candidate */
805                                 rte = &rs->rs_rlt[i];
806                         }
807                         if ((flags & RS_PACING_GEQ) &&
808                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
809                                 /* An exact match and we want equal */
810                                 matched = 1;
811                                 rte = &rs->rs_rlt[i];
812                                 break;
813                         } else if (rte) {
814                                 /*
815                                  * Found one that is larger than but don't
816                                  * stop, there may be a more closer match.
817                                  */
818                                 matched = 1;
819                         }
820                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
821                                 /*
822                                  * We found a table entry that is smaller,
823                                  * stop there will be none greater or equal.
824                                  */
825                                 break;
826                         }
827                 }
828                 if ((matched == 0) &&
829                     (flags & RS_PACING_SUB_OK)) {
830                         /* Kick in a substitute (the highest) */
831                         rte = &rs->rs_rlt[rs->rs_highest_valid];
832                 }
833         }
834         return (rte);
835 }
836
837 static struct ifnet *
838 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
839 {
840         struct ifnet *tifp;
841         struct m_snd_tag *tag;
842         union if_snd_tag_alloc_params params = {
843                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
844                 .rate_limit.hdr.flowid = 1,
845                 .rate_limit.max_rate = COMMON_RATE,
846                 .rate_limit.flags = M_NOWAIT,
847         };
848         int err;
849 #ifdef RSS
850         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
851             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
852 #else
853         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
854 #endif
855         tag = NULL;
856         if (ifp->if_snd_tag_alloc) {
857                 if (error)
858                         *error = ENODEV;
859                 return (NULL);
860         }
861         err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
862         if (err) {
863                 /* Failed to setup a tag? */
864                 if (error)
865                         *error = err;
866                 return (NULL);
867         }
868         tifp = tag->ifp;
869         tifp->if_snd_tag_free(tag);
870         return (tifp);
871 }
872
873 static const struct tcp_hwrate_limit_table *
874 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
875     uint32_t flags, int *error)
876 {
877         /* First lets find the interface if it exists */
878         const struct tcp_hwrate_limit_table *rte;
879         struct tcp_rate_set *rs;
880         struct epoch_tracker et;
881         int err;
882
883         NET_EPOCH_ENTER(et);
884 use_real_interface:
885         CK_LIST_FOREACH(rs, &int_rs, next) {
886                 /*
887                  * Note we don't look with the lock since we either see a
888                  * new entry or will get one when we try to add it.
889                  */
890                 if (rs->rs_flags & RS_IS_DEAD) {
891                         /* The dead are not looked at */
892                         continue;
893                 }
894                 if ((rs->rs_ifp == ifp) &&
895                     (rs->rs_if_dunit == ifp->if_dunit)) {
896                         /* Ok we found it */
897                         break;
898                 }
899         }
900         if ((rs == NULL) ||
901             (rs->rs_flags & RS_INTF_NO_SUP) ||
902             (rs->rs_flags & RS_IS_DEAD)) {
903                 /*
904                  * This means we got a packet *before*
905                  * the IF-UP was processed below, <or>
906                  * while or after we already received an interface
907                  * departed event. In either case we really don't
908                  * want to do anything with pacing, in
909                  * the departing case the packet is not
910                  * going to go very far. The new case
911                  * might be arguable, but its impossible
912                  * to tell from the departing case.
913                  */
914                 if (rs->rs_disable && error)
915                         *error = ENODEV;
916                 NET_EPOCH_EXIT(et);
917                 return (NULL);
918         }
919
920         if ((rs == NULL) || (rs->rs_disable != 0)) {
921                 if (rs->rs_disable && error)
922                         *error = ENOSPC;
923                 NET_EPOCH_EXIT(et);
924                 return (NULL);
925         }
926         if (rs->rs_flags & RS_IS_DEFF) {
927                 /* We need to find the real interface */
928                 struct ifnet *tifp;
929
930                 tifp = rt_find_real_interface(ifp, inp, error);
931                 if (tifp == NULL) {
932                         if (rs->rs_disable && error)
933                                 *error = ENOTSUP;
934                         NET_EPOCH_EXIT(et);
935                         return (NULL);
936                 }
937                 goto use_real_interface;
938         }
939         if (rs->rs_flow_limit &&
940             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
941                 if (error)
942                         *error = ENOSPC;
943                 NET_EPOCH_EXIT(et);
944                 return (NULL);
945         }
946         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
947         if (rte) {
948                 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
949                     inp->inp_flowtype,
950                     inp->inp_flowid,
951                     rte->rate,
952                     &inp->inp_snd_tag);
953                 if (err) {
954                         /* Failed to attach */
955                         if (error)
956                                 *error = err;
957                         rte = NULL;
958                 }
959         }
960         if (rte) {
961                 /*
962                  * We use an atomic here for accounting so we don't have to
963                  * use locks when freeing.
964                  */
965                 atomic_add_64(&rs->rs_flows_using, 1);
966         }
967         NET_EPOCH_EXIT(et);
968         return (rte);
969 }
970
971 static void
972 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
973 {
974         int error;
975         struct tcp_rate_set *rs;
976
977         if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
978             (link_state != LINK_STATE_UP)) {
979                 /*
980                  * We only care on an interface going up that is rate-limit
981                  * capable.
982                  */
983                 return;
984         }
985         mtx_lock(&rs_mtx);
986         CK_LIST_FOREACH(rs, &int_rs, next) {
987                 if ((rs->rs_ifp == ifp) &&
988                     (rs->rs_if_dunit == ifp->if_dunit)) {
989                         /* We already have initialized this guy */
990                         mtx_unlock(&rs_mtx);
991                         return;
992                 }
993         }
994         mtx_unlock(&rs_mtx);
995         rt_setup_new_rs(ifp, &error);
996 }
997
998 static void
999 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
1000 {
1001         struct tcp_rate_set *rs, *nrs;
1002         struct ifnet *tifp;
1003         int i;
1004
1005         mtx_lock(&rs_mtx);
1006         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1007                 if ((rs->rs_ifp == ifp) &&
1008                     (rs->rs_if_dunit == ifp->if_dunit)) {
1009                         CK_LIST_REMOVE(rs, next);
1010                         rs_number_alive--;
1011                         rs->rs_flags |= RS_IS_DEAD;
1012                         for (i = 0; i < rs->rs_rate_cnt; i++) {
1013                                 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1014                                         tifp = rs->rs_rlt[i].tag->ifp;
1015                                         in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1016                                         rs->rs_rlt[i].tag = NULL;
1017                                 }
1018                                 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1019                         }
1020                         if (rs->rs_flows_using == 0)
1021                                 rs_defer_destroy(rs);
1022                         break;
1023                 }
1024         }
1025         mtx_unlock(&rs_mtx);
1026 }
1027
1028 static void
1029 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1030 {
1031         struct tcp_rate_set *rs, *nrs;
1032         struct ifnet *tifp;
1033         int i;
1034
1035         mtx_lock(&rs_mtx);
1036         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1037                 CK_LIST_REMOVE(rs, next);
1038                 rs_number_alive--;
1039                 rs->rs_flags |= RS_IS_DEAD;
1040                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1041                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1042                                 tifp = rs->rs_rlt[i].tag->ifp;
1043                                 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1044                                 rs->rs_rlt[i].tag = NULL;
1045                         }
1046                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1047                 }
1048                 if (rs->rs_flows_using == 0)
1049                         rs_defer_destroy(rs);
1050         }
1051         mtx_unlock(&rs_mtx);
1052 }
1053
1054 const struct tcp_hwrate_limit_table *
1055 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1056     uint64_t bytes_per_sec, int flags, int *error)
1057 {
1058         const struct tcp_hwrate_limit_table *rte;
1059
1060         if (tp->t_inpcb->inp_snd_tag == NULL) {
1061                 /*
1062                  * We are setting up a rate for the first time.
1063                  */
1064                 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1065                         /* Not supported by the egress */
1066                         if (error)
1067                                 *error = ENODEV;
1068                         return (NULL);
1069                 }
1070 #ifdef KERN_TLS
1071                 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1072                         /*
1073                          * We currently can't do both TLS and hardware
1074                          * pacing
1075                          */
1076                         if (error)
1077                                 *error = EINVAL;
1078                         return (NULL);
1079                 }
1080 #endif
1081                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1082         } else {
1083                 /*
1084                  * We are modifying a rate, wrong interface?
1085                  */
1086                 if (error)
1087                         *error = EINVAL;
1088                 rte = NULL;
1089         }
1090         return (rte);
1091 }
1092
1093 const struct tcp_hwrate_limit_table *
1094 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1095     struct tcpcb *tp, struct ifnet *ifp,
1096     uint64_t bytes_per_sec, int flags, int *error)
1097 {
1098         const struct tcp_hwrate_limit_table *nrte;
1099         const struct tcp_rate_set *rs;
1100         int is_indirect = 0;
1101         int err;
1102
1103
1104         if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1105             (crte == NULL)) {
1106                 /* Wrong interface */
1107                 if (error)
1108                         *error = EINVAL;
1109                 return (NULL);
1110         }
1111         rs = crte->ptbl;
1112         if ((rs->rs_flags & RS_IS_DEAD) ||
1113             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1114                 /* Release the rate, and try anew */
1115 re_rate:
1116                 tcp_rel_pacing_rate(crte, tp);
1117                 nrte = tcp_set_pacing_rate(tp, ifp,
1118                     bytes_per_sec, flags, error);
1119                 return (nrte);
1120         }
1121         if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1122                 is_indirect = 1;
1123         else
1124                 is_indirect = 0;
1125         if ((is_indirect == 0) &&
1126             ((ifp != rs->rs_ifp) ||
1127             (ifp->if_dunit != rs->rs_if_dunit))) {
1128                 /*
1129                  * Something changed, the user is not pointing to the same
1130                  * ifp? Maybe a route updated on this guy?
1131                  */
1132                 goto re_rate;
1133         } else if (is_indirect) {
1134                 /*
1135                  * For indirect we have to dig in and find the real interface.
1136                  */
1137                 struct ifnet *rifp;
1138
1139                 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1140                 if (rifp == NULL) {
1141                         /* Can't find it? */
1142                         goto re_rate;
1143                 }
1144                 if ((rifp != rs->rs_ifp) ||
1145                     (ifp->if_dunit != rs->rs_if_dunit)) {
1146                         goto re_rate;
1147                 }
1148         }
1149         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1150         if (nrte == crte) {
1151                 /* No change */
1152                 if (error)
1153                         *error = 0;
1154                 return (crte);
1155         }
1156         if (nrte == NULL) {
1157                 /* Release the old rate */
1158                 tcp_rel_pacing_rate(crte, tp);
1159                 return (NULL);
1160         }
1161         /* Change rates to our new entry */
1162         err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1163         if (err) {
1164                 if (error)
1165                         *error = err;
1166                 return (NULL);
1167         }
1168         if (error)
1169                 *error = 0;
1170         return (nrte);
1171 }
1172
1173 void
1174 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1175 {
1176         const struct tcp_rate_set *crs;
1177         struct tcp_rate_set *rs;
1178         uint64_t pre;
1179
1180         crs = crte->ptbl;
1181         /*
1182          * Now we must break the const
1183          * in order to release our refcount.
1184          */
1185         rs = __DECONST(struct tcp_rate_set *, crs);
1186         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1187         if (pre == 1) {
1188                 mtx_lock(&rs_mtx);
1189                 /*
1190                  * Is it dead?
1191                  */
1192                 if (rs->rs_flags & RS_IS_DEAD)
1193                         rs_defer_destroy(rs);
1194                 mtx_unlock(&rs_mtx);
1195         }
1196         in_pcbdetach_txrtlmt(tp->t_inpcb);
1197 }
1198
1199 static eventhandler_tag rl_ifnet_departs;
1200 static eventhandler_tag rl_ifnet_arrives;
1201 static eventhandler_tag rl_shutdown_start;
1202
1203 static void
1204 tcp_rs_init(void *st __unused)
1205 {
1206         CK_LIST_INIT(&int_rs);
1207         rs_number_alive = 0;
1208         rs_number_dead = 0;;
1209         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1210         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1211             tcp_rl_ifnet_departure,
1212             NULL, EVENTHANDLER_PRI_ANY);
1213         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1214             tcp_rl_ifnet_link,
1215             NULL, EVENTHANDLER_PRI_ANY);
1216         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1217             tcp_rl_shutdown, NULL,
1218             SHUTDOWN_PRI_FIRST);
1219         printf("TCP_ratelimit: Is now initialized\n");
1220 }
1221
1222 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1223 #endif