]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ratelimit.c
MFV r353613: 10731 zfs: NULL pointer errors
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ratelimit.c
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2019
6  *      Netflix Inc.
7  *      All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 /**
32  * Author: Randall Stewart <rrs@netflix.com>
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/eventhandler.h>
50 #include <sys/mutex.h>
51 #include <sys/ck.h>
52 #define TCPSTATES               /* for logging */
53 #include <netinet/in.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/tcp_var.h>
56 #ifdef INET6
57 #include <netinet6/tcp6_var.h>
58 #endif
59 #include <netinet/tcp_ratelimit.h>
60 #ifndef USECS_IN_SECOND
61 #define USECS_IN_SECOND 1000000
62 #endif
63 /*
64  * For the purposes of each send, what is the size
65  * of an ethernet frame.
66  */
67 #ifndef ETHERNET_SEGMENT_SIZE
68 #define ETHERNET_SEGMENT_SIZE 1500
69 #endif
70 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
71 #ifdef RATELIMIT
72
73 #define COMMON_RATE 180500
74 uint64_t desired_rates[] = {
75         62500,                  /* 500Kbps */
76         180500,                 /* 1.44Mpbs */
77         375000,                 /* 3Mbps */
78         500000,                 /* 4Mbps */
79         625000,                 /* 5Mbps */
80         750000,                 /* 6Mbps */
81         1000000,                /* 8Mbps */
82         1250000,                /* 10Mbps */
83         2500000,                /* 20Mbps */
84         3750000,                /* 30Mbps */
85         5000000,                /* 40Meg */
86         6250000,                /* 50Mbps */
87         12500000,               /* 100Mbps */
88         25000000,               /* 200Mbps */
89         50000000,               /* 400Mbps */
90         100000000,              /* 800Mbps */
91         12500,                  /* 100kbps */
92         25000,                  /* 200kbps */
93         875000,                 /* 7Mbps */
94         1125000,                /* 9Mbps */
95         1875000,                /* 15Mbps */
96         3125000,                /* 25Mbps */
97         8125000,                /* 65Mbps */
98         10000000,               /* 80Mbps */
99         18750000,               /* 150Mbps */
100         20000000,               /* 250Mbps */
101         37500000,               /* 350Mbps */
102         62500000,               /* 500Mbps */
103         78125000,               /* 625Mbps */
104         125000000,              /* 1Gbps */
105 };
106 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
107 #define RS_ORDERED_COUNT 16     /*
108                                  * Number that are in order
109                                  * at the beginning of the table,
110                                  * over this a sort is required.
111                                  */
112 #define RS_NEXT_ORDER_GROUP 16  /*
113                                  * The point in our table where
114                                  * we come fill in a second ordered
115                                  * group (index wise means -1).
116                                  */
117 #define ALL_HARDWARE_RATES 1004 /*
118                                  * 1Meg - 1Gig in 1 Meg steps
119                                  * plus 100, 200k  and 500k and
120                                  * 10Gig
121                                  */
122
123 #define RS_ONE_MEGABIT_PERSEC 1000000
124 #define RS_ONE_GIGABIT_PERSEC 1000000000
125 #define RS_TEN_GIGABIT_PERSEC 10000000000
126
127 static struct head_tcp_rate_set int_rs;
128 static struct mtx rs_mtx;
129 uint32_t rs_number_alive;
130 uint32_t rs_number_dead;
131
132 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
133     "TCP Ratelimit stats");
134 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
135     &rs_number_alive, 0,
136     "Number of interfaces initialized for ratelimiting");
137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
138     &rs_number_dead, 0,
139     "Number of interfaces departing from ratelimiting");
140
141 static void
142 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
143 {
144         /*
145          * Add sysctl entries for thus interface.
146          */
147         if (rs->rs_flags & RS_INTF_NO_SUP) {
148                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
149                    SYSCTL_CHILDREN(rl_sysctl_root),
150                    OID_AUTO, "disable", CTLFLAG_RD,
151                    &rs->rs_disable, 0,
152                    "Disable this interface from new hdwr limiting?");
153         } else {
154                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
155                    SYSCTL_CHILDREN(rl_sysctl_root),
156                    OID_AUTO, "disable", CTLFLAG_RW,
157                    &rs->rs_disable, 0,
158                    "Disable this interface from new hdwr limiting?");
159         }
160         SYSCTL_ADD_S32(&rs->sysctl_ctx,
161             SYSCTL_CHILDREN(rl_sysctl_root),
162             OID_AUTO, "minseg", CTLFLAG_RW,
163             &rs->rs_min_seg, 0,
164             "What is the minimum we need to send on this interface?");
165         SYSCTL_ADD_U64(&rs->sysctl_ctx,
166             SYSCTL_CHILDREN(rl_sysctl_root),
167             OID_AUTO, "flow_limit", CTLFLAG_RW,
168             &rs->rs_flow_limit, 0,
169             "What is the limit for number of flows (0=unlimited)?");
170         SYSCTL_ADD_S32(&rs->sysctl_ctx,
171             SYSCTL_CHILDREN(rl_sysctl_root),
172             OID_AUTO, "highest", CTLFLAG_RD,
173             &rs->rs_highest_valid, 0,
174             "Highest valid rate");
175         SYSCTL_ADD_S32(&rs->sysctl_ctx,
176             SYSCTL_CHILDREN(rl_sysctl_root),
177             OID_AUTO, "lowest", CTLFLAG_RD,
178             &rs->rs_lowest_valid, 0,
179             "Lowest valid rate");
180         SYSCTL_ADD_S32(&rs->sysctl_ctx,
181             SYSCTL_CHILDREN(rl_sysctl_root),
182             OID_AUTO, "flags", CTLFLAG_RD,
183             &rs->rs_flags, 0,
184             "What lags are on the entry?");
185         SYSCTL_ADD_S32(&rs->sysctl_ctx,
186             SYSCTL_CHILDREN(rl_sysctl_root),
187             OID_AUTO, "numrates", CTLFLAG_RD,
188             &rs->rs_rate_cnt, 0,
189             "How many rates re there?");
190         SYSCTL_ADD_U64(&rs->sysctl_ctx,
191             SYSCTL_CHILDREN(rl_sysctl_root),
192             OID_AUTO, "flows_using", CTLFLAG_RD,
193             &rs->rs_flows_using, 0,
194             "How many flows are using this interface now?");
195 #ifdef DETAILED_RATELIMIT_SYSCTL
196         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
197                 /*  Lets display the rates */
198                 int i;
199                 struct sysctl_oid *rl_rates;
200                 struct sysctl_oid *rl_rate_num;
201                 char rate_num[16];
202                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
203                                             SYSCTL_CHILDREN(rl_sysctl_root),
204                                             OID_AUTO,
205                                             "rate",
206                                             CTLFLAG_RW, 0,
207                                             "Ratelist");
208                 for( i = 0; i < rs->rs_rate_cnt; i++) {
209                         sprintf(rate_num, "%d", i);
210                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
211                                             SYSCTL_CHILDREN(rl_rates),
212                                             OID_AUTO,
213                                             rate_num,
214                                             CTLFLAG_RW, 0,
215                                             "Individual Rate");
216                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
217                                        SYSCTL_CHILDREN(rl_rate_num),
218                                        OID_AUTO, "flags", CTLFLAG_RD,
219                                        &rs->rs_rlt[i].flags, 0,
220                                        "Flags on this rate");
221                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
222                                        SYSCTL_CHILDREN(rl_rate_num),
223                                        OID_AUTO, "pacetime", CTLFLAG_RD,
224                                        &rs->rs_rlt[i].time_between, 0,
225                                        "Time hardware inserts between 1500 byte sends");
226                         SYSCTL_ADD_U64(&rs->sysctl_ctx,
227                                        SYSCTL_CHILDREN(rl_rate_num),
228                                        OID_AUTO, "rate", CTLFLAG_RD,
229                                        &rs->rs_rlt[i].rate, 0,
230                                        "Rate in bytes per second");
231                 }
232         }
233 #endif
234 }
235
236 static void
237 rs_destroy(epoch_context_t ctx)
238 {
239         struct tcp_rate_set *rs;
240         bool do_free_rs;
241
242         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
243
244         mtx_lock(&rs_mtx);
245         rs->rs_flags &= ~RS_FUNERAL_SCHD;
246         /*
247          * In theory its possible (but unlikely)
248          * that while the delete was occuring
249          * and we were applying the DEAD flag
250          * someone slipped in and found the
251          * interface in a lookup. While we
252          * decided rs_flows_using were 0 and
253          * scheduling the epoch_call, the other
254          * thread incremented rs_flow_using. This
255          * is because users have a pointer and
256          * we only use the rs_flows_using in an
257          * atomic fashion, i.e. the other entities
258          * are not protected. To assure this did
259          * not occur, we check rs_flows_using here
260          * before deleting.
261          */
262         do_free_rs = (rs->rs_flows_using == 0);
263         rs_number_dead--;
264         mtx_unlock(&rs_mtx);
265
266         if (do_free_rs) {
267                 sysctl_ctx_free(&rs->sysctl_ctx);
268                 free(rs->rs_rlt, M_TCPPACE);
269                 free(rs, M_TCPPACE);
270         }
271 }
272
273 static void
274 rs_defer_destroy(struct tcp_rate_set *rs)
275 {
276
277         mtx_assert(&rs_mtx, MA_OWNED);
278
279         /* Check if already pending. */
280         if (rs->rs_flags & RS_FUNERAL_SCHD)
281                 return;
282
283         rs_number_dead++;
284
285         /* Set flag to only defer once. */
286         rs->rs_flags |= RS_FUNERAL_SCHD;
287         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
288 }
289
290 #ifdef INET
291 extern counter_u64_t rate_limit_set_ok;
292 extern counter_u64_t rate_limit_active;
293 extern counter_u64_t rate_limit_alloc_fail;
294 #endif
295
296 static int
297 rl_attach_txrtlmt(struct ifnet *ifp,
298     uint32_t flowtype,
299     int flowid,
300     uint64_t cfg_rate,
301     struct m_snd_tag **tag)
302 {
303         int error;
304         union if_snd_tag_alloc_params params = {
305                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
306                 .rate_limit.hdr.flowid = flowid,
307                 .rate_limit.hdr.flowtype = flowtype,
308                 .rate_limit.max_rate = cfg_rate,
309                 .rate_limit.flags = M_NOWAIT,
310         };
311
312         if (ifp->if_snd_tag_alloc == NULL) {
313                 error = EOPNOTSUPP;
314         } else {
315                 error = ifp->if_snd_tag_alloc(ifp, &params, tag);
316 #ifdef INET
317                 if (error == 0) {
318                         if_ref((*tag)->ifp);
319                         counter_u64_add(rate_limit_set_ok, 1);
320                         counter_u64_add(rate_limit_active, 1);
321                 } else
322                         counter_u64_add(rate_limit_alloc_fail, 1);
323 #endif
324         }
325         return (error);
326 }
327
328 static void
329 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
330 {
331         /*
332          * The internal table is "special", it
333          * is two seperate ordered tables that
334          * must be merged. We get here when the
335          * adapter specifies a number of rates that
336          * covers both ranges in the table in some
337          * form.
338          */
339         int i, at_low, at_high;
340         uint8_t low_disabled = 0, high_disabled = 0;
341
342         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
343                 rs->rs_rlt[i].flags = 0;
344                 rs->rs_rlt[i].time_between = 0;
345                 if ((low_disabled == 0) &&
346                     (high_disabled ||
347                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
348                         rs->rs_rlt[i].rate = rate_table_act[at_low];
349                         at_low++;
350                         if (at_low == RS_NEXT_ORDER_GROUP)
351                                 low_disabled = 1;
352                 } else if (high_disabled == 0) {
353                         rs->rs_rlt[i].rate = rate_table_act[at_high];
354                         at_high++;
355                         if (at_high == MAX_HDWR_RATES)
356                                 high_disabled = 1;
357                 }
358         }
359 }
360
361 static struct tcp_rate_set *
362 rt_setup_new_rs(struct ifnet *ifp, int *error)
363 {
364         struct tcp_rate_set *rs;
365         const uint64_t *rate_table_act;
366         uint64_t lentim, res;
367         size_t sz;
368         uint32_t hash_type;
369         int i;
370         struct if_ratelimit_query_results rl;
371         struct sysctl_oid *rl_sysctl_root;
372         /*
373          * We expect to enter with the 
374          * mutex locked.
375          */
376
377         if (ifp->if_ratelimit_query == NULL) {
378                 /*
379                  * We can do nothing if we cannot
380                  * get a query back from the driver.
381                  */
382                 return (NULL);
383         }
384         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
385         if (rs == NULL) {
386                 if (error)
387                         *error = ENOMEM;
388                 return (NULL);
389         }
390         rl.flags = RT_NOSUPPORT;
391         ifp->if_ratelimit_query(ifp, &rl);
392         if (rl.flags & RT_IS_UNUSABLE) {
393                 /* 
394                  * The interface does not really support 
395                  * the rate-limiting.
396                  */
397                 memset(rs, 0, sizeof(struct tcp_rate_set));
398                 rs->rs_ifp = ifp;
399                 rs->rs_if_dunit = ifp->if_dunit;
400                 rs->rs_flags = RS_INTF_NO_SUP;
401                 rs->rs_disable = 1;
402                 rs_number_alive++;
403                 sysctl_ctx_init(&rs->sysctl_ctx);
404                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
405                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
406                     OID_AUTO,
407                     rs->rs_ifp->if_xname,
408                     CTLFLAG_RW, 0,
409                     "");
410                 rl_add_syctl_entries(rl_sysctl_root, rs);
411                 mtx_lock(&rs_mtx);
412                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
413                 mtx_unlock(&rs_mtx);
414                 return (rs);
415         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
416                 memset(rs, 0, sizeof(struct tcp_rate_set));
417                 rs->rs_ifp = ifp;
418                 rs->rs_if_dunit = ifp->if_dunit;
419                 rs->rs_flags = RS_IS_DEFF;
420                 rs_number_alive++;
421                 sysctl_ctx_init(&rs->sysctl_ctx);
422                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
423                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
424                     OID_AUTO,
425                     rs->rs_ifp->if_xname,
426                     CTLFLAG_RW, 0,
427                     "");
428                 rl_add_syctl_entries(rl_sysctl_root, rs);
429                 mtx_lock(&rs_mtx);
430                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
431                 mtx_unlock(&rs_mtx);
432                 return (rs);
433         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
434                 /* Mellanox most likely */
435                 rs->rs_ifp = ifp;
436                 rs->rs_if_dunit = ifp->if_dunit;
437                 rs->rs_rate_cnt = rl.number_of_rates;
438                 rs->rs_min_seg = rl.min_segment_burst;
439                 rs->rs_highest_valid = 0;
440                 rs->rs_flow_limit = rl.max_flows;
441                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
442                 rs->rs_disable = 0;
443                 rate_table_act = rl.rate_table;
444         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
445                 /* Chelsio */
446                 rs->rs_ifp = ifp;
447                 rs->rs_if_dunit = ifp->if_dunit;
448                 rs->rs_rate_cnt = rl.number_of_rates;
449                 rs->rs_min_seg = rl.min_segment_burst;
450                 rs->rs_disable = 0;
451                 rs->rs_flow_limit = rl.max_flows;
452                 rate_table_act = desired_rates;
453                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
454                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
455                         /*
456                          * Our desired table is not big
457                          * enough, do what we can.
458                          */
459                         rs->rs_rate_cnt = MAX_HDWR_RATES;
460                  }
461                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
462                         rs->rs_flags = RS_IS_INTF;
463                 else
464                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
465                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
466                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
467         } else {
468                 printf("Interface:%s unit:%d not one known to have rate-limits\n",
469                     ifp->if_dname,
470                     ifp->if_dunit);
471                 free(rs, M_TCPPACE);
472                 return (NULL);
473         }
474         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
475         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
476         if (rs->rs_rlt == NULL) {
477                 if (error)
478                         *error = ENOMEM;
479 bail:
480                 free(rs, M_TCPPACE);
481                 return (NULL);
482         }
483         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
484                 /*
485                  * The interface supports all
486                  * the rates we could possibly want.
487                  */
488                 uint64_t rat;
489
490                 rs->rs_rlt[0].rate = 12500;     /* 100k */
491                 rs->rs_rlt[1].rate = 25000;     /* 200k */
492                 rs->rs_rlt[2].rate = 62500;     /* 500k */
493                 /* Note 125000 == 1Megabit
494                  * populate 1Meg - 1000meg.
495                  */
496                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
497                         rs->rs_rlt[i].rate = rat;
498                         rat += 125000;
499                 }
500                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
501         } else if (rs->rs_flags & RS_INT_TBL) {
502                 /* We populate this in a special way */
503                 populate_canned_table(rs, rate_table_act);
504         } else {
505                 /*
506                  * Just copy in the rates from
507                  * the table, it is in order.
508                  */
509                 for (i=0; i<rs->rs_rate_cnt; i++) {
510                         rs->rs_rlt[i].rate = rate_table_act[i];
511                         rs->rs_rlt[i].time_between = 0;
512                         rs->rs_rlt[i].flags = 0;
513                 }
514         }
515         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
516                 /*
517                  * We go backwards through the list so that if we can't get
518                  * a rate and fail to init one, we have at least a chance of
519                  * getting the highest one.
520                  */
521                 rs->rs_rlt[i].ptbl = rs;
522                 rs->rs_rlt[i].tag = NULL;
523                 /*
524                  * Calculate the time between.
525                  */
526                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
527                 res = lentim / rs->rs_rlt[i].rate;
528                 if (res > 0)
529                         rs->rs_rlt[i].time_between = res;
530                 else
531                         rs->rs_rlt[i].time_between = 1;
532                 if (rs->rs_flags & RS_NO_PRE) {
533                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
534                         rs->rs_lowest_valid = i;
535                 } else {
536                         int err;
537 #ifdef RSS
538                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
539 #else
540                         hash_type = M_HASHTYPE_OPAQUE_HASH;
541 #endif
542                         err = rl_attach_txrtlmt(ifp,
543                             hash_type,
544                             (i + 1),
545                             rs->rs_rlt[i].rate,
546                             &rs->rs_rlt[i].tag);
547                         if (err) {
548                                 if (i == (rs->rs_rate_cnt - 1)) {
549                                         /*
550                                          * Huh - first rate and we can't get
551                                          * it?
552                                          */
553                                         free(rs->rs_rlt, M_TCPPACE);
554                                         if (error)
555                                                 *error = err;
556                                         goto bail;
557                                 } else {
558                                         if (error)
559                                                 *error = err;
560                                 }
561                                 break;
562                         } else {
563                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
564                                 rs->rs_lowest_valid = i;
565                         }
566                 }
567         }
568         /* Did we get at least 1 rate? */
569         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
570                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
571         else {
572                 free(rs->rs_rlt, M_TCPPACE);
573                 goto bail;
574         }
575         rs_number_alive++;
576         sysctl_ctx_init(&rs->sysctl_ctx);
577         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
578             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
579             OID_AUTO,
580             rs->rs_ifp->if_xname,
581             CTLFLAG_RW, 0,
582             "");
583         rl_add_syctl_entries(rl_sysctl_root, rs);
584         mtx_lock(&rs_mtx);
585         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
586         mtx_unlock(&rs_mtx);
587         return (rs);
588 }
589
590 static const struct tcp_hwrate_limit_table *
591 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
592     uint64_t bytes_per_sec, uint32_t flags)
593 {
594         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
595         uint64_t mbits_per_sec, ind_calc;
596         int i;
597
598         mbits_per_sec = (bytes_per_sec * 8);
599         if (flags & RS_PACING_LT) {
600                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
601                     (rs->rs_lowest_valid <= 2)){
602                         /*
603                          * Smaller than 1Meg, only
604                          * 3 entries can match it.
605                          */
606                         for(i = rs->rs_lowest_valid; i < 3; i++) {
607                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
608                                         rte = &rs->rs_rlt[i];
609                                         break;
610                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
611                                         arte = &rs->rs_rlt[i];
612                                 }
613                         }
614                         goto done;
615                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
616                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
617                         /*
618                          * Larger than 1G (the majority of
619                          * our table.
620                          */
621                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
622                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
623                         else
624                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
625                         goto done;
626                 }
627                 /*
628                  * If we reach here its in our table (between 1Meg - 1000Meg),
629                  * just take the rounded down mbits per second, and add
630                  * 1Megabit to it, from this we can calculate
631                  * the index in the table.
632                  */
633                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
634                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
635                         ind_calc++;
636                 /* our table is offset by 3, we add 2 */
637                 ind_calc += 2;
638                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
639                         /* This should not happen */
640                         ind_calc = ALL_HARDWARE_RATES-1;
641                 }
642                 if ((ind_calc >= rs->rs_lowest_valid) &&
643                     (ind_calc <= rs->rs_highest_valid))
644                 rte = &rs->rs_rlt[ind_calc];
645         } else if (flags & RS_PACING_EXACT_MATCH) {
646                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
647                     (rs->rs_lowest_valid <= 2)){
648                         for(i = rs->rs_lowest_valid; i < 3; i++) {
649                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
650                                         rte = &rs->rs_rlt[i];
651                                         break;
652                                 }
653                         }
654                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
655                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
656                         /* > 1Gbps only one rate */
657                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
658                                 /* Its 10G wow */
659                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
660                         }
661                 } else {
662                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
663                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
664                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
665                                 /* its an exact Mbps */
666                                 ind_calc += 2;
667                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
668                                         /* This should not happen */
669                                         ind_calc = ALL_HARDWARE_RATES-1;
670                                 }
671                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
672                                         rte = &rs->rs_rlt[ind_calc];
673                         }
674                 }
675         } else {
676                 /* we want greater than the requested rate */
677                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
678                     (rs->rs_lowest_valid <= 2)){
679                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
680                         for (i=2; i>=rs->rs_lowest_valid; i--) {
681                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
682                                         rte = &rs->rs_rlt[i];
683                                         break;
684                                 } else if ((flags & RS_PACING_GEQ) &&
685                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
686                                         rte = &rs->rs_rlt[i];
687                                         break;
688                                 } else {
689                                         arte = &rs->rs_rlt[i]; /* new alternate */
690                                 }
691                         }
692                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
693                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
694                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
695                                 /* Our top rate is larger than the request */
696                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
697                         } else if ((flags & RS_PACING_GEQ) &&
698                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
699                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
700                                 /* It matches our top rate */
701                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
702                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
703                                 /* The top rate is an alternative */
704                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
705                         }
706                 } else {
707                         /* Its in our range 1Meg - 1Gig */
708                         if (flags & RS_PACING_GEQ) {
709                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
710                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
711                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
712                                                 /* This should not happen */
713                                                 ind_calc = (ALL_HARDWARE_RATES-1);
714                                         }
715                                         rte = &rs->rs_rlt[ind_calc];
716                                 }
717                                 goto done;
718                         }
719                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
720                         ind_calc += 2;
721                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
722                                 /* This should not happen */
723                                 ind_calc = ALL_HARDWARE_RATES-1;
724                         }
725                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
726                                 rte = &rs->rs_rlt[ind_calc];
727                 }
728         }
729 done:
730         if ((rte == NULL) &&
731             (arte != NULL) &&
732             (flags & RS_PACING_SUB_OK)) {
733                 /* We can use the substitute */
734                 rte = arte;
735         }
736         return (rte);
737 }
738
739 static const struct tcp_hwrate_limit_table *
740 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
741 {
742         /**
743          * Hunt the rate table with the restrictions in flags and find a
744          * suitable rate if possible.
745          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
746          * RS_PACING_GT     - must be greater than.
747          * RS_PACING_GEQ    - must be greater than or equal.
748          * RS_PACING_LT     - must be less than.
749          * RS_PACING_SUB_OK - If we don't meet criteria a
750          *                    substitute is ok.
751          */
752         int i, matched;
753         struct tcp_hwrate_limit_table *rte = NULL;
754
755
756         if ((rs->rs_flags & RS_INT_TBL) &&
757             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
758                 /*
759                  * Here we don't want to paw thru
760                  * a big table, we have everything
761                  * from 1Meg - 1000Meg in 1Meg increments.
762                  * Use an alternate method to "lookup".
763                  */
764                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
765         }
766         if ((flags & RS_PACING_LT) ||
767             (flags & RS_PACING_EXACT_MATCH)) {
768                 /*
769                  * For exact and less than we go forward through the table.
770                  * This way when we find one larger we stop (exact was a
771                  * toss up).
772                  */
773                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
774                         if ((flags & RS_PACING_EXACT_MATCH) &&
775                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
776                                 rte = &rs->rs_rlt[i];
777                                 matched = 1;
778                                 break;
779                         } else if ((flags & RS_PACING_LT) &&
780                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
781                                 rte = &rs->rs_rlt[i];
782                                 matched = 1;
783                                 break;
784                         }
785                         if (bytes_per_sec > rs->rs_rlt[i].rate)
786                                 break;
787                 }
788                 if ((matched == 0) &&
789                     (flags & RS_PACING_LT) &&
790                     (flags & RS_PACING_SUB_OK)) {
791                         /* Kick in a substitute (the lowest) */
792                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
793                 }
794         } else {
795                 /*
796                  * Here we go backward through the table so that we can find
797                  * the one greater in theory faster (but its probably a
798                  * wash).
799                  */
800                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
801                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
802                                 /* A possible candidate */
803                                 rte = &rs->rs_rlt[i];
804                         }
805                         if ((flags & RS_PACING_GEQ) &&
806                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
807                                 /* An exact match and we want equal */
808                                 matched = 1;
809                                 rte = &rs->rs_rlt[i];
810                                 break;
811                         } else if (rte) {
812                                 /*
813                                  * Found one that is larger than but don't
814                                  * stop, there may be a more closer match.
815                                  */
816                                 matched = 1;
817                         }
818                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
819                                 /*
820                                  * We found a table entry that is smaller,
821                                  * stop there will be none greater or equal.
822                                  */
823                                 break;
824                         }
825                 }
826                 if ((matched == 0) &&
827                     (flags & RS_PACING_SUB_OK)) {
828                         /* Kick in a substitute (the highest) */
829                         rte = &rs->rs_rlt[rs->rs_highest_valid];
830                 }
831         }
832         return (rte);
833 }
834
835 static struct ifnet *
836 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
837 {
838         struct ifnet *tifp;
839         struct m_snd_tag *tag;
840         union if_snd_tag_alloc_params params = {
841                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
842                 .rate_limit.hdr.flowid = 1,
843                 .rate_limit.max_rate = COMMON_RATE,
844                 .rate_limit.flags = M_NOWAIT,
845         };
846         int err;
847 #ifdef RSS
848         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
849             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
850 #else
851         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
852 #endif
853         tag = NULL;
854         if (ifp->if_snd_tag_alloc) {
855                 if (error)
856                         *error = ENODEV;
857                 return (NULL);
858         }
859         err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
860         if (err) {
861                 /* Failed to setup a tag? */
862                 if (error)
863                         *error = err;
864                 return (NULL);
865         }
866         tifp = tag->ifp;
867         tifp->if_snd_tag_free(tag);
868         return (tifp);
869 }
870
871 static const struct tcp_hwrate_limit_table *
872 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
873     uint32_t flags, int *error)
874 {
875         /* First lets find the interface if it exists */
876         const struct tcp_hwrate_limit_table *rte;
877         struct tcp_rate_set *rs;
878         struct epoch_tracker et;
879         int err;
880
881         epoch_enter_preempt(net_epoch_preempt, &et);
882 use_real_interface:
883         CK_LIST_FOREACH(rs, &int_rs, next) {
884                 /*
885                  * Note we don't look with the lock since we either see a
886                  * new entry or will get one when we try to add it.
887                  */
888                 if (rs->rs_flags & RS_IS_DEAD) {
889                         /* The dead are not looked at */
890                         continue;
891                 }
892                 if ((rs->rs_ifp == ifp) &&
893                     (rs->rs_if_dunit == ifp->if_dunit)) {
894                         /* Ok we found it */
895                         break;
896                 }
897         }
898         if ((rs == NULL) ||
899             (rs->rs_flags & RS_INTF_NO_SUP) ||
900             (rs->rs_flags & RS_IS_DEAD)) {
901                 /*
902                  * This means we got a packet *before*
903                  * the IF-UP was processed below, <or>
904                  * while or after we already received an interface
905                  * departed event. In either case we really don't
906                  * want to do anything with pacing, in
907                  * the departing case the packet is not
908                  * going to go very far. The new case
909                  * might be arguable, but its impossible
910                  * to tell from the departing case.
911                  */
912                 if (rs->rs_disable && error)
913                         *error = ENODEV;
914                 epoch_exit_preempt(net_epoch_preempt, &et);
915                 return (NULL);
916         }
917
918         if ((rs == NULL) || (rs->rs_disable != 0)) {
919                 if (rs->rs_disable && error)
920                         *error = ENOSPC;
921                 epoch_exit_preempt(net_epoch_preempt, &et);
922                 return (NULL);
923         }
924         if (rs->rs_flags & RS_IS_DEFF) {
925                 /* We need to find the real interface */
926                 struct ifnet *tifp;
927
928                 tifp = rt_find_real_interface(ifp, inp, error);
929                 if (tifp == NULL) {
930                         if (rs->rs_disable && error)
931                                 *error = ENOTSUP;
932                         epoch_exit_preempt(net_epoch_preempt, &et);
933                         return (NULL);
934                 }
935                 goto use_real_interface;
936         }
937         if (rs->rs_flow_limit &&
938             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
939                 if (error)
940                         *error = ENOSPC;
941                 epoch_exit_preempt(net_epoch_preempt, &et);
942                 return (NULL);
943         }
944         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
945         if (rte) {
946                 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
947                     inp->inp_flowtype,
948                     inp->inp_flowid,
949                     rte->rate,
950                     &inp->inp_snd_tag);
951                 if (err) {
952                         /* Failed to attach */
953                         if (error)
954                                 *error = err;
955                         rte = NULL;
956                 }
957         }
958         if (rte) {
959                 /*
960                  * We use an atomic here for accounting so we don't have to
961                  * use locks when freeing.
962                  */
963                 atomic_add_64(&rs->rs_flows_using, 1);
964         }
965         epoch_exit_preempt(net_epoch_preempt, &et);
966         return (rte);
967 }
968
969 static void
970 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
971 {
972         int error;
973         struct tcp_rate_set *rs;
974
975         if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
976             (link_state != LINK_STATE_UP)) {
977                 /*
978                  * We only care on an interface going up that is rate-limit
979                  * capable.
980                  */
981                 return;
982         }
983         mtx_lock(&rs_mtx);
984         CK_LIST_FOREACH(rs, &int_rs, next) {
985                 if ((rs->rs_ifp == ifp) &&
986                     (rs->rs_if_dunit == ifp->if_dunit)) {
987                         /* We already have initialized this guy */
988                         mtx_unlock(&rs_mtx);
989                         return;
990                 }
991         }
992         mtx_unlock(&rs_mtx);
993         rt_setup_new_rs(ifp, &error);
994 }
995
996 static void
997 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
998 {
999         struct tcp_rate_set *rs, *nrs;
1000         struct ifnet *tifp;
1001         int i;
1002
1003         mtx_lock(&rs_mtx);
1004         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1005                 if ((rs->rs_ifp == ifp) &&
1006                     (rs->rs_if_dunit == ifp->if_dunit)) {
1007                         CK_LIST_REMOVE(rs, next);
1008                         rs_number_alive--;
1009                         rs->rs_flags |= RS_IS_DEAD;
1010                         for (i = 0; i < rs->rs_rate_cnt; i++) {
1011                                 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1012                                         tifp = rs->rs_rlt[i].tag->ifp;
1013                                         in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1014                                         rs->rs_rlt[i].tag = NULL;
1015                                 }
1016                                 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1017                         }
1018                         if (rs->rs_flows_using == 0)
1019                                 rs_defer_destroy(rs);
1020                         break;
1021                 }
1022         }
1023         mtx_unlock(&rs_mtx);
1024 }
1025
1026 static void
1027 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1028 {
1029         struct tcp_rate_set *rs, *nrs;
1030         struct ifnet *tifp;
1031         int i;
1032
1033         mtx_lock(&rs_mtx);
1034         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1035                 CK_LIST_REMOVE(rs, next);
1036                 rs_number_alive--;
1037                 rs->rs_flags |= RS_IS_DEAD;
1038                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1039                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1040                                 tifp = rs->rs_rlt[i].tag->ifp;
1041                                 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1042                                 rs->rs_rlt[i].tag = NULL;
1043                         }
1044                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1045                 }
1046                 if (rs->rs_flows_using == 0)
1047                         rs_defer_destroy(rs);
1048         }
1049         mtx_unlock(&rs_mtx);
1050 }
1051
1052 const struct tcp_hwrate_limit_table *
1053 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1054     uint64_t bytes_per_sec, int flags, int *error)
1055 {
1056         const struct tcp_hwrate_limit_table *rte;
1057
1058         if (tp->t_inpcb->inp_snd_tag == NULL) {
1059                 /*
1060                  * We are setting up a rate for the first time.
1061                  */
1062                 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1063                         /* Not supported by the egress */
1064                         if (error)
1065                                 *error = ENODEV;
1066                         return (NULL);
1067                 }
1068 #ifdef KERN_TLS
1069                 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1070                         /*
1071                          * We currently can't do both TLS and hardware
1072                          * pacing
1073                          */
1074                         if (error)
1075                                 *error = EINVAL;
1076                         return (NULL);
1077                 }
1078 #endif
1079                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1080         } else {
1081                 /*
1082                  * We are modifying a rate, wrong interface?
1083                  */
1084                 if (error)
1085                         *error = EINVAL;
1086                 rte = NULL;
1087         }
1088         return (rte);
1089 }
1090
1091 const struct tcp_hwrate_limit_table *
1092 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1093     struct tcpcb *tp, struct ifnet *ifp,
1094     uint64_t bytes_per_sec, int flags, int *error)
1095 {
1096         const struct tcp_hwrate_limit_table *nrte;
1097         const struct tcp_rate_set *rs;
1098         int is_indirect = 0;
1099         int err;
1100
1101
1102         if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1103             (crte == NULL)) {
1104                 /* Wrong interface */
1105                 if (error)
1106                         *error = EINVAL;
1107                 return (NULL);
1108         }
1109         rs = crte->ptbl;
1110         if ((rs->rs_flags & RS_IS_DEAD) ||
1111             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1112                 /* Release the rate, and try anew */
1113 re_rate:
1114                 tcp_rel_pacing_rate(crte, tp);
1115                 nrte = tcp_set_pacing_rate(tp, ifp,
1116                     bytes_per_sec, flags, error);
1117                 return (nrte);
1118         }
1119         if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1120                 is_indirect = 1;
1121         else
1122                 is_indirect = 0;
1123         if ((is_indirect == 0) &&
1124             ((ifp != rs->rs_ifp) ||
1125             (ifp->if_dunit != rs->rs_if_dunit))) {
1126                 /*
1127                  * Something changed, the user is not pointing to the same
1128                  * ifp? Maybe a route updated on this guy?
1129                  */
1130                 goto re_rate;
1131         } else if (is_indirect) {
1132                 /*
1133                  * For indirect we have to dig in and find the real interface.
1134                  */
1135                 struct ifnet *rifp;
1136
1137                 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1138                 if (rifp == NULL) {
1139                         /* Can't find it? */
1140                         goto re_rate;
1141                 }
1142                 if ((rifp != rs->rs_ifp) ||
1143                     (ifp->if_dunit != rs->rs_if_dunit)) {
1144                         goto re_rate;
1145                 }
1146         }
1147         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1148         if (nrte == crte) {
1149                 /* No change */
1150                 if (error)
1151                         *error = 0;
1152                 return (crte);
1153         }
1154         if (nrte == NULL) {
1155                 /* Release the old rate */
1156                 tcp_rel_pacing_rate(crte, tp);
1157                 return (NULL);
1158         }
1159         /* Change rates to our new entry */
1160         err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1161         if (err) {
1162                 if (error)
1163                         *error = err;
1164                 return (NULL);
1165         }
1166         if (error)
1167                 *error = 0;
1168         return (nrte);
1169 }
1170
1171 void
1172 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1173 {
1174         const struct tcp_rate_set *crs;
1175         struct tcp_rate_set *rs;
1176         uint64_t pre;
1177
1178         crs = crte->ptbl;
1179         /*
1180          * Now we must break the const
1181          * in order to release our refcount.
1182          */
1183         rs = __DECONST(struct tcp_rate_set *, crs);
1184         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1185         if (pre == 1) {
1186                 mtx_lock(&rs_mtx);
1187                 /*
1188                  * Is it dead?
1189                  */
1190                 if (rs->rs_flags & RS_IS_DEAD)
1191                         rs_defer_destroy(rs);
1192                 mtx_unlock(&rs_mtx);
1193         }
1194         in_pcbdetach_txrtlmt(tp->t_inpcb);
1195 }
1196
1197 static eventhandler_tag rl_ifnet_departs;
1198 static eventhandler_tag rl_ifnet_arrives;
1199 static eventhandler_tag rl_shutdown_start;
1200
1201 static void
1202 tcp_rs_init(void *st __unused)
1203 {
1204         CK_LIST_INIT(&int_rs);
1205         rs_number_alive = 0;
1206         rs_number_dead = 0;;
1207         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1208         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1209             tcp_rl_ifnet_departure,
1210             NULL, EVENTHANDLER_PRI_ANY);
1211         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1212             tcp_rl_ifnet_link,
1213             NULL, EVENTHANDLER_PRI_ANY);
1214         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1215             tcp_rl_shutdown, NULL,
1216             SHUTDOWN_PRI_FIRST);
1217         printf("TCP_ratelimit: Is now initialized\n");
1218 }
1219
1220 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1221 #endif