]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ratelimit.c
sys/{x86,amd64}: remove one of doubled ;s
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ratelimit.c
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2019
6  *      Netflix Inc.
7  *      All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 /**
32  * Author: Randall Stewart <rrs@netflix.com>
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #ifdef KERN_TLS
49 #include <sys/sockbuf_tls.h>
50 #endif
51 #include <sys/sysctl.h>
52 #include <sys/eventhandler.h>
53 #include <sys/mutex.h>
54 #include <sys/ck.h>
55 #define TCPSTATES               /* for logging */
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/tcp_var.h>
59 #ifdef INET6
60 #include <netinet6/tcp6_var.h>
61 #endif
62 #include <netinet/tcp_ratelimit.h>
63 #ifndef USECS_IN_SECOND
64 #define USECS_IN_SECOND 1000000
65 #endif
66 /*
67  * For the purposes of each send, what is the size
68  * of an ethernet frame.
69  */
70 #ifndef ETHERNET_SEGMENT_SIZE
71 #define ETHERNET_SEGMENT_SIZE 1500
72 #endif
73 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
74 #ifdef RATELIMIT
75
76 #define COMMON_RATE 180500
77 uint64_t desired_rates[] = {
78         62500,                  /* 500Kbps */
79         180500,                 /* 1.44Mpbs */
80         375000,                 /* 3Mbps */
81         500000,                 /* 4Mbps */
82         625000,                 /* 5Mbps */
83         750000,                 /* 6Mbps */
84         1000000,                /* 8Mbps */
85         1250000,                /* 10Mbps */
86         2500000,                /* 20Mbps */
87         3750000,                /* 30Mbps */
88         5000000,                /* 40Meg */
89         6250000,                /* 50Mbps */
90         12500000,               /* 100Mbps */
91         25000000,               /* 200Mbps */
92         50000000,               /* 400Mbps */
93         100000000,              /* 800Mbps */
94         12500,                  /* 100kbps */
95         25000,                  /* 200kbps */
96         875000,                 /* 7Mbps */
97         1125000,                /* 9Mbps */
98         1875000,                /* 15Mbps */
99         3125000,                /* 25Mbps */
100         8125000,                /* 65Mbps */
101         10000000,               /* 80Mbps */
102         18750000,               /* 150Mbps */
103         20000000,               /* 250Mbps */
104         37500000,               /* 350Mbps */
105         62500000,               /* 500Mbps */
106         78125000,               /* 625Mbps */
107         125000000,              /* 1Gbps */
108 };
109 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
110 #define RS_ORDERED_COUNT 16     /*
111                                  * Number that are in order
112                                  * at the beginning of the table,
113                                  * over this a sort is required.
114                                  */
115 #define RS_NEXT_ORDER_GROUP 16  /*
116                                  * The point in our table where
117                                  * we come fill in a second ordered
118                                  * group (index wise means -1).
119                                  */
120 #define ALL_HARDWARE_RATES 1004 /*
121                                  * 1Meg - 1Gig in 1 Meg steps
122                                  * plus 100, 200k  and 500k and
123                                  * 10Gig
124                                  */
125
126 #define RS_ONE_MEGABIT_PERSEC 1000000
127 #define RS_ONE_GIGABIT_PERSEC 1000000000
128 #define RS_TEN_GIGABIT_PERSEC 10000000000
129
130 static struct head_tcp_rate_set int_rs;
131 static struct mtx rs_mtx;
132 uint32_t rs_number_alive;
133 uint32_t rs_number_dead;
134
135 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
136     "TCP Ratelimit stats");
137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
138     &rs_number_alive, 0,
139     "Number of interfaces initialized for ratelimiting");
140 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
141     &rs_number_dead, 0,
142     "Number of interfaces departing from ratelimiting");
143
144 static void
145 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
146 {
147         /*
148          * Add sysctl entries for thus interface.
149          */
150         if (rs->rs_flags & RS_INTF_NO_SUP) {
151                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
152                    SYSCTL_CHILDREN(rl_sysctl_root),
153                    OID_AUTO, "disable", CTLFLAG_RD,
154                    &rs->rs_disable, 0,
155                    "Disable this interface from new hdwr limiting?");
156         } else {
157                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
158                    SYSCTL_CHILDREN(rl_sysctl_root),
159                    OID_AUTO, "disable", CTLFLAG_RW,
160                    &rs->rs_disable, 0,
161                    "Disable this interface from new hdwr limiting?");
162         }
163         SYSCTL_ADD_S32(&rs->sysctl_ctx,
164             SYSCTL_CHILDREN(rl_sysctl_root),
165             OID_AUTO, "minseg", CTLFLAG_RW,
166             &rs->rs_min_seg, 0,
167             "What is the minimum we need to send on this interface?");
168         SYSCTL_ADD_U64(&rs->sysctl_ctx,
169             SYSCTL_CHILDREN(rl_sysctl_root),
170             OID_AUTO, "flow_limit", CTLFLAG_RW,
171             &rs->rs_flow_limit, 0,
172             "What is the limit for number of flows (0=unlimited)?");
173         SYSCTL_ADD_S32(&rs->sysctl_ctx,
174             SYSCTL_CHILDREN(rl_sysctl_root),
175             OID_AUTO, "highest", CTLFLAG_RD,
176             &rs->rs_highest_valid, 0,
177             "Highest valid rate");
178         SYSCTL_ADD_S32(&rs->sysctl_ctx,
179             SYSCTL_CHILDREN(rl_sysctl_root),
180             OID_AUTO, "lowest", CTLFLAG_RD,
181             &rs->rs_lowest_valid, 0,
182             "Lowest valid rate");
183         SYSCTL_ADD_S32(&rs->sysctl_ctx,
184             SYSCTL_CHILDREN(rl_sysctl_root),
185             OID_AUTO, "flags", CTLFLAG_RD,
186             &rs->rs_flags, 0,
187             "What lags are on the entry?");
188         SYSCTL_ADD_S32(&rs->sysctl_ctx,
189             SYSCTL_CHILDREN(rl_sysctl_root),
190             OID_AUTO, "numrates", CTLFLAG_RD,
191             &rs->rs_rate_cnt, 0,
192             "How many rates re there?");
193         SYSCTL_ADD_U64(&rs->sysctl_ctx,
194             SYSCTL_CHILDREN(rl_sysctl_root),
195             OID_AUTO, "flows_using", CTLFLAG_RD,
196             &rs->rs_flows_using, 0,
197             "How many flows are using this interface now?");
198 #ifdef DETAILED_RATELIMIT_SYSCTL
199         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
200                 /*  Lets display the rates */
201                 int i;
202                 struct sysctl_oid *rl_rates;
203                 struct sysctl_oid *rl_rate_num;
204                 char rate_num[16];
205                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
206                                             SYSCTL_CHILDREN(rl_sysctl_root),
207                                             OID_AUTO,
208                                             "rate",
209                                             CTLFLAG_RW, 0,
210                                             "Ratelist");
211                 for( i = 0; i < rs->rs_rate_cnt; i++) {
212                         sprintf(rate_num, "%d", i);
213                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
214                                             SYSCTL_CHILDREN(rl_rates),
215                                             OID_AUTO,
216                                             rate_num,
217                                             CTLFLAG_RW, 0,
218                                             "Individual Rate");
219                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
220                                        SYSCTL_CHILDREN(rl_rate_num),
221                                        OID_AUTO, "flags", CTLFLAG_RD,
222                                        &rs->rs_rlt[i].flags, 0,
223                                        "Flags on this rate");
224                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
225                                        SYSCTL_CHILDREN(rl_rate_num),
226                                        OID_AUTO, "pacetime", CTLFLAG_RD,
227                                        &rs->rs_rlt[i].time_between, 0,
228                                        "Time hardware inserts between 1500 byte sends");
229                         SYSCTL_ADD_U64(&rs->sysctl_ctx,
230                                        SYSCTL_CHILDREN(rl_rate_num),
231                                        OID_AUTO, "rate", CTLFLAG_RD,
232                                        &rs->rs_rlt[i].rate, 0,
233                                        "Rate in bytes per second");
234                 }
235         }
236 #endif
237 }
238
239 static void
240 rs_destroy(epoch_context_t ctx)
241 {
242         struct tcp_rate_set *rs;
243
244         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
245         mtx_lock(&rs_mtx);
246         rs->rs_flags &= ~RS_FUNERAL_SCHD;
247         if (rs->rs_flows_using == 0) {
248                 /*
249                  * In theory its possible (but unlikely)
250                  * that while the delete was occuring
251                  * and we were applying the DEAD flag
252                  * someone slipped in and found the
253                  * interface in a lookup. While we
254                  * decided rs_flows_using were 0 and
255                  * scheduling the epoch_call, the other
256                  * thread incremented rs_flow_using. This
257                  * is because users have a pointer and
258                  * we only use the rs_flows_using in an
259                  * atomic fashion, i.e. the other entities
260                  * are not protected. To assure this did
261                  * not occur, we check rs_flows_using here
262                  * before deleteing.
263                  */
264                 sysctl_ctx_free(&rs->sysctl_ctx);
265                 free(rs->rs_rlt, M_TCPPACE);
266                 free(rs, M_TCPPACE);
267                 rs_number_dead--;
268         }
269         mtx_unlock(&rs_mtx);
270
271 }
272
273 #ifdef INET
274 extern counter_u64_t rate_limit_set_ok;
275 extern counter_u64_t rate_limit_active;
276 extern counter_u64_t rate_limit_alloc_fail;
277 #endif
278
279 static int
280 rl_attach_txrtlmt(struct ifnet *ifp,
281     uint32_t flowtype,
282     int flowid,
283     uint64_t cfg_rate,
284     struct m_snd_tag **tag)
285 {
286         int error;
287         union if_snd_tag_alloc_params params = {
288                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
289                 .rate_limit.hdr.flowid = flowid,
290                 .rate_limit.hdr.flowtype = flowtype,
291                 .rate_limit.max_rate = cfg_rate,
292                 .rate_limit.flags = M_NOWAIT,
293         };
294
295         if (ifp->if_snd_tag_alloc == NULL) {
296                 error = EOPNOTSUPP;
297         } else {
298                 error = ifp->if_snd_tag_alloc(ifp, &params, tag);
299 #ifdef INET
300                 if (error == 0) {
301                         if_ref((*tag)->ifp);
302                         counter_u64_add(rate_limit_set_ok, 1);
303                         counter_u64_add(rate_limit_active, 1);
304                 } else
305                         counter_u64_add(rate_limit_alloc_fail, 1);
306 #endif
307         }
308         return (error);
309 }
310
311 static void
312 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
313 {
314         /*
315          * The internal table is "special", it
316          * is two seperate ordered tables that
317          * must be merged. We get here when the
318          * adapter specifies a number of rates that
319          * covers both ranges in the table in some
320          * form.
321          */
322         int i, at_low, at_high;
323         uint8_t low_disabled = 0, high_disabled = 0;
324
325         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
326                 rs->rs_rlt[i].flags = 0;
327                 rs->rs_rlt[i].time_between = 0;
328                 if ((low_disabled == 0) &&
329                     (high_disabled ||
330                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
331                         rs->rs_rlt[i].rate = rate_table_act[at_low];
332                         at_low++;
333                         if (at_low == RS_NEXT_ORDER_GROUP)
334                                 low_disabled = 1;
335                 } else if (high_disabled == 0) {
336                         rs->rs_rlt[i].rate = rate_table_act[at_high];
337                         at_high++;
338                         if (at_high == MAX_HDWR_RATES)
339                                 high_disabled = 1;
340                 }
341         }
342 }
343
344 static struct tcp_rate_set *
345 rt_setup_new_rs(struct ifnet *ifp, int *error)
346 {
347         struct tcp_rate_set *rs;
348         const uint64_t *rate_table_act;
349         uint64_t lentim, res;
350         size_t sz;
351         uint32_t hash_type;
352         int i;
353         struct if_ratelimit_query_results rl;
354         struct sysctl_oid *rl_sysctl_root;
355         /*
356          * We expect to enter with the 
357          * mutex locked.
358          */
359
360         if (ifp->if_ratelimit_query == NULL) {
361                 /*
362                  * We can do nothing if we cannot
363                  * get a query back from the driver.
364                  */
365                 return (NULL);
366         }
367         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
368         if (rs == NULL) {
369                 if (error)
370                         *error = ENOMEM;
371                 return (NULL);
372         }
373         rl.flags = RT_NOSUPPORT;
374         ifp->if_ratelimit_query(ifp, &rl);
375         if (rl.flags & RT_IS_UNUSABLE) {
376                 /* 
377                  * The interface does not really support 
378                  * the rate-limiting.
379                  */
380                 memset(rs, 0, sizeof(struct tcp_rate_set));
381                 rs->rs_ifp = ifp;
382                 rs->rs_if_dunit = ifp->if_dunit;
383                 rs->rs_flags = RS_INTF_NO_SUP;
384                 rs->rs_disable = 1;
385                 rs_number_alive++;
386                 sysctl_ctx_init(&rs->sysctl_ctx);
387                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
388                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
389                     OID_AUTO,
390                     rs->rs_ifp->if_xname,
391                     CTLFLAG_RW, 0,
392                     "");
393                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
394                 /* Unlock to allow the sysctl stuff to allocate */
395                 mtx_unlock(&rs_mtx);
396                 rl_add_syctl_entries(rl_sysctl_root, rs);
397                 /* re-lock for our caller */
398                 mtx_lock(&rs_mtx);
399                 return (rs);
400         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
401                 memset(rs, 0, sizeof(struct tcp_rate_set));
402                 rs->rs_ifp = ifp;
403                 rs->rs_if_dunit = ifp->if_dunit;
404                 rs->rs_flags = RS_IS_DEFF;
405                 rs_number_alive++;
406                 sysctl_ctx_init(&rs->sysctl_ctx);
407                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
408                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
409                     OID_AUTO,
410                     rs->rs_ifp->if_xname,
411                     CTLFLAG_RW, 0,
412                     "");
413                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
414                 /* Unlock to allow the sysctl stuff to allocate */
415                 mtx_unlock(&rs_mtx);
416                 rl_add_syctl_entries(rl_sysctl_root, rs);
417                 /* re-lock for our caller */
418                 mtx_lock(&rs_mtx);
419                 return (rs);
420         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
421                 /* Mellanox most likely */
422                 rs->rs_ifp = ifp;
423                 rs->rs_if_dunit = ifp->if_dunit;
424                 rs->rs_rate_cnt = rl.number_of_rates;
425                 rs->rs_min_seg = rl.min_segment_burst;
426                 rs->rs_highest_valid = 0;
427                 rs->rs_flow_limit = rl.max_flows;
428                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
429                 rs->rs_disable = 0;
430                 rate_table_act = rl.rate_table;
431         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
432                 /* Chelsio */
433                 rs->rs_ifp = ifp;
434                 rs->rs_if_dunit = ifp->if_dunit;
435                 rs->rs_rate_cnt = rl.number_of_rates;
436                 rs->rs_min_seg = rl.min_segment_burst;
437                 rs->rs_disable = 0;
438                 rs->rs_flow_limit = rl.max_flows;
439                 rate_table_act = desired_rates;
440                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
441                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
442                         /*
443                          * Our desired table is not big
444                          * enough, do what we can.
445                          */
446                         rs->rs_rate_cnt = MAX_HDWR_RATES;
447                  }
448                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
449                         rs->rs_flags = RS_IS_INTF;
450                 else
451                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
452                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
453                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
454         } else {
455                 printf("Interface:%s unit:%d not one known to have rate-limits\n",
456                     ifp->if_dname,
457                     ifp->if_dunit);
458                 free(rs, M_TCPPACE);
459                 return (NULL);
460         }
461         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
462         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
463         if (rs->rs_rlt == NULL) {
464                 if (error)
465                         *error = ENOMEM;
466 bail:
467                 free(rs, M_TCPPACE);
468                 return (NULL);
469         }
470         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
471                 /*
472                  * The interface supports all
473                  * the rates we could possibly want.
474                  */
475                 uint64_t rat;
476
477                 rs->rs_rlt[0].rate = 12500;     /* 100k */
478                 rs->rs_rlt[1].rate = 25000;     /* 200k */
479                 rs->rs_rlt[2].rate = 62500;     /* 500k */
480                 /* Note 125000 == 1Megabit
481                  * populate 1Meg - 1000meg.
482                  */
483                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
484                         rs->rs_rlt[i].rate = rat;
485                         rat += 125000;
486                 }
487                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
488         } else if (rs->rs_flags & RS_INT_TBL) {
489                 /* We populate this in a special way */
490                 populate_canned_table(rs, rate_table_act);
491         } else {
492                 /*
493                  * Just copy in the rates from
494                  * the table, it is in order.
495                  */
496                 for (i=0; i<rs->rs_rate_cnt; i++) {
497                         rs->rs_rlt[i].rate = rate_table_act[i];
498                         rs->rs_rlt[i].time_between = 0;
499                         rs->rs_rlt[i].flags = 0;
500                 }
501         }
502         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
503                 /*
504                  * We go backwards through the list so that if we can't get
505                  * a rate and fail to init one, we have at least a chance of
506                  * getting the highest one.
507                  */
508                 rs->rs_rlt[i].ptbl = rs;
509                 rs->rs_rlt[i].tag = NULL;
510                 /*
511                  * Calculate the time between.
512                  */
513                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
514                 res = lentim / rs->rs_rlt[i].rate;
515                 if (res > 0)
516                         rs->rs_rlt[i].time_between = res;
517                 else
518                         rs->rs_rlt[i].time_between = 1;
519                 if (rs->rs_flags & RS_NO_PRE) {
520                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
521                         rs->rs_lowest_valid = i;
522                 } else {
523                         int err;
524 #ifdef RSS
525                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
526 #else
527                         hash_type = M_HASHTYPE_OPAQUE_HASH;
528 #endif
529                         err = rl_attach_txrtlmt(ifp,
530                             hash_type,
531                             (i + 1),
532                             rs->rs_rlt[i].rate,
533                             &rs->rs_rlt[i].tag);
534                         if (err) {
535                                 if (i == (rs->rs_rate_cnt - 1)) {
536                                         /*
537                                          * Huh - first rate and we can't get
538                                          * it?
539                                          */
540                                         free(rs->rs_rlt, M_TCPPACE);
541                                         if (error)
542                                                 *error = err;
543                                         goto bail;
544                                 } else {
545                                         if (error)
546                                                 *error = err;
547                                 }
548                                 break;
549                         } else {
550                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
551                                 rs->rs_lowest_valid = i;
552                         }
553                 }
554         }
555         /* Did we get at least 1 rate? */
556         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
557                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
558         else {
559                 free(rs->rs_rlt, M_TCPPACE);
560                 goto bail;
561         }
562         rs_number_alive++;
563         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
564         sysctl_ctx_init(&rs->sysctl_ctx);
565         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
566             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
567             OID_AUTO,
568             rs->rs_ifp->if_xname,
569             CTLFLAG_RW, 0,
570             "");
571         /* Unlock to allow the sysctl stuff to allocate */
572         mtx_unlock(&rs_mtx);
573         rl_add_syctl_entries(rl_sysctl_root, rs);
574         /* re-lock for our caller */
575         mtx_lock(&rs_mtx);
576         return (rs);
577 }
578
579 static const struct tcp_hwrate_limit_table *
580 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
581     uint64_t bytes_per_sec, uint32_t flags)
582 {
583         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
584         uint64_t mbits_per_sec, ind_calc;
585         int i;
586
587         mbits_per_sec = (bytes_per_sec * 8);
588         if (flags & RS_PACING_LT) {
589                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
590                     (rs->rs_lowest_valid <= 2)){
591                         /*
592                          * Smaller than 1Meg, only
593                          * 3 entries can match it.
594                          */
595                         for(i = rs->rs_lowest_valid; i < 3; i++) {
596                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
597                                         rte = &rs->rs_rlt[i];
598                                         break;
599                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
600                                         arte = &rs->rs_rlt[i];
601                                 }
602                         }
603                         goto done;
604                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
605                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
606                         /*
607                          * Larger than 1G (the majority of
608                          * our table.
609                          */
610                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
611                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
612                         else
613                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
614                         goto done;
615                 }
616                 /*
617                  * If we reach here its in our table (between 1Meg - 1000Meg),
618                  * just take the rounded down mbits per second, and add
619                  * 1Megabit to it, from this we can calculate
620                  * the index in the table.
621                  */
622                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
623                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
624                         ind_calc++;
625                 /* our table is offset by 3, we add 2 */
626                 ind_calc += 2;
627                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
628                         /* This should not happen */
629                         ind_calc = ALL_HARDWARE_RATES-1;
630                 }
631                 if ((ind_calc >= rs->rs_lowest_valid) &&
632                     (ind_calc <= rs->rs_highest_valid))
633                 rte = &rs->rs_rlt[ind_calc];
634         } else if (flags & RS_PACING_EXACT_MATCH) {
635                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
636                     (rs->rs_lowest_valid <= 2)){
637                         for(i = rs->rs_lowest_valid; i < 3; i++) {
638                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
639                                         rte = &rs->rs_rlt[i];
640                                         break;
641                                 }
642                         }
643                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
644                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
645                         /* > 1Gbps only one rate */
646                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
647                                 /* Its 10G wow */
648                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
649                         }
650                 } else {
651                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
652                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
653                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
654                                 /* its an exact Mbps */
655                                 ind_calc += 2;
656                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
657                                         /* This should not happen */
658                                         ind_calc = ALL_HARDWARE_RATES-1;
659                                 }
660                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
661                                         rte = &rs->rs_rlt[ind_calc];
662                         }
663                 }
664         } else {
665                 /* we want greater than the requested rate */
666                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
667                     (rs->rs_lowest_valid <= 2)){
668                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
669                         for (i=2; i>=rs->rs_lowest_valid; i--) {
670                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
671                                         rte = &rs->rs_rlt[i];
672                                         break;
673                                 } else if ((flags & RS_PACING_GEQ) &&
674                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
675                                         rte = &rs->rs_rlt[i];
676                                         break;
677                                 } else {
678                                         arte = &rs->rs_rlt[i]; /* new alternate */
679                                 }
680                         }
681                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
682                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
683                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
684                                 /* Our top rate is larger than the request */
685                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
686                         } else if ((flags & RS_PACING_GEQ) &&
687                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
688                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
689                                 /* It matches our top rate */
690                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
691                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
692                                 /* The top rate is an alternative */
693                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
694                         }
695                 } else {
696                         /* Its in our range 1Meg - 1Gig */
697                         if (flags & RS_PACING_GEQ) {
698                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
699                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
700                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
701                                                 /* This should not happen */
702                                                 ind_calc = (ALL_HARDWARE_RATES-1);
703                                         }
704                                         rte = &rs->rs_rlt[ind_calc];
705                                 }
706                                 goto done;
707                         }
708                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
709                         ind_calc += 2;
710                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
711                                 /* This should not happen */
712                                 ind_calc = ALL_HARDWARE_RATES-1;
713                         }
714                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
715                                 rte = &rs->rs_rlt[ind_calc];
716                 }
717         }
718 done:
719         if ((rte == NULL) &&
720             (arte != NULL) &&
721             (flags & RS_PACING_SUB_OK)) {
722                 /* We can use the substitute */
723                 rte = arte;
724         }
725         return (rte);
726 }
727
728 static const struct tcp_hwrate_limit_table *
729 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
730 {
731         /**
732          * Hunt the rate table with the restrictions in flags and find a
733          * suitable rate if possible.
734          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
735          * RS_PACING_GT     - must be greater than.
736          * RS_PACING_GEQ    - must be greater than or equal.
737          * RS_PACING_LT     - must be less than.
738          * RS_PACING_SUB_OK - If we don't meet criteria a
739          *                    substitute is ok.
740          */
741         int i, matched;
742         struct tcp_hwrate_limit_table *rte = NULL;
743
744
745         if ((rs->rs_flags & RS_INT_TBL) &&
746             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
747                 /*
748                  * Here we don't want to paw thru
749                  * a big table, we have everything
750                  * from 1Meg - 1000Meg in 1Meg increments.
751                  * Use an alternate method to "lookup".
752                  */
753                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
754         }
755         if ((flags & RS_PACING_LT) ||
756             (flags & RS_PACING_EXACT_MATCH)) {
757                 /*
758                  * For exact and less than we go forward through the table.
759                  * This way when we find one larger we stop (exact was a
760                  * toss up).
761                  */
762                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
763                         if ((flags & RS_PACING_EXACT_MATCH) &&
764                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
765                                 rte = &rs->rs_rlt[i];
766                                 matched = 1;
767                                 break;
768                         } else if ((flags & RS_PACING_LT) &&
769                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
770                                 rte = &rs->rs_rlt[i];
771                                 matched = 1;
772                                 break;
773                         }
774                         if (bytes_per_sec > rs->rs_rlt[i].rate)
775                                 break;
776                 }
777                 if ((matched == 0) &&
778                     (flags & RS_PACING_LT) &&
779                     (flags & RS_PACING_SUB_OK)) {
780                         /* Kick in a substitute (the lowest) */
781                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
782                 }
783         } else {
784                 /*
785                  * Here we go backward through the table so that we can find
786                  * the one greater in theory faster (but its probably a
787                  * wash).
788                  */
789                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
790                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
791                                 /* A possible candidate */
792                                 rte = &rs->rs_rlt[i];
793                         }
794                         if ((flags & RS_PACING_GEQ) &&
795                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
796                                 /* An exact match and we want equal */
797                                 matched = 1;
798                                 rte = &rs->rs_rlt[i];
799                                 break;
800                         } else if (rte) {
801                                 /*
802                                  * Found one that is larger than but don't
803                                  * stop, there may be a more closer match.
804                                  */
805                                 matched = 1;
806                         }
807                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
808                                 /*
809                                  * We found a table entry that is smaller,
810                                  * stop there will be none greater or equal.
811                                  */
812                                 break;
813                         }
814                 }
815                 if ((matched == 0) &&
816                     (flags & RS_PACING_SUB_OK)) {
817                         /* Kick in a substitute (the highest) */
818                         rte = &rs->rs_rlt[rs->rs_highest_valid];
819                 }
820         }
821         return (rte);
822 }
823
824 static struct ifnet *
825 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
826 {
827         struct ifnet *tifp;
828         struct m_snd_tag *tag;
829         union if_snd_tag_alloc_params params = {
830                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
831                 .rate_limit.hdr.flowid = 1,
832                 .rate_limit.max_rate = COMMON_RATE,
833                 .rate_limit.flags = M_NOWAIT,
834         };
835         int err;
836 #ifdef RSS
837         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
838             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
839 #else
840         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
841 #endif
842         tag = NULL;
843         if (ifp->if_snd_tag_alloc) {
844                 if (error)
845                         *error = ENODEV;
846                 return (NULL);
847         }
848         err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
849         if (err) {
850                 /* Failed to setup a tag? */
851                 if (error)
852                         *error = err;
853                 return (NULL);
854         }
855         tifp = tag->ifp;
856         tifp->if_snd_tag_free(tag);
857         return (tifp);
858 }
859
860 static const struct tcp_hwrate_limit_table *
861 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
862     uint32_t flags, int *error)
863 {
864         /* First lets find the interface if it exists */
865         const struct tcp_hwrate_limit_table *rte;
866         struct tcp_rate_set *rs;
867         struct epoch_tracker et;
868         int err;
869
870         epoch_enter_preempt(net_epoch_preempt, &et);
871 use_real_interface:
872         CK_LIST_FOREACH(rs, &int_rs, next) {
873                 /*
874                  * Note we don't look with the lock since we either see a
875                  * new entry or will get one when we try to add it.
876                  */
877                 if (rs->rs_flags & RS_IS_DEAD) {
878                         /* The dead are not looked at */
879                         continue;
880                 }
881                 if ((rs->rs_ifp == ifp) &&
882                     (rs->rs_if_dunit == ifp->if_dunit)) {
883                         /* Ok we found it */
884                         break;
885                 }
886         }
887         if ((rs == NULL) ||
888             (rs->rs_flags & RS_INTF_NO_SUP) ||
889             (rs->rs_flags & RS_IS_DEAD)) {
890                 /*
891                  * This means we got a packet *before*
892                  * the IF-UP was processed below, <or>
893                  * while or after we already received an interface
894                  * departed event. In either case we really don't
895                  * want to do anything with pacing, in
896                  * the departing case the packet is not
897                  * going to go very far. The new case
898                  * might be arguable, but its impossible
899                  * to tell from the departing case.
900                  */
901                 if (rs->rs_disable && error)
902                         *error = ENODEV;
903                 epoch_exit_preempt(net_epoch_preempt, &et);
904                 return (NULL);
905         }
906
907         if ((rs == NULL) || (rs->rs_disable != 0)) {
908                 if (rs->rs_disable && error)
909                         *error = ENOSPC;
910                 epoch_exit_preempt(net_epoch_preempt, &et);
911                 return (NULL);
912         }
913         if (rs->rs_flags & RS_IS_DEFF) {
914                 /* We need to find the real interface */
915                 struct ifnet *tifp;
916
917                 tifp = rt_find_real_interface(ifp, inp, error);
918                 if (tifp == NULL) {
919                         if (rs->rs_disable && error)
920                                 *error = ENOTSUP;
921                         epoch_exit_preempt(net_epoch_preempt, &et);
922                         return (NULL);
923                 }
924                 goto use_real_interface;
925         }
926         if (rs->rs_flow_limit &&
927             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
928                 if (error)
929                         *error = ENOSPC;
930                 epoch_exit_preempt(net_epoch_preempt, &et);
931                 return (NULL);
932         }
933         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
934         if (rte) {
935                 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
936                     inp->inp_flowtype,
937                     inp->inp_flowid,
938                     rte->rate,
939                     &inp->inp_snd_tag);
940                 if (err) {
941                         /* Failed to attach */
942                         if (error)
943                                 *error = err;
944                         rte = NULL;
945                 }
946         }
947         if (rte) {
948                 /*
949                  * We use an atomic here for accounting so we don't have to
950                  * use locks when freeing.
951                  */
952                 atomic_add_64(&rs->rs_flows_using, 1);
953         }
954         epoch_exit_preempt(net_epoch_preempt, &et);
955         return (rte);
956 }
957
958 static void
959 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
960 {
961         int error;
962         struct tcp_rate_set *rs;
963
964         if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
965             (link_state != LINK_STATE_UP)) {
966                 /*
967                  * We only care on an interface going up that is rate-limit
968                  * capable.
969                  */
970                 return;
971         }
972         mtx_lock(&rs_mtx);
973         CK_LIST_FOREACH(rs, &int_rs, next) {
974                 if ((rs->rs_ifp == ifp) &&
975                     (rs->rs_if_dunit == ifp->if_dunit)) {
976                         /* We already have initialized this guy */
977                         mtx_unlock(&rs_mtx);
978                         return;
979                 }
980         }
981         rt_setup_new_rs(ifp, &error);
982         mtx_unlock(&rs_mtx);
983 }
984
985 static void
986 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
987 {
988         struct tcp_rate_set *rs, *nrs;
989         struct ifnet *tifp;
990         int i;
991
992         mtx_lock(&rs_mtx);
993         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
994                 if ((rs->rs_ifp == ifp) &&
995                     (rs->rs_if_dunit == ifp->if_dunit)) {
996                         CK_LIST_REMOVE(rs, next);
997                         rs_number_alive--;
998                         rs_number_dead++;
999                         rs->rs_flags |= RS_IS_DEAD;
1000                         for (i = 0; i < rs->rs_rate_cnt; i++) {
1001                                 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1002                                         tifp = rs->rs_rlt[i].tag->ifp;
1003                                         in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1004                                         rs->rs_rlt[i].tag = NULL;
1005                                 }
1006                                 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1007                         }
1008                         if (rs->rs_flows_using == 0) {
1009                                 /*
1010                                  * No references left, so we can schedule the
1011                                  * destruction after the epoch (with a caveat).
1012                                  */
1013                                 rs->rs_flags |= RS_FUNERAL_SCHD;
1014                                 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1015                         }
1016                         break;
1017                 }
1018         }
1019         mtx_unlock(&rs_mtx);
1020 }
1021
1022 static void
1023 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1024 {
1025         struct tcp_rate_set *rs, *nrs;
1026         struct ifnet *tifp;
1027         int i;
1028
1029         mtx_lock(&rs_mtx);
1030         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1031                 CK_LIST_REMOVE(rs, next);
1032                 rs_number_alive--;
1033                 rs_number_dead++;
1034                 rs->rs_flags |= RS_IS_DEAD;
1035                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1036                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1037                                 tifp = rs->rs_rlt[i].tag->ifp;
1038                                 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1039                                 rs->rs_rlt[i].tag = NULL;
1040                         }
1041                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1042                 }
1043                 if (rs->rs_flows_using != 0) {
1044                         /*
1045                          * We dont hold a reference
1046                          * so we have nothing left to
1047                          * do.
1048                          */
1049                 } else {
1050                         /*
1051                          * No references left, so we can destroy it
1052                          * after the epoch.
1053                          */
1054                         rs->rs_flags |= RS_FUNERAL_SCHD;
1055                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1056                 }
1057         }
1058         mtx_unlock(&rs_mtx);
1059 }
1060
1061 const struct tcp_hwrate_limit_table *
1062 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1063     uint64_t bytes_per_sec, int flags, int *error)
1064 {
1065         const struct tcp_hwrate_limit_table *rte;
1066
1067         if (tp->t_inpcb->inp_snd_tag == NULL) {
1068                 /*
1069                  * We are setting up a rate for the first time.
1070                  */
1071                 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1072                         /* Not supported by the egress */
1073                         if (error)
1074                                 *error = ENODEV;
1075                         return (NULL);
1076                 }
1077 #ifdef KERN_TLS
1078                 if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
1079                         /*
1080                          * We currently can't do both TLS and hardware
1081                          * pacing
1082                          */
1083                         if (error)
1084                                 *error = EINVAL;
1085                         return (NULL);
1086                 }
1087 #endif
1088                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1089         } else {
1090                 /*
1091                  * We are modifying a rate, wrong interface?
1092                  */
1093                 if (error)
1094                         *error = EINVAL;
1095                 rte = NULL;
1096         }
1097         return (rte);
1098 }
1099
1100 const struct tcp_hwrate_limit_table *
1101 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1102     struct tcpcb *tp, struct ifnet *ifp,
1103     uint64_t bytes_per_sec, int flags, int *error)
1104 {
1105         const struct tcp_hwrate_limit_table *nrte;
1106         const struct tcp_rate_set *rs;
1107         int is_indirect = 0;
1108         int err;
1109
1110
1111         if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1112             (crte == NULL)) {
1113                 /* Wrong interface */
1114                 if (error)
1115                         *error = EINVAL;
1116                 return (NULL);
1117         }
1118         rs = crte->ptbl;
1119         if ((rs->rs_flags & RS_IS_DEAD) ||
1120             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1121                 /* Release the rate, and try anew */
1122 re_rate:
1123                 tcp_rel_pacing_rate(crte, tp);
1124                 nrte = tcp_set_pacing_rate(tp, ifp,
1125                     bytes_per_sec, flags, error);
1126                 return (nrte);
1127         }
1128         if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1129                 is_indirect = 1;
1130         else
1131                 is_indirect = 0;
1132         if ((is_indirect == 0) &&
1133             ((ifp != rs->rs_ifp) ||
1134             (ifp->if_dunit != rs->rs_if_dunit))) {
1135                 /*
1136                  * Something changed, the user is not pointing to the same
1137                  * ifp? Maybe a route updated on this guy?
1138                  */
1139                 goto re_rate;
1140         } else if (is_indirect) {
1141                 /*
1142                  * For indirect we have to dig in and find the real interface.
1143                  */
1144                 struct ifnet *rifp;
1145
1146                 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1147                 if (rifp == NULL) {
1148                         /* Can't find it? */
1149                         goto re_rate;
1150                 }
1151                 if ((rifp != rs->rs_ifp) ||
1152                     (ifp->if_dunit != rs->rs_if_dunit)) {
1153                         goto re_rate;
1154                 }
1155         }
1156         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1157         if (nrte == crte) {
1158                 /* No change */
1159                 if (error)
1160                         *error = 0;
1161                 return (crte);
1162         }
1163         if (nrte == NULL) {
1164                 /* Release the old rate */
1165                 tcp_rel_pacing_rate(crte, tp);
1166                 return (NULL);
1167         }
1168         /* Change rates to our new entry */
1169         err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1170         if (err) {
1171                 if (error)
1172                         *error = err;
1173                 return (NULL);
1174         }
1175         if (error)
1176                 *error = 0;
1177         return (nrte);
1178 }
1179
1180 void
1181 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1182 {
1183         const struct tcp_rate_set *crs;
1184         struct tcp_rate_set *rs;
1185         uint64_t pre;
1186
1187         crs = crte->ptbl;
1188         /*
1189          * Now we must break the const
1190          * in order to release our refcount.
1191          */
1192         rs = __DECONST(struct tcp_rate_set *, crs);
1193         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1194         if (pre == 1) {
1195                 mtx_lock(&rs_mtx);
1196                 /*
1197                  * Is it dead?
1198                  */
1199                 if ((rs->rs_flags & RS_IS_DEAD) &&
1200                     ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
1201                         /*
1202                          * We were the last,
1203                          * and a funeral is not pending, so
1204                          * we must schedule it.
1205                          */
1206                         rs->rs_flags |= RS_FUNERAL_SCHD;
1207                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1208                 }
1209                 mtx_unlock(&rs_mtx);
1210         }
1211         in_pcbdetach_txrtlmt(tp->t_inpcb);
1212 }
1213
1214 static eventhandler_tag rl_ifnet_departs;
1215 static eventhandler_tag rl_ifnet_arrives;
1216 static eventhandler_tag rl_shutdown_start;
1217
1218 static void
1219 tcp_rs_init(void *st __unused)
1220 {
1221         CK_LIST_INIT(&int_rs);
1222         rs_number_alive = 0;
1223         rs_number_dead = 0;;
1224         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1225         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1226             tcp_rl_ifnet_departure,
1227             NULL, EVENTHANDLER_PRI_ANY);
1228         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1229             tcp_rl_ifnet_link,
1230             NULL, EVENTHANDLER_PRI_ANY);
1231         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1232             tcp_rl_shutdown, NULL,
1233             SHUTDOWN_PRI_FIRST);
1234         printf("TCP_ratelimit: Is now initialized\n");
1235 }
1236
1237 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1238 #endif