]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ratelimit.c
Fix one more atomic for i86
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ratelimit.c
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2019
6  *      Netflix Inc.
7  *      All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 /**
32  * Author: Randall Stewart <rrs@netflix.com>
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #ifdef KERN_TLS
49 #include <sys/sockbuf_tls.h>
50 #endif
51 #include <sys/sysctl.h>
52 #include <sys/eventhandler.h>
53 #include <sys/mutex.h>
54 #include <sys/ck.h>
55 #define TCPSTATES               /* for logging */
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/tcp_var.h>
59 #ifdef INET6
60 #include <netinet6/tcp6_var.h>
61 #endif
62 #include <netinet/tcp_ratelimit.h>
63 #ifndef USECS_IN_SECOND
64 #define USECS_IN_SECOND 1000000
65 #endif
66 /*
67  * For the purposes of each send, what is the size
68  * of an ethernet frame.
69  */
70 #ifndef ETHERNET_SEGMENT_SIZE
71 #define ETHERNET_SEGMENT_SIZE 1500
72 #endif
73 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
74 #ifdef RATELIMIT
75
76 #define COMMON_RATE 180500
77 uint64_t desired_rates[] = {
78         62500,                  /* 500Kbps */
79         180500,                 /* 1.44Mpbs */
80         375000,                 /* 3Mbps */
81         500000,                 /* 4Mbps */
82         625000,                 /* 5Mbps */
83         750000,                 /* 6Mbps */
84         1000000,                /* 8Mbps */
85         1250000,                /* 10Mbps */
86         2500000,                /* 20Mbps */
87         3750000,                /* 30Mbps */
88         5000000,                /* 40Meg */
89         6250000,                /* 50Mbps */
90         12500000,               /* 100Mbps */
91         25000000,               /* 200Mbps */
92         50000000,               /* 400Mbps */
93         100000000,              /* 800Mbps */
94         12500,                  /* 100kbps */
95         25000,                  /* 200kbps */
96         875000,                 /* 7Mbps */
97         1125000,                /* 9Mbps */
98         1875000,                /* 15Mbps */
99         3125000,                /* 25Mbps */
100         8125000,                /* 65Mbps */
101         10000000,               /* 80Mbps */
102         18750000,               /* 150Mbps */
103         20000000,               /* 250Mbps */
104         37500000,               /* 350Mbps */
105         62500000,               /* 500Mbps */
106         78125000,               /* 625Mbps */
107         125000000,              /* 1Gbps */
108 };
109 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
110 #define RS_ORDERED_COUNT 16     /*
111                                  * Number that are in order
112                                  * at the beginning of the table,
113                                  * over this a sort is required.
114                                  */
115 #define RS_NEXT_ORDER_GROUP 16  /*
116                                  * The point in our table where
117                                  * we come fill in a second ordered
118                                  * group (index wise means -1).
119                                  */
120 #define ALL_HARDWARE_RATES 1004 /*
121                                  * 1Meg - 1Gig in 1 Meg steps
122                                  * plus 100, 200k  and 500k and
123                                  * 10Gig
124                                  */
125
126 #define RS_ONE_MEGABIT_PERSEC 1000000
127 #define RS_ONE_GIGABIT_PERSEC 1000000000
128 #define RS_TEN_GIGABIT_PERSEC 10000000000
129
130 static struct head_tcp_rate_set int_rs;
131 static struct mtx rs_mtx;
132 uint32_t rs_number_alive;
133 uint32_t rs_number_dead;
134
135 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
136     "TCP Ratelimit stats");
137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
138     &rs_number_alive, 0,
139     "Number of interfaces initialized for ratelimiting");
140 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
141     &rs_number_dead, 0,
142     "Number of interfaces departing from ratelimiting");
143
144 static void
145 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
146 {
147         /*
148          * Add sysctl entries for thus interface.
149          */
150         if (rs->rs_flags & RS_INTF_NO_SUP) {
151                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
152                    SYSCTL_CHILDREN(rl_sysctl_root),
153                    OID_AUTO, "disable", CTLFLAG_RD,
154                    &rs->rs_disable, 0,
155                    "Disable this interface from new hdwr limiting?");
156         } else {
157                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
158                    SYSCTL_CHILDREN(rl_sysctl_root),
159                    OID_AUTO, "disable", CTLFLAG_RW,
160                    &rs->rs_disable, 0,
161                    "Disable this interface from new hdwr limiting?");
162         }
163         SYSCTL_ADD_S32(&rs->sysctl_ctx,
164             SYSCTL_CHILDREN(rl_sysctl_root),
165             OID_AUTO, "minseg", CTLFLAG_RW,
166             &rs->rs_min_seg, 0,
167             "What is the minimum we need to send on this interface?");
168         SYSCTL_ADD_U64(&rs->sysctl_ctx,
169             SYSCTL_CHILDREN(rl_sysctl_root),
170             OID_AUTO, "flow_limit", CTLFLAG_RW,
171             &rs->rs_flow_limit, 0,
172             "What is the limit for number of flows (0=unlimited)?");
173         SYSCTL_ADD_S32(&rs->sysctl_ctx,
174             SYSCTL_CHILDREN(rl_sysctl_root),
175             OID_AUTO, "highest", CTLFLAG_RD,
176             &rs->rs_highest_valid, 0,
177             "Highest valid rate");
178         SYSCTL_ADD_S32(&rs->sysctl_ctx,
179             SYSCTL_CHILDREN(rl_sysctl_root),
180             OID_AUTO, "lowest", CTLFLAG_RD,
181             &rs->rs_lowest_valid, 0,
182             "Lowest valid rate");
183         SYSCTL_ADD_S32(&rs->sysctl_ctx,
184             SYSCTL_CHILDREN(rl_sysctl_root),
185             OID_AUTO, "flags", CTLFLAG_RD,
186             &rs->rs_flags, 0,
187             "What lags are on the entry?");
188         SYSCTL_ADD_S32(&rs->sysctl_ctx,
189             SYSCTL_CHILDREN(rl_sysctl_root),
190             OID_AUTO, "numrates", CTLFLAG_RD,
191             &rs->rs_rate_cnt, 0,
192             "How many rates re there?");
193         SYSCTL_ADD_U64(&rs->sysctl_ctx,
194             SYSCTL_CHILDREN(rl_sysctl_root),
195             OID_AUTO, "flows_using", CTLFLAG_RD,
196             &rs->rs_flows_using, 0,
197             "How many flows are using this interface now?");
198 #ifdef DETAILED_RATELIMIT_SYSCTL
199         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
200                 /*  Lets display the rates */
201                 int i;
202                 struct sysctl_oid *rl_rates;
203                 struct sysctl_oid *rl_rate_num;
204                 char rate_num[16];
205                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
206                                             SYSCTL_CHILDREN(rl_sysctl_root),
207                                             OID_AUTO,
208                                             "rate",
209                                             CTLFLAG_RW, 0,
210                                             "Ratelist");
211                 for( i = 0; i < rs->rs_rate_cnt; i++) {
212                         sprintf(rate_num, "%d", i);
213                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
214                                             SYSCTL_CHILDREN(rl_rates),
215                                             OID_AUTO,
216                                             rate_num,
217                                             CTLFLAG_RW, 0,
218                                             "Individual Rate");
219                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
220                                        SYSCTL_CHILDREN(rl_rate_num),
221                                        OID_AUTO, "flags", CTLFLAG_RD,
222                                        &rs->rs_rlt[i].flags, 0,
223                                        "Flags on this rate");
224                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
225                                        SYSCTL_CHILDREN(rl_rate_num),
226                                        OID_AUTO, "pacetime", CTLFLAG_RD,
227                                        &rs->rs_rlt[i].time_between, 0,
228                                        "Time hardware inserts between 1500 byte sends");
229                         SYSCTL_ADD_U64(&rs->sysctl_ctx,
230                                        SYSCTL_CHILDREN(rl_rate_num),
231                                        OID_AUTO, "rate", CTLFLAG_RD,
232                                        &rs->rs_rlt[i].rate, 0,
233                                        "Rate in bytes per second");
234                 }
235         }
236 #endif
237 }
238
239 static void
240 rs_destroy(epoch_context_t ctx)
241 {
242         struct tcp_rate_set *rs;
243
244         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
245         mtx_lock(&rs_mtx);
246         rs->rs_flags &= ~RS_FUNERAL_SCHD;
247         if (rs->rs_flows_using == 0) {
248                 /*
249                  * In theory its possible (but unlikely)
250                  * that while the delete was occuring
251                  * and we were applying the DEAD flag
252                  * someone slipped in and found the
253                  * interface in a lookup. While we
254                  * decided rs_flows_using were 0 and
255                  * scheduling the epoch_call, the other
256                  * thread incremented rs_flow_using. This
257                  * is because users have a pointer and
258                  * we only use the rs_flows_using in an
259                  * atomic fashion, i.e. the other entities
260                  * are not protected. To assure this did
261                  * not occur, we check rs_flows_using here
262                  * before deleteing.
263                  */
264                 sysctl_ctx_free(&rs->sysctl_ctx);
265                 free(rs->rs_rlt, M_TCPPACE);
266                 free(rs, M_TCPPACE);
267                 rs_number_dead--;
268         }
269         mtx_unlock(&rs_mtx);
270
271 }
272
273 extern counter_u64_t rate_limit_set_ok;
274 extern counter_u64_t rate_limit_active;
275 extern counter_u64_t rate_limit_alloc_fail;
276
277 static int
278 rl_attach_txrtlmt(struct ifnet *ifp,
279     uint32_t flowtype,
280     int flowid,
281     uint64_t cfg_rate,
282     struct m_snd_tag **tag)
283 {
284         int error;
285         union if_snd_tag_alloc_params params = {
286                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
287                 .rate_limit.hdr.flowid = flowid,
288                 .rate_limit.hdr.flowtype = flowtype,
289                 .rate_limit.max_rate = cfg_rate,
290                 .rate_limit.flags = M_NOWAIT,
291         };
292
293         if (ifp->if_snd_tag_alloc == NULL) {
294                 error = EOPNOTSUPP;
295         } else {
296                 error = ifp->if_snd_tag_alloc(ifp, &params, tag);
297                 if (error == 0) {
298                         if_ref((*tag)->ifp);
299                         counter_u64_add(rate_limit_set_ok, 1);
300                         counter_u64_add(rate_limit_active, 1);
301                 } else
302                         counter_u64_add(rate_limit_alloc_fail, 1);
303         }
304         return (error);
305 }
306
307 static void
308 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
309 {
310         /*
311          * The internal table is "special", it
312          * is two seperate ordered tables that
313          * must be merged. We get here when the
314          * adapter specifies a number of rates that
315          * covers both ranges in the table in some
316          * form.
317          */
318         int i, at_low, at_high;
319         uint8_t low_disabled = 0, high_disabled = 0;
320
321         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
322                 rs->rs_rlt[i].flags = 0;
323                 rs->rs_rlt[i].time_between = 0;
324                 if ((low_disabled == 0) &&
325                     (high_disabled ||
326                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
327                         rs->rs_rlt[i].rate = rate_table_act[at_low];
328                         at_low++;
329                         if (at_low == RS_NEXT_ORDER_GROUP)
330                                 low_disabled = 1;
331                 } else if (high_disabled == 0) {
332                         rs->rs_rlt[i].rate = rate_table_act[at_high];
333                         at_high++;
334                         if (at_high == MAX_HDWR_RATES)
335                                 high_disabled = 1;
336                 }
337         }
338 }
339
340 static struct tcp_rate_set *
341 rt_setup_new_rs(struct ifnet *ifp, int *error)
342 {
343         struct tcp_rate_set *rs;
344         const uint64_t *rate_table_act;
345         uint64_t lentim, res;
346         size_t sz;
347         uint32_t hash_type;
348         int i;
349         struct if_ratelimit_query_results rl;
350         struct sysctl_oid *rl_sysctl_root;
351         /*
352          * We expect to enter with the 
353          * mutex locked.
354          */
355
356         if (ifp->if_ratelimit_query == NULL) {
357                 /*
358                  * We can do nothing if we cannot
359                  * get a query back from the driver.
360                  */
361                 return (NULL);
362         }
363         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
364         if (rs == NULL) {
365                 if (error)
366                         *error = ENOMEM;
367                 return (NULL);
368         }
369         rl.flags = RT_NOSUPPORT;
370         ifp->if_ratelimit_query(ifp, &rl);
371         if (rl.flags & RT_IS_UNUSABLE) {
372                 /* 
373                  * The interface does not really support 
374                  * the rate-limiting.
375                  */
376                 memset(rs, 0, sizeof(struct tcp_rate_set));
377                 rs->rs_ifp = ifp;
378                 rs->rs_if_dunit = ifp->if_dunit;
379                 rs->rs_flags = RS_INTF_NO_SUP;
380                 rs->rs_disable = 1;
381                 rs_number_alive++;
382                 sysctl_ctx_init(&rs->sysctl_ctx);
383                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
384                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
385                     OID_AUTO,
386                     rs->rs_ifp->if_xname,
387                     CTLFLAG_RW, 0,
388                     "");
389                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
390                 /* Unlock to allow the sysctl stuff to allocate */
391                 mtx_unlock(&rs_mtx);
392                 rl_add_syctl_entries(rl_sysctl_root, rs);
393                 /* re-lock for our caller */
394                 mtx_lock(&rs_mtx);
395                 return (rs);
396         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
397                 memset(rs, 0, sizeof(struct tcp_rate_set));
398                 rs->rs_ifp = ifp;
399                 rs->rs_if_dunit = ifp->if_dunit;
400                 rs->rs_flags = RS_IS_DEFF;
401                 rs_number_alive++;
402                 sysctl_ctx_init(&rs->sysctl_ctx);
403                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
404                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
405                     OID_AUTO,
406                     rs->rs_ifp->if_xname,
407                     CTLFLAG_RW, 0,
408                     "");
409                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
410                 /* Unlock to allow the sysctl stuff to allocate */
411                 mtx_unlock(&rs_mtx);
412                 rl_add_syctl_entries(rl_sysctl_root, rs);
413                 /* re-lock for our caller */
414                 mtx_lock(&rs_mtx);
415                 return (rs);
416         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
417                 /* Mellanox most likely */
418                 rs->rs_ifp = ifp;
419                 rs->rs_if_dunit = ifp->if_dunit;
420                 rs->rs_rate_cnt = rl.number_of_rates;
421                 rs->rs_min_seg = rl.min_segment_burst;
422                 rs->rs_highest_valid = 0;
423                 rs->rs_flow_limit = rl.max_flows;
424                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
425                 rs->rs_disable = 0;
426                 rate_table_act = rl.rate_table;
427         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
428                 /* Chelsio */
429                 rs->rs_ifp = ifp;
430                 rs->rs_if_dunit = ifp->if_dunit;
431                 rs->rs_rate_cnt = rl.number_of_rates;
432                 rs->rs_min_seg = rl.min_segment_burst;
433                 rs->rs_disable = 0;
434                 rs->rs_flow_limit = rl.max_flows;
435                 rate_table_act = desired_rates;
436                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
437                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
438                         /*
439                          * Our desired table is not big
440                          * enough, do what we can.
441                          */
442                         rs->rs_rate_cnt = MAX_HDWR_RATES;
443                  }
444                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
445                         rs->rs_flags = RS_IS_INTF;
446                 else
447                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
448                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
449                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
450         } else {
451                 printf("Interface:%s unit:%d not one known to have rate-limits\n",
452                     ifp->if_dname,
453                     ifp->if_dunit);
454                 free(rs, M_TCPPACE);
455                 return (NULL);
456         }
457         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
458         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
459         if (rs->rs_rlt == NULL) {
460                 if (error)
461                         *error = ENOMEM;
462 bail:
463                 free(rs, M_TCPPACE);
464                 return (NULL);
465         }
466         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
467                 /*
468                  * The interface supports all
469                  * the rates we could possibly want.
470                  */
471                 uint64_t rat;
472
473                 rs->rs_rlt[0].rate = 12500;     /* 100k */
474                 rs->rs_rlt[1].rate = 25000;     /* 200k */
475                 rs->rs_rlt[2].rate = 62500;     /* 500k */
476                 /* Note 125000 == 1Megabit
477                  * populate 1Meg - 1000meg.
478                  */
479                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
480                         rs->rs_rlt[i].rate = rat;
481                         rat += 125000;
482                 }
483                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
484         } else if (rs->rs_flags & RS_INT_TBL) {
485                 /* We populate this in a special way */
486                 populate_canned_table(rs, rate_table_act);
487         } else {
488                 /*
489                  * Just copy in the rates from
490                  * the table, it is in order.
491                  */
492                 for (i=0; i<rs->rs_rate_cnt; i++) {
493                         rs->rs_rlt[i].rate = rate_table_act[i];
494                         rs->rs_rlt[i].time_between = 0;
495                         rs->rs_rlt[i].flags = 0;
496                 }
497         }
498         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
499                 /*
500                  * We go backwards through the list so that if we can't get
501                  * a rate and fail to init one, we have at least a chance of
502                  * getting the highest one.
503                  */
504                 rs->rs_rlt[i].ptbl = rs;
505                 rs->rs_rlt[i].tag = NULL;
506                 /*
507                  * Calculate the time between.
508                  */
509                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
510                 res = lentim / rs->rs_rlt[i].rate;
511                 if (res > 0)
512                         rs->rs_rlt[i].time_between = res;
513                 else
514                         rs->rs_rlt[i].time_between = 1;
515                 if (rs->rs_flags & RS_NO_PRE) {
516                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
517                         rs->rs_lowest_valid = i;
518                 } else {
519                         int err;
520 #ifdef RSS
521                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
522 #else
523                         hash_type = M_HASHTYPE_OPAQUE_HASH;
524 #endif
525                         err = rl_attach_txrtlmt(ifp,
526                             hash_type,
527                             (i + 1),
528                             rs->rs_rlt[i].rate,
529                             &rs->rs_rlt[i].tag);
530                         if (err) {
531                                 if (i == (rs->rs_rate_cnt - 1)) {
532                                         /*
533                                          * Huh - first rate and we can't get
534                                          * it?
535                                          */
536                                         free(rs->rs_rlt, M_TCPPACE);
537                                         if (error)
538                                                 *error = err;
539                                         goto bail;
540                                 } else {
541                                         if (error)
542                                                 *error = err;
543                                 }
544                                 break;
545                         } else {
546                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
547                                 rs->rs_lowest_valid = i;
548                         }
549                 }
550         }
551         /* Did we get at least 1 rate? */
552         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
553                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
554         else {
555                 free(rs->rs_rlt, M_TCPPACE);
556                 goto bail;
557         }
558         rs_number_alive++;
559         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
560         sysctl_ctx_init(&rs->sysctl_ctx);
561         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
562             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
563             OID_AUTO,
564             rs->rs_ifp->if_xname,
565             CTLFLAG_RW, 0,
566             "");
567         /* Unlock to allow the sysctl stuff to allocate */
568         mtx_unlock(&rs_mtx);
569         rl_add_syctl_entries(rl_sysctl_root, rs);
570         /* re-lock for our caller */
571         mtx_lock(&rs_mtx);
572         return (rs);
573 }
574
575 static const struct tcp_hwrate_limit_table *
576 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
577     uint64_t bytes_per_sec, uint32_t flags)
578 {
579         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
580         uint64_t mbits_per_sec, ind_calc;
581         int i;
582
583         mbits_per_sec = (bytes_per_sec * 8);
584         if (flags & RS_PACING_LT) {
585                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
586                     (rs->rs_lowest_valid <= 2)){
587                         /*
588                          * Smaller than 1Meg, only
589                          * 3 entries can match it.
590                          */
591                         for(i = rs->rs_lowest_valid; i < 3; i++) {
592                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
593                                         rte = &rs->rs_rlt[i];
594                                         break;
595                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
596                                         arte = &rs->rs_rlt[i];
597                                 }
598                         }
599                         goto done;
600                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
601                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
602                         /*
603                          * Larger than 1G (the majority of
604                          * our table.
605                          */
606                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
607                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
608                         else
609                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
610                         goto done;
611                 }
612                 /*
613                  * If we reach here its in our table (between 1Meg - 1000Meg),
614                  * just take the rounded down mbits per second, and add
615                  * 1Megabit to it, from this we can calculate
616                  * the index in the table.
617                  */
618                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
619                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
620                         ind_calc++;
621                 /* our table is offset by 3, we add 2 */
622                 ind_calc += 2;
623                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
624                         /* This should not happen */
625                         ind_calc = ALL_HARDWARE_RATES-1;
626                 }
627                 if ((ind_calc >= rs->rs_lowest_valid) &&
628                     (ind_calc <= rs->rs_highest_valid))
629                 rte = &rs->rs_rlt[ind_calc];
630         } else if (flags & RS_PACING_EXACT_MATCH) {
631                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
632                     (rs->rs_lowest_valid <= 2)){
633                         for(i = rs->rs_lowest_valid; i < 3; i++) {
634                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
635                                         rte = &rs->rs_rlt[i];
636                                         break;
637                                 }
638                         }
639                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
640                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
641                         /* > 1Gbps only one rate */
642                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
643                                 /* Its 10G wow */
644                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
645                         }
646                 } else {
647                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
648                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
649                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
650                                 /* its an exact Mbps */
651                                 ind_calc += 2;
652                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
653                                         /* This should not happen */
654                                         ind_calc = ALL_HARDWARE_RATES-1;
655                                 }
656                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
657                                         rte = &rs->rs_rlt[ind_calc];
658                         }
659                 }
660         } else {
661                 /* we want greater than the requested rate */
662                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
663                     (rs->rs_lowest_valid <= 2)){
664                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
665                         for (i=2; i>=rs->rs_lowest_valid; i--) {
666                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
667                                         rte = &rs->rs_rlt[i];
668                                         break;
669                                 } else if ((flags & RS_PACING_GEQ) &&
670                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
671                                         rte = &rs->rs_rlt[i];
672                                         break;
673                                 } else {
674                                         arte = &rs->rs_rlt[i]; /* new alternate */
675                                 }
676                         }
677                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
678                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
679                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
680                                 /* Our top rate is larger than the request */
681                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
682                         } else if ((flags & RS_PACING_GEQ) &&
683                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
684                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
685                                 /* It matches our top rate */
686                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
687                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
688                                 /* The top rate is an alternative */
689                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
690                         }
691                 } else {
692                         /* Its in our range 1Meg - 1Gig */
693                         if (flags & RS_PACING_GEQ) {
694                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
695                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
696                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
697                                                 /* This should not happen */
698                                                 ind_calc = (ALL_HARDWARE_RATES-1);
699                                         }
700                                         rte = &rs->rs_rlt[ind_calc];
701                                 }
702                                 goto done;
703                         }
704                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
705                         ind_calc += 2;
706                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
707                                 /* This should not happen */
708                                 ind_calc = ALL_HARDWARE_RATES-1;
709                         }
710                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
711                                 rte = &rs->rs_rlt[ind_calc];
712                 }
713         }
714 done:
715         if ((rte == NULL) &&
716             (arte != NULL) &&
717             (flags & RS_PACING_SUB_OK)) {
718                 /* We can use the substitute */
719                 rte = arte;
720         }
721         return (rte);
722 }
723
724 static const struct tcp_hwrate_limit_table *
725 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
726 {
727         /**
728          * Hunt the rate table with the restrictions in flags and find a
729          * suitable rate if possible.
730          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
731          * RS_PACING_GT     - must be greater than.
732          * RS_PACING_GEQ    - must be greater than or equal.
733          * RS_PACING_LT     - must be less than.
734          * RS_PACING_SUB_OK - If we don't meet criteria a
735          *                    substitute is ok.
736          */
737         int i, matched;
738         struct tcp_hwrate_limit_table *rte = NULL;
739
740
741         if ((rs->rs_flags & RS_INT_TBL) &&
742             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
743                 /*
744                  * Here we don't want to paw thru
745                  * a big table, we have everything
746                  * from 1Meg - 1000Meg in 1Meg increments.
747                  * Use an alternate method to "lookup".
748                  */
749                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
750         }
751         if ((flags & RS_PACING_LT) ||
752             (flags & RS_PACING_EXACT_MATCH)) {
753                 /*
754                  * For exact and less than we go forward through the table.
755                  * This way when we find one larger we stop (exact was a
756                  * toss up).
757                  */
758                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
759                         if ((flags & RS_PACING_EXACT_MATCH) &&
760                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
761                                 rte = &rs->rs_rlt[i];
762                                 matched = 1;
763                                 break;
764                         } else if ((flags & RS_PACING_LT) &&
765                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
766                                 rte = &rs->rs_rlt[i];
767                                 matched = 1;
768                                 break;
769                         }
770                         if (bytes_per_sec > rs->rs_rlt[i].rate)
771                                 break;
772                 }
773                 if ((matched == 0) &&
774                     (flags & RS_PACING_LT) &&
775                     (flags & RS_PACING_SUB_OK)) {
776                         /* Kick in a substitute (the lowest) */
777                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
778                 }
779         } else {
780                 /*
781                  * Here we go backward through the table so that we can find
782                  * the one greater in theory faster (but its probably a
783                  * wash).
784                  */
785                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
786                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
787                                 /* A possible candidate */
788                                 rte = &rs->rs_rlt[i];
789                         }
790                         if ((flags & RS_PACING_GEQ) &&
791                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
792                                 /* An exact match and we want equal */
793                                 matched = 1;
794                                 rte = &rs->rs_rlt[i];
795                                 break;
796                         } else if (rte) {
797                                 /*
798                                  * Found one that is larger than but don't
799                                  * stop, there may be a more closer match.
800                                  */
801                                 matched = 1;
802                         }
803                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
804                                 /*
805                                  * We found a table entry that is smaller,
806                                  * stop there will be none greater or equal.
807                                  */
808                                 break;
809                         }
810                 }
811                 if ((matched == 0) &&
812                     (flags & RS_PACING_SUB_OK)) {
813                         /* Kick in a substitute (the highest) */
814                         rte = &rs->rs_rlt[rs->rs_highest_valid];
815                 }
816         }
817         return (rte);
818 }
819
820 static struct ifnet *
821 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
822 {
823         struct ifnet *tifp;
824         struct m_snd_tag *tag;
825         union if_snd_tag_alloc_params params = {
826                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
827                 .rate_limit.hdr.flowid = 1,
828                 .rate_limit.max_rate = COMMON_RATE,
829                 .rate_limit.flags = M_NOWAIT,
830         };
831         int err;
832 #ifdef RSS
833         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
834             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
835 #else
836         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
837 #endif
838         tag = NULL;
839         if (ifp->if_snd_tag_alloc) {
840                 if (error)
841                         *error = ENODEV;
842                 return (NULL);
843         }
844         err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
845         if (err) {
846                 /* Failed to setup a tag? */
847                 if (error)
848                         *error = err;
849                 return (NULL);
850         }
851         tifp = tag->ifp;
852         tifp->if_snd_tag_free(tag);
853         return (tifp);
854 }
855
856 static const struct tcp_hwrate_limit_table *
857 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
858     uint32_t flags, int *error)
859 {
860         /* First lets find the interface if it exists */
861         const struct tcp_hwrate_limit_table *rte;
862         struct tcp_rate_set *rs;
863         struct epoch_tracker et;
864         int err;
865
866         epoch_enter_preempt(net_epoch_preempt, &et);
867 use_real_interface:
868         CK_LIST_FOREACH(rs, &int_rs, next) {
869                 /*
870                  * Note we don't look with the lock since we either see a
871                  * new entry or will get one when we try to add it.
872                  */
873                 if (rs->rs_flags & RS_IS_DEAD) {
874                         /* The dead are not looked at */
875                         continue;
876                 }
877                 if ((rs->rs_ifp == ifp) &&
878                     (rs->rs_if_dunit == ifp->if_dunit)) {
879                         /* Ok we found it */
880                         break;
881                 }
882         }
883         if ((rs == NULL) ||
884             (rs->rs_flags & RS_INTF_NO_SUP) ||
885             (rs->rs_flags & RS_IS_DEAD)) {
886                 /*
887                  * This means we got a packet *before*
888                  * the IF-UP was processed below, <or>
889                  * while or after we already received an interface
890                  * departed event. In either case we really don't
891                  * want to do anything with pacing, in
892                  * the departing case the packet is not
893                  * going to go very far. The new case
894                  * might be arguable, but its impossible
895                  * to tell from the departing case.
896                  */
897                 if (rs->rs_disable && error)
898                         *error = ENODEV;
899                 epoch_exit_preempt(net_epoch_preempt, &et);
900                 return (NULL);
901         }
902
903         if ((rs == NULL) || (rs->rs_disable != 0)) {
904                 if (rs->rs_disable && error)
905                         *error = ENOSPC;
906                 epoch_exit_preempt(net_epoch_preempt, &et);
907                 return (NULL);
908         }
909         if (rs->rs_flags & RS_IS_DEFF) {
910                 /* We need to find the real interface */
911                 struct ifnet *tifp;
912
913                 tifp = rt_find_real_interface(ifp, inp, error);
914                 if (tifp == NULL) {
915                         if (rs->rs_disable && error)
916                                 *error = ENOTSUP;
917                         epoch_exit_preempt(net_epoch_preempt, &et);
918                         return (NULL);
919                 }
920                 goto use_real_interface;
921         }
922         if (rs->rs_flow_limit &&
923             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
924                 if (error)
925                         *error = ENOSPC;
926                 epoch_exit_preempt(net_epoch_preempt, &et);
927                 return (NULL);
928         }
929         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
930         if (rte) {
931                 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
932                     inp->inp_flowtype,
933                     inp->inp_flowid,
934                     rte->rate,
935                     &inp->inp_snd_tag);
936                 if (err) {
937                         /* Failed to attach */
938                         if (error)
939                                 *error = err;
940                         rte = NULL;
941                 }
942         }
943         if (rte) {
944                 /*
945                  * We use an atomic here for accounting so we don't have to
946                  * use locks when freeing.
947                  */
948                 atomic_add_64(&rs->rs_flows_using, 1);
949         }
950         epoch_exit_preempt(net_epoch_preempt, &et);
951         return (rte);
952 }
953
954 static void
955 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
956 {
957         int error;
958         struct tcp_rate_set *rs;
959
960         if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
961             (link_state != LINK_STATE_UP)) {
962                 /*
963                  * We only care on an interface going up that is rate-limit
964                  * capable.
965                  */
966                 return;
967         }
968         mtx_lock(&rs_mtx);
969         CK_LIST_FOREACH(rs, &int_rs, next) {
970                 if ((rs->rs_ifp == ifp) &&
971                     (rs->rs_if_dunit == ifp->if_dunit)) {
972                         /* We already have initialized this guy */
973                         mtx_unlock(&rs_mtx);
974                         return;
975                 }
976         }
977         rt_setup_new_rs(ifp, &error);
978         mtx_unlock(&rs_mtx);
979 }
980
981 static void
982 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
983 {
984         struct tcp_rate_set *rs, *nrs;
985         struct ifnet *tifp;
986         int i;
987
988         mtx_lock(&rs_mtx);
989         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
990                 if ((rs->rs_ifp == ifp) &&
991                     (rs->rs_if_dunit == ifp->if_dunit)) {
992                         CK_LIST_REMOVE(rs, next);
993                         rs_number_alive--;
994                         rs_number_dead++;
995                         rs->rs_flags |= RS_IS_DEAD;
996                         for (i = 0; i < rs->rs_rate_cnt; i++) {
997                                 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
998                                         tifp = rs->rs_rlt[i].tag->ifp;
999                                         in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1000                                         rs->rs_rlt[i].tag = NULL;
1001                                 }
1002                                 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1003                         }
1004                         if (rs->rs_flows_using == 0) {
1005                                 /*
1006                                  * No references left, so we can schedule the
1007                                  * destruction after the epoch (with a caveat).
1008                                  */
1009                                 rs->rs_flags |= RS_FUNERAL_SCHD;
1010                                 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1011                         }
1012                         break;
1013                 }
1014         }
1015         mtx_unlock(&rs_mtx);
1016 }
1017
1018 static void
1019 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1020 {
1021         struct tcp_rate_set *rs, *nrs;
1022         struct ifnet *tifp;
1023         int i;
1024
1025         mtx_lock(&rs_mtx);
1026         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1027                 CK_LIST_REMOVE(rs, next);
1028                 rs_number_alive--;
1029                 rs_number_dead++;
1030                 rs->rs_flags |= RS_IS_DEAD;
1031                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1032                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1033                                 tifp = rs->rs_rlt[i].tag->ifp;
1034                                 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1035                                 rs->rs_rlt[i].tag = NULL;
1036                         }
1037                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1038                 }
1039                 if (rs->rs_flows_using != 0) {
1040                         /*
1041                          * We dont hold a reference
1042                          * so we have nothing left to
1043                          * do.
1044                          */
1045                 } else {
1046                         /*
1047                          * No references left, so we can destroy it
1048                          * after the epoch.
1049                          */
1050                         rs->rs_flags |= RS_FUNERAL_SCHD;
1051                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1052                 }
1053         }
1054         mtx_unlock(&rs_mtx);
1055 }
1056
1057 const struct tcp_hwrate_limit_table *
1058 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1059     uint64_t bytes_per_sec, int flags, int *error)
1060 {
1061         const struct tcp_hwrate_limit_table *rte;
1062
1063         if (tp->t_inpcb->inp_snd_tag == NULL) {
1064                 /*
1065                  * We are setting up a rate for the first time.
1066                  */
1067                 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1068                         /* Not supported by the egress */
1069                         if (error)
1070                                 *error = ENODEV;
1071                         return (NULL);
1072                 }
1073 #ifdef KERN_TLS
1074                 if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
1075                         /*
1076                          * We currently can't do both TLS and hardware
1077                          * pacing
1078                          */
1079                         if (error)
1080                                 *error = EINVAL;
1081                         return (NULL);
1082                 }
1083 #endif
1084                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1085         } else {
1086                 /*
1087                  * We are modifying a rate, wrong interface?
1088                  */
1089                 if (error)
1090                         *error = EINVAL;
1091                 rte = NULL;
1092         }
1093         return (rte);
1094 }
1095
1096 const struct tcp_hwrate_limit_table *
1097 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1098     struct tcpcb *tp, struct ifnet *ifp,
1099     uint64_t bytes_per_sec, int flags, int *error)
1100 {
1101         const struct tcp_hwrate_limit_table *nrte;
1102         const struct tcp_rate_set *rs;
1103         int is_indirect = 0;
1104         int err;
1105
1106
1107         if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1108             (crte == NULL)) {
1109                 /* Wrong interface */
1110                 if (error)
1111                         *error = EINVAL;
1112                 return (NULL);
1113         }
1114         rs = crte->ptbl;
1115         if ((rs->rs_flags & RS_IS_DEAD) ||
1116             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1117                 /* Release the rate, and try anew */
1118 re_rate:
1119                 tcp_rel_pacing_rate(crte, tp);
1120                 nrte = tcp_set_pacing_rate(tp, ifp,
1121                     bytes_per_sec, flags, error);
1122                 return (nrte);
1123         }
1124         if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1125                 is_indirect = 1;
1126         else
1127                 is_indirect = 0;
1128         if ((is_indirect == 0) &&
1129             ((ifp != rs->rs_ifp) ||
1130             (ifp->if_dunit != rs->rs_if_dunit))) {
1131                 /*
1132                  * Something changed, the user is not pointing to the same
1133                  * ifp? Maybe a route updated on this guy?
1134                  */
1135                 goto re_rate;
1136         } else if (is_indirect) {
1137                 /*
1138                  * For indirect we have to dig in and find the real interface.
1139                  */
1140                 struct ifnet *rifp;
1141
1142                 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1143                 if (rifp == NULL) {
1144                         /* Can't find it? */
1145                         goto re_rate;
1146                 }
1147                 if ((rifp != rs->rs_ifp) ||
1148                     (ifp->if_dunit != rs->rs_if_dunit)) {
1149                         goto re_rate;
1150                 }
1151         }
1152         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1153         if (nrte == crte) {
1154                 /* No change */
1155                 if (error)
1156                         *error = 0;
1157                 return (crte);
1158         }
1159         if (nrte == NULL) {
1160                 /* Release the old rate */
1161                 tcp_rel_pacing_rate(crte, tp);
1162                 return (NULL);
1163         }
1164         /* Change rates to our new entry */
1165         err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1166         if (err) {
1167                 if (error)
1168                         *error = err;
1169                 return (NULL);
1170         }
1171         if (error)
1172                 *error = 0;
1173         return (nrte);
1174 }
1175
1176 void
1177 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1178 {
1179         const struct tcp_rate_set *crs;
1180         struct tcp_rate_set *rs;
1181         uint64_t pre;
1182
1183         crs = crte->ptbl;
1184         /*
1185          * Now we must break the const
1186          * in order to release our refcount.
1187          */
1188         rs = __DECONST(struct tcp_rate_set *, crs);
1189         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1190         if (pre == 1) {
1191                 mtx_lock(&rs_mtx);
1192                 /*
1193                  * Is it dead?
1194                  */
1195                 if ((rs->rs_flags & RS_IS_DEAD) &&
1196                     ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
1197                         /*
1198                          * We were the last,
1199                          * and a funeral is not pending, so
1200                          * we must schedule it.
1201                          */
1202                         rs->rs_flags |= RS_FUNERAL_SCHD;
1203                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1204                 }
1205                 mtx_unlock(&rs_mtx);
1206         }
1207         in_pcbdetach_txrtlmt(tp->t_inpcb);
1208 }
1209
1210 static eventhandler_tag rl_ifnet_departs;
1211 static eventhandler_tag rl_ifnet_arrives;
1212 static eventhandler_tag rl_shutdown_start;
1213
1214 static void
1215 tcp_rs_init(void *st __unused)
1216 {
1217         CK_LIST_INIT(&int_rs);
1218         rs_number_alive = 0;
1219         rs_number_dead = 0;;
1220         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1221         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1222             tcp_rl_ifnet_departure,
1223             NULL, EVENTHANDLER_PRI_ANY);
1224         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1225             tcp_rl_ifnet_link,
1226             NULL, EVENTHANDLER_PRI_ANY);
1227         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1228             tcp_rl_shutdown, NULL,
1229             SHUTDOWN_PRI_FIRST);
1230         printf("TCP_ratelimit: Is now initialized\n");
1231 }
1232
1233 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1234 #endif