]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_ratelimit.c
MFV r351553:
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_ratelimit.c
1 /*-
2  *
3  * SPDX-License-Identifier: BSD-3-Clause
4  *
5  * Copyright (c) 2018-2019
6  *      Netflix Inc.
7  *      All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 /**
32  * Author: Randall Stewart <rrs@netflix.com>
33  */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 #include "opt_inet.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_tcpdebug.h"
41 #include "opt_ratelimit.h"
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #ifdef KERN_TLS
49 #include <sys/sockbuf_tls.h>
50 #endif
51 #include <sys/sysctl.h>
52 #include <sys/eventhandler.h>
53 #include <sys/mutex.h>
54 #include <sys/ck.h>
55 #define TCPSTATES               /* for logging */
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/tcp_var.h>
59 #ifdef INET6
60 #include <netinet6/tcp6_var.h>
61 #endif
62 #include <netinet/tcp_ratelimit.h>
63 #ifndef USECS_IN_SECOND
64 #define USECS_IN_SECOND 1000000
65 #endif
66 /*
67  * For the purposes of each send, what is the size
68  * of an ethernet frame.
69  */
70 #ifndef ETHERNET_SEGMENT_SIZE
71 #define ETHERNET_SEGMENT_SIZE 1500
72 #endif
73 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
74 #ifdef RATELIMIT
75
76 #define COMMON_RATE 180500
77 uint64_t desired_rates[] = {
78         62500,                  /* 500Kbps */
79         180500,                 /* 1.44Mpbs */
80         375000,                 /* 3Mbps */
81         500000,                 /* 4Mbps */
82         625000,                 /* 5Mbps */
83         750000,                 /* 6Mbps */
84         1000000,                /* 8Mbps */
85         1250000,                /* 10Mbps */
86         2500000,                /* 20Mbps */
87         3750000,                /* 30Mbps */
88         5000000,                /* 40Meg */
89         6250000,                /* 50Mbps */
90         12500000,               /* 100Mbps */
91         25000000,               /* 200Mbps */
92         50000000,               /* 400Mbps */
93         100000000,              /* 800Mbps */
94         12500,                  /* 100kbps */
95         25000,                  /* 200kbps */
96         875000,                 /* 7Mbps */
97         1125000,                /* 9Mbps */
98         1875000,                /* 15Mbps */
99         3125000,                /* 25Mbps */
100         8125000,                /* 65Mbps */
101         10000000,               /* 80Mbps */
102         18750000,               /* 150Mbps */
103         20000000,               /* 250Mbps */
104         37500000,               /* 350Mbps */
105         62500000,               /* 500Mbps */
106         78125000,               /* 625Mbps */
107         125000000,              /* 1Gbps */
108 };
109 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
110 #define RS_ORDERED_COUNT 16     /*
111                                  * Number that are in order
112                                  * at the beginning of the table,
113                                  * over this a sort is required.
114                                  */
115 #define RS_NEXT_ORDER_GROUP 16  /*
116                                  * The point in our table where
117                                  * we come fill in a second ordered
118                                  * group (index wise means -1).
119                                  */
120 #define ALL_HARDWARE_RATES 1004 /*
121                                  * 1Meg - 1Gig in 1 Meg steps
122                                  * plus 100, 200k  and 500k and
123                                  * 10Gig
124                                  */
125
126 #define RS_ONE_MEGABIT_PERSEC 1000000
127 #define RS_ONE_GIGABIT_PERSEC 1000000000
128 #define RS_TEN_GIGABIT_PERSEC 10000000000
129
130 static struct head_tcp_rate_set int_rs;
131 static struct mtx rs_mtx;
132 uint32_t rs_number_alive;
133 uint32_t rs_number_dead;
134
135 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
136     "TCP Ratelimit stats");
137 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
138     &rs_number_alive, 0,
139     "Number of interfaces initialized for ratelimiting");
140 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
141     &rs_number_dead, 0,
142     "Number of interfaces departing from ratelimiting");
143
144 static void
145 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
146 {
147         /*
148          * Add sysctl entries for thus interface.
149          */
150         if (rs->rs_flags & RS_INTF_NO_SUP) {
151                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
152                    SYSCTL_CHILDREN(rl_sysctl_root),
153                    OID_AUTO, "disable", CTLFLAG_RD,
154                    &rs->rs_disable, 0,
155                    "Disable this interface from new hdwr limiting?");
156         } else {
157                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
158                    SYSCTL_CHILDREN(rl_sysctl_root),
159                    OID_AUTO, "disable", CTLFLAG_RW,
160                    &rs->rs_disable, 0,
161                    "Disable this interface from new hdwr limiting?");
162         }
163         SYSCTL_ADD_S32(&rs->sysctl_ctx,
164             SYSCTL_CHILDREN(rl_sysctl_root),
165             OID_AUTO, "minseg", CTLFLAG_RW,
166             &rs->rs_min_seg, 0,
167             "What is the minimum we need to send on this interface?");
168         SYSCTL_ADD_U64(&rs->sysctl_ctx,
169             SYSCTL_CHILDREN(rl_sysctl_root),
170             OID_AUTO, "flow_limit", CTLFLAG_RW,
171             &rs->rs_flow_limit, 0,
172             "What is the limit for number of flows (0=unlimited)?");
173         SYSCTL_ADD_S32(&rs->sysctl_ctx,
174             SYSCTL_CHILDREN(rl_sysctl_root),
175             OID_AUTO, "highest", CTLFLAG_RD,
176             &rs->rs_highest_valid, 0,
177             "Highest valid rate");
178         SYSCTL_ADD_S32(&rs->sysctl_ctx,
179             SYSCTL_CHILDREN(rl_sysctl_root),
180             OID_AUTO, "lowest", CTLFLAG_RD,
181             &rs->rs_lowest_valid, 0,
182             "Lowest valid rate");
183         SYSCTL_ADD_S32(&rs->sysctl_ctx,
184             SYSCTL_CHILDREN(rl_sysctl_root),
185             OID_AUTO, "flags", CTLFLAG_RD,
186             &rs->rs_flags, 0,
187             "What lags are on the entry?");
188         SYSCTL_ADD_S32(&rs->sysctl_ctx,
189             SYSCTL_CHILDREN(rl_sysctl_root),
190             OID_AUTO, "numrates", CTLFLAG_RD,
191             &rs->rs_rate_cnt, 0,
192             "How many rates re there?");
193         SYSCTL_ADD_U64(&rs->sysctl_ctx,
194             SYSCTL_CHILDREN(rl_sysctl_root),
195             OID_AUTO, "flows_using", CTLFLAG_RD,
196             &rs->rs_flows_using, 0,
197             "How many flows are using this interface now?");
198 #ifdef DETAILED_RATELIMIT_SYSCTL
199         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
200                 /*  Lets display the rates */
201                 int i;
202                 struct sysctl_oid *rl_rates;
203                 struct sysctl_oid *rl_rate_num;
204                 char rate_num[16];
205                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
206                                             SYSCTL_CHILDREN(rl_sysctl_root),
207                                             OID_AUTO,
208                                             "rate",
209                                             CTLFLAG_RW, 0,
210                                             "Ratelist");
211                 for( i = 0; i < rs->rs_rate_cnt; i++) {
212                         sprintf(rate_num, "%d", i);
213                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
214                                             SYSCTL_CHILDREN(rl_rates),
215                                             OID_AUTO,
216                                             rate_num,
217                                             CTLFLAG_RW, 0,
218                                             "Individual Rate");
219                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
220                                        SYSCTL_CHILDREN(rl_rate_num),
221                                        OID_AUTO, "flags", CTLFLAG_RD,
222                                        &rs->rs_rlt[i].flags, 0,
223                                        "Flags on this rate");
224                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
225                                        SYSCTL_CHILDREN(rl_rate_num),
226                                        OID_AUTO, "pacetime", CTLFLAG_RD,
227                                        &rs->rs_rlt[i].time_between, 0,
228                                        "Time hardware inserts between 1500 byte sends");
229                         SYSCTL_ADD_U64(&rs->sysctl_ctx,
230                                        SYSCTL_CHILDREN(rl_rate_num),
231                                        OID_AUTO, "rate", CTLFLAG_RD,
232                                        &rs->rs_rlt[i].rate, 0,
233                                        "Rate in bytes per second");
234                 }
235         }
236 #endif
237 }
238
239 static void
240 rs_destroy(epoch_context_t ctx)
241 {
242         struct tcp_rate_set *rs;
243
244         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
245         mtx_lock(&rs_mtx);
246         rs->rs_flags &= ~RS_FUNERAL_SCHD;
247         if (rs->rs_flows_using == 0) {
248                 /*
249                  * In theory its possible (but unlikely)
250                  * that while the delete was occuring
251                  * and we were applying the DEAD flag
252                  * someone slipped in and found the
253                  * interface in a lookup. While we
254                  * decided rs_flows_using were 0 and
255                  * scheduling the epoch_call, the other
256                  * thread incremented rs_flow_using. This
257                  * is because users have a pointer and
258                  * we only use the rs_flows_using in an
259                  * atomic fashion, i.e. the other entities
260                  * are not protected. To assure this did
261                  * not occur, we check rs_flows_using here
262                  * before deleteing.
263                  */
264                 sysctl_ctx_free(&rs->sysctl_ctx);
265                 free(rs->rs_rlt, M_TCPPACE);
266                 free(rs, M_TCPPACE);
267                 rs_number_dead--;
268         }
269         mtx_unlock(&rs_mtx);
270
271 }
272
273 #ifdef INET
274 extern counter_u64_t rate_limit_set_ok;
275 extern counter_u64_t rate_limit_active;
276 extern counter_u64_t rate_limit_alloc_fail;
277 #endif
278
279 static int
280 rl_attach_txrtlmt(struct ifnet *ifp,
281     uint32_t flowtype,
282     int flowid,
283     uint64_t cfg_rate,
284     struct m_snd_tag **tag)
285 {
286         int error;
287         union if_snd_tag_alloc_params params = {
288                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
289                 .rate_limit.hdr.flowid = flowid,
290                 .rate_limit.hdr.flowtype = flowtype,
291                 .rate_limit.max_rate = cfg_rate,
292                 .rate_limit.flags = M_NOWAIT,
293         };
294
295         if (ifp->if_snd_tag_alloc == NULL) {
296                 error = EOPNOTSUPP;
297         } else {
298                 error = ifp->if_snd_tag_alloc(ifp, &params, tag);
299 #ifdef INET
300                 if (error == 0) {
301                         if_ref((*tag)->ifp);
302                         counter_u64_add(rate_limit_set_ok, 1);
303                         counter_u64_add(rate_limit_active, 1);
304                 } else
305                         counter_u64_add(rate_limit_alloc_fail, 1);
306 #endif
307         }
308         return (error);
309 }
310
311 static void
312 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
313 {
314         /*
315          * The internal table is "special", it
316          * is two seperate ordered tables that
317          * must be merged. We get here when the
318          * adapter specifies a number of rates that
319          * covers both ranges in the table in some
320          * form.
321          */
322         int i, at_low, at_high;
323         uint8_t low_disabled = 0, high_disabled = 0;
324
325         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
326                 rs->rs_rlt[i].flags = 0;
327                 rs->rs_rlt[i].time_between = 0;
328                 if ((low_disabled == 0) &&
329                     (high_disabled ||
330                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
331                         rs->rs_rlt[i].rate = rate_table_act[at_low];
332                         at_low++;
333                         if (at_low == RS_NEXT_ORDER_GROUP)
334                                 low_disabled = 1;
335                 } else if (high_disabled == 0) {
336                         rs->rs_rlt[i].rate = rate_table_act[at_high];
337                         at_high++;
338                         if (at_high == MAX_HDWR_RATES)
339                                 high_disabled = 1;
340                 }
341         }
342 }
343
344 static struct tcp_rate_set *
345 rt_setup_new_rs(struct ifnet *ifp, int *error)
346 {
347         struct tcp_rate_set *rs;
348         const uint64_t *rate_table_act;
349         uint64_t lentim, res;
350         size_t sz;
351         uint32_t hash_type;
352         int i;
353         struct if_ratelimit_query_results rl;
354         struct sysctl_oid *rl_sysctl_root;
355         /*
356          * We expect to enter with the 
357          * mutex locked.
358          */
359
360         if (ifp->if_ratelimit_query == NULL) {
361                 /*
362                  * We can do nothing if we cannot
363                  * get a query back from the driver.
364                  */
365                 return (NULL);
366         }
367         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
368         if (rs == NULL) {
369                 if (error)
370                         *error = ENOMEM;
371                 return (NULL);
372         }
373         rl.flags = RT_NOSUPPORT;
374         ifp->if_ratelimit_query(ifp, &rl);
375         if (rl.flags & RT_IS_UNUSABLE) {
376                 /* 
377                  * The interface does not really support 
378                  * the rate-limiting.
379                  */
380                 memset(rs, 0, sizeof(struct tcp_rate_set));
381                 rs->rs_ifp = ifp;
382                 rs->rs_if_dunit = ifp->if_dunit;
383                 rs->rs_flags = RS_INTF_NO_SUP;
384                 rs->rs_disable = 1;
385                 rs_number_alive++;
386                 sysctl_ctx_init(&rs->sysctl_ctx);
387                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
388                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
389                     OID_AUTO,
390                     rs->rs_ifp->if_xname,
391                     CTLFLAG_RW, 0,
392                     "");
393                 rl_add_syctl_entries(rl_sysctl_root, rs);
394                 mtx_lock(&rs_mtx);
395                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
396                 mtx_unlock(&rs_mtx);
397                 return (rs);
398         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
399                 memset(rs, 0, sizeof(struct tcp_rate_set));
400                 rs->rs_ifp = ifp;
401                 rs->rs_if_dunit = ifp->if_dunit;
402                 rs->rs_flags = RS_IS_DEFF;
403                 rs_number_alive++;
404                 sysctl_ctx_init(&rs->sysctl_ctx);
405                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
406                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
407                     OID_AUTO,
408                     rs->rs_ifp->if_xname,
409                     CTLFLAG_RW, 0,
410                     "");
411                 rl_add_syctl_entries(rl_sysctl_root, rs);
412                 mtx_lock(&rs_mtx);
413                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
414                 mtx_unlock(&rs_mtx);
415                 return (rs);
416         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
417                 /* Mellanox most likely */
418                 rs->rs_ifp = ifp;
419                 rs->rs_if_dunit = ifp->if_dunit;
420                 rs->rs_rate_cnt = rl.number_of_rates;
421                 rs->rs_min_seg = rl.min_segment_burst;
422                 rs->rs_highest_valid = 0;
423                 rs->rs_flow_limit = rl.max_flows;
424                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
425                 rs->rs_disable = 0;
426                 rate_table_act = rl.rate_table;
427         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
428                 /* Chelsio */
429                 rs->rs_ifp = ifp;
430                 rs->rs_if_dunit = ifp->if_dunit;
431                 rs->rs_rate_cnt = rl.number_of_rates;
432                 rs->rs_min_seg = rl.min_segment_burst;
433                 rs->rs_disable = 0;
434                 rs->rs_flow_limit = rl.max_flows;
435                 rate_table_act = desired_rates;
436                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
437                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
438                         /*
439                          * Our desired table is not big
440                          * enough, do what we can.
441                          */
442                         rs->rs_rate_cnt = MAX_HDWR_RATES;
443                  }
444                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
445                         rs->rs_flags = RS_IS_INTF;
446                 else
447                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
448                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
449                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
450         } else {
451                 printf("Interface:%s unit:%d not one known to have rate-limits\n",
452                     ifp->if_dname,
453                     ifp->if_dunit);
454                 free(rs, M_TCPPACE);
455                 return (NULL);
456         }
457         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
458         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
459         if (rs->rs_rlt == NULL) {
460                 if (error)
461                         *error = ENOMEM;
462 bail:
463                 free(rs, M_TCPPACE);
464                 return (NULL);
465         }
466         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
467                 /*
468                  * The interface supports all
469                  * the rates we could possibly want.
470                  */
471                 uint64_t rat;
472
473                 rs->rs_rlt[0].rate = 12500;     /* 100k */
474                 rs->rs_rlt[1].rate = 25000;     /* 200k */
475                 rs->rs_rlt[2].rate = 62500;     /* 500k */
476                 /* Note 125000 == 1Megabit
477                  * populate 1Meg - 1000meg.
478                  */
479                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
480                         rs->rs_rlt[i].rate = rat;
481                         rat += 125000;
482                 }
483                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
484         } else if (rs->rs_flags & RS_INT_TBL) {
485                 /* We populate this in a special way */
486                 populate_canned_table(rs, rate_table_act);
487         } else {
488                 /*
489                  * Just copy in the rates from
490                  * the table, it is in order.
491                  */
492                 for (i=0; i<rs->rs_rate_cnt; i++) {
493                         rs->rs_rlt[i].rate = rate_table_act[i];
494                         rs->rs_rlt[i].time_between = 0;
495                         rs->rs_rlt[i].flags = 0;
496                 }
497         }
498         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
499                 /*
500                  * We go backwards through the list so that if we can't get
501                  * a rate and fail to init one, we have at least a chance of
502                  * getting the highest one.
503                  */
504                 rs->rs_rlt[i].ptbl = rs;
505                 rs->rs_rlt[i].tag = NULL;
506                 /*
507                  * Calculate the time between.
508                  */
509                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
510                 res = lentim / rs->rs_rlt[i].rate;
511                 if (res > 0)
512                         rs->rs_rlt[i].time_between = res;
513                 else
514                         rs->rs_rlt[i].time_between = 1;
515                 if (rs->rs_flags & RS_NO_PRE) {
516                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
517                         rs->rs_lowest_valid = i;
518                 } else {
519                         int err;
520 #ifdef RSS
521                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
522 #else
523                         hash_type = M_HASHTYPE_OPAQUE_HASH;
524 #endif
525                         err = rl_attach_txrtlmt(ifp,
526                             hash_type,
527                             (i + 1),
528                             rs->rs_rlt[i].rate,
529                             &rs->rs_rlt[i].tag);
530                         if (err) {
531                                 if (i == (rs->rs_rate_cnt - 1)) {
532                                         /*
533                                          * Huh - first rate and we can't get
534                                          * it?
535                                          */
536                                         free(rs->rs_rlt, M_TCPPACE);
537                                         if (error)
538                                                 *error = err;
539                                         goto bail;
540                                 } else {
541                                         if (error)
542                                                 *error = err;
543                                 }
544                                 break;
545                         } else {
546                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
547                                 rs->rs_lowest_valid = i;
548                         }
549                 }
550         }
551         /* Did we get at least 1 rate? */
552         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
553                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
554         else {
555                 free(rs->rs_rlt, M_TCPPACE);
556                 goto bail;
557         }
558         rs_number_alive++;
559         sysctl_ctx_init(&rs->sysctl_ctx);
560         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
561             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
562             OID_AUTO,
563             rs->rs_ifp->if_xname,
564             CTLFLAG_RW, 0,
565             "");
566         rl_add_syctl_entries(rl_sysctl_root, rs);
567         mtx_lock(&rs_mtx);
568         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
569         mtx_unlock(&rs_mtx);
570         return (rs);
571 }
572
573 static const struct tcp_hwrate_limit_table *
574 tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
575     uint64_t bytes_per_sec, uint32_t flags)
576 {
577         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
578         uint64_t mbits_per_sec, ind_calc;
579         int i;
580
581         mbits_per_sec = (bytes_per_sec * 8);
582         if (flags & RS_PACING_LT) {
583                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
584                     (rs->rs_lowest_valid <= 2)){
585                         /*
586                          * Smaller than 1Meg, only
587                          * 3 entries can match it.
588                          */
589                         for(i = rs->rs_lowest_valid; i < 3; i++) {
590                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
591                                         rte = &rs->rs_rlt[i];
592                                         break;
593                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
594                                         arte = &rs->rs_rlt[i];
595                                 }
596                         }
597                         goto done;
598                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
599                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
600                         /*
601                          * Larger than 1G (the majority of
602                          * our table.
603                          */
604                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
605                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
606                         else
607                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
608                         goto done;
609                 }
610                 /*
611                  * If we reach here its in our table (between 1Meg - 1000Meg),
612                  * just take the rounded down mbits per second, and add
613                  * 1Megabit to it, from this we can calculate
614                  * the index in the table.
615                  */
616                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
617                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
618                         ind_calc++;
619                 /* our table is offset by 3, we add 2 */
620                 ind_calc += 2;
621                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
622                         /* This should not happen */
623                         ind_calc = ALL_HARDWARE_RATES-1;
624                 }
625                 if ((ind_calc >= rs->rs_lowest_valid) &&
626                     (ind_calc <= rs->rs_highest_valid))
627                 rte = &rs->rs_rlt[ind_calc];
628         } else if (flags & RS_PACING_EXACT_MATCH) {
629                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
630                     (rs->rs_lowest_valid <= 2)){
631                         for(i = rs->rs_lowest_valid; i < 3; i++) {
632                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
633                                         rte = &rs->rs_rlt[i];
634                                         break;
635                                 }
636                         }
637                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
638                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
639                         /* > 1Gbps only one rate */
640                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
641                                 /* Its 10G wow */
642                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
643                         }
644                 } else {
645                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
646                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
647                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
648                                 /* its an exact Mbps */
649                                 ind_calc += 2;
650                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
651                                         /* This should not happen */
652                                         ind_calc = ALL_HARDWARE_RATES-1;
653                                 }
654                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
655                                         rte = &rs->rs_rlt[ind_calc];
656                         }
657                 }
658         } else {
659                 /* we want greater than the requested rate */
660                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
661                     (rs->rs_lowest_valid <= 2)){
662                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
663                         for (i=2; i>=rs->rs_lowest_valid; i--) {
664                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
665                                         rte = &rs->rs_rlt[i];
666                                         break;
667                                 } else if ((flags & RS_PACING_GEQ) &&
668                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
669                                         rte = &rs->rs_rlt[i];
670                                         break;
671                                 } else {
672                                         arte = &rs->rs_rlt[i]; /* new alternate */
673                                 }
674                         }
675                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
676                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
677                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
678                                 /* Our top rate is larger than the request */
679                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
680                         } else if ((flags & RS_PACING_GEQ) &&
681                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
682                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
683                                 /* It matches our top rate */
684                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
685                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
686                                 /* The top rate is an alternative */
687                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
688                         }
689                 } else {
690                         /* Its in our range 1Meg - 1Gig */
691                         if (flags & RS_PACING_GEQ) {
692                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
693                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
694                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
695                                                 /* This should not happen */
696                                                 ind_calc = (ALL_HARDWARE_RATES-1);
697                                         }
698                                         rte = &rs->rs_rlt[ind_calc];
699                                 }
700                                 goto done;
701                         }
702                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
703                         ind_calc += 2;
704                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
705                                 /* This should not happen */
706                                 ind_calc = ALL_HARDWARE_RATES-1;
707                         }
708                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
709                                 rte = &rs->rs_rlt[ind_calc];
710                 }
711         }
712 done:
713         if ((rte == NULL) &&
714             (arte != NULL) &&
715             (flags & RS_PACING_SUB_OK)) {
716                 /* We can use the substitute */
717                 rte = arte;
718         }
719         return (rte);
720 }
721
722 static const struct tcp_hwrate_limit_table *
723 tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
724 {
725         /**
726          * Hunt the rate table with the restrictions in flags and find a
727          * suitable rate if possible.
728          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
729          * RS_PACING_GT     - must be greater than.
730          * RS_PACING_GEQ    - must be greater than or equal.
731          * RS_PACING_LT     - must be less than.
732          * RS_PACING_SUB_OK - If we don't meet criteria a
733          *                    substitute is ok.
734          */
735         int i, matched;
736         struct tcp_hwrate_limit_table *rte = NULL;
737
738
739         if ((rs->rs_flags & RS_INT_TBL) &&
740             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
741                 /*
742                  * Here we don't want to paw thru
743                  * a big table, we have everything
744                  * from 1Meg - 1000Meg in 1Meg increments.
745                  * Use an alternate method to "lookup".
746                  */
747                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
748         }
749         if ((flags & RS_PACING_LT) ||
750             (flags & RS_PACING_EXACT_MATCH)) {
751                 /*
752                  * For exact and less than we go forward through the table.
753                  * This way when we find one larger we stop (exact was a
754                  * toss up).
755                  */
756                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
757                         if ((flags & RS_PACING_EXACT_MATCH) &&
758                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
759                                 rte = &rs->rs_rlt[i];
760                                 matched = 1;
761                                 break;
762                         } else if ((flags & RS_PACING_LT) &&
763                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
764                                 rte = &rs->rs_rlt[i];
765                                 matched = 1;
766                                 break;
767                         }
768                         if (bytes_per_sec > rs->rs_rlt[i].rate)
769                                 break;
770                 }
771                 if ((matched == 0) &&
772                     (flags & RS_PACING_LT) &&
773                     (flags & RS_PACING_SUB_OK)) {
774                         /* Kick in a substitute (the lowest) */
775                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
776                 }
777         } else {
778                 /*
779                  * Here we go backward through the table so that we can find
780                  * the one greater in theory faster (but its probably a
781                  * wash).
782                  */
783                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
784                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
785                                 /* A possible candidate */
786                                 rte = &rs->rs_rlt[i];
787                         }
788                         if ((flags & RS_PACING_GEQ) &&
789                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
790                                 /* An exact match and we want equal */
791                                 matched = 1;
792                                 rte = &rs->rs_rlt[i];
793                                 break;
794                         } else if (rte) {
795                                 /*
796                                  * Found one that is larger than but don't
797                                  * stop, there may be a more closer match.
798                                  */
799                                 matched = 1;
800                         }
801                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
802                                 /*
803                                  * We found a table entry that is smaller,
804                                  * stop there will be none greater or equal.
805                                  */
806                                 break;
807                         }
808                 }
809                 if ((matched == 0) &&
810                     (flags & RS_PACING_SUB_OK)) {
811                         /* Kick in a substitute (the highest) */
812                         rte = &rs->rs_rlt[rs->rs_highest_valid];
813                 }
814         }
815         return (rte);
816 }
817
818 static struct ifnet *
819 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
820 {
821         struct ifnet *tifp;
822         struct m_snd_tag *tag;
823         union if_snd_tag_alloc_params params = {
824                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
825                 .rate_limit.hdr.flowid = 1,
826                 .rate_limit.max_rate = COMMON_RATE,
827                 .rate_limit.flags = M_NOWAIT,
828         };
829         int err;
830 #ifdef RSS
831         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
832             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
833 #else
834         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
835 #endif
836         tag = NULL;
837         if (ifp->if_snd_tag_alloc) {
838                 if (error)
839                         *error = ENODEV;
840                 return (NULL);
841         }
842         err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
843         if (err) {
844                 /* Failed to setup a tag? */
845                 if (error)
846                         *error = err;
847                 return (NULL);
848         }
849         tifp = tag->ifp;
850         tifp->if_snd_tag_free(tag);
851         return (tifp);
852 }
853
854 static const struct tcp_hwrate_limit_table *
855 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
856     uint32_t flags, int *error)
857 {
858         /* First lets find the interface if it exists */
859         const struct tcp_hwrate_limit_table *rte;
860         struct tcp_rate_set *rs;
861         struct epoch_tracker et;
862         int err;
863
864         epoch_enter_preempt(net_epoch_preempt, &et);
865 use_real_interface:
866         CK_LIST_FOREACH(rs, &int_rs, next) {
867                 /*
868                  * Note we don't look with the lock since we either see a
869                  * new entry or will get one when we try to add it.
870                  */
871                 if (rs->rs_flags & RS_IS_DEAD) {
872                         /* The dead are not looked at */
873                         continue;
874                 }
875                 if ((rs->rs_ifp == ifp) &&
876                     (rs->rs_if_dunit == ifp->if_dunit)) {
877                         /* Ok we found it */
878                         break;
879                 }
880         }
881         if ((rs == NULL) ||
882             (rs->rs_flags & RS_INTF_NO_SUP) ||
883             (rs->rs_flags & RS_IS_DEAD)) {
884                 /*
885                  * This means we got a packet *before*
886                  * the IF-UP was processed below, <or>
887                  * while or after we already received an interface
888                  * departed event. In either case we really don't
889                  * want to do anything with pacing, in
890                  * the departing case the packet is not
891                  * going to go very far. The new case
892                  * might be arguable, but its impossible
893                  * to tell from the departing case.
894                  */
895                 if (rs->rs_disable && error)
896                         *error = ENODEV;
897                 epoch_exit_preempt(net_epoch_preempt, &et);
898                 return (NULL);
899         }
900
901         if ((rs == NULL) || (rs->rs_disable != 0)) {
902                 if (rs->rs_disable && error)
903                         *error = ENOSPC;
904                 epoch_exit_preempt(net_epoch_preempt, &et);
905                 return (NULL);
906         }
907         if (rs->rs_flags & RS_IS_DEFF) {
908                 /* We need to find the real interface */
909                 struct ifnet *tifp;
910
911                 tifp = rt_find_real_interface(ifp, inp, error);
912                 if (tifp == NULL) {
913                         if (rs->rs_disable && error)
914                                 *error = ENOTSUP;
915                         epoch_exit_preempt(net_epoch_preempt, &et);
916                         return (NULL);
917                 }
918                 goto use_real_interface;
919         }
920         if (rs->rs_flow_limit &&
921             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
922                 if (error)
923                         *error = ENOSPC;
924                 epoch_exit_preempt(net_epoch_preempt, &et);
925                 return (NULL);
926         }
927         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
928         if (rte) {
929                 err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
930                     inp->inp_flowtype,
931                     inp->inp_flowid,
932                     rte->rate,
933                     &inp->inp_snd_tag);
934                 if (err) {
935                         /* Failed to attach */
936                         if (error)
937                                 *error = err;
938                         rte = NULL;
939                 }
940         }
941         if (rte) {
942                 /*
943                  * We use an atomic here for accounting so we don't have to
944                  * use locks when freeing.
945                  */
946                 atomic_add_64(&rs->rs_flows_using, 1);
947         }
948         epoch_exit_preempt(net_epoch_preempt, &et);
949         return (rte);
950 }
951
952 static void
953 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
954 {
955         int error;
956         struct tcp_rate_set *rs;
957
958         if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
959             (link_state != LINK_STATE_UP)) {
960                 /*
961                  * We only care on an interface going up that is rate-limit
962                  * capable.
963                  */
964                 return;
965         }
966         mtx_lock(&rs_mtx);
967         CK_LIST_FOREACH(rs, &int_rs, next) {
968                 if ((rs->rs_ifp == ifp) &&
969                     (rs->rs_if_dunit == ifp->if_dunit)) {
970                         /* We already have initialized this guy */
971                         mtx_unlock(&rs_mtx);
972                         return;
973                 }
974         }
975         mtx_unlock(&rs_mtx);
976         rt_setup_new_rs(ifp, &error);
977 }
978
979 static void
980 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
981 {
982         struct tcp_rate_set *rs, *nrs;
983         struct ifnet *tifp;
984         int i;
985
986         mtx_lock(&rs_mtx);
987         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
988                 if ((rs->rs_ifp == ifp) &&
989                     (rs->rs_if_dunit == ifp->if_dunit)) {
990                         CK_LIST_REMOVE(rs, next);
991                         rs_number_alive--;
992                         rs_number_dead++;
993                         rs->rs_flags |= RS_IS_DEAD;
994                         for (i = 0; i < rs->rs_rate_cnt; i++) {
995                                 if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
996                                         tifp = rs->rs_rlt[i].tag->ifp;
997                                         in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
998                                         rs->rs_rlt[i].tag = NULL;
999                                 }
1000                                 rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1001                         }
1002                         if (rs->rs_flows_using == 0) {
1003                                 /*
1004                                  * No references left, so we can schedule the
1005                                  * destruction after the epoch (with a caveat).
1006                                  */
1007                                 rs->rs_flags |= RS_FUNERAL_SCHD;
1008                                 epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1009                         }
1010                         break;
1011                 }
1012         }
1013         mtx_unlock(&rs_mtx);
1014 }
1015
1016 static void
1017 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1018 {
1019         struct tcp_rate_set *rs, *nrs;
1020         struct ifnet *tifp;
1021         int i;
1022
1023         mtx_lock(&rs_mtx);
1024         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1025                 CK_LIST_REMOVE(rs, next);
1026                 rs_number_alive--;
1027                 rs_number_dead++;
1028                 rs->rs_flags |= RS_IS_DEAD;
1029                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1030                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1031                                 tifp = rs->rs_rlt[i].tag->ifp;
1032                                 in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
1033                                 rs->rs_rlt[i].tag = NULL;
1034                         }
1035                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1036                 }
1037                 if (rs->rs_flows_using != 0) {
1038                         /*
1039                          * We dont hold a reference
1040                          * so we have nothing left to
1041                          * do.
1042                          */
1043                 } else {
1044                         /*
1045                          * No references left, so we can destroy it
1046                          * after the epoch.
1047                          */
1048                         rs->rs_flags |= RS_FUNERAL_SCHD;
1049                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1050                 }
1051         }
1052         mtx_unlock(&rs_mtx);
1053 }
1054
1055 const struct tcp_hwrate_limit_table *
1056 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1057     uint64_t bytes_per_sec, int flags, int *error)
1058 {
1059         const struct tcp_hwrate_limit_table *rte;
1060
1061         if (tp->t_inpcb->inp_snd_tag == NULL) {
1062                 /*
1063                  * We are setting up a rate for the first time.
1064                  */
1065                 if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
1066                         /* Not supported by the egress */
1067                         if (error)
1068                                 *error = ENODEV;
1069                         return (NULL);
1070                 }
1071 #ifdef KERN_TLS
1072                 if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
1073                         /*
1074                          * We currently can't do both TLS and hardware
1075                          * pacing
1076                          */
1077                         if (error)
1078                                 *error = EINVAL;
1079                         return (NULL);
1080                 }
1081 #endif
1082                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
1083         } else {
1084                 /*
1085                  * We are modifying a rate, wrong interface?
1086                  */
1087                 if (error)
1088                         *error = EINVAL;
1089                 rte = NULL;
1090         }
1091         return (rte);
1092 }
1093
1094 const struct tcp_hwrate_limit_table *
1095 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1096     struct tcpcb *tp, struct ifnet *ifp,
1097     uint64_t bytes_per_sec, int flags, int *error)
1098 {
1099         const struct tcp_hwrate_limit_table *nrte;
1100         const struct tcp_rate_set *rs;
1101         int is_indirect = 0;
1102         int err;
1103
1104
1105         if ((tp->t_inpcb->inp_snd_tag == NULL) ||
1106             (crte == NULL)) {
1107                 /* Wrong interface */
1108                 if (error)
1109                         *error = EINVAL;
1110                 return (NULL);
1111         }
1112         rs = crte->ptbl;
1113         if ((rs->rs_flags & RS_IS_DEAD) ||
1114             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1115                 /* Release the rate, and try anew */
1116 re_rate:
1117                 tcp_rel_pacing_rate(crte, tp);
1118                 nrte = tcp_set_pacing_rate(tp, ifp,
1119                     bytes_per_sec, flags, error);
1120                 return (nrte);
1121         }
1122         if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
1123                 is_indirect = 1;
1124         else
1125                 is_indirect = 0;
1126         if ((is_indirect == 0) &&
1127             ((ifp != rs->rs_ifp) ||
1128             (ifp->if_dunit != rs->rs_if_dunit))) {
1129                 /*
1130                  * Something changed, the user is not pointing to the same
1131                  * ifp? Maybe a route updated on this guy?
1132                  */
1133                 goto re_rate;
1134         } else if (is_indirect) {
1135                 /*
1136                  * For indirect we have to dig in and find the real interface.
1137                  */
1138                 struct ifnet *rifp;
1139
1140                 rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
1141                 if (rifp == NULL) {
1142                         /* Can't find it? */
1143                         goto re_rate;
1144                 }
1145                 if ((rifp != rs->rs_ifp) ||
1146                     (ifp->if_dunit != rs->rs_if_dunit)) {
1147                         goto re_rate;
1148                 }
1149         }
1150         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
1151         if (nrte == crte) {
1152                 /* No change */
1153                 if (error)
1154                         *error = 0;
1155                 return (crte);
1156         }
1157         if (nrte == NULL) {
1158                 /* Release the old rate */
1159                 tcp_rel_pacing_rate(crte, tp);
1160                 return (NULL);
1161         }
1162         /* Change rates to our new entry */
1163         err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1164         if (err) {
1165                 if (error)
1166                         *error = err;
1167                 return (NULL);
1168         }
1169         if (error)
1170                 *error = 0;
1171         return (nrte);
1172 }
1173
1174 void
1175 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1176 {
1177         const struct tcp_rate_set *crs;
1178         struct tcp_rate_set *rs;
1179         uint64_t pre;
1180
1181         crs = crte->ptbl;
1182         /*
1183          * Now we must break the const
1184          * in order to release our refcount.
1185          */
1186         rs = __DECONST(struct tcp_rate_set *, crs);
1187         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1188         if (pre == 1) {
1189                 mtx_lock(&rs_mtx);
1190                 /*
1191                  * Is it dead?
1192                  */
1193                 if ((rs->rs_flags & RS_IS_DEAD) &&
1194                     ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
1195                         /*
1196                          * We were the last,
1197                          * and a funeral is not pending, so
1198                          * we must schedule it.
1199                          */
1200                         rs->rs_flags |= RS_FUNERAL_SCHD;
1201                         epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
1202                 }
1203                 mtx_unlock(&rs_mtx);
1204         }
1205         in_pcbdetach_txrtlmt(tp->t_inpcb);
1206 }
1207
1208 static eventhandler_tag rl_ifnet_departs;
1209 static eventhandler_tag rl_ifnet_arrives;
1210 static eventhandler_tag rl_shutdown_start;
1211
1212 static void
1213 tcp_rs_init(void *st __unused)
1214 {
1215         CK_LIST_INIT(&int_rs);
1216         rs_number_alive = 0;
1217         rs_number_dead = 0;;
1218         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1219         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1220             tcp_rl_ifnet_departure,
1221             NULL, EVENTHANDLER_PRI_ANY);
1222         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1223             tcp_rl_ifnet_link,
1224             NULL, EVENTHANDLER_PRI_ANY);
1225         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1226             tcp_rl_shutdown, NULL,
1227             SHUTDOWN_PRI_FIRST);
1228         printf("TCP_ratelimit: Is now initialized\n");
1229 }
1230
1231 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1232 #endif