sys/netinet/tcp_ratelimit.c

   1 /*-
   2  *
   3  * SPDX-License-Identifier: BSD-3-Clause
   4  *
   5  * Copyright (c) 2018-2020
   6  *      Netflix Inc.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  */
  30 /**
  31  * Author: Randall Stewart <rrs@netflix.com>
  32  */
  33
  34 #include <sys/cdefs.h>
  35 #include "opt_inet.h"
  36 #include "opt_inet6.h"
  37 #include "opt_ipsec.h"
  38 #include "opt_tcpdebug.h"
  39 #include "opt_ratelimit.h"
  40 #include <sys/param.h>
  41 #include <sys/kernel.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/socket.h>
  45 #include <sys/socketvar.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/eventhandler.h>
  48 #include <sys/mutex.h>
  49 #include <sys/ck.h>
  50 #include <net/if.h>
  51 #include <net/if_var.h>
  52 #include <netinet/in.h>
  53 #include <netinet/in_pcb.h>
  54 #define TCPSTATES               /* for logging */
  55 #include <netinet/tcp_var.h>
  56 #ifdef INET6
  57 #include <netinet6/tcp6_var.h>
  58 #endif
  59 #include <netinet/tcp_hpts.h>
  60 #include <netinet/tcp_log_buf.h>
  61 #include <netinet/tcp_ratelimit.h>
  62 #ifndef USECS_IN_SECOND
  63 #define USECS_IN_SECOND 1000000
  64 #endif
  65 /*
  66  * For the purposes of each send, what is the size
  67  * of an ethernet frame.
  68  */
  69 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
  70 #ifdef RATELIMIT
  71
  72 /*
  73  * The following preferred table will seem weird to
  74  * the casual viewer. Why do we not have any rates below
  75  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
  76  * Why do the rates cluster in the 1-100Mbps range more
  77  * than others? Why does the table jump around at the beginnign
  78  * and then be more consistently raising?
  79  *
  80  * Let me try to answer those questions. A lot of
  81  * this is dependant on the hardware. We have three basic
  82  * supporters of rate limiting
  83  *
  84  * Chelsio - Supporting 16 configurable rates.
  85  * Mlx  - c4 supporting 13 fixed rates.
  86  * Mlx  - c5 & c6 supporting 127 configurable rates.
  87  *
  88  * The c4 is why we have a common rate that is available
  89  * in all rate tables. This is a selected rate from the
  90  * c4 table and we assure its available in all ratelimit
  91  * tables. This way the tcp_ratelimit code has an assured
  92  * rate it should always be able to get. This answers a
  93  * couple of the questions above.
  94  *
  95  * So what about the rest, well the table is built to
  96  * try to get the most out of a joint hardware/software
  97  * pacing system.  The software pacer will always pick
  98  * a rate higher than the b/w that it is estimating
  99  *
 100  * on the path. This is done for two reasons.
 101  * a) So we can discover more b/w
 102  * and
 103  * b) So we can send a block of MSS's down and then
 104  *    have the software timer go off after the previous
 105  *    send is completely out of the hardware.
 106  *
 107  * But when we do <b> we don't want to have the delay
 108  * between the last packet sent by the hardware be
 109  * excessively long (to reach our desired rate).
 110  *
 111  * So let me give an example for clarity.
 112  *
 113  * Lets assume that the tcp stack sees that 29,110,000 bps is
 114  * what the bw of the path is. The stack would select the
 115  * rate 31Mbps. 31Mbps means that each send that is done
 116  * by the hardware will cause a 390 micro-second gap between
 117  * the packets sent at that rate. For 29,110,000 bps we
 118  * would need 416 micro-seconds gap between each send.
 119  *
 120  * Note that are calculating a complete time for pacing
 121  * which includes the ethernet, IP and TCP overhead. So
 122  * a full 1514 bytes is used for the above calculations.
 123  * My testing has shown that both cards are also using this
 124  * as their basis i.e. full payload size of the ethernet frame.
 125  * The TCP stack caller needs to be aware of this and make the
 126  * appropriate overhead calculations be included in its choices.
 127  *
 128  * Now, continuing our example, we pick a MSS size based on the
 129  * delta between the two rates (416 - 390) divided into the rate
 130  * we really wish to send at rounded up.  That results in a MSS
 131  * send of 17 mss's at once. The hardware then will
 132  * run out of data in a single 17MSS send in 6,630 micro-seconds.
 133  *
 134  * On the other hand the software pacer will send more data
 135  * in 7,072 micro-seconds. This means that we will refill
 136  * the hardware 52 microseconds after it would have sent
 137  * next if it had not ran out of data. This is a win since we are
 138  * only sending every 7ms or so and yet all the packets are spaced on
 139  * the wire with 94% of what they should be and only
 140  * the last packet is delayed extra to make up for the
 141  * difference.
 142  *
 143  * Note that the above formula has two important caveat.
 144  * If we are above (b/w wise) over 100Mbps we double the result
 145  * of the MSS calculation. The second caveat is if we are 500Mbps
 146  * or more we just send the maximum MSS at once i.e. 45MSS. At
 147  * the higher b/w's even the cards have limits to what times (timer granularity)
 148  * they can insert between packets and start to send more than one
 149  * packet at a time on the wire.
 150  *
 151  */
 152 #define COMMON_RATE 180500
 153 const uint64_t desired_rates[] = {
 154         122500,                 /* 1Mbps  - rate 1 */
 155         180500,                 /* 1.44Mpbs - rate 2  common rate */
 156         375000,                 /* 3Mbps    - rate 3 */
 157         625000,                 /* 5Mbps    - rate 4 */
 158         1250000,                /* 10Mbps   - rate 5 */
 159         1875000,                /* 15Mbps   - rate 6 */
 160         2500000,                /* 20Mbps   - rate 7 */
 161         3125000,                /* 25Mbps   - rate 8 */
 162         3750000,                /* 30Mbps   - rate 9 */
 163         4375000,                /* 35Mbps   - rate 10 */
 164         5000000,                /* 40Meg    - rate 11 */
 165         6250000,                /* 50Mbps   - rate 12 */
 166         12500000,               /* 100Mbps  - rate 13 */
 167         25000000,               /* 200Mbps  - rate 14 */
 168         50000000,               /* 400Mbps  - rate 15 */
 169         100000000,              /* 800Mbps  - rate 16 */
 170         5625000,                /* 45Mbps   - rate 17 */
 171         6875000,                /* 55Mbps   - rate 19 */
 172         7500000,                /* 60Mbps   - rate 20 */
 173         8125000,                /* 65Mbps   - rate 21 */
 174         8750000,                /* 70Mbps   - rate 22 */
 175         9375000,                /* 75Mbps   - rate 23 */
 176         10000000,               /* 80Mbps   - rate 24 */
 177         10625000,               /* 85Mbps   - rate 25 */
 178         11250000,               /* 90Mbps   - rate 26 */
 179         11875000,               /* 95Mbps   - rate 27 */
 180         12500000,               /* 100Mbps  - rate 28 */
 181         13750000,               /* 110Mbps  - rate 29 */
 182         15000000,               /* 120Mbps  - rate 30 */
 183         16250000,               /* 130Mbps  - rate 31 */
 184         17500000,               /* 140Mbps  - rate 32 */
 185         18750000,               /* 150Mbps  - rate 33 */
 186         20000000,               /* 160Mbps  - rate 34 */
 187         21250000,               /* 170Mbps  - rate 35 */
 188         22500000,               /* 180Mbps  - rate 36 */
 189         23750000,               /* 190Mbps  - rate 37 */
 190         26250000,               /* 210Mbps  - rate 38 */
 191         27500000,               /* 220Mbps  - rate 39 */
 192         28750000,               /* 230Mbps  - rate 40 */
 193         30000000,               /* 240Mbps  - rate 41 */
 194         31250000,               /* 250Mbps  - rate 42 */
 195         34375000,               /* 275Mbps  - rate 43 */
 196         37500000,               /* 300Mbps  - rate 44 */
 197         40625000,               /* 325Mbps  - rate 45 */
 198         43750000,               /* 350Mbps  - rate 46 */
 199         46875000,               /* 375Mbps  - rate 47 */
 200         53125000,               /* 425Mbps  - rate 48 */
 201         56250000,               /* 450Mbps  - rate 49 */
 202         59375000,               /* 475Mbps  - rate 50 */
 203         62500000,               /* 500Mbps  - rate 51 */
 204         68750000,               /* 550Mbps  - rate 52 */
 205         75000000,               /* 600Mbps  - rate 53 */
 206         81250000,               /* 650Mbps  - rate 54 */
 207         87500000,               /* 700Mbps  - rate 55 */
 208         93750000,               /* 750Mbps  - rate 56 */
 209         106250000,              /* 850Mbps  - rate 57 */
 210         112500000,              /* 900Mbps  - rate 58 */
 211         125000000,              /* 1Gbps    - rate 59 */
 212         156250000,              /* 1.25Gps  - rate 60 */
 213         187500000,              /* 1.5Gps   - rate 61 */
 214         218750000,              /* 1.75Gps  - rate 62 */
 215         250000000,              /* 2Gbps    - rate 63 */
 216         281250000,              /* 2.25Gps  - rate 64 */
 217         312500000,              /* 2.5Gbps  - rate 65 */
 218         343750000,              /* 2.75Gbps - rate 66 */
 219         375000000,              /* 3Gbps    - rate 67 */
 220         500000000,              /* 4Gbps    - rate 68 */
 221         625000000,              /* 5Gbps    - rate 69 */
 222         750000000,              /* 6Gbps    - rate 70 */
 223         875000000,              /* 7Gbps    - rate 71 */
 224         1000000000,             /* 8Gbps    - rate 72 */
 225         1125000000,             /* 9Gbps    - rate 73 */
 226         1250000000,             /* 10Gbps   - rate 74 */
 227         1875000000,             /* 15Gbps   - rate 75 */
 228         2500000000              /* 20Gbps   - rate 76 */
 229 };
 230
 231 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
 232 #define RS_ORDERED_COUNT 16     /*
 233                                  * Number that are in order
 234                                  * at the beginning of the table,
 235                                  * over this a sort is required.
 236                                  */
 237 #define RS_NEXT_ORDER_GROUP 16  /*
 238                                  * The point in our table where
 239                                  * we come fill in a second ordered
 240                                  * group (index wise means -1).
 241                                  */
 242 #define ALL_HARDWARE_RATES 1004 /*
 243                                  * 1Meg - 1Gig in 1 Meg steps
 244                                  * plus 100, 200k  and 500k and
 245                                  * 10Gig
 246                                  */
 247
 248 #define RS_ONE_MEGABIT_PERSEC 1000000
 249 #define RS_ONE_GIGABIT_PERSEC 1000000000
 250 #define RS_TEN_GIGABIT_PERSEC 10000000000
 251
 252 static struct head_tcp_rate_set int_rs;
 253 static struct mtx rs_mtx;
 254 uint32_t rs_number_alive;
 255 uint32_t rs_number_dead;
 256 static uint32_t rs_floor_mss = 0;
 257 static uint32_t wait_time_floor = 8000; /* 8 ms */
 258 static uint32_t rs_hw_floor_mss = 16;
 259 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
 260
 261 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 262     "TCP Ratelimit stats");
 263 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
 264     &rs_number_alive, 0,
 265     "Number of interfaces initialized for ratelimiting");
 266 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
 267     &rs_number_dead, 0,
 268     "Number of interfaces departing from ratelimiting");
 269 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
 270     &rs_floor_mss, 0,
 271     "Number of MSS that will override the normal minimums (0 means don't enforce)");
 272 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
 273     &wait_time_floor, 2000,
 274     "Has b/w increases what is the wait floor we are willing to wait at the end?");
 275 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
 276     &num_of_waits_allowed, 1,
 277     "How many time blocks on the end should software pacing be willing to wait?");
 278
 279 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
 280     &rs_hw_floor_mss, 16,
 281     "Number of mss that are a minum for hardware pacing?");
 282
 283
 284 static void
 285 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
 286 {
 287         /*
 288          * Add sysctl entries for thus interface.
 289          */
 290         if (rs->rs_flags & RS_INTF_NO_SUP) {
 291                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
 292                    SYSCTL_CHILDREN(rl_sysctl_root),
 293                    OID_AUTO, "disable", CTLFLAG_RD,
 294                    &rs->rs_disable, 0,
 295                    "Disable this interface from new hdwr limiting?");
 296         } else {
 297                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
 298                    SYSCTL_CHILDREN(rl_sysctl_root),
 299                    OID_AUTO, "disable", CTLFLAG_RW,
 300                    &rs->rs_disable, 0,
 301                    "Disable this interface from new hdwr limiting?");
 302         }
 303         SYSCTL_ADD_S32(&rs->sysctl_ctx,
 304             SYSCTL_CHILDREN(rl_sysctl_root),
 305             OID_AUTO, "minseg", CTLFLAG_RW,
 306             &rs->rs_min_seg, 0,
 307             "What is the minimum we need to send on this interface?");
 308         SYSCTL_ADD_U64(&rs->sysctl_ctx,
 309             SYSCTL_CHILDREN(rl_sysctl_root),
 310             OID_AUTO, "flow_limit", CTLFLAG_RW,
 311             &rs->rs_flow_limit, 0,
 312             "What is the limit for number of flows (0=unlimited)?");
 313         SYSCTL_ADD_S32(&rs->sysctl_ctx,
 314             SYSCTL_CHILDREN(rl_sysctl_root),
 315             OID_AUTO, "highest", CTLFLAG_RD,
 316             &rs->rs_highest_valid, 0,
 317             "Highest valid rate");
 318         SYSCTL_ADD_S32(&rs->sysctl_ctx,
 319             SYSCTL_CHILDREN(rl_sysctl_root),
 320             OID_AUTO, "lowest", CTLFLAG_RD,
 321             &rs->rs_lowest_valid, 0,
 322             "Lowest valid rate");
 323         SYSCTL_ADD_S32(&rs->sysctl_ctx,
 324             SYSCTL_CHILDREN(rl_sysctl_root),
 325             OID_AUTO, "flags", CTLFLAG_RD,
 326             &rs->rs_flags, 0,
 327             "What lags are on the entry?");
 328         SYSCTL_ADD_S32(&rs->sysctl_ctx,
 329             SYSCTL_CHILDREN(rl_sysctl_root),
 330             OID_AUTO, "numrates", CTLFLAG_RD,
 331             &rs->rs_rate_cnt, 0,
 332             "How many rates re there?");
 333         SYSCTL_ADD_U64(&rs->sysctl_ctx,
 334             SYSCTL_CHILDREN(rl_sysctl_root),
 335             OID_AUTO, "flows_using", CTLFLAG_RD,
 336             &rs->rs_flows_using, 0,
 337             "How many flows are using this interface now?");
 338 #ifdef DETAILED_RATELIMIT_SYSCTL
 339         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
 340                 /*  Lets display the rates */
 341                 int i;
 342                 struct sysctl_oid *rl_rates;
 343                 struct sysctl_oid *rl_rate_num;
 344                 char rate_num[16];
 345                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 346                                             SYSCTL_CHILDREN(rl_sysctl_root),
 347                                             OID_AUTO,
 348                                             "rate",
 349                                             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 350                                             "Ratelist");
 351                 for( i = 0; i < rs->rs_rate_cnt; i++) {
 352                         sprintf(rate_num, "%d", i);
 353                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 354                                             SYSCTL_CHILDREN(rl_rates),
 355                                             OID_AUTO,
 356                                             rate_num,
 357                                             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 358                                             "Individual Rate");
 359                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
 360                                        SYSCTL_CHILDREN(rl_rate_num),
 361                                        OID_AUTO, "flags", CTLFLAG_RD,
 362                                        &rs->rs_rlt[i].flags, 0,
 363                                        "Flags on this rate");
 364                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
 365                                        SYSCTL_CHILDREN(rl_rate_num),
 366                                        OID_AUTO, "pacetime", CTLFLAG_RD,
 367                                        &rs->rs_rlt[i].time_between, 0,
 368                                        "Time hardware inserts between 1500 byte sends");
 369                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 370                                        SYSCTL_CHILDREN(rl_rate_num),
 371                                        OID_AUTO, "rate", CTLFLAG_RD,
 372                                        &rs->rs_rlt[i].rate,
 373                                        "Rate in bytes per second");
 374                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 375                                        SYSCTL_CHILDREN(rl_rate_num),
 376                                        OID_AUTO, "using", CTLFLAG_RD,
 377                                        &rs->rs_rlt[i].using,
 378                                        "Number of flows using");
 379                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 380                                        SYSCTL_CHILDREN(rl_rate_num),
 381                                        OID_AUTO, "enobufs", CTLFLAG_RD,
 382                                        &rs->rs_rlt[i].rs_num_enobufs,
 383                                        "Number of enobufs logged on this rate");
 384
 385                 }
 386         }
 387 #endif
 388 }
 389
 390 static void
 391 rs_destroy(epoch_context_t ctx)
 392 {
 393         struct tcp_rate_set *rs;
 394         bool do_free_rs;
 395
 396         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
 397
 398         mtx_lock(&rs_mtx);
 399         rs->rs_flags &= ~RS_FUNERAL_SCHD;
 400         /*
 401          * In theory its possible (but unlikely)
 402          * that while the delete was occuring
 403          * and we were applying the DEAD flag
 404          * someone slipped in and found the
 405          * interface in a lookup. While we
 406          * decided rs_flows_using were 0 and
 407          * scheduling the epoch_call, the other
 408          * thread incremented rs_flow_using. This
 409          * is because users have a pointer and
 410          * we only use the rs_flows_using in an
 411          * atomic fashion, i.e. the other entities
 412          * are not protected. To assure this did
 413          * not occur, we check rs_flows_using here
 414          * before deleting.
 415          */
 416         do_free_rs = (rs->rs_flows_using == 0);
 417         rs_number_dead--;
 418         mtx_unlock(&rs_mtx);
 419
 420         if (do_free_rs) {
 421                 sysctl_ctx_free(&rs->sysctl_ctx);
 422                 free(rs->rs_rlt, M_TCPPACE);
 423                 free(rs, M_TCPPACE);
 424         }
 425 }
 426
 427 static void
 428 rs_defer_destroy(struct tcp_rate_set *rs)
 429 {
 430
 431         mtx_assert(&rs_mtx, MA_OWNED);
 432
 433         /* Check if already pending. */
 434         if (rs->rs_flags & RS_FUNERAL_SCHD)
 435                 return;
 436
 437         rs_number_dead++;
 438
 439         /* Set flag to only defer once. */
 440         rs->rs_flags |= RS_FUNERAL_SCHD;
 441         NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
 442 }
 443
 444 #ifdef INET
 445 extern counter_u64_t rate_limit_new;
 446 extern counter_u64_t rate_limit_chg;
 447 extern counter_u64_t rate_limit_set_ok;
 448 extern counter_u64_t rate_limit_active;
 449 extern counter_u64_t rate_limit_alloc_fail;
 450 #endif
 451
 452 static int
 453 rl_attach_txrtlmt(struct ifnet *ifp,
 454     uint32_t flowtype,
 455     int flowid,
 456     uint64_t cfg_rate,
 457     struct m_snd_tag **tag)
 458 {
 459         int error;
 460         union if_snd_tag_alloc_params params = {
 461                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 462                 .rate_limit.hdr.flowid = flowid,
 463                 .rate_limit.hdr.flowtype = flowtype,
 464                 .rate_limit.max_rate = cfg_rate,
 465                 .rate_limit.flags = M_NOWAIT,
 466         };
 467
 468         error = m_snd_tag_alloc(ifp, &params, tag);
 469 #ifdef INET
 470         if (error == 0) {
 471                 counter_u64_add(rate_limit_set_ok, 1);
 472                 counter_u64_add(rate_limit_active, 1);
 473         } else if (error != EOPNOTSUPP)
 474                 counter_u64_add(rate_limit_alloc_fail, 1);
 475 #endif
 476         return (error);
 477 }
 478
 479 static void
 480 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
 481 {
 482         /*
 483          * The internal table is "special", it
 484          * is two seperate ordered tables that
 485          * must be merged. We get here when the
 486          * adapter specifies a number of rates that
 487          * covers both ranges in the table in some
 488          * form.
 489          */
 490         int i, at_low, at_high;
 491         uint8_t low_disabled = 0, high_disabled = 0;
 492
 493         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
 494                 rs->rs_rlt[i].flags = 0;
 495                 rs->rs_rlt[i].time_between = 0;
 496                 if ((low_disabled == 0) &&
 497                     (high_disabled ||
 498                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
 499                         rs->rs_rlt[i].rate = rate_table_act[at_low];
 500                         at_low++;
 501                         if (at_low == RS_NEXT_ORDER_GROUP)
 502                                 low_disabled = 1;
 503                 } else if (high_disabled == 0) {
 504                         rs->rs_rlt[i].rate = rate_table_act[at_high];
 505                         at_high++;
 506                         if (at_high == MAX_HDWR_RATES)
 507                                 high_disabled = 1;
 508                 }
 509         }
 510 }
 511
 512 static struct tcp_rate_set *
 513 rt_setup_new_rs(struct ifnet *ifp, int *error)
 514 {
 515         struct tcp_rate_set *rs;
 516         const uint64_t *rate_table_act;
 517         uint64_t lentim, res;
 518         size_t sz;
 519         uint32_t hash_type;
 520         int i;
 521         struct if_ratelimit_query_results rl;
 522         struct sysctl_oid *rl_sysctl_root;
 523         struct epoch_tracker et;
 524         /*
 525          * We expect to enter with the
 526          * mutex locked.
 527          */
 528
 529         if (ifp->if_ratelimit_query == NULL) {
 530                 /*
 531                  * We can do nothing if we cannot
 532                  * get a query back from the driver.
 533                  */
 534                 printf("Warning:No query functions for %s:%d-- failed\n",
 535                        ifp->if_dname, ifp->if_dunit);
 536                 return (NULL);
 537         }
 538         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
 539         if (rs == NULL) {
 540                 if (error)
 541                         *error = ENOMEM;
 542                 printf("Warning:No memory for malloc of tcp_rate_set\n");
 543                 return (NULL);
 544         }
 545         memset(&rl, 0, sizeof(rl));
 546         rl.flags = RT_NOSUPPORT;
 547         ifp->if_ratelimit_query(ifp, &rl);
 548         if (rl.flags & RT_IS_UNUSABLE) {
 549                 /*
 550                  * The interface does not really support
 551                  * the rate-limiting.
 552                  */
 553                 memset(rs, 0, sizeof(struct tcp_rate_set));
 554                 rs->rs_ifp = ifp;
 555                 rs->rs_if_dunit = ifp->if_dunit;
 556                 rs->rs_flags = RS_INTF_NO_SUP;
 557                 rs->rs_disable = 1;
 558                 rs_number_alive++;
 559                 sysctl_ctx_init(&rs->sysctl_ctx);
 560                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 561                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 562                     OID_AUTO,
 563                     rs->rs_ifp->if_xname,
 564                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 565                     "");
 566                 rl_add_syctl_entries(rl_sysctl_root, rs);
 567                 NET_EPOCH_ENTER(et);
 568                 mtx_lock(&rs_mtx);
 569                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 570                 mtx_unlock(&rs_mtx);
 571                 NET_EPOCH_EXIT(et);
 572                 return (rs);
 573         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
 574                 memset(rs, 0, sizeof(struct tcp_rate_set));
 575                 rs->rs_ifp = ifp;
 576                 rs->rs_if_dunit = ifp->if_dunit;
 577                 rs->rs_flags = RS_IS_DEFF;
 578                 rs_number_alive++;
 579                 sysctl_ctx_init(&rs->sysctl_ctx);
 580                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 581                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 582                     OID_AUTO,
 583                     rs->rs_ifp->if_xname,
 584                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 585                     "");
 586                 rl_add_syctl_entries(rl_sysctl_root, rs);
 587                 NET_EPOCH_ENTER(et);
 588                 mtx_lock(&rs_mtx);
 589                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 590                 mtx_unlock(&rs_mtx);
 591                 NET_EPOCH_EXIT(et);
 592                 return (rs);
 593         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
 594                 /* Mellanox C4 likely */
 595                 rs->rs_ifp = ifp;
 596                 rs->rs_if_dunit = ifp->if_dunit;
 597                 rs->rs_rate_cnt = rl.number_of_rates;
 598                 rs->rs_min_seg = rl.min_segment_burst;
 599                 rs->rs_highest_valid = 0;
 600                 rs->rs_flow_limit = rl.max_flows;
 601                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
 602                 rs->rs_disable = 0;
 603                 rate_table_act = rl.rate_table;
 604         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
 605                 /* Chelsio, C5 and C6 of Mellanox? */
 606                 rs->rs_ifp = ifp;
 607                 rs->rs_if_dunit = ifp->if_dunit;
 608                 rs->rs_rate_cnt = rl.number_of_rates;
 609                 rs->rs_min_seg = rl.min_segment_burst;
 610                 rs->rs_disable = 0;
 611                 rs->rs_flow_limit = rl.max_flows;
 612                 rate_table_act = desired_rates;
 613                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
 614                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
 615                         /*
 616                          * Our desired table is not big
 617                          * enough, do what we can.
 618                          */
 619                         rs->rs_rate_cnt = MAX_HDWR_RATES;
 620                  }
 621                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
 622                         rs->rs_flags = RS_IS_INTF;
 623                 else
 624                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
 625                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
 626                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
 627         } else {
 628                 free(rs, M_TCPPACE);
 629                 return (NULL);
 630         }
 631         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
 632         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
 633         if (rs->rs_rlt == NULL) {
 634                 if (error)
 635                         *error = ENOMEM;
 636 bail:
 637                 free(rs, M_TCPPACE);
 638                 return (NULL);
 639         }
 640         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
 641                 /*
 642                  * The interface supports all
 643                  * the rates we could possibly want.
 644                  */
 645                 uint64_t rat;
 646
 647                 rs->rs_rlt[0].rate = 12500;     /* 100k */
 648                 rs->rs_rlt[1].rate = 25000;     /* 200k */
 649                 rs->rs_rlt[2].rate = 62500;     /* 500k */
 650                 /* Note 125000 == 1Megabit
 651                  * populate 1Meg - 1000meg.
 652                  */
 653                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
 654                         rs->rs_rlt[i].rate = rat;
 655                         rat += 125000;
 656                 }
 657                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
 658         } else if (rs->rs_flags & RS_INT_TBL) {
 659                 /* We populate this in a special way */
 660                 populate_canned_table(rs, rate_table_act);
 661         } else {
 662                 /*
 663                  * Just copy in the rates from
 664                  * the table, it is in order.
 665                  */
 666                 for (i=0; i<rs->rs_rate_cnt; i++) {
 667                         rs->rs_rlt[i].rate = rate_table_act[i];
 668                         rs->rs_rlt[i].time_between = 0;
 669                         rs->rs_rlt[i].flags = 0;
 670                 }
 671         }
 672         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
 673                 /*
 674                  * We go backwards through the list so that if we can't get
 675                  * a rate and fail to init one, we have at least a chance of
 676                  * getting the highest one.
 677                  */
 678                 rs->rs_rlt[i].ptbl = rs;
 679                 rs->rs_rlt[i].tag = NULL;
 680                 rs->rs_rlt[i].using = 0;
 681                 rs->rs_rlt[i].rs_num_enobufs = 0;
 682                 /*
 683                  * Calculate the time between.
 684                  */
 685                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 686                 res = lentim / rs->rs_rlt[i].rate;
 687                 if (res > 0)
 688                         rs->rs_rlt[i].time_between = res;
 689                 else
 690                         rs->rs_rlt[i].time_between = 1;
 691                 if (rs->rs_flags & RS_NO_PRE) {
 692                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
 693                         rs->rs_lowest_valid = i;
 694                 } else {
 695                         int err;
 696
 697                         if ((rl.flags & RT_IS_SETUP_REQ)  &&
 698                             (ifp->if_ratelimit_query)) {
 699                                 err = ifp->if_ratelimit_setup(ifp,
 700                                          rs->rs_rlt[i].rate, i);
 701                                 if (err)
 702                                         goto handle_err;
 703                         }
 704 #ifdef RSS
 705                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 706 #else
 707                         hash_type = M_HASHTYPE_OPAQUE_HASH;
 708 #endif
 709                         err = rl_attach_txrtlmt(ifp,
 710                             hash_type,
 711                             (i + 1),
 712                             rs->rs_rlt[i].rate,
 713                             &rs->rs_rlt[i].tag);
 714                         if (err) {
 715 handle_err:
 716                                 if (i == (rs->rs_rate_cnt - 1)) {
 717                                         /*
 718                                          * Huh - first rate and we can't get
 719                                          * it?
 720                                          */
 721                                         free(rs->rs_rlt, M_TCPPACE);
 722                                         if (error)
 723                                                 *error = err;
 724                                         goto bail;
 725                                 } else {
 726                                         if (error)
 727                                                 *error = err;
 728                                 }
 729                                 break;
 730                         } else {
 731                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
 732                                 rs->rs_lowest_valid = i;
 733                         }
 734                 }
 735         }
 736         /* Did we get at least 1 rate? */
 737         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
 738                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
 739         else {
 740                 free(rs->rs_rlt, M_TCPPACE);
 741                 goto bail;
 742         }
 743         rs_number_alive++;
 744         sysctl_ctx_init(&rs->sysctl_ctx);
 745         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 746             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 747             OID_AUTO,
 748             rs->rs_ifp->if_xname,
 749             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 750             "");
 751         rl_add_syctl_entries(rl_sysctl_root, rs);
 752         NET_EPOCH_ENTER(et);
 753         mtx_lock(&rs_mtx);
 754         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 755         mtx_unlock(&rs_mtx);
 756         NET_EPOCH_EXIT(et);
 757         return (rs);
 758 }
 759
 760 /*
 761  * For an explanation of why the argument is volatile please
 762  * look at the comments around rt_setup_rate().
 763  */
 764 static const struct tcp_hwrate_limit_table *
 765 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
 766     uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 767 {
 768         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
 769         uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
 770         int i;
 771
 772         mbits_per_sec = (bytes_per_sec * 8);
 773         if (flags & RS_PACING_LT) {
 774                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 775                     (rs->rs_lowest_valid <= 2)){
 776                         /*
 777                          * Smaller than 1Meg, only
 778                          * 3 entries can match it.
 779                          */
 780                         previous_rate = 0;
 781                         for(i = rs->rs_lowest_valid; i < 3; i++) {
 782                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
 783                                         rte = &rs->rs_rlt[i];
 784                                         break;
 785                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
 786                                         arte = &rs->rs_rlt[i];
 787                                 }
 788                                 previous_rate = rs->rs_rlt[i].rate;
 789                         }
 790                         goto done;
 791                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 792                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 793                         /*
 794                          * Larger than 1G (the majority of
 795                          * our table.
 796                          */
 797                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
 798                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 799                         else
 800                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 801                         previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 802                         goto done;
 803                 }
 804                 /*
 805                  * If we reach here its in our table (between 1Meg - 1000Meg),
 806                  * just take the rounded down mbits per second, and add
 807                  * 1Megabit to it, from this we can calculate
 808                  * the index in the table.
 809                  */
 810                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 811                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
 812                         ind_calc++;
 813                 /* our table is offset by 3, we add 2 */
 814                 ind_calc += 2;
 815                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 816                         /* This should not happen */
 817                         ind_calc = ALL_HARDWARE_RATES-1;
 818                 }
 819                 if ((ind_calc >= rs->rs_lowest_valid) &&
 820                     (ind_calc <= rs->rs_highest_valid)) {
 821                         rte = &rs->rs_rlt[ind_calc];
 822                         if (ind_calc >= 1)
 823                                 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 824                 }
 825         } else if (flags & RS_PACING_EXACT_MATCH) {
 826                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 827                     (rs->rs_lowest_valid <= 2)){
 828                         for(i = rs->rs_lowest_valid; i < 3; i++) {
 829                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
 830                                         rte = &rs->rs_rlt[i];
 831                                         break;
 832                                 }
 833                         }
 834                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 835                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 836                         /* > 1Gbps only one rate */
 837                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
 838                                 /* Its 10G wow */
 839                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 840                         }
 841                 } else {
 842                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
 843                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 844                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 845                                 /* its an exact Mbps */
 846                                 ind_calc += 2;
 847                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 848                                         /* This should not happen */
 849                                         ind_calc = ALL_HARDWARE_RATES-1;
 850                                 }
 851                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
 852                                         rte = &rs->rs_rlt[ind_calc];
 853                         }
 854                 }
 855         } else {
 856                 /* we want greater than the requested rate */
 857                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 858                     (rs->rs_lowest_valid <= 2)){
 859                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
 860                         for (i=2; i>=rs->rs_lowest_valid; i--) {
 861                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
 862                                         rte = &rs->rs_rlt[i];
 863                                         if (i >= 1) {
 864                                                 previous_rate = rs->rs_rlt[(i-1)].rate;
 865                                         }
 866                                         break;
 867                                 } else if ((flags & RS_PACING_GEQ) &&
 868                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
 869                                         rte = &rs->rs_rlt[i];
 870                                         if (i >= 1) {
 871                                                 previous_rate = rs->rs_rlt[(i-1)].rate;
 872                                         }
 873                                         break;
 874                                 } else {
 875                                         arte = &rs->rs_rlt[i]; /* new alternate */
 876                                 }
 877                         }
 878                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
 879                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 880                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 881                                 /* Our top rate is larger than the request */
 882                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 883                         } else if ((flags & RS_PACING_GEQ) &&
 884                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 885                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 886                                 /* It matches our top rate */
 887                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 888                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
 889                                 /* The top rate is an alternative */
 890                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 891                         }
 892                         previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 893                 } else {
 894                         /* Its in our range 1Meg - 1Gig */
 895                         if (flags & RS_PACING_GEQ) {
 896                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 897                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 898                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 899                                                 /* This should not happen */
 900                                                 ind_calc = (ALL_HARDWARE_RATES-1);
 901                                         }
 902                                         rte = &rs->rs_rlt[ind_calc];
 903                                         if (ind_calc >= 1)
 904                                                 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 905                                 }
 906                                 goto done;
 907                         }
 908                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
 909                         ind_calc += 2;
 910                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 911                                 /* This should not happen */
 912                                 ind_calc = ALL_HARDWARE_RATES-1;
 913                         }
 914                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
 915                                 rte = &rs->rs_rlt[ind_calc];
 916                                 if (ind_calc >= 1)
 917                                         previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 918                         }
 919                 }
 920         }
 921 done:
 922         if ((rte == NULL) &&
 923             (arte != NULL) &&
 924             (flags & RS_PACING_SUB_OK)) {
 925                 /* We can use the substitute */
 926                 rte = arte;
 927         }
 928         if (lower_rate)
 929                 *lower_rate = previous_rate;
 930         return (rte);
 931 }
 932
 933 /*
 934  * For an explanation of why the argument is volatile please
 935  * look at the comments around rt_setup_rate().
 936  */
 937 static const struct tcp_hwrate_limit_table *
 938 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 939 {
 940         /**
 941          * Hunt the rate table with the restrictions in flags and find a
 942          * suitable rate if possible.
 943          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
 944          * RS_PACING_GT     - must be greater than.
 945          * RS_PACING_GEQ    - must be greater than or equal.
 946          * RS_PACING_LT     - must be less than.
 947          * RS_PACING_SUB_OK - If we don't meet criteria a
 948          *                    substitute is ok.
 949          */
 950         int i, matched;
 951         struct tcp_hwrate_limit_table *rte = NULL;
 952         uint64_t previous_rate = 0;
 953
 954         if ((rs->rs_flags & RS_INT_TBL) &&
 955             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
 956                 /*
 957                  * Here we don't want to paw thru
 958                  * a big table, we have everything
 959                  * from 1Meg - 1000Meg in 1Meg increments.
 960                  * Use an alternate method to "lookup".
 961                  */
 962                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
 963         }
 964         if ((flags & RS_PACING_LT) ||
 965             (flags & RS_PACING_EXACT_MATCH)) {
 966                 /*
 967                  * For exact and less than we go forward through the table.
 968                  * This way when we find one larger we stop (exact was a
 969                  * toss up).
 970                  */
 971                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
 972                         if ((flags & RS_PACING_EXACT_MATCH) &&
 973                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
 974                                 rte = &rs->rs_rlt[i];
 975                                 matched = 1;
 976                                 if (lower_rate != NULL)
 977                                         *lower_rate = previous_rate;
 978                                 break;
 979                         } else if ((flags & RS_PACING_LT) &&
 980                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
 981                                 rte = &rs->rs_rlt[i];
 982                                 matched = 1;
 983                                 if (lower_rate != NULL)
 984                                         *lower_rate = previous_rate;
 985                                 break;
 986                         }
 987                         previous_rate = rs->rs_rlt[i].rate;
 988                         if (bytes_per_sec > rs->rs_rlt[i].rate)
 989                                 break;
 990                 }
 991                 if ((matched == 0) &&
 992                     (flags & RS_PACING_LT) &&
 993                     (flags & RS_PACING_SUB_OK)) {
 994                         /* Kick in a substitute (the lowest) */
 995                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
 996                 }
 997         } else {
 998                 /*
 999                  * Here we go backward through the table so that we can find
1000                  * the one greater in theory faster (but its probably a
1001                  * wash).
1002                  */
1003                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
1004                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
1005                                 /* A possible candidate */
1006                                 rte = &rs->rs_rlt[i];
1007                         }
1008                         if ((flags & RS_PACING_GEQ) &&
1009                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
1010                                 /* An exact match and we want equal */
1011                                 matched = 1;
1012                                 rte = &rs->rs_rlt[i];
1013                                 break;
1014                         } else if (rte) {
1015                                 /*
1016                                  * Found one that is larger than but don't
1017                                  * stop, there may be a more closer match.
1018                                  */
1019                                 matched = 1;
1020                         }
1021                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
1022                                 /*
1023                                  * We found a table entry that is smaller,
1024                                  * stop there will be none greater or equal.
1025                                  */
1026                                 if (lower_rate != NULL)
1027                                         *lower_rate = rs->rs_rlt[i].rate;
1028                                 break;
1029                         }
1030                 }
1031                 if ((matched == 0) &&
1032                     (flags & RS_PACING_SUB_OK)) {
1033                         /* Kick in a substitute (the highest) */
1034                         rte = &rs->rs_rlt[rs->rs_highest_valid];
1035                 }
1036         }
1037         return (rte);
1038 }
1039
1040 static struct ifnet *
1041 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
1042 {
1043         struct ifnet *tifp;
1044         struct m_snd_tag *tag, *ntag;
1045         union if_snd_tag_alloc_params params = {
1046                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
1047                 .rate_limit.hdr.flowid = inp->inp_flowid,
1048                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
1049                 .rate_limit.max_rate = COMMON_RATE,
1050                 .rate_limit.flags = M_NOWAIT,
1051         };
1052         int err;
1053 #ifdef RSS
1054         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
1055             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
1056 #else
1057         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
1058 #endif
1059         err = m_snd_tag_alloc(ifp, &params, &tag);
1060         if (err) {
1061                 /* Failed to setup a tag? */
1062                 if (error)
1063                         *error = err;
1064                 return (NULL);
1065         }
1066         ntag = tag;
1067         while(ntag->ifp->if_next_snd_tag != NULL) {
1068                 ntag = ntag->ifp->if_next_snd_tag(ntag);
1069         }
1070         tifp = ntag->ifp;
1071         m_snd_tag_rele(tag);
1072         return (tifp);
1073 }
1074
1075 static void
1076 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
1077 {
1078         struct tcp_hwrate_limit_table *decon_rte;
1079
1080         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1081         atomic_add_long(&decon_rte->using, 1);
1082 }
1083
1084 static void
1085 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
1086 {
1087         struct tcp_hwrate_limit_table *decon_rte;
1088
1089         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1090         atomic_subtract_long(&decon_rte->using, 1);
1091 }
1092
1093 void
1094 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
1095 {
1096         struct tcp_hwrate_limit_table *decon_rte;
1097
1098         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
1099         atomic_add_long(&decon_rte->rs_num_enobufs, 1);
1100 }
1101
1102 /*
1103  * Do NOT take the __noinline out of the
1104  * find_rs_for_ifp() function. If you do the inline
1105  * of it for the rt_setup_rate() will show you a
1106  * compiler bug. For some reason the compiler thinks
1107  * the list can never be empty. The consequence of
1108  * this will be a crash when we dereference NULL
1109  * if an ifp is removed just has a hw rate limit
1110  * is attempted. If you are working on the compiler
1111  * and want to "test" this go ahead and take the noinline
1112  * out otherwise let sleeping dogs ly until such time
1113  * as we get a compiler fix 10/2/20 -- RRS
1114  */
1115 static __noinline struct tcp_rate_set *
1116 find_rs_for_ifp(struct ifnet *ifp)
1117 {
1118         struct tcp_rate_set *rs;
1119
1120         CK_LIST_FOREACH(rs, &int_rs, next) {
1121                 if ((rs->rs_ifp == ifp) &&
1122                     (rs->rs_if_dunit == ifp->if_dunit)) {
1123                         /* Ok we found it */
1124                         return (rs);
1125                 }
1126         }
1127         return (NULL);
1128 }
1129
1130
1131 static const struct tcp_hwrate_limit_table *
1132 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
1133     uint32_t flags, int *error, uint64_t *lower_rate)
1134 {
1135         /* First lets find the interface if it exists */
1136         const struct tcp_hwrate_limit_table *rte;
1137         /*
1138          * So why is rs volatile? This is to defeat a
1139          * compiler bug where in the compiler is convinced
1140          * that rs can never be NULL (which is not true). Because
1141          * of its conviction it nicely optimizes out the if ((rs == NULL
1142          * below which means if you get a NULL back you dereference it.
1143          */
1144         volatile struct tcp_rate_set *rs;
1145         struct epoch_tracker et;
1146         struct ifnet *oifp = ifp;
1147         int err;
1148
1149         NET_EPOCH_ENTER(et);
1150 use_real_interface:
1151         rs = find_rs_for_ifp(ifp);
1152         if ((rs == NULL) ||
1153             (rs->rs_flags & RS_INTF_NO_SUP) ||
1154             (rs->rs_flags & RS_IS_DEAD)) {
1155                 /*
1156                  * This means we got a packet *before*
1157                  * the IF-UP was processed below, <or>
1158                  * while or after we already received an interface
1159                  * departed event. In either case we really don't
1160                  * want to do anything with pacing, in
1161                  * the departing case the packet is not
1162                  * going to go very far. The new case
1163                  * might be arguable, but its impossible
1164                  * to tell from the departing case.
1165                  */
1166                 if (error)
1167                         *error = ENODEV;
1168                 NET_EPOCH_EXIT(et);
1169                 return (NULL);
1170         }
1171
1172         if ((rs == NULL) || (rs->rs_disable != 0)) {
1173                 if (error)
1174                         *error = ENOSPC;
1175                 NET_EPOCH_EXIT(et);
1176                 return (NULL);
1177         }
1178         if (rs->rs_flags & RS_IS_DEFF) {
1179                 /* We need to find the real interface */
1180                 struct ifnet *tifp;
1181
1182                 tifp = rt_find_real_interface(ifp, inp, error);
1183                 if (tifp == NULL) {
1184                         if (rs->rs_disable && error)
1185                                 *error = ENOTSUP;
1186                         NET_EPOCH_EXIT(et);
1187                         return (NULL);
1188                 }
1189                 KASSERT((tifp != ifp),
1190                         ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
1191                          ifp, inp, tifp));
1192                 ifp = tifp;
1193                 goto use_real_interface;
1194         }
1195         if (rs->rs_flow_limit &&
1196             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
1197                 if (error)
1198                         *error = ENOSPC;
1199                 NET_EPOCH_EXIT(et);
1200                 return (NULL);
1201         }
1202         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1203         if (rte) {
1204                 err = in_pcbattach_txrtlmt(inp, oifp,
1205                     inp->inp_flowtype,
1206                     inp->inp_flowid,
1207                     rte->rate,
1208                     &inp->inp_snd_tag);
1209                 if (err) {
1210                         /* Failed to attach */
1211                         if (error)
1212                                 *error = err;
1213                         rte = NULL;
1214                 } else {
1215                         KASSERT((inp->inp_snd_tag != NULL) ,
1216                                 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
1217                                  inp, rte, (unsigned long long)rte->rate, rs));
1218 #ifdef INET
1219                         counter_u64_add(rate_limit_new, 1);
1220 #endif
1221                 }
1222         }
1223         if (rte) {
1224                 /*
1225                  * We use an atomic here for accounting so we don't have to
1226                  * use locks when freeing.
1227                  */
1228                 atomic_add_64(&rs->rs_flows_using, 1);
1229         }
1230         NET_EPOCH_EXIT(et);
1231         return (rte);
1232 }
1233
1234 static void
1235 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
1236 {
1237         int error;
1238         struct tcp_rate_set *rs;
1239         struct epoch_tracker et;
1240
1241         if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
1242             (link_state != LINK_STATE_UP)) {
1243                 /*
1244                  * We only care on an interface going up that is rate-limit
1245                  * capable.
1246                  */
1247                 return;
1248         }
1249         NET_EPOCH_ENTER(et);
1250         mtx_lock(&rs_mtx);
1251         rs = find_rs_for_ifp(ifp);
1252         if (rs) {
1253                 /* We already have initialized this guy */
1254                 mtx_unlock(&rs_mtx);
1255                 NET_EPOCH_EXIT(et);
1256                 return;
1257         }
1258         mtx_unlock(&rs_mtx);
1259         NET_EPOCH_EXIT(et);
1260         rt_setup_new_rs(ifp, &error);
1261 }
1262
1263 static void
1264 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
1265 {
1266         struct tcp_rate_set *rs;
1267         struct epoch_tracker et;
1268         int i;
1269
1270         NET_EPOCH_ENTER(et);
1271         mtx_lock(&rs_mtx);
1272         rs = find_rs_for_ifp(ifp);
1273         if (rs) {
1274                 CK_LIST_REMOVE(rs, next);
1275                 rs_number_alive--;
1276                 rs->rs_flags |= RS_IS_DEAD;
1277                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1278                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1279                                 in_pcbdetach_tag(rs->rs_rlt[i].tag);
1280                                 rs->rs_rlt[i].tag = NULL;
1281                         }
1282                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1283                 }
1284                 if (rs->rs_flows_using == 0)
1285                         rs_defer_destroy(rs);
1286         }
1287         mtx_unlock(&rs_mtx);
1288         NET_EPOCH_EXIT(et);
1289 }
1290
1291 static void
1292 tcp_rl_shutdown(void *arg __unused, int howto __unused)
1293 {
1294         struct tcp_rate_set *rs, *nrs;
1295         struct epoch_tracker et;
1296         int i;
1297
1298         NET_EPOCH_ENTER(et);
1299         mtx_lock(&rs_mtx);
1300         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
1301                 CK_LIST_REMOVE(rs, next);
1302                 rs_number_alive--;
1303                 rs->rs_flags |= RS_IS_DEAD;
1304                 for (i = 0; i < rs->rs_rate_cnt; i++) {
1305                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
1306                                 in_pcbdetach_tag(rs->rs_rlt[i].tag);
1307                                 rs->rs_rlt[i].tag = NULL;
1308                         }
1309                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
1310                 }
1311                 if (rs->rs_flows_using == 0)
1312                         rs_defer_destroy(rs);
1313         }
1314         mtx_unlock(&rs_mtx);
1315         NET_EPOCH_EXIT(et);
1316 }
1317
1318 const struct tcp_hwrate_limit_table *
1319 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
1320     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1321 {
1322         const struct tcp_hwrate_limit_table *rte;
1323 #ifdef KERN_TLS
1324         struct ktls_session *tls;
1325 #endif
1326
1327         INP_WLOCK_ASSERT(tp->t_inpcb);
1328
1329         if (tp->t_inpcb->inp_snd_tag == NULL) {
1330                 /*
1331                  * We are setting up a rate for the first time.
1332                  */
1333                 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
1334                         /* Not supported by the egress */
1335                         if (error)
1336                                 *error = ENODEV;
1337                         return (NULL);
1338                 }
1339 #ifdef KERN_TLS
1340                 tls = NULL;
1341                 if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1342                         tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
1343
1344                         if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
1345                             tls->mode != TCP_TLS_MODE_IFNET) {
1346                                 if (error)
1347                                         *error = ENODEV;
1348                                 return (NULL);
1349                         }
1350                 }
1351 #endif
1352                 rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate);
1353                 if (rte)
1354                         rl_increment_using(rte);
1355 #ifdef KERN_TLS
1356                 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
1357                         /*
1358                          * Fake a route change error to reset the TLS
1359                          * send tag.  This will convert the existing
1360                          * tag to a TLS ratelimit tag.
1361                          */
1362                         MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS);
1363                         ktls_output_eagain(tp->t_inpcb, tls);
1364                 }
1365 #endif
1366         } else {
1367                 /*
1368                  * We are modifying a rate, wrong interface?
1369                  */
1370                 if (error)
1371                         *error = EINVAL;
1372                 rte = NULL;
1373         }
1374         if (rte != NULL) {
1375                 tp->t_pacing_rate = rte->rate;
1376                 *error = 0;
1377         }
1378         return (rte);
1379 }
1380
1381 const struct tcp_hwrate_limit_table *
1382 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
1383     struct tcpcb *tp, struct ifnet *ifp,
1384     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
1385 {
1386         const struct tcp_hwrate_limit_table *nrte;
1387         const struct tcp_rate_set *rs;
1388 #ifdef KERN_TLS
1389         struct ktls_session *tls = NULL;
1390 #endif
1391         int err;
1392
1393         INP_WLOCK_ASSERT(tp->t_inpcb);
1394
1395         if (crte == NULL) {
1396                 /* Wrong interface */
1397                 if (error)
1398                         *error = EINVAL;
1399                 return (NULL);
1400         }
1401
1402 #ifdef KERN_TLS
1403         if (tp->t_inpcb->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
1404                 tls = tp->t_inpcb->inp_socket->so_snd.sb_tls_info;
1405                 MPASS(tls->mode == TCP_TLS_MODE_IFNET);
1406                 if (tls->snd_tag != NULL &&
1407                     tls->snd_tag->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
1408                         /*
1409                          * NIC probably doesn't support ratelimit TLS
1410                          * tags if it didn't allocate one when an
1411                          * existing rate was present, so ignore.
1412                          */
1413                         if (error)
1414                                 *error = EOPNOTSUPP;
1415                         return (NULL);
1416                 }
1417         }
1418 #endif
1419         if (tp->t_inpcb->inp_snd_tag == NULL) {
1420                 /* Wrong interface */
1421                 if (error)
1422                         *error = EINVAL;
1423                 return (NULL);
1424         }
1425         rs = crte->ptbl;
1426         if ((rs->rs_flags & RS_IS_DEAD) ||
1427             (crte->flags & HDWRPACE_IFPDEPARTED)) {
1428                 /* Release the rate, and try anew */
1429
1430                 tcp_rel_pacing_rate(crte, tp);
1431                 nrte = tcp_set_pacing_rate(tp, ifp,
1432                     bytes_per_sec, flags, error, lower_rate);
1433                 return (nrte);
1434         }
1435         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
1436         if (nrte == crte) {
1437                 /* No change */
1438                 if (error)
1439                         *error = 0;
1440                 return (crte);
1441         }
1442         if (nrte == NULL) {
1443                 /* Release the old rate */
1444                 if (error)
1445                         *error = ENOENT;
1446                 tcp_rel_pacing_rate(crte, tp);
1447                 return (NULL);
1448         }
1449         rl_decrement_using(crte);
1450         rl_increment_using(nrte);
1451         /* Change rates to our new entry */
1452 #ifdef KERN_TLS
1453         if (tls != NULL)
1454                 err = ktls_modify_txrtlmt(tls, nrte->rate);
1455         else
1456 #endif
1457                 err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
1458         if (err) {
1459                 rl_decrement_using(nrte);
1460                 /* Do we still have a snd-tag attached? */
1461                 if (tp->t_inpcb->inp_snd_tag)
1462                         in_pcbdetach_txrtlmt(tp->t_inpcb);
1463                 if (error)
1464                         *error = err;
1465                 return (NULL);
1466         } else {
1467 #ifdef INET
1468                 counter_u64_add(rate_limit_chg, 1);
1469 #endif
1470         }
1471         if (error)
1472                 *error = 0;
1473         tp->t_pacing_rate = nrte->rate;
1474         return (nrte);
1475 }
1476
1477 void
1478 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
1479 {
1480         const struct tcp_rate_set *crs;
1481         struct tcp_rate_set *rs;
1482         uint64_t pre;
1483
1484         INP_WLOCK_ASSERT(tp->t_inpcb);
1485
1486         tp->t_pacing_rate = -1;
1487         crs = crte->ptbl;
1488         /*
1489          * Now we must break the const
1490          * in order to release our refcount.
1491          */
1492         rs = __DECONST(struct tcp_rate_set *, crs);
1493         rl_decrement_using(crte);
1494         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
1495         if (pre == 1) {
1496                 struct epoch_tracker et;
1497
1498                 NET_EPOCH_ENTER(et);
1499                 mtx_lock(&rs_mtx);
1500                 /*
1501                  * Is it dead?
1502                  */
1503                 if (rs->rs_flags & RS_IS_DEAD)
1504                         rs_defer_destroy(rs);
1505                 mtx_unlock(&rs_mtx);
1506                 NET_EPOCH_EXIT(et);
1507         }
1508
1509         /*
1510          * XXX: If this connection is using ifnet TLS, should we
1511          * switch it to using an unlimited rate, or perhaps use
1512          * ktls_output_eagain() to reset the send tag to a plain
1513          * TLS tag?
1514          */
1515         in_pcbdetach_txrtlmt(tp->t_inpcb);
1516 }
1517
1518 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
1519 #define ONE_HUNDRED_MBPS 12500000       /* 100Mbps in bytes per second */
1520 #define FIVE_HUNDRED_MBPS 62500000      /* 500Mbps in bytes per second */
1521 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
1522
1523 static void
1524 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
1525                     uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
1526                     uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
1527 {
1528         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1529                 union tcp_log_stackspecific log;
1530                 struct timeval tv;
1531                 uint32_t cts;
1532
1533                 memset(&log, 0, sizeof(log));
1534                 cts = tcp_get_usecs(&tv);
1535                 log.u_bbr.flex1 = segsiz;
1536                 log.u_bbr.flex2 = new_tso;
1537                 log.u_bbr.flex3 = time_between;
1538                 log.u_bbr.flex4 = calc_time_between;
1539                 log.u_bbr.flex5 = segs;
1540                 log.u_bbr.flex6 = res_div;
1541                 log.u_bbr.flex7 = mult;
1542                 log.u_bbr.flex8 = mod;
1543                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1544                 log.u_bbr.cur_del_rate = bw;
1545                 log.u_bbr.delRate = hw_rate;
1546                 TCP_LOG_EVENTP(tp, NULL,
1547                     &tp->t_inpcb->inp_socket->so_rcv,
1548                     &tp->t_inpcb->inp_socket->so_snd,
1549                     TCP_HDWR_PACE_SIZE, 0,
1550                     0, &log, false, &tv);
1551         }
1552 }
1553
1554 uint32_t
1555 tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
1556    const struct tcp_hwrate_limit_table *te, int *err)
1557 {
1558         /*
1559          * We use the google formula to calculate the
1560          * TSO size. I.E.
1561          * bw < 24Meg
1562          *   tso = 2mss
1563          * else
1564          *   tso = min(bw/1000, 64k)
1565          *
1566          * Note for these calculations we ignore the
1567          * packet overhead (enet hdr, ip hdr and tcp hdr).
1568          */
1569         uint64_t lentim, res, bytes;
1570         uint32_t new_tso, min_tso_segs;
1571
1572         bytes = bw / 1000;
1573         if (bytes > (64 * 1000))
1574                 bytes = 64 * 1000;
1575         /* Round up */
1576         new_tso = (bytes + segsiz - 1) / segsiz;
1577         if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
1578                 min_tso_segs = 1;
1579         else
1580                 min_tso_segs = 2;
1581         if (rs_floor_mss && (new_tso < rs_floor_mss))
1582                 new_tso = rs_floor_mss;
1583         else if (new_tso < min_tso_segs)
1584                 new_tso = min_tso_segs;
1585         if (new_tso > MAX_MSS_SENT)
1586                 new_tso = MAX_MSS_SENT;
1587         new_tso *= segsiz;
1588         tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1589                             0, 0, 0, 0, 0, 0, 1);
1590         /*
1591          * If we are not doing hardware pacing
1592          * then we are done.
1593          */
1594         if (te == NULL) {
1595                 if (err)
1596                         *err = 0;
1597                 return(new_tso);
1598         }
1599         /*
1600          * For hardware pacing we look at the
1601          * rate you are sending at and compare
1602          * that to the rate you have in hardware.
1603          *
1604          * If the hardware rate is slower than your
1605          * software rate then you are in error and
1606          * we will build a queue in our hardware whic
1607          * is probably not desired, in such a case
1608          * just return the non-hardware TSO size.
1609          *
1610          * If the rate in hardware is faster (which
1611          * it should be) then look at how long it
1612          * takes to send one ethernet segment size at
1613          * your b/w and compare that to the time it
1614          * takes to send at the rate you had selected.
1615          *
1616          * If your time is greater (which we hope it is)
1617          * we get the delta between the two, and then
1618          * divide that into your pacing time. This tells
1619          * us how many MSS you can send down at once (rounded up).
1620          *
1621          * Note we also double this value if the b/w is over
1622          * 100Mbps. If its over 500meg we just set you to the
1623          * max (43 segments).
1624          */
1625         if (te->rate > FIVE_HUNDRED_MBPS)
1626                 goto max;
1627         if (te->rate == bw) {
1628                 /* We are pacing at exactly the hdwr rate */
1629 max:
1630                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1631                                     te->rate, te->time_between, (uint32_t)0,
1632                                     (segsiz * MAX_MSS_SENT), 0, 0, 3);
1633                 return (segsiz * MAX_MSS_SENT);
1634         }
1635         lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
1636         res = lentim / bw;
1637         if (res > te->time_between) {
1638                 uint32_t delta, segs, res_div;
1639
1640                 res_div = ((res * num_of_waits_allowed) + wait_time_floor);
1641                 delta = res - te->time_between;
1642                 segs = (res_div + delta - 1)/delta;
1643                 if (segs < min_tso_segs)
1644                         segs = min_tso_segs;
1645                 if (segs < rs_hw_floor_mss)
1646                         segs = rs_hw_floor_mss;
1647                 if (segs > MAX_MSS_SENT)
1648                         segs = MAX_MSS_SENT;
1649                 segs *= segsiz;
1650                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1651                                     te->rate, te->time_between, (uint32_t)res,
1652                                     segs, res_div, 1, 3);
1653                 if (err)
1654                         *err = 0;
1655                 if (segs < new_tso) {
1656                         /* unexpected ? */
1657                         return(new_tso);
1658                 } else {
1659                         return (segs);
1660                 }
1661         } else {
1662                 /*
1663                  * Your time is smaller which means
1664                  * we will grow a queue on our
1665                  * hardware. Send back the non-hardware
1666                  * rate.
1667                  */
1668                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
1669                                     te->rate, te->time_between, (uint32_t)res,
1670                                     0, 0, 0, 4);
1671                 if (err)
1672                         *err = -1;
1673                 return (new_tso);
1674         }
1675 }
1676
1677 uint64_t
1678 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
1679 {
1680         struct epoch_tracker et;
1681         struct tcp_rate_set *rs;
1682         uint64_t rate_ret;
1683
1684         NET_EPOCH_ENTER(et);
1685 use_next_interface:
1686         rs = find_rs_for_ifp(ifp);
1687         if (rs == NULL) {
1688                 /* This interface does not do ratelimiting */
1689                 rate_ret = 0;
1690         } else if (rs->rs_flags & RS_IS_DEFF) {
1691                 /* We need to find the real interface */
1692                 struct ifnet *tifp;
1693
1694                 tifp = rt_find_real_interface(ifp, inp, NULL);
1695                 if (tifp == NULL) {
1696                         NET_EPOCH_EXIT(et);
1697                         return (0);
1698                 }
1699                 ifp = tifp;
1700                 goto use_next_interface;
1701         } else {
1702                 /* Lets return the highest rate this guy has */
1703                 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
1704         }
1705         NET_EPOCH_EXIT(et);
1706         return(rate_ret);
1707 }
1708
1709 static eventhandler_tag rl_ifnet_departs;
1710 static eventhandler_tag rl_ifnet_arrives;
1711 static eventhandler_tag rl_shutdown_start;
1712
1713 static void
1714 tcp_rs_init(void *st __unused)
1715 {
1716         CK_LIST_INIT(&int_rs);
1717         rs_number_alive = 0;
1718         rs_number_dead = 0;
1719         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
1720         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
1721             tcp_rl_ifnet_departure,
1722             NULL, EVENTHANDLER_PRI_ANY);
1723         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
1724             tcp_rl_ifnet_link,
1725             NULL, EVENTHANDLER_PRI_ANY);
1726         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1727             tcp_rl_shutdown, NULL,
1728             SHUTDOWN_PRI_FIRST);
1729         printf("TCP_ratelimit: Is now initialized\n");
1730 }
1731
1732 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
1733 #endif