sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2020 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_ratelimit.h"
  34 #include "opt_kern_tls.h"
  35 #if defined(INET) || defined(INET6)
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/systm.h>
  53 #ifdef STATS
  54 #include <sys/qmath.h>
  55 #include <sys/tree.h>
  56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  57 #else
  58 #include <sys/tree.h>
  59 #endif
  60 #include <sys/refcount.h>
  61 #include <sys/queue.h>
  62 #include <sys/tim_filter.h>
  63 #include <sys/smp.h>
  64 #include <sys/kthread.h>
  65 #include <sys/kern_prefetch.h>
  66 #include <sys/protosw.h>
  67 #ifdef TCP_ACCOUNTING
  68 #include <sys/sched.h>
  69 #include <machine/cpu.h>
  70 #endif
  71 #include <vm/uma.h>
  72
  73 #include <net/route.h>
  74 #include <net/route/nhop.h>
  75 #include <net/vnet.h>
  76
  77 #define TCPSTATES               /* for logging */
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_kdtrace.h>
  81 #include <netinet/in_pcb.h>
  82 #include <netinet/ip.h>
  83 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  84 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  85 #include <netinet/ip_var.h>
  86 #include <netinet/ip6.h>
  87 #include <netinet6/in6_pcb.h>
  88 #include <netinet6/ip6_var.h>
  89 #include <netinet/tcp.h>
  90 #define TCPOUTFLAGS
  91 #include <netinet/tcp_fsm.h>
  92 #include <netinet/tcp_seq.h>
  93 #include <netinet/tcp_timer.h>
  94 #include <netinet/tcp_var.h>
  95 #include <netinet/tcp_log_buf.h>
  96 #include <netinet/tcp_syncache.h>
  97 #include <netinet/tcp_hpts.h>
  98 #include <netinet/tcp_ratelimit.h>
  99 #include <netinet/tcp_accounting.h>
 100 #include <netinet/tcpip.h>
 101 #include <netinet/cc/cc.h>
 102 #include <netinet/cc/cc_newreno.h>
 103 #include <netinet/tcp_fastopen.h>
 104 #include <netinet/tcp_lro.h>
 105 #ifdef NETFLIX_SHARED_CWND
 106 #include <netinet/tcp_shared_cwnd.h>
 107 #endif
 108 #ifdef TCP_OFFLOAD
 109 #include <netinet/tcp_offload.h>
 110 #endif
 111 #ifdef INET6
 112 #include <netinet6/tcp6_var.h>
 113 #endif
 114 #include <netinet/tcp_ecn.h>
 115
 116 #include <netipsec/ipsec_support.h>
 117
 118 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 119 #include <netipsec/ipsec.h>
 120 #include <netipsec/ipsec6.h>
 121 #endif                          /* IPSEC */
 122
 123 #include <netinet/udp.h>
 124 #include <netinet/udp_var.h>
 125 #include <machine/in_cksum.h>
 126
 127 #ifdef MAC
 128 #include <security/mac/mac_framework.h>
 129 #endif
 130 #include "sack_filter.h"
 131 #include "tcp_rack.h"
 132 #include "tailq_hash.h"
 133 #include "rack_bbr_common.h"
 134
 135 uma_zone_t rack_zone;
 136 uma_zone_t rack_pcb_zone;
 137
 138 #ifndef TICKS2SBT
 139 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 140 #endif
 141
 142 VNET_DECLARE(uint32_t, newreno_beta);
 143 VNET_DECLARE(uint32_t, newreno_beta_ecn);
 144 #define V_newreno_beta VNET(newreno_beta)
 145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
 146
 147
 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
 149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
 150
 151 struct sysctl_ctx_list rack_sysctl_ctx;
 152 struct sysctl_oid *rack_sysctl_root;
 153
 154 #define CUM_ACKED 1
 155 #define SACKED 2
 156
 157 /*
 158  * The RACK module incorporates a number of
 159  * TCP ideas that have been put out into the IETF
 160  * over the last few years:
 161  * - Matt Mathis's Rate Halving which slowly drops
 162  *    the congestion window so that the ack clock can
 163  *    be maintained during a recovery.
 164  * - Yuchung Cheng's RACK TCP (for which its named) that
 165  *    will stop us using the number of dup acks and instead
 166  *    use time as the gage of when we retransmit.
 167  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 168  *    of Dukkipati et.al.
 169  * RACK depends on SACK, so if an endpoint arrives that
 170  * cannot do SACK the state machine below will shuttle the
 171  * connection back to using the "default" TCP stack that is
 172  * in FreeBSD.
 173  *
 174  * To implement RACK the original TCP stack was first decomposed
 175  * into a functional state machine with individual states
 176  * for each of the possible TCP connection states. The do_segment
 177  * functions role in life is to mandate the connection supports SACK
 178  * initially and then assure that the RACK state matches the conenction
 179  * state before calling the states do_segment function. Each
 180  * state is simplified due to the fact that the original do_segment
 181  * has been decomposed and we *know* what state we are in (no
 182  * switches on the state) and all tests for SACK are gone. This
 183  * greatly simplifies what each state does.
 184  *
 185  * TCP output is also over-written with a new version since it
 186  * must maintain the new rack scoreboard.
 187  *
 188  */
 189 static int32_t rack_tlp_thresh = 1;
 190 static int32_t rack_tlp_limit = 2;      /* No more than 2 TLPs w-out new data */
 191 static int32_t rack_tlp_use_greater = 1;
 192 static int32_t rack_reorder_thresh = 2;
 193 static int32_t rack_reorder_fade = 60000000;    /* 0 - never fade, def 60,000,000
 194                                                  * - 60 seconds */
 195 static uint32_t rack_clamp_ss_upper = 110;
 196 static uint32_t rack_clamp_ca_upper = 105;
 197 static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */
 198 static uint32_t rack_unclamp_round_thresh = 100;        /* number of perfect rounds before we unclamp */
 199 static uint32_t rack_unclamp_rxt_thresh = 5;    /* .5%  and under */
 200 static uint64_t rack_rxt_clamp_thresh = 0;      /* Do we do the rxt clamp thing */
 201 static int32_t rack_dnd_default = 0;            /* For rr_conf = 3, what is the default for dnd */
 202 static int32_t rack_rxt_controls = 0;
 203 static int32_t rack_fill_cw_state = 0;
 204 static uint8_t rack_req_measurements = 1;
 205 /* Attack threshold detections */
 206 static uint32_t rack_highest_sack_thresh_seen = 0;
 207 static uint32_t rack_highest_move_thresh_seen = 0;
 208 static uint32_t rack_merge_out_sacks_on_attack = 0;
 209 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
 210 static int32_t rack_hw_pace_extra_slots = 0;    /* 2 extra MSS time betweens */
 211 static int32_t rack_hw_rate_caps = 0; /* 1; */
 212 static int32_t rack_hw_rate_cap_per = 0;        /* 0 -- off  */
 213 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
 214 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
 215 static int32_t rack_hw_up_only = 0;
 216 static int32_t rack_stats_gets_ms_rtt = 1;
 217 static int32_t rack_prr_addbackmax = 2;
 218 static int32_t rack_do_hystart = 0;
 219 static int32_t rack_apply_rtt_with_reduced_conf = 0;
 220 static int32_t rack_hibeta_setting = 0;
 221 static int32_t rack_default_pacing_divisor = 250;
 222 static int32_t rack_uses_full_dgp_in_rec = 1;
 223 static uint16_t rack_pacing_min_seg = 0;
 224
 225
 226 static uint32_t sad_seg_size_per = 800; /* 80.0 % */
 227 static int32_t rack_pkt_delay = 1000;
 228 static int32_t rack_send_a_lot_in_prr = 1;
 229 static int32_t rack_min_to = 1000;      /* Number of microsecond  min timeout */
 230 static int32_t rack_verbose_logging = 0;
 231 static int32_t rack_ignore_data_after_close = 1;
 232 static int32_t rack_enable_shared_cwnd = 1;
 233 static int32_t rack_use_cmp_acks = 1;
 234 static int32_t rack_use_fsb = 1;
 235 static int32_t rack_use_rfo = 1;
 236 static int32_t rack_use_rsm_rfo = 1;
 237 static int32_t rack_max_abc_post_recovery = 2;
 238 static int32_t rack_client_low_buf = 0;
 239 static int32_t rack_dsack_std_based = 0x3;      /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
 240 static int32_t rack_bw_multipler = 2;           /* Limit on fill cw's jump up to be this x gp_est */
 241 #ifdef TCP_ACCOUNTING
 242 static int32_t rack_tcp_accounting = 0;
 243 #endif
 244 static int32_t rack_limits_scwnd = 1;
 245 static int32_t rack_enable_mqueue_for_nonpaced = 0;
 246 static int32_t rack_hybrid_allow_set_maxseg = 0;
 247 static int32_t rack_disable_prr = 0;
 248 static int32_t use_rack_rr = 1;
 249 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
 250 static int32_t rack_persist_min = 250000;       /* 250usec */
 251 static int32_t rack_persist_max = 2000000;      /* 2 Second in usec's */
 252 static int32_t rack_sack_not_required = 1;      /* set to one to allow non-sack to use rack */
 253 static int32_t rack_default_init_window = 0;    /* Use system default */
 254 static int32_t rack_limit_time_with_srtt = 0;
 255 static int32_t rack_autosndbuf_inc = 20;        /* In percentage form */
 256 static int32_t rack_enobuf_hw_boost_mult = 0;   /* How many times the hw rate we boost slot using time_between */
 257 static int32_t rack_enobuf_hw_max = 12000;      /* 12 ms in usecs */
 258 static int32_t rack_enobuf_hw_min = 10000;      /* 10 ms in usecs */
 259 static int32_t rack_hw_rwnd_factor = 2;         /* How many max_segs the rwnd must be before we hold off sending */
 260 static int32_t rack_hw_check_queue = 0;         /* Do we always pre-check queue depth of a hw queue */
 261 static int32_t rack_full_buffer_discount = 10;
 262 /*
 263  * Currently regular tcp has a rto_min of 30ms
 264  * the backoff goes 12 times so that ends up
 265  * being a total of 122.850 seconds before a
 266  * connection is killed.
 267  */
 268 static uint32_t rack_def_data_window = 20;
 269 static uint32_t rack_goal_bdp = 2;
 270 static uint32_t rack_min_srtts = 1;
 271 static uint32_t rack_min_measure_usec = 0;
 272 static int32_t rack_tlp_min = 10000;    /* 10ms */
 273 static int32_t rack_rto_min = 30000;    /* 30,000 usec same as main freebsd */
 274 static int32_t rack_rto_max = 4000000;  /* 4 seconds in usec's */
 275 static const int32_t rack_free_cache = 2;
 276 static int32_t rack_hptsi_segments = 40;
 277 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 278 static int32_t rack_pace_every_seg = 0;
 279 static int32_t rack_delayed_ack_time = 40000;   /* 40ms in usecs */
 280 static int32_t rack_slot_reduction = 4;
 281 static int32_t rack_wma_divisor = 8;            /* For WMA calculation */
 282 static int32_t rack_cwnd_block_ends_measure = 0;
 283 static int32_t rack_rwnd_block_ends_measure = 0;
 284 static int32_t rack_def_profile = 0;
 285
 286 static int32_t rack_lower_cwnd_at_tlp = 0;
 287 static int32_t rack_limited_retran = 0;
 288 static int32_t rack_always_send_oldest = 0;
 289 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 290
 291 static uint16_t rack_per_of_gp_ss = 250;        /* 250 % slow-start */
 292 static uint16_t rack_per_of_gp_ca = 200;        /* 200 % congestion-avoidance */
 293 static uint16_t rack_per_of_gp_rec = 200;       /* 200 % of bw */
 294
 295 /* Probertt */
 296 static uint16_t rack_per_of_gp_probertt = 60;   /* 60% of bw */
 297 static uint16_t rack_per_of_gp_lowthresh = 40;  /* 40% is bottom */
 298 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
 299 static uint16_t rack_atexit_prtt_hbp = 130;     /* Clamp to 130% on exit prtt if highly buffered path */
 300 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
 301
 302 static uint32_t rack_max_drain_wait = 2;        /* How man gp srtt's before we give up draining */
 303 static uint32_t rack_must_drain = 1;            /* How many GP srtt's we *must* wait */
 304 static uint32_t rack_probertt_use_min_rtt_entry = 1;    /* Use the min to calculate the goal else gp_srtt */
 305 static uint32_t rack_probertt_use_min_rtt_exit = 0;
 306 static uint32_t rack_probe_rtt_sets_cwnd = 0;
 307 static uint32_t rack_probe_rtt_safety_val = 2000000;    /* No more than 2 sec in probe-rtt */
 308 static uint32_t rack_time_between_probertt = 9600000;   /* 9.6 sec in usecs */
 309 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;       /* How many srtt periods does probe-rtt last top fraction */
 310 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;       /* How many srtt periods does probe-rtt last bottom fraction */
 311 static uint32_t rack_min_probertt_hold = 40000;         /* Equal to delayed ack time */
 312 static uint32_t rack_probertt_filter_life = 10000000;
 313 static uint32_t rack_probertt_lower_within = 10;
 314 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds)  to count as a lowering */
 315 static int32_t rack_pace_one_seg = 0;           /* Shall we pace for less than 1.4Meg 1MSS at a time */
 316 static int32_t rack_probertt_clear_is = 1;
 317 static int32_t rack_max_drain_hbp = 1;          /* Extra drain times gpsrtt for highly buffered paths */
 318 static int32_t rack_hbp_thresh = 3;             /* what is the divisor max_rtt/min_rtt to decided a hbp */
 319
 320 /* Part of pacing */
 321 static int32_t rack_max_per_above = 30;         /* When we go to increment stop if above 100+this% */
 322
 323 /* Timely information */
 324 /* Combine these two gives the range of 'no change' to bw */
 325 /* ie the up/down provide the upper and lower bound */
 326 static int32_t rack_gp_per_bw_mul_up = 2;       /* 2% */
 327 static int32_t rack_gp_per_bw_mul_down = 4;     /* 4% */
 328 static int32_t rack_gp_rtt_maxmul = 3;          /* 3 x maxmin */
 329 static int32_t rack_gp_rtt_minmul = 1;          /* minrtt + (minrtt/mindiv) is lower rtt */
 330 static int32_t rack_gp_rtt_mindiv = 4;          /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
 331 static int32_t rack_gp_decrease_per = 20;       /* 20% decrease in multiplier */
 332 static int32_t rack_gp_increase_per = 2;        /* 2% increase in multiplier */
 333 static int32_t rack_per_lower_bound = 50;       /* Don't allow to drop below this multiplier */
 334 static int32_t rack_per_upper_bound_ss = 0;     /* Don't allow SS to grow above this */
 335 static int32_t rack_per_upper_bound_ca = 0;     /* Don't allow CA to grow above this */
 336 static int32_t rack_do_dyn_mul = 0;             /* Are the rack gp multipliers dynamic */
 337 static int32_t rack_gp_no_rec_chg = 1;          /* Prohibit recovery from reducing it's multiplier */
 338 static int32_t rack_timely_dec_clear = 6;       /* Do we clear decrement count at a value (6)? */
 339 static int32_t rack_timely_max_push_rise = 3;   /* One round of pushing */
 340 static int32_t rack_timely_max_push_drop = 3;   /* Three round of pushing */
 341 static int32_t rack_timely_min_segs = 4;        /* 4 segment minimum */
 342 static int32_t rack_use_max_for_nobackoff = 0;
 343 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
 344 static int32_t rack_timely_no_stopping = 0;
 345 static int32_t rack_down_raise_thresh = 100;
 346 static int32_t rack_req_segs = 1;
 347 static uint64_t rack_bw_rate_cap = 0;
 348
 349
 350 /* Rack specific counters */
 351 counter_u64_t rack_saw_enobuf;
 352 counter_u64_t rack_saw_enobuf_hw;
 353 counter_u64_t rack_saw_enetunreach;
 354 counter_u64_t rack_persists_sends;
 355 counter_u64_t rack_persists_acks;
 356 counter_u64_t rack_persists_loss;
 357 counter_u64_t rack_persists_lost_ends;
 358 counter_u64_t rack_total_bytes;
 359 #ifdef INVARIANTS
 360 counter_u64_t rack_adjust_map_bw;
 361 #endif
 362 /* Tail loss probe counters */
 363 counter_u64_t rack_tlp_tot;
 364 counter_u64_t rack_tlp_newdata;
 365 counter_u64_t rack_tlp_retran;
 366 counter_u64_t rack_tlp_retran_bytes;
 367 counter_u64_t rack_to_tot;
 368 counter_u64_t rack_hot_alloc;
 369 counter_u64_t rack_to_alloc;
 370 counter_u64_t rack_to_alloc_hard;
 371 counter_u64_t rack_to_alloc_emerg;
 372 counter_u64_t rack_to_alloc_limited;
 373 counter_u64_t rack_alloc_limited_conns;
 374 counter_u64_t rack_split_limited;
 375 counter_u64_t rack_rxt_clamps_cwnd;
 376 counter_u64_t rack_rxt_clamps_cwnd_uniq;
 377
 378 counter_u64_t rack_multi_single_eq;
 379 counter_u64_t rack_proc_non_comp_ack;
 380
 381 counter_u64_t rack_fto_send;
 382 counter_u64_t rack_fto_rsm_send;
 383 counter_u64_t rack_nfto_resend;
 384 counter_u64_t rack_non_fto_send;
 385 counter_u64_t rack_extended_rfo;
 386
 387 counter_u64_t rack_sack_proc_all;
 388 counter_u64_t rack_sack_proc_short;
 389 counter_u64_t rack_sack_proc_restart;
 390 counter_u64_t rack_sack_attacks_detected;
 391 counter_u64_t rack_sack_attacks_reversed;
 392 counter_u64_t rack_sack_attacks_suspect;
 393 counter_u64_t rack_sack_used_next_merge;
 394 counter_u64_t rack_sack_splits;
 395 counter_u64_t rack_sack_used_prev_merge;
 396 counter_u64_t rack_sack_skipped_acked;
 397 counter_u64_t rack_ack_total;
 398 counter_u64_t rack_express_sack;
 399 counter_u64_t rack_sack_total;
 400 counter_u64_t rack_move_none;
 401 counter_u64_t rack_move_some;
 402
 403 counter_u64_t rack_input_idle_reduces;
 404 counter_u64_t rack_collapsed_win;
 405 counter_u64_t rack_collapsed_win_seen;
 406 counter_u64_t rack_collapsed_win_rxt;
 407 counter_u64_t rack_collapsed_win_rxt_bytes;
 408 counter_u64_t rack_try_scwnd;
 409 counter_u64_t rack_hw_pace_init_fail;
 410 counter_u64_t rack_hw_pace_lost;
 411
 412 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 413 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 414
 415
 416 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
 417
 418 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {  \
 419         (tv) = (value) + slop;   \
 420         if ((u_long)(tv) < (u_long)(tvmin)) \
 421                 (tv) = (tvmin); \
 422         if ((u_long)(tv) > (u_long)(tvmax)) \
 423                 (tv) = (tvmax); \
 424 } while (0)
 425
 426 static void
 427 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 428
 429 static int
 430 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 431     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 432     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 433 static int
 434 rack_process_data(struct mbuf *m, struct tcphdr *th,
 435     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 436     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 437 static void
 438 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 439    uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
 440 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 441 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 442     uint8_t limit_type);
 443 static struct rack_sendmap *
 444 rack_check_recovery_mode(struct tcpcb *tp,
 445     uint32_t tsused);
 446 static void
 447 rack_cong_signal(struct tcpcb *tp,
 448                  uint32_t type, uint32_t ack, int );
 449 static void rack_counter_destroy(void);
 450 static int
 451 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
 452 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 453 static void
 454 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
 455 static void
 456 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 457     int32_t drop_hdrlen, int32_t tlen, uint8_t iptos);
 458 static void rack_dtor(void *mem, int32_t size, void *arg);
 459 static void
 460 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
 461     uint32_t flex1, uint32_t flex2,
 462     uint32_t flex3, uint32_t flex4,
 463     uint32_t flex5, uint32_t flex6,
 464     uint16_t flex7, uint8_t mod);
 465
 466 static void
 467 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
 468    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
 469    struct rack_sendmap *rsm, uint8_t quality);
 470 static struct rack_sendmap *
 471 rack_find_high_nonack(struct tcp_rack *rack,
 472     struct rack_sendmap *rsm);
 473 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 474 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 475 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 476 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt);
 477 static void
 478 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 479                             tcp_seq th_ack, int line, uint8_t quality);
 480 static void
 481 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm);
 482
 483 static uint32_t
 484 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 485 static int32_t rack_handoff_ok(struct tcpcb *tp);
 486 static int32_t rack_init(struct tcpcb *tp, void **ptr);
 487 static void rack_init_sysctls(void);
 488
 489 static void
 490 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 491     struct tcphdr *th, int entered_rec, int dup_ack_struck,
 492     int *dsack_seen, int *sacks_seen);
 493 static void
 494 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 495     uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
 496     struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
 497
 498 static uint64_t rack_get_gp_est(struct tcp_rack *rack);
 499
 500 static void
 501 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 502     struct rack_sendmap *rsm);
 503 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
 504 static int32_t rack_output(struct tcpcb *tp);
 505
 506 static uint32_t
 507 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 508     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 509     uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz);
 510 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
 511 static void rack_remxt_tmr(struct tcpcb *tp);
 512 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt);
 513 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 514 static int32_t rack_stopall(struct tcpcb *tp);
 515 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 516 static uint32_t
 517 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 518     struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz);
 519 static void
 520 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 521     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz);
 522 static int
 523 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 524     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
 525 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 526 static int
 527 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 528     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 529     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 530 static int
 531 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 532     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 533     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 534 static int
 535 rack_do_established(struct mbuf *m, struct tcphdr *th,
 536     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 537     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 538 static int
 539 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 540     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 541     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 542 static int
 543 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 544     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 545     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 546 static int
 547 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 548     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 549     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 550 static int
 551 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 552     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 553     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 554 static int
 555 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 556     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 557     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 558 static int
 559 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 560     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 561     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 562 static void rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts);
 563 struct rack_sendmap *
 564 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 565     uint32_t tsused);
 566 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
 567     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
 568 static void
 569      tcp_rack_partialack(struct tcpcb *tp);
 570 static int
 571 rack_set_profile(struct tcp_rack *rack, int prof);
 572 static void
 573 rack_apply_deferred_options(struct tcp_rack *rack);
 574
 575 int32_t rack_clear_counter=0;
 576
 577 static uint64_t
 578 rack_get_lt_bw(struct tcp_rack *rack)
 579 {
 580         struct timeval tv;
 581         uint64_t tim, bytes;
 582
 583         tim = rack->r_ctl.lt_bw_time;
 584         bytes = rack->r_ctl.lt_bw_bytes;
 585         if (rack->lt_bw_up) {
 586                 /* Include all the current bytes too */
 587                 microuptime(&tv);
 588                 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq);
 589                 tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
 590         }
 591         if ((bytes != 0) && (tim != 0))
 592                 return ((bytes * (uint64_t)1000000) / tim);
 593         else
 594                 return (0);
 595 }
 596
 597 static void
 598 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8)
 599 {
 600         struct sockopt sopt;
 601         struct cc_newreno_opts opt;
 602         struct newreno old;
 603         struct tcpcb *tp;
 604         int error, failed = 0;
 605
 606         tp = rack->rc_tp;
 607         if (tp->t_cc == NULL) {
 608                 /* Tcb is leaving */
 609                 return;
 610         }
 611         rack->rc_pacing_cc_set = 1;
 612         if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
 613                 /* Not new-reno we can't play games with beta! */
 614                 failed = 1;
 615                 goto out;
 616
 617         }
 618         if (CC_ALGO(tp)->ctl_output == NULL)  {
 619                 /* Huh, not using new-reno so no swaps.? */
 620                 failed = 2;
 621                 goto out;
 622         }
 623         /* Get the current values out */
 624         sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
 625         sopt.sopt_dir = SOPT_GET;
 626         opt.name = CC_NEWRENO_BETA;
 627         error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
 628         if (error)  {
 629                 failed = 3;
 630                 goto out;
 631         }
 632         old.beta = opt.val;
 633         opt.name = CC_NEWRENO_BETA_ECN;
 634         error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
 635         if (error)  {
 636                 failed = 4;
 637                 goto out;
 638         }
 639         old.beta_ecn = opt.val;
 640
 641         /* Now lets set in the values we have stored */
 642         sopt.sopt_dir = SOPT_SET;
 643         opt.name = CC_NEWRENO_BETA;
 644         opt.val = rack->r_ctl.rc_saved_beta.beta;
 645         error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
 646         if (error)  {
 647                 failed = 5;
 648                 goto out;
 649         }
 650         opt.name = CC_NEWRENO_BETA_ECN;
 651         opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
 652         error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
 653         if (error) {
 654                 failed = 6;
 655                 goto out;
 656         }
 657         /* Save off the values for restoral */
 658         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
 659 out:
 660         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
 661                 union tcp_log_stackspecific log;
 662                 struct timeval tv;
 663                 struct newreno *ptr;
 664
 665                 ptr = ((struct newreno *)tp->t_ccv.cc_data);
 666                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 667                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 668                 log.u_bbr.flex1 = ptr->beta;
 669                 log.u_bbr.flex2 = ptr->beta_ecn;
 670                 log.u_bbr.flex3 = ptr->newreno_flags;
 671                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
 672                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
 673                 log.u_bbr.flex6 = failed;
 674                 log.u_bbr.flex7 = rack->gp_ready;
 675                 log.u_bbr.flex7 <<= 1;
 676                 log.u_bbr.flex7 |= rack->use_fixed_rate;
 677                 log.u_bbr.flex7 <<= 1;
 678                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
 679                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 680                 log.u_bbr.flex8 = flex8;
 681                 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
 682                                0, &log, false, NULL, NULL, 0, &tv);
 683         }
 684 }
 685
 686 static void
 687 rack_set_cc_pacing(struct tcp_rack *rack)
 688 {
 689         if (rack->rc_pacing_cc_set)
 690                 return;
 691         /*
 692          * Use the swap utility placing in 3 for flex8 to id a
 693          * set of a new set of values.
 694          */
 695         rack->rc_pacing_cc_set = 1;
 696         rack_swap_beta_values(rack, 3);
 697 }
 698
 699 static void
 700 rack_undo_cc_pacing(struct tcp_rack *rack)
 701 {
 702         if (rack->rc_pacing_cc_set == 0)
 703                 return;
 704         /*
 705          * Use the swap utility placing in 4 for flex8 to id a
 706          * restoral of the old values.
 707          */
 708         rack->rc_pacing_cc_set = 0;
 709         rack_swap_beta_values(rack, 4);
 710 }
 711
 712 static void
 713 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
 714                uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
 715 {
 716         if (tcp_bblogging_on(rack->rc_tp)) {
 717                 union tcp_log_stackspecific log;
 718                 struct timeval tv;
 719
 720                 memset(&log, 0, sizeof(log));
 721                 log.u_bbr.flex1 = seq_end;
 722                 log.u_bbr.flex2 = rack->rc_tp->gput_seq;
 723                 log.u_bbr.flex3 = ack_end_t;
 724                 log.u_bbr.flex4 = rack->rc_tp->gput_ts;
 725                 log.u_bbr.flex5 = send_end_t;
 726                 log.u_bbr.flex6 = rack->rc_tp->gput_ack;
 727                 log.u_bbr.flex7 = mode;
 728                 log.u_bbr.flex8 = 69;
 729                 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts;
 730                 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts;
 731                 log.u_bbr.pkts_out = line;
 732                 log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
 733                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
 734                 if (rsm != NULL) {
 735                         log.u_bbr.applimited = rsm->r_start;
 736                         log.u_bbr.delivered = rsm->r_end;
 737                         log.u_bbr.epoch = rsm->r_flags;
 738                 }
 739                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 740                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
 741                     &rack->rc_inp->inp_socket->so_rcv,
 742                     &rack->rc_inp->inp_socket->so_snd,
 743                     BBR_LOG_HPTSI_CALC, 0,
 744                     0, &log, false, &tv);
 745         }
 746 }
 747
 748 static int
 749 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 750 {
 751         uint32_t stat;
 752         int32_t error;
 753
 754         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 755         if (error || req->newptr == NULL)
 756                 return error;
 757
 758         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 759         if (error)
 760                 return (error);
 761         if (stat == 1) {
 762 #ifdef INVARIANTS
 763                 printf("Clearing RACK counters\n");
 764 #endif
 765                 counter_u64_zero(rack_tlp_tot);
 766                 counter_u64_zero(rack_tlp_newdata);
 767                 counter_u64_zero(rack_tlp_retran);
 768                 counter_u64_zero(rack_tlp_retran_bytes);
 769                 counter_u64_zero(rack_to_tot);
 770                 counter_u64_zero(rack_saw_enobuf);
 771                 counter_u64_zero(rack_saw_enobuf_hw);
 772                 counter_u64_zero(rack_saw_enetunreach);
 773                 counter_u64_zero(rack_persists_sends);
 774                 counter_u64_zero(rack_total_bytes);
 775                 counter_u64_zero(rack_persists_acks);
 776                 counter_u64_zero(rack_persists_loss);
 777                 counter_u64_zero(rack_persists_lost_ends);
 778 #ifdef INVARIANTS
 779                 counter_u64_zero(rack_adjust_map_bw);
 780 #endif
 781                 counter_u64_zero(rack_to_alloc_hard);
 782                 counter_u64_zero(rack_to_alloc_emerg);
 783                 counter_u64_zero(rack_sack_proc_all);
 784                 counter_u64_zero(rack_fto_send);
 785                 counter_u64_zero(rack_fto_rsm_send);
 786                 counter_u64_zero(rack_extended_rfo);
 787                 counter_u64_zero(rack_hw_pace_init_fail);
 788                 counter_u64_zero(rack_hw_pace_lost);
 789                 counter_u64_zero(rack_non_fto_send);
 790                 counter_u64_zero(rack_nfto_resend);
 791                 counter_u64_zero(rack_sack_proc_short);
 792                 counter_u64_zero(rack_sack_proc_restart);
 793                 counter_u64_zero(rack_to_alloc);
 794                 counter_u64_zero(rack_to_alloc_limited);
 795                 counter_u64_zero(rack_alloc_limited_conns);
 796                 counter_u64_zero(rack_split_limited);
 797                 counter_u64_zero(rack_rxt_clamps_cwnd);
 798                 counter_u64_zero(rack_rxt_clamps_cwnd_uniq);
 799                 counter_u64_zero(rack_multi_single_eq);
 800                 counter_u64_zero(rack_proc_non_comp_ack);
 801                 counter_u64_zero(rack_sack_attacks_detected);
 802                 counter_u64_zero(rack_sack_attacks_reversed);
 803                 counter_u64_zero(rack_sack_attacks_suspect);
 804                 counter_u64_zero(rack_sack_used_next_merge);
 805                 counter_u64_zero(rack_sack_used_prev_merge);
 806                 counter_u64_zero(rack_sack_splits);
 807                 counter_u64_zero(rack_sack_skipped_acked);
 808                 counter_u64_zero(rack_ack_total);
 809                 counter_u64_zero(rack_express_sack);
 810                 counter_u64_zero(rack_sack_total);
 811                 counter_u64_zero(rack_move_none);
 812                 counter_u64_zero(rack_move_some);
 813                 counter_u64_zero(rack_try_scwnd);
 814                 counter_u64_zero(rack_collapsed_win);
 815                 counter_u64_zero(rack_collapsed_win_rxt);
 816                 counter_u64_zero(rack_collapsed_win_seen);
 817                 counter_u64_zero(rack_collapsed_win_rxt_bytes);
 818         } else if (stat == 2) {
 819 #ifdef INVARIANTS
 820                 printf("Clearing RACK option array\n");
 821 #endif
 822                 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE);
 823         } else if (stat == 3) {
 824                 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n");
 825         } else if (stat == 4) {
 826 #ifdef INVARIANTS
 827                 printf("Clearing RACK out size array\n");
 828 #endif
 829                 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE);
 830         }
 831         rack_clear_counter = 0;
 832         return (0);
 833 }
 834
 835 static void
 836 rack_init_sysctls(void)
 837 {
 838         struct sysctl_oid *rack_counters;
 839         struct sysctl_oid *rack_attack;
 840         struct sysctl_oid *rack_pacing;
 841         struct sysctl_oid *rack_timely;
 842         struct sysctl_oid *rack_timers;
 843         struct sysctl_oid *rack_tlp;
 844         struct sysctl_oid *rack_misc;
 845         struct sysctl_oid *rack_features;
 846         struct sysctl_oid *rack_measure;
 847         struct sysctl_oid *rack_probertt;
 848         struct sysctl_oid *rack_hw_pacing;
 849
 850         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 851             SYSCTL_CHILDREN(rack_sysctl_root),
 852             OID_AUTO,
 853             "sack_attack",
 854             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 855             "Rack Sack Attack Counters and Controls");
 856         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 857             SYSCTL_CHILDREN(rack_sysctl_root),
 858             OID_AUTO,
 859             "stats",
 860             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 861             "Rack Counters");
 862         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 863             SYSCTL_CHILDREN(rack_sysctl_root),
 864             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 865             &rack_rate_sample_method , USE_RTT_LOW,
 866             "What method should we use for rate sampling 0=high, 1=low ");
 867         /* Probe rtt related controls */
 868         rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 869             SYSCTL_CHILDREN(rack_sysctl_root),
 870             OID_AUTO,
 871             "probertt",
 872             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 873             "ProbeRTT related Controls");
 874         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 875             SYSCTL_CHILDREN(rack_probertt),
 876             OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
 877             &rack_atexit_prtt_hbp, 130,
 878             "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
 879         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 880             SYSCTL_CHILDREN(rack_probertt),
 881             OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
 882             &rack_atexit_prtt, 130,
 883             "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
 884         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 885             SYSCTL_CHILDREN(rack_probertt),
 886             OID_AUTO, "gp_per_mul", CTLFLAG_RW,
 887             &rack_per_of_gp_probertt, 60,
 888             "What percentage of goodput do we pace at in probertt");
 889         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 890             SYSCTL_CHILDREN(rack_probertt),
 891             OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
 892             &rack_per_of_gp_probertt_reduce, 10,
 893             "What percentage of goodput do we reduce every gp_srtt");
 894         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 895             SYSCTL_CHILDREN(rack_probertt),
 896             OID_AUTO, "gp_per_low", CTLFLAG_RW,
 897             &rack_per_of_gp_lowthresh, 40,
 898             "What percentage of goodput do we allow the multiplier to fall to");
 899         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 900             SYSCTL_CHILDREN(rack_probertt),
 901             OID_AUTO, "time_between", CTLFLAG_RW,
 902             & rack_time_between_probertt, 96000000,
 903             "How many useconds between the lowest rtt falling must past before we enter probertt");
 904         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 905             SYSCTL_CHILDREN(rack_probertt),
 906             OID_AUTO, "safety", CTLFLAG_RW,
 907             &rack_probe_rtt_safety_val, 2000000,
 908             "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
 909         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 910             SYSCTL_CHILDREN(rack_probertt),
 911             OID_AUTO, "sets_cwnd", CTLFLAG_RW,
 912             &rack_probe_rtt_sets_cwnd, 0,
 913             "Do we set the cwnd too (if always_lower is on)");
 914         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 915             SYSCTL_CHILDREN(rack_probertt),
 916             OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
 917             &rack_max_drain_wait, 2,
 918             "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
 919         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 920             SYSCTL_CHILDREN(rack_probertt),
 921             OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
 922             &rack_must_drain, 1,
 923             "We must drain this many gp_srtt's waiting for flight to reach goal");
 924         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 925             SYSCTL_CHILDREN(rack_probertt),
 926             OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
 927             &rack_probertt_use_min_rtt_entry, 1,
 928             "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
 929         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 930             SYSCTL_CHILDREN(rack_probertt),
 931             OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
 932             &rack_probertt_use_min_rtt_exit, 0,
 933             "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
 934         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 935             SYSCTL_CHILDREN(rack_probertt),
 936             OID_AUTO, "length_div", CTLFLAG_RW,
 937             &rack_probertt_gpsrtt_cnt_div, 0,
 938             "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
 939         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 940             SYSCTL_CHILDREN(rack_probertt),
 941             OID_AUTO, "length_mul", CTLFLAG_RW,
 942             &rack_probertt_gpsrtt_cnt_mul, 0,
 943             "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
 944         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 945             SYSCTL_CHILDREN(rack_probertt),
 946             OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
 947             &rack_min_probertt_hold, 200000,
 948             "What is the minimum time we hold probertt at target");
 949         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 950             SYSCTL_CHILDREN(rack_probertt),
 951             OID_AUTO, "filter_life", CTLFLAG_RW,
 952             &rack_probertt_filter_life, 10000000,
 953             "What is the time for the filters life in useconds");
 954         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 955             SYSCTL_CHILDREN(rack_probertt),
 956             OID_AUTO, "lower_within", CTLFLAG_RW,
 957             &rack_probertt_lower_within, 10,
 958             "If the rtt goes lower within this percentage of the time, go into probe-rtt");
 959         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 960             SYSCTL_CHILDREN(rack_probertt),
 961             OID_AUTO, "must_move", CTLFLAG_RW,
 962             &rack_min_rtt_movement, 250,
 963             "How much is the minimum movement in rtt to count as a drop for probertt purposes");
 964         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 965             SYSCTL_CHILDREN(rack_probertt),
 966             OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
 967             &rack_probertt_clear_is, 1,
 968             "Do we clear I/S counts on exiting probe-rtt");
 969         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 970             SYSCTL_CHILDREN(rack_probertt),
 971             OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
 972             &rack_max_drain_hbp, 1,
 973             "How many extra drain gpsrtt's do we get in highly buffered paths");
 974         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 975             SYSCTL_CHILDREN(rack_probertt),
 976             OID_AUTO, "hbp_threshold", CTLFLAG_RW,
 977             &rack_hbp_thresh, 3,
 978             "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
 979         /* Pacing related sysctls */
 980         rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 981             SYSCTL_CHILDREN(rack_sysctl_root),
 982             OID_AUTO,
 983             "pacing",
 984             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 985             "Pacing related Controls");
 986         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 987             SYSCTL_CHILDREN(rack_pacing),
 988             OID_AUTO, "fulldgpinrec", CTLFLAG_RW,
 989             &rack_uses_full_dgp_in_rec, 1,
 990             "Do we use all DGP features in recovery (fillcw, timely et.al.)?");
 991         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 992             SYSCTL_CHILDREN(rack_pacing),
 993             OID_AUTO, "fullbufdisc", CTLFLAG_RW,
 994             &rack_full_buffer_discount, 10,
 995             "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?");
 996         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 997             SYSCTL_CHILDREN(rack_pacing),
 998             OID_AUTO, "fillcw", CTLFLAG_RW,
 999             &rack_fill_cw_state, 0,
1000             "Enable fillcw on new connections (default=0 off)?");
1001         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1002             SYSCTL_CHILDREN(rack_pacing),
1003             OID_AUTO, "min_burst", CTLFLAG_RW,
1004             &rack_pacing_min_seg, 0,
1005             "What is the min burst size for pacing (0 disables)?");
1006         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1007             SYSCTL_CHILDREN(rack_pacing),
1008             OID_AUTO, "divisor", CTLFLAG_RW,
1009             &rack_default_pacing_divisor, 4,
1010             "What is the default divisor given to the rl code?");
1011         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1012             SYSCTL_CHILDREN(rack_pacing),
1013             OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
1014             &rack_bw_multipler, 2,
1015             "What is the multiplier of the current gp_est that fillcw can increase the b/w too?");
1016         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1017             SYSCTL_CHILDREN(rack_pacing),
1018             OID_AUTO, "max_pace_over", CTLFLAG_RW,
1019             &rack_max_per_above, 30,
1020             "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
1021         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1022             SYSCTL_CHILDREN(rack_pacing),
1023             OID_AUTO, "allow1mss", CTLFLAG_RW,
1024             &rack_pace_one_seg, 0,
1025             "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?");
1026         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1027             SYSCTL_CHILDREN(rack_pacing),
1028             OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
1029             &rack_limit_time_with_srtt, 0,
1030             "Do we limit pacing time based on srtt");
1031         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1032             SYSCTL_CHILDREN(rack_pacing),
1033             OID_AUTO, "init_win", CTLFLAG_RW,
1034             &rack_default_init_window, 0,
1035             "Do we have a rack initial window 0 = system default");
1036         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1037             SYSCTL_CHILDREN(rack_pacing),
1038             OID_AUTO, "gp_per_ss", CTLFLAG_RW,
1039             &rack_per_of_gp_ss, 250,
1040             "If non zero, what percentage of goodput to pace at in slow start");
1041         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1042             SYSCTL_CHILDREN(rack_pacing),
1043             OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1044             &rack_per_of_gp_ca, 150,
1045             "If non zero, what percentage of goodput to pace at in congestion avoidance");
1046         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1047             SYSCTL_CHILDREN(rack_pacing),
1048             OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1049             &rack_per_of_gp_rec, 200,
1050             "If non zero, what percentage of goodput to pace at in recovery");
1051         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1052             SYSCTL_CHILDREN(rack_pacing),
1053             OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1054             &rack_hptsi_segments, 40,
1055             "What size is the max for TSO segments in pacing and burst mitigation");
1056         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1057             SYSCTL_CHILDREN(rack_pacing),
1058             OID_AUTO, "burst_reduces", CTLFLAG_RW,
1059             &rack_slot_reduction, 4,
1060             "When doing only burst mitigation what is the reduce divisor");
1061         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1062             SYSCTL_CHILDREN(rack_sysctl_root),
1063             OID_AUTO, "use_pacing", CTLFLAG_RW,
1064             &rack_pace_every_seg, 0,
1065             "If set we use pacing, if clear we use only the original burst mitigation");
1066         SYSCTL_ADD_U64(&rack_sysctl_ctx,
1067             SYSCTL_CHILDREN(rack_pacing),
1068             OID_AUTO, "rate_cap", CTLFLAG_RW,
1069             &rack_bw_rate_cap, 0,
1070             "If set we apply this value to the absolute rate cap used by pacing");
1071         SYSCTL_ADD_U8(&rack_sysctl_ctx,
1072             SYSCTL_CHILDREN(rack_sysctl_root),
1073             OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1074             &rack_req_measurements, 1,
1075             "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1076         /* Hardware pacing */
1077         rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1078             SYSCTL_CHILDREN(rack_sysctl_root),
1079             OID_AUTO,
1080             "hdwr_pacing",
1081             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1082             "Pacing related Controls");
1083         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1084             SYSCTL_CHILDREN(rack_hw_pacing),
1085             OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1086             &rack_hw_rwnd_factor, 2,
1087             "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1088         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1089             SYSCTL_CHILDREN(rack_hw_pacing),
1090             OID_AUTO, "precheck", CTLFLAG_RW,
1091             &rack_hw_check_queue, 0,
1092             "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?");
1093         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1094             SYSCTL_CHILDREN(rack_hw_pacing),
1095             OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1096             &rack_enobuf_hw_boost_mult, 0,
1097             "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1098         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1099             SYSCTL_CHILDREN(rack_hw_pacing),
1100             OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1101             &rack_enobuf_hw_max, 2,
1102             "What is the max boost the pacing time if we see a ENOBUFS?");
1103         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1104             SYSCTL_CHILDREN(rack_hw_pacing),
1105             OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1106             &rack_enobuf_hw_min, 2,
1107             "What is the min boost the pacing time if we see a ENOBUFS?");
1108         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1109             SYSCTL_CHILDREN(rack_hw_pacing),
1110             OID_AUTO, "enable", CTLFLAG_RW,
1111             &rack_enable_hw_pacing, 0,
1112             "Should RACK attempt to use hw pacing?");
1113         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1114             SYSCTL_CHILDREN(rack_hw_pacing),
1115             OID_AUTO, "rate_cap", CTLFLAG_RW,
1116             &rack_hw_rate_caps, 0,
1117             "Does the highest hardware pacing rate cap the rate we will send at??");
1118         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1119             SYSCTL_CHILDREN(rack_hw_pacing),
1120             OID_AUTO, "uncap_per", CTLFLAG_RW,
1121             &rack_hw_rate_cap_per, 0,
1122             "If you go over b/w by this amount you will be uncapped (0 = never)");
1123         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1124             SYSCTL_CHILDREN(rack_hw_pacing),
1125             OID_AUTO, "rate_min", CTLFLAG_RW,
1126             &rack_hw_rate_min, 0,
1127             "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1128         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1129             SYSCTL_CHILDREN(rack_hw_pacing),
1130             OID_AUTO, "rate_to_low", CTLFLAG_RW,
1131             &rack_hw_rate_to_low, 0,
1132             "If we fall below this rate, dis-engage hw pacing?");
1133         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1134             SYSCTL_CHILDREN(rack_hw_pacing),
1135             OID_AUTO, "up_only", CTLFLAG_RW,
1136             &rack_hw_up_only, 0,
1137             "Do we allow hw pacing to lower the rate selected?");
1138         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1139             SYSCTL_CHILDREN(rack_hw_pacing),
1140             OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
1141             &rack_hw_pace_extra_slots, 0,
1142             "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
1143         rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1144             SYSCTL_CHILDREN(rack_sysctl_root),
1145             OID_AUTO,
1146             "timely",
1147             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1148             "Rack Timely RTT Controls");
1149         /* Timely based GP dynmics */
1150         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1151             SYSCTL_CHILDREN(rack_timely),
1152             OID_AUTO, "upper", CTLFLAG_RW,
1153             &rack_gp_per_bw_mul_up, 2,
1154             "Rack timely upper range for equal b/w (in percentage)");
1155         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1156             SYSCTL_CHILDREN(rack_timely),
1157             OID_AUTO, "lower", CTLFLAG_RW,
1158             &rack_gp_per_bw_mul_down, 4,
1159             "Rack timely lower range for equal b/w (in percentage)");
1160         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1161             SYSCTL_CHILDREN(rack_timely),
1162             OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1163             &rack_gp_rtt_maxmul, 3,
1164             "Rack timely multiplier of lowest rtt for rtt_max");
1165         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1166             SYSCTL_CHILDREN(rack_timely),
1167             OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1168             &rack_gp_rtt_mindiv, 4,
1169             "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1170         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1171             SYSCTL_CHILDREN(rack_timely),
1172             OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1173             &rack_gp_rtt_minmul, 1,
1174             "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1175         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1176             SYSCTL_CHILDREN(rack_timely),
1177             OID_AUTO, "decrease", CTLFLAG_RW,
1178             &rack_gp_decrease_per, 20,
1179             "Rack timely decrease percentage of our GP multiplication factor");
1180         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1181             SYSCTL_CHILDREN(rack_timely),
1182             OID_AUTO, "increase", CTLFLAG_RW,
1183             &rack_gp_increase_per, 2,
1184             "Rack timely increase perentage of our GP multiplication factor");
1185         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1186             SYSCTL_CHILDREN(rack_timely),
1187             OID_AUTO, "lowerbound", CTLFLAG_RW,
1188             &rack_per_lower_bound, 50,
1189             "Rack timely lowest percentage we allow GP multiplier to fall to");
1190         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1191             SYSCTL_CHILDREN(rack_timely),
1192             OID_AUTO, "upperboundss", CTLFLAG_RW,
1193             &rack_per_upper_bound_ss, 0,
1194             "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1195         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1196             SYSCTL_CHILDREN(rack_timely),
1197             OID_AUTO, "upperboundca", CTLFLAG_RW,
1198             &rack_per_upper_bound_ca, 0,
1199             "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1200         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1201             SYSCTL_CHILDREN(rack_timely),
1202             OID_AUTO, "dynamicgp", CTLFLAG_RW,
1203             &rack_do_dyn_mul, 0,
1204             "Rack timely do we enable dynmaic timely goodput by default");
1205         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1206             SYSCTL_CHILDREN(rack_timely),
1207             OID_AUTO, "no_rec_red", CTLFLAG_RW,
1208             &rack_gp_no_rec_chg, 1,
1209             "Rack timely do we prohibit the recovery multiplier from being lowered");
1210         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1211             SYSCTL_CHILDREN(rack_timely),
1212             OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1213             &rack_timely_dec_clear, 6,
1214             "Rack timely what threshold do we count to before another boost during b/w decent");
1215         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1216             SYSCTL_CHILDREN(rack_timely),
1217             OID_AUTO, "max_push_rise", CTLFLAG_RW,
1218             &rack_timely_max_push_rise, 3,
1219             "Rack timely how many times do we push up with b/w increase");
1220         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1221             SYSCTL_CHILDREN(rack_timely),
1222             OID_AUTO, "max_push_drop", CTLFLAG_RW,
1223             &rack_timely_max_push_drop, 3,
1224             "Rack timely how many times do we push back on b/w decent");
1225         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1226             SYSCTL_CHILDREN(rack_timely),
1227             OID_AUTO, "min_segs", CTLFLAG_RW,
1228             &rack_timely_min_segs, 4,
1229             "Rack timely when setting the cwnd what is the min num segments");
1230         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1231             SYSCTL_CHILDREN(rack_timely),
1232             OID_AUTO, "noback_max", CTLFLAG_RW,
1233             &rack_use_max_for_nobackoff, 0,
1234             "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
1235         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1236             SYSCTL_CHILDREN(rack_timely),
1237             OID_AUTO, "interim_timely_only", CTLFLAG_RW,
1238             &rack_timely_int_timely_only, 0,
1239             "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
1240         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1241             SYSCTL_CHILDREN(rack_timely),
1242             OID_AUTO, "nonstop", CTLFLAG_RW,
1243             &rack_timely_no_stopping, 0,
1244             "Rack timely don't stop increase");
1245         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1246             SYSCTL_CHILDREN(rack_timely),
1247             OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1248             &rack_down_raise_thresh, 100,
1249             "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1250         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1251             SYSCTL_CHILDREN(rack_timely),
1252             OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1253             &rack_req_segs, 1,
1254             "Bottom dragging if not these many segments outstanding and room");
1255
1256         /* TLP and Rack related parameters */
1257         rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1258             SYSCTL_CHILDREN(rack_sysctl_root),
1259             OID_AUTO,
1260             "tlp",
1261             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1262             "TLP and Rack related Controls");
1263         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1264             SYSCTL_CHILDREN(rack_tlp),
1265             OID_AUTO, "use_rrr", CTLFLAG_RW,
1266             &use_rack_rr, 1,
1267             "Do we use Rack Rapid Recovery");
1268         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1269             SYSCTL_CHILDREN(rack_tlp),
1270             OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1271             &rack_max_abc_post_recovery, 2,
1272             "Since we do early recovery, do we override the l_abc to a value, if so what?");
1273         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1274             SYSCTL_CHILDREN(rack_tlp),
1275             OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1276             &rack_non_rxt_use_cr, 0,
1277             "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1278         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1279             SYSCTL_CHILDREN(rack_tlp),
1280             OID_AUTO, "tlpmethod", CTLFLAG_RW,
1281             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1282             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1283         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1284             SYSCTL_CHILDREN(rack_tlp),
1285             OID_AUTO, "limit", CTLFLAG_RW,
1286             &rack_tlp_limit, 2,
1287             "How many TLP's can be sent without sending new data");
1288         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1289             SYSCTL_CHILDREN(rack_tlp),
1290             OID_AUTO, "use_greater", CTLFLAG_RW,
1291             &rack_tlp_use_greater, 1,
1292             "Should we use the rack_rtt time if its greater than srtt");
1293         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1294             SYSCTL_CHILDREN(rack_tlp),
1295             OID_AUTO, "tlpminto", CTLFLAG_RW,
1296             &rack_tlp_min, 10000,
1297             "TLP minimum timeout per the specification (in microseconds)");
1298         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1299             SYSCTL_CHILDREN(rack_tlp),
1300             OID_AUTO, "send_oldest", CTLFLAG_RW,
1301             &rack_always_send_oldest, 0,
1302             "Should we always send the oldest TLP and RACK-TLP");
1303         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1304             SYSCTL_CHILDREN(rack_tlp),
1305             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
1306             &rack_limited_retran, 0,
1307             "How many times can a rack timeout drive out sends");
1308         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1309             SYSCTL_CHILDREN(rack_tlp),
1310             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1311             &rack_lower_cwnd_at_tlp, 0,
1312             "When a TLP completes a retran should we enter recovery");
1313         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1314             SYSCTL_CHILDREN(rack_tlp),
1315             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1316             &rack_reorder_thresh, 2,
1317             "What factor for rack will be added when seeing reordering (shift right)");
1318         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1319             SYSCTL_CHILDREN(rack_tlp),
1320             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1321             &rack_tlp_thresh, 1,
1322             "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1323         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1324             SYSCTL_CHILDREN(rack_tlp),
1325             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1326             &rack_reorder_fade, 60000000,
1327             "Does reorder detection fade, if so how many microseconds (0 means never)");
1328         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1329             SYSCTL_CHILDREN(rack_tlp),
1330             OID_AUTO, "pktdelay", CTLFLAG_RW,
1331             &rack_pkt_delay, 1000,
1332             "Extra RACK time (in microseconds) besides reordering thresh");
1333
1334         /* Timer related controls */
1335         rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1336             SYSCTL_CHILDREN(rack_sysctl_root),
1337             OID_AUTO,
1338             "timers",
1339             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1340             "Timer related controls");
1341         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1342             SYSCTL_CHILDREN(rack_timers),
1343             OID_AUTO, "persmin", CTLFLAG_RW,
1344             &rack_persist_min, 250000,
1345             "What is the minimum time in microseconds between persists");
1346         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1347             SYSCTL_CHILDREN(rack_timers),
1348             OID_AUTO, "persmax", CTLFLAG_RW,
1349             &rack_persist_max, 2000000,
1350             "What is the largest delay in microseconds between persists");
1351         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1352             SYSCTL_CHILDREN(rack_timers),
1353             OID_AUTO, "delayed_ack", CTLFLAG_RW,
1354             &rack_delayed_ack_time, 40000,
1355             "Delayed ack time (40ms in microseconds)");
1356         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1357             SYSCTL_CHILDREN(rack_timers),
1358             OID_AUTO, "minrto", CTLFLAG_RW,
1359             &rack_rto_min, 30000,
1360             "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1361         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1362             SYSCTL_CHILDREN(rack_timers),
1363             OID_AUTO, "maxrto", CTLFLAG_RW,
1364             &rack_rto_max, 4000000,
1365             "Maximum RTO in microseconds -- should be at least as large as min_rto");
1366         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1367             SYSCTL_CHILDREN(rack_timers),
1368             OID_AUTO, "minto", CTLFLAG_RW,
1369             &rack_min_to, 1000,
1370             "Minimum rack timeout in microseconds");
1371         /* Measure controls */
1372         rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1373             SYSCTL_CHILDREN(rack_sysctl_root),
1374             OID_AUTO,
1375             "measure",
1376             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1377             "Measure related controls");
1378         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1379             SYSCTL_CHILDREN(rack_measure),
1380             OID_AUTO, "wma_divisor", CTLFLAG_RW,
1381             &rack_wma_divisor, 8,
1382             "When doing b/w calculation what is the  divisor for the WMA");
1383         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1384             SYSCTL_CHILDREN(rack_measure),
1385             OID_AUTO, "end_cwnd", CTLFLAG_RW,
1386             &rack_cwnd_block_ends_measure, 0,
1387             "Does a cwnd just-return end the measurement window (app limited)");
1388         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1389             SYSCTL_CHILDREN(rack_measure),
1390             OID_AUTO, "end_rwnd", CTLFLAG_RW,
1391             &rack_rwnd_block_ends_measure, 0,
1392             "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1393         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1394             SYSCTL_CHILDREN(rack_measure),
1395             OID_AUTO, "min_target", CTLFLAG_RW,
1396             &rack_def_data_window, 20,
1397             "What is the minimum target window (in mss) for a GP measurements");
1398         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1399             SYSCTL_CHILDREN(rack_measure),
1400             OID_AUTO, "goal_bdp", CTLFLAG_RW,
1401             &rack_goal_bdp, 2,
1402             "What is the goal BDP to measure");
1403         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1404             SYSCTL_CHILDREN(rack_measure),
1405             OID_AUTO, "min_srtts", CTLFLAG_RW,
1406             &rack_min_srtts, 1,
1407             "What is the goal BDP to measure");
1408         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1409             SYSCTL_CHILDREN(rack_measure),
1410             OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1411             &rack_min_measure_usec, 0,
1412             "What is the Minimum time time for a measurement if 0, this is off");
1413         /* Features */
1414         rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1415             SYSCTL_CHILDREN(rack_sysctl_root),
1416             OID_AUTO,
1417             "features",
1418             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1419             "Feature controls");
1420         SYSCTL_ADD_U64(&rack_sysctl_ctx,
1421             SYSCTL_CHILDREN(rack_features),
1422             OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW,
1423             &rack_rxt_clamp_thresh, 0,
1424             "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP");
1425         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1426             SYSCTL_CHILDREN(rack_features),
1427             OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
1428             &rack_hybrid_allow_set_maxseg, 0,
1429             "Should hybrid pacing allow the setmss command");
1430         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1431             SYSCTL_CHILDREN(rack_features),
1432             OID_AUTO, "cmpack", CTLFLAG_RW,
1433             &rack_use_cmp_acks, 1,
1434             "Should RACK have LRO send compressed acks");
1435         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1436             SYSCTL_CHILDREN(rack_features),
1437             OID_AUTO, "fsb", CTLFLAG_RW,
1438             &rack_use_fsb, 1,
1439             "Should RACK use the fast send block?");
1440         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1441             SYSCTL_CHILDREN(rack_features),
1442             OID_AUTO, "rfo", CTLFLAG_RW,
1443             &rack_use_rfo, 1,
1444             "Should RACK use rack_fast_output()?");
1445         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1446             SYSCTL_CHILDREN(rack_features),
1447             OID_AUTO, "rsmrfo", CTLFLAG_RW,
1448             &rack_use_rsm_rfo, 1,
1449             "Should RACK use rack_fast_rsm_output()?");
1450         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1451             SYSCTL_CHILDREN(rack_features),
1452             OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1453             &rack_enable_mqueue_for_nonpaced, 0,
1454             "Should RACK use mbuf queuing for non-paced connections");
1455         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1456             SYSCTL_CHILDREN(rack_features),
1457             OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1458             &rack_do_hystart, 0,
1459             "Should RACK enable HyStart++ on connections?");
1460         /* Misc rack controls */
1461         rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1462             SYSCTL_CHILDREN(rack_sysctl_root),
1463             OID_AUTO,
1464             "misc",
1465             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1466             "Misc related controls");
1467 #ifdef TCP_ACCOUNTING
1468         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1469             SYSCTL_CHILDREN(rack_misc),
1470             OID_AUTO, "tcp_acct", CTLFLAG_RW,
1471             &rack_tcp_accounting, 0,
1472             "Should we turn on TCP accounting for all rack sessions?");
1473 #endif
1474         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1475             SYSCTL_CHILDREN(rack_misc),
1476             OID_AUTO, "dnd", CTLFLAG_RW,
1477             &rack_dnd_default, 0,
1478             "Do not disturb default for rack_rrr = 3");
1479         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1480             SYSCTL_CHILDREN(rack_misc),
1481             OID_AUTO, "sad_seg_per", CTLFLAG_RW,
1482             &sad_seg_size_per, 800,
1483             "Percentage of segment size needed in a sack 800 = 80.0?");
1484         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1485             SYSCTL_CHILDREN(rack_misc),
1486             OID_AUTO, "rxt_controls", CTLFLAG_RW,
1487             &rack_rxt_controls, 0,
1488             "Retransmit sending size controls (valid  values 0, 1, 2 default=1)?");
1489         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1490             SYSCTL_CHILDREN(rack_misc),
1491             OID_AUTO, "rack_hibeta", CTLFLAG_RW,
1492             &rack_hibeta_setting, 0,
1493             "Do we ue a high beta (80 instead of 50)?");
1494         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1495             SYSCTL_CHILDREN(rack_misc),
1496             OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
1497             &rack_apply_rtt_with_reduced_conf, 0,
1498             "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
1499         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1500             SYSCTL_CHILDREN(rack_misc),
1501             OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1502             &rack_dsack_std_based, 3,
1503             "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1504         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1505             SYSCTL_CHILDREN(rack_misc),
1506             OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1507             &rack_prr_addbackmax, 2,
1508             "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1509         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1510             SYSCTL_CHILDREN(rack_misc),
1511             OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1512             &rack_stats_gets_ms_rtt, 1,
1513             "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1514         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1515             SYSCTL_CHILDREN(rack_misc),
1516             OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1517             &rack_client_low_buf, 0,
1518             "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1519         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1520             SYSCTL_CHILDREN(rack_misc),
1521             OID_AUTO, "defprofile", CTLFLAG_RW,
1522             &rack_def_profile, 0,
1523             "Should RACK use a default profile (0=no, num == profile num)?");
1524         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1525             SYSCTL_CHILDREN(rack_misc),
1526             OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1527             &rack_enable_shared_cwnd, 1,
1528             "Should RACK try to use the shared cwnd on connections where allowed");
1529         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1530             SYSCTL_CHILDREN(rack_misc),
1531             OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1532             &rack_limits_scwnd, 1,
1533             "Should RACK place low end time limits on the shared cwnd feature");
1534         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1535             SYSCTL_CHILDREN(rack_misc),
1536             OID_AUTO, "no_prr", CTLFLAG_RW,
1537             &rack_disable_prr, 0,
1538             "Should RACK not use prr and only pace (must have pacing on)");
1539         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1540             SYSCTL_CHILDREN(rack_misc),
1541             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1542             &rack_verbose_logging, 0,
1543             "Should RACK black box logging be verbose");
1544         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1545             SYSCTL_CHILDREN(rack_misc),
1546             OID_AUTO, "data_after_close", CTLFLAG_RW,
1547             &rack_ignore_data_after_close, 1,
1548             "Do we hold off sending a RST until all pending data is ack'd");
1549         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1550             SYSCTL_CHILDREN(rack_misc),
1551             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1552             &rack_sack_not_required, 1,
1553             "Do we allow rack to run on connections not supporting SACK");
1554         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1555             SYSCTL_CHILDREN(rack_misc),
1556             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1557             &rack_send_a_lot_in_prr, 1,
1558             "Send a lot in prr");
1559         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1560             SYSCTL_CHILDREN(rack_misc),
1561             OID_AUTO, "autoscale", CTLFLAG_RW,
1562             &rack_autosndbuf_inc, 20,
1563             "What percentage should rack scale up its snd buffer by?");
1564         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1565             SYSCTL_CHILDREN(rack_misc),
1566             OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW,
1567             &rack_rxt_min_rnds, 10,
1568             "Number of rounds needed between RTT clamps due to high loss rates");
1569         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1570             SYSCTL_CHILDREN(rack_misc),
1571             OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW,
1572             &rack_unclamp_round_thresh, 100,
1573             "Number of rounds needed with no loss to unclamp");
1574         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1575             SYSCTL_CHILDREN(rack_misc),
1576             OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW,
1577             &rack_unclamp_rxt_thresh, 5,
1578            "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n");
1579         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1580             SYSCTL_CHILDREN(rack_misc),
1581             OID_AUTO, "clamp_ss_upper", CTLFLAG_RW,
1582             &rack_clamp_ss_upper, 110,
1583             "Clamp percentage ceiling in SS?");
1584         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1585             SYSCTL_CHILDREN(rack_misc),
1586             OID_AUTO, "clamp_ca_upper", CTLFLAG_RW,
1587             &rack_clamp_ca_upper, 110,
1588             "Clamp percentage ceiling in CA?");
1589         /* Sack Attacker detection stuff */
1590         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1591             SYSCTL_CHILDREN(rack_attack),
1592             OID_AUTO, "merge_out", CTLFLAG_RW,
1593             &rack_merge_out_sacks_on_attack, 0,
1594             "Do we merge the sendmap when we decide we are being attacked?");
1595
1596         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1597             SYSCTL_CHILDREN(rack_attack),
1598             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
1599             &rack_highest_sack_thresh_seen, 0,
1600             "Highest sack to ack ratio seen");
1601         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1602             SYSCTL_CHILDREN(rack_attack),
1603             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
1604             &rack_highest_move_thresh_seen, 0,
1605             "Highest move to non-move ratio seen");
1606         rack_ack_total = counter_u64_alloc(M_WAITOK);
1607         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1608             SYSCTL_CHILDREN(rack_attack),
1609             OID_AUTO, "acktotal", CTLFLAG_RD,
1610             &rack_ack_total,
1611             "Total number of Ack's");
1612         rack_express_sack = counter_u64_alloc(M_WAITOK);
1613         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1614             SYSCTL_CHILDREN(rack_attack),
1615             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
1616             &rack_express_sack,
1617             "Total expresss number of Sack's");
1618         rack_sack_total = counter_u64_alloc(M_WAITOK);
1619         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1620             SYSCTL_CHILDREN(rack_attack),
1621             OID_AUTO, "sacktotal", CTLFLAG_RD,
1622             &rack_sack_total,
1623             "Total number of SACKs");
1624         rack_move_none = counter_u64_alloc(M_WAITOK);
1625         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1626             SYSCTL_CHILDREN(rack_attack),
1627             OID_AUTO, "move_none", CTLFLAG_RD,
1628             &rack_move_none,
1629             "Total number of SACK index reuse of positions under threshold");
1630         rack_move_some = counter_u64_alloc(M_WAITOK);
1631         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1632             SYSCTL_CHILDREN(rack_attack),
1633             OID_AUTO, "move_some", CTLFLAG_RD,
1634             &rack_move_some,
1635             "Total number of SACK index reuse of positions over threshold");
1636         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
1637         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1638             SYSCTL_CHILDREN(rack_attack),
1639             OID_AUTO, "attacks", CTLFLAG_RD,
1640             &rack_sack_attacks_detected,
1641             "Total number of SACK attackers that had sack disabled");
1642         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
1643         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1644             SYSCTL_CHILDREN(rack_attack),
1645             OID_AUTO, "reversed", CTLFLAG_RD,
1646             &rack_sack_attacks_reversed,
1647             "Total number of SACK attackers that were later determined false positive");
1648         rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK);
1649         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1650             SYSCTL_CHILDREN(rack_attack),
1651             OID_AUTO, "suspect", CTLFLAG_RD,
1652             &rack_sack_attacks_suspect,
1653             "Total number of SACKs that triggered early detection");
1654
1655         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
1656         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1657             SYSCTL_CHILDREN(rack_attack),
1658             OID_AUTO, "nextmerge", CTLFLAG_RD,
1659             &rack_sack_used_next_merge,
1660             "Total number of times we used the next merge");
1661         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
1662         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1663             SYSCTL_CHILDREN(rack_attack),
1664             OID_AUTO, "prevmerge", CTLFLAG_RD,
1665             &rack_sack_used_prev_merge,
1666             "Total number of times we used the prev merge");
1667         /* Counters */
1668         rack_total_bytes = counter_u64_alloc(M_WAITOK);
1669         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1670             SYSCTL_CHILDREN(rack_counters),
1671             OID_AUTO, "totalbytes", CTLFLAG_RD,
1672             &rack_total_bytes,
1673             "Total number of bytes sent");
1674         rack_fto_send = counter_u64_alloc(M_WAITOK);
1675         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1676             SYSCTL_CHILDREN(rack_counters),
1677             OID_AUTO, "fto_send", CTLFLAG_RD,
1678             &rack_fto_send, "Total number of rack_fast_output sends");
1679         rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1680         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1681             SYSCTL_CHILDREN(rack_counters),
1682             OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1683             &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1684         rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1685         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1686             SYSCTL_CHILDREN(rack_counters),
1687             OID_AUTO, "nfto_resend", CTLFLAG_RD,
1688             &rack_nfto_resend, "Total number of rack_output retransmissions");
1689         rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1690         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1691             SYSCTL_CHILDREN(rack_counters),
1692             OID_AUTO, "nfto_send", CTLFLAG_RD,
1693             &rack_non_fto_send, "Total number of rack_output first sends");
1694         rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1695         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1696             SYSCTL_CHILDREN(rack_counters),
1697             OID_AUTO, "rfo_extended", CTLFLAG_RD,
1698             &rack_extended_rfo, "Total number of times we extended rfo");
1699
1700         rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1701         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1702             SYSCTL_CHILDREN(rack_counters),
1703             OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1704             &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1705         rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1706
1707         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1708             SYSCTL_CHILDREN(rack_counters),
1709             OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1710             &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1711         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1712         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1713             SYSCTL_CHILDREN(rack_counters),
1714             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1715             &rack_tlp_tot,
1716             "Total number of tail loss probe expirations");
1717         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1718         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1719             SYSCTL_CHILDREN(rack_counters),
1720             OID_AUTO, "tlp_new", CTLFLAG_RD,
1721             &rack_tlp_newdata,
1722             "Total number of tail loss probe sending new data");
1723         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1724         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1725             SYSCTL_CHILDREN(rack_counters),
1726             OID_AUTO, "tlp_retran", CTLFLAG_RD,
1727             &rack_tlp_retran,
1728             "Total number of tail loss probe sending retransmitted data");
1729         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1730         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1731             SYSCTL_CHILDREN(rack_counters),
1732             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1733             &rack_tlp_retran_bytes,
1734             "Total bytes of tail loss probe sending retransmitted data");
1735         rack_to_tot = counter_u64_alloc(M_WAITOK);
1736         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1737             SYSCTL_CHILDREN(rack_counters),
1738             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1739             &rack_to_tot,
1740             "Total number of times the rack to expired");
1741         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1742         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1743             SYSCTL_CHILDREN(rack_counters),
1744             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1745             &rack_saw_enobuf,
1746             "Total number of times a sends returned enobuf for non-hdwr paced connections");
1747         rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1748         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1749             SYSCTL_CHILDREN(rack_counters),
1750             OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1751             &rack_saw_enobuf_hw,
1752             "Total number of times a send returned enobuf for hdwr paced connections");
1753         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1754         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1755             SYSCTL_CHILDREN(rack_counters),
1756             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1757             &rack_saw_enetunreach,
1758             "Total number of times a send received a enetunreachable");
1759         rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1760         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1761             SYSCTL_CHILDREN(rack_counters),
1762             OID_AUTO, "alloc_hot", CTLFLAG_RD,
1763             &rack_hot_alloc,
1764             "Total allocations from the top of our list");
1765         rack_to_alloc = counter_u64_alloc(M_WAITOK);
1766         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1767             SYSCTL_CHILDREN(rack_counters),
1768             OID_AUTO, "allocs", CTLFLAG_RD,
1769             &rack_to_alloc,
1770             "Total allocations of tracking structures");
1771         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1772         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1773             SYSCTL_CHILDREN(rack_counters),
1774             OID_AUTO, "allochard", CTLFLAG_RD,
1775             &rack_to_alloc_hard,
1776             "Total allocations done with sleeping the hard way");
1777         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1778         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1779             SYSCTL_CHILDREN(rack_counters),
1780             OID_AUTO, "allocemerg", CTLFLAG_RD,
1781             &rack_to_alloc_emerg,
1782             "Total allocations done from emergency cache");
1783         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1784         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1785             SYSCTL_CHILDREN(rack_counters),
1786             OID_AUTO, "alloc_limited", CTLFLAG_RD,
1787             &rack_to_alloc_limited,
1788             "Total allocations dropped due to limit");
1789         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1790         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1791             SYSCTL_CHILDREN(rack_counters),
1792             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1793             &rack_alloc_limited_conns,
1794             "Connections with allocations dropped due to limit");
1795         rack_split_limited = counter_u64_alloc(M_WAITOK);
1796         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1797             SYSCTL_CHILDREN(rack_counters),
1798             OID_AUTO, "split_limited", CTLFLAG_RD,
1799             &rack_split_limited,
1800             "Split allocations dropped due to limit");
1801         rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK);
1802         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1803             SYSCTL_CHILDREN(rack_counters),
1804             OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD,
1805             &rack_rxt_clamps_cwnd,
1806             "Number of times that excessive rxt clamped the cwnd down");
1807         rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK);
1808         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1809             SYSCTL_CHILDREN(rack_counters),
1810             OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD,
1811             &rack_rxt_clamps_cwnd_uniq,
1812             "Number of connections that have had excessive rxt clamped the cwnd down");
1813         rack_persists_sends = counter_u64_alloc(M_WAITOK);
1814         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1815             SYSCTL_CHILDREN(rack_counters),
1816             OID_AUTO, "persist_sends", CTLFLAG_RD,
1817             &rack_persists_sends,
1818             "Number of times we sent a persist probe");
1819         rack_persists_acks = counter_u64_alloc(M_WAITOK);
1820         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1821             SYSCTL_CHILDREN(rack_counters),
1822             OID_AUTO, "persist_acks", CTLFLAG_RD,
1823             &rack_persists_acks,
1824             "Number of times a persist probe was acked");
1825         rack_persists_loss = counter_u64_alloc(M_WAITOK);
1826         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1827             SYSCTL_CHILDREN(rack_counters),
1828             OID_AUTO, "persist_loss", CTLFLAG_RD,
1829             &rack_persists_loss,
1830             "Number of times we detected a lost persist probe (no ack)");
1831         rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
1832         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1833             SYSCTL_CHILDREN(rack_counters),
1834             OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
1835             &rack_persists_lost_ends,
1836             "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
1837 #ifdef INVARIANTS
1838         rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1839         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1840             SYSCTL_CHILDREN(rack_counters),
1841             OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1842             &rack_adjust_map_bw,
1843             "Number of times we hit the case where the sb went up and down on a sendmap entry");
1844 #endif
1845         rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1846         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1847             SYSCTL_CHILDREN(rack_counters),
1848             OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1849             &rack_multi_single_eq,
1850             "Number of compressed acks total represented");
1851         rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1852         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1853             SYSCTL_CHILDREN(rack_counters),
1854             OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1855             &rack_proc_non_comp_ack,
1856             "Number of non compresseds acks that we processed");
1857
1858
1859         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1860         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1861             SYSCTL_CHILDREN(rack_counters),
1862             OID_AUTO, "sack_long", CTLFLAG_RD,
1863             &rack_sack_proc_all,
1864             "Total times we had to walk whole list for sack processing");
1865         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1866         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1867             SYSCTL_CHILDREN(rack_counters),
1868             OID_AUTO, "sack_restart", CTLFLAG_RD,
1869             &rack_sack_proc_restart,
1870             "Total times we had to walk whole list due to a restart");
1871         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1872         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1873             SYSCTL_CHILDREN(rack_counters),
1874             OID_AUTO, "sack_short", CTLFLAG_RD,
1875             &rack_sack_proc_short,
1876             "Total times we took shortcut for sack processing");
1877         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
1878         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1879             SYSCTL_CHILDREN(rack_attack),
1880             OID_AUTO, "skipacked", CTLFLAG_RD,
1881             &rack_sack_skipped_acked,
1882             "Total number of times we skipped previously sacked");
1883         rack_sack_splits = counter_u64_alloc(M_WAITOK);
1884         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1885             SYSCTL_CHILDREN(rack_attack),
1886             OID_AUTO, "ofsplit", CTLFLAG_RD,
1887             &rack_sack_splits,
1888             "Total number of times we did the old fashion tree split");
1889         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1890         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1891             SYSCTL_CHILDREN(rack_counters),
1892             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1893             &rack_input_idle_reduces,
1894             "Total number of idle reductions on input");
1895         rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
1896         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1897             SYSCTL_CHILDREN(rack_counters),
1898             OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
1899             &rack_collapsed_win_seen,
1900             "Total number of collapsed window events seen (where our window shrinks)");
1901
1902         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1903         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1904             SYSCTL_CHILDREN(rack_counters),
1905             OID_AUTO, "collapsed_win", CTLFLAG_RD,
1906             &rack_collapsed_win,
1907             "Total number of collapsed window events where we mark packets");
1908         rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
1909         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1910             SYSCTL_CHILDREN(rack_counters),
1911             OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
1912             &rack_collapsed_win_rxt,
1913             "Total number of packets that were retransmitted");
1914         rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
1915         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1916             SYSCTL_CHILDREN(rack_counters),
1917             OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
1918             &rack_collapsed_win_rxt_bytes,
1919             "Total number of bytes that were retransmitted");
1920         rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1921         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1922             SYSCTL_CHILDREN(rack_counters),
1923             OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1924             &rack_try_scwnd,
1925             "Total number of scwnd attempts");
1926         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1927         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1928             OID_AUTO, "outsize", CTLFLAG_RD,
1929             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1930         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1931         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1932             OID_AUTO, "opts", CTLFLAG_RD,
1933             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1934         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1935             SYSCTL_CHILDREN(rack_sysctl_root),
1936             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1937             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1938 }
1939
1940 static uint32_t
1941 rc_init_window(struct tcp_rack *rack)
1942 {
1943         uint32_t win;
1944
1945         if (rack->rc_init_win == 0) {
1946                 /*
1947                  * Nothing set by the user, use the system stack
1948                  * default.
1949                  */
1950                 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1951         }
1952         win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
1953         return (win);
1954 }
1955
1956 static uint64_t
1957 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1958 {
1959         if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
1960                 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1961         else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1962                 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1963         else
1964                 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1965 }
1966
1967 static void
1968 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim,
1969         uint64_t data, uint8_t mod, uint16_t aux,
1970         struct http_sendfile_track *cur)
1971 {
1972 #ifdef TCP_REQUEST_TRK
1973         int do_log = 0;
1974
1975         /*
1976          * The rate cap one is noisy and only should come out when normal BB logging
1977          * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out
1978          * once per chunk and make up the BBpoint that can be turned on by the client.
1979          */
1980         if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
1981                 if (rack_verbose_logging != 0)
1982                         do_log = tcp_bblogging_on(rack->rc_tp);
1983                 else
1984                         do_log = 0;
1985         } else
1986                 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING);
1987
1988         if (do_log) {
1989                 union tcp_log_stackspecific log;
1990                 struct timeval tv;
1991                 uint64_t lt_bw;
1992
1993                 /* Convert our ms to a microsecond */
1994                 memset(&log, 0, sizeof(log));
1995
1996                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1997                 log.u_bbr.rttProp = tim;
1998                 log.u_bbr.bw_inuse = cbw;
1999                 log.u_bbr.delRate = rack_get_gp_est(rack);
2000                 lt_bw = rack_get_lt_bw(rack);
2001                 log.u_bbr.flex1 = seq;
2002                 log.u_bbr.pacing_gain = aux;
2003                 /* lt_bw = < flex3 | flex2 > */
2004                 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff);
2005                 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff);
2006                 /* Record the last obtained us rtt in inflight */
2007                 if (cur == NULL) {
2008                         /* Make sure we are looking at the right log if an overide comes in */
2009                         cur = rack->r_ctl.rc_last_sft;
2010                 }
2011                 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY)
2012                         log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt;
2013                 else {
2014                         /* Use the last known rtt i.e. the rack-rtt */
2015                         log.u_bbr.inflight = rack->rc_rack_rtt;
2016                 }
2017                 if (cur != NULL) {
2018                         uint64_t off;
2019
2020                         log.u_bbr.cur_del_rate = cur->deadline;
2021                         if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
2022                                 /* start = < lost | pkt_epoch > */
2023                                 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
2024                                 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
2025                                 log.u_bbr.flex6 = cur->start_seq;
2026                                 log.u_bbr.pkts_out = cur->end_seq;
2027                         } else {
2028                                 /* start = < lost | pkt_epoch > */
2029                                 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
2030                                 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
2031                                 /* end = < pkts_out | flex6 > */
2032                                 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff);
2033                                 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
2034                         }
2035                         /* first_send = <lt_epoch | epoch> */
2036                         log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff);
2037                         log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff);
2038                         /* localtime = <delivered | applimited>*/
2039                         log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
2040                         log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
2041                         off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]);
2042                         log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track));
2043                         log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
2044                         log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
2045                         log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
2046                 } else {
2047                         log.u_bbr.flex7 = 0xffff;
2048                         log.u_bbr.cur_del_rate = 0xffffffffffffffff;
2049                 }
2050                 /*
2051                  * Compose bbr_state to be a bit wise 0000ADHF
2052                  * where A is the always_pace flag
2053                  * where D is the dgp_on flag
2054                  * where H is the hybrid_mode on flag
2055                  * where F is the use_fixed_rate flag.
2056                  */
2057                 log.u_bbr.bbr_state = rack->rc_always_pace;
2058                 log.u_bbr.bbr_state <<= 1;
2059                 log.u_bbr.bbr_state |= rack->dgp_on;
2060                 log.u_bbr.bbr_state <<= 1;
2061                 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
2062                 log.u_bbr.bbr_state <<= 1;
2063                 log.u_bbr.bbr_state |= rack->use_fixed_rate;
2064                 log.u_bbr.flex8 = mod;
2065                 tcp_log_event(rack->rc_tp, NULL,
2066                     &rack->rc_inp->inp_socket->so_rcv,
2067                     &rack->rc_inp->inp_socket->so_snd,
2068                     TCP_HYBRID_PACING_LOG, 0,
2069                     0, &log, false, NULL, __func__, __LINE__, &tv);
2070
2071         }
2072 #endif
2073 }
2074
2075 static inline uint64_t
2076 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw)
2077 {
2078         uint64_t ret_bw, ether;
2079         uint64_t u_segsiz;
2080
2081         ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr);
2082         if (rack->r_is_v6){
2083 #ifdef INET6
2084                 ether += sizeof(struct ip6_hdr);
2085 #endif
2086                 ether += 14;    /* eheader size 6+6+2 */
2087         } else {
2088 #ifdef INET
2089                 ether += sizeof(struct ip);
2090 #endif
2091                 ether += 14;    /* eheader size 6+6+2 */
2092         }
2093         u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs);
2094         ret_bw = bw;
2095         ret_bw *= ether;
2096         ret_bw /= u_segsiz;
2097         return (ret_bw);
2098 }
2099
2100 static void
2101 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped)
2102 {
2103 #ifdef TCP_REQUEST_TRK
2104         struct timeval tv;
2105         uint64_t timenow, timeleft, lenleft, lengone, calcbw;
2106 #endif
2107
2108         if (rack->r_ctl.bw_rate_cap == 0)
2109                 return;
2110 #ifdef TCP_REQUEST_TRK
2111         if (rack->rc_catch_up && rack->rc_hybrid_mode &&
2112             (rack->r_ctl.rc_last_sft != NULL)) {
2113                 /*
2114                  * We have a dynamic cap. The original target
2115                  * is in bw_rate_cap, but we need to look at
2116                  * how long it is until we hit the deadline.
2117                  */
2118                 struct http_sendfile_track *ent;
2119
2120                 ent = rack->r_ctl.rc_last_sft;
2121                 microuptime(&tv);
2122                 timenow = tcp_tv_to_lusectick(&tv);
2123                 if (timenow >= ent->deadline) {
2124                         /* No time left we do DGP only */
2125                         rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2126                                            0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent);
2127                         rack->r_ctl.bw_rate_cap = 0;
2128                         return;
2129                 }
2130                 /* We have the time */
2131                 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow;
2132                 if (timeleft < HPTS_MSEC_IN_SEC) {
2133                         /* If there is less than a ms left just use DGPs rate */
2134                         rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2135                                            0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent);
2136                         rack->r_ctl.bw_rate_cap = 0;
2137                         return;
2138                 }
2139                 /*
2140                  * Now lets find the amount of data left to send.
2141                  *
2142                  * Now ideally we want to use the end_seq to figure out how much more
2143                  * but it might not be possible (only if we have the TRACK_FG_COMP on the entry..
2144                  */
2145                 if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
2146                         if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una))
2147                                 lenleft = ent->end_seq - rack->rc_tp->snd_una;
2148                         else {
2149                                 /* TSNH, we should catch it at the send */
2150                                 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2151                                                    0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent);
2152                                 rack->r_ctl.bw_rate_cap = 0;
2153                                 return;
2154                         }
2155                 } else {
2156                         /*
2157                          * The hard way, figure out how much is gone and then
2158                          * take that away from the total the client asked for
2159                          * (thats off by tls overhead if this is tls).
2160                          */
2161                         if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq))
2162                                 lengone = rack->rc_tp->snd_una - ent->start_seq;
2163                         else
2164                                 lengone = 0;
2165                         if (lengone < (ent->end - ent->start))
2166                                 lenleft = (ent->end - ent->start) - lengone;
2167                         else {
2168                                 /* TSNH, we should catch it at the send */
2169                                 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2170                                                    0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent);
2171                                 rack->r_ctl.bw_rate_cap = 0;
2172                                 return;
2173                         }
2174                 }
2175                 if (lenleft == 0) {
2176                         /* We have it all sent */
2177                         rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2178                                            0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent);
2179                         if (rack->r_ctl.bw_rate_cap)
2180                                 goto normal_ratecap;
2181                         else
2182                                 return;
2183                 }
2184                 calcbw = lenleft * HPTS_USEC_IN_SEC;
2185                 calcbw /= timeleft;
2186                 /* Now we must compensate for IP/TCP overhead */
2187                 calcbw = rack_compensate_for_linerate(rack, calcbw);
2188                 /* Update the bit rate cap */
2189                 rack->r_ctl.bw_rate_cap = calcbw;
2190                 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2191                     (rack_hybrid_allow_set_maxseg == 1) &&
2192                     ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2193                         /* Lets set in a smaller mss possibly here to match our rate-cap */
2194                         uint32_t orig_max;
2195
2196                         orig_max = rack->r_ctl.rc_pace_max_segs;
2197                         rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2198                         rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp));
2199                         rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2200                 }
2201                 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2202                                    calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent);
2203                 if ((calcbw > 0) && (*bw > calcbw)) {
2204                         rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2205                                            *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent);
2206                         *capped = 1;
2207                         *bw = calcbw;
2208                 }
2209                 return;
2210         }
2211 normal_ratecap:
2212 #endif
2213         if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) {
2214 #ifdef TCP_REQUEST_TRK
2215                 if (rack->rc_hybrid_mode &&
2216                     rack->rc_catch_up &&
2217                     (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2218                     (rack_hybrid_allow_set_maxseg == 1) &&
2219                     ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2220                         /* Lets set in a smaller mss possibly here to match our rate-cap */
2221                         uint32_t orig_max;
2222
2223                         orig_max = rack->r_ctl.rc_pace_max_segs;
2224                         rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2225                         rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp));
2226                         rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2227                 }
2228 #endif
2229                 *capped = 1;
2230                 *bw = rack->r_ctl.bw_rate_cap;
2231                 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2232                                    *bw, 0, 0,
2233                                    HYBRID_LOG_RATE_CAP, 1, NULL);
2234         }
2235 }
2236
2237 static uint64_t
2238 rack_get_gp_est(struct tcp_rack *rack)
2239 {
2240         uint64_t bw, lt_bw, ret_bw;
2241
2242         if (rack->rc_gp_filled == 0) {
2243                 /*
2244                  * We have yet no b/w measurement,
2245                  * if we have a user set initial bw
2246                  * return it. If we don't have that and
2247                  * we have an srtt, use the tcp IW (10) to
2248                  * calculate a fictional b/w over the SRTT
2249                  * which is more or less a guess. Note
2250                  * we don't use our IW from rack on purpose
2251                  * so if we have like IW=30, we are not
2252                  * calculating a "huge" b/w.
2253                  */
2254                 uint64_t srtt;
2255
2256                 lt_bw = rack_get_lt_bw(rack);
2257                 if (lt_bw) {
2258                         /*
2259                          * No goodput bw but a long-term b/w does exist
2260                          * lets use that.
2261                          */
2262                         ret_bw = lt_bw;
2263                         goto compensate;
2264                 }
2265                 if (rack->r_ctl.init_rate)
2266                         return (rack->r_ctl.init_rate);
2267
2268                 /* Ok lets come up with the IW guess, if we have a srtt */
2269                 if (rack->rc_tp->t_srtt == 0) {
2270                         /*
2271                          * Go with old pacing method
2272                          * i.e. burst mitigation only.
2273                          */
2274                         return (0);
2275                 }
2276                 /* Ok lets get the initial TCP win (not racks) */
2277                 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
2278                 srtt = (uint64_t)rack->rc_tp->t_srtt;
2279                 bw *= (uint64_t)USECS_IN_SECOND;
2280                 bw /= srtt;
2281                 ret_bw = bw;
2282                 goto compensate;
2283
2284         }
2285         if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
2286                 /* Averaging is done, we can return the value */
2287                 bw = rack->r_ctl.gp_bw;
2288         } else {
2289                 /* Still doing initial average must calculate */
2290                 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
2291         }
2292         lt_bw = rack_get_lt_bw(rack);
2293         if (lt_bw == 0) {
2294                 /* If we don't have one then equate it to the gp_bw */
2295                 lt_bw = rack->r_ctl.gp_bw;
2296         }
2297         if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){
2298                 /*  if clamped take the lowest */
2299                 if (lt_bw < bw)
2300                         ret_bw = lt_bw;
2301                 else
2302                         ret_bw = bw;
2303         } else {
2304                 /* If not set for clamped to get lowest, take the highest */
2305                 if (lt_bw > bw)
2306                         ret_bw = lt_bw;
2307                 else
2308                         ret_bw = bw;
2309         }
2310         /*
2311          * Now lets compensate based on the TCP/IP overhead. Our
2312          * Goodput estimate does not include this so we must pace out
2313          * a bit faster since our pacing calculations do. The pacing
2314          * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz
2315          * we are using to do this, so we do that here in the opposite
2316          * direction as well. This means that if we are tunneled and the
2317          * segsiz is say 1200 bytes we will get quite a boost, but its
2318          * compensated for in the pacing time the opposite way.
2319          */
2320 compensate:
2321         ret_bw = rack_compensate_for_linerate(rack, ret_bw);
2322         return(ret_bw);
2323 }
2324
2325
2326 static uint64_t
2327 rack_get_bw(struct tcp_rack *rack)
2328 {
2329         uint64_t bw;
2330
2331         if (rack->use_fixed_rate) {
2332                 /* Return the fixed pacing rate */
2333                 return (rack_get_fixed_pacing_bw(rack));
2334         }
2335         bw = rack_get_gp_est(rack);
2336         return (bw);
2337 }
2338
2339 static uint16_t
2340 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
2341 {
2342         if (rack->use_fixed_rate) {
2343                 return (100);
2344         } else if (rack->in_probe_rtt && (rsm == NULL))
2345                 return (rack->r_ctl.rack_per_of_gp_probertt);
2346         else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
2347                   rack->r_ctl.rack_per_of_gp_rec)) {
2348                 if (rsm) {
2349                         /* a retransmission always use the recovery rate */
2350                         return (rack->r_ctl.rack_per_of_gp_rec);
2351                 } else if (rack->rack_rec_nonrxt_use_cr) {
2352                         /* Directed to use the configured rate */
2353                         goto configured_rate;
2354                 } else if (rack->rack_no_prr &&
2355                            (rack->r_ctl.rack_per_of_gp_rec > 100)) {
2356                         /* No PRR, lets just use the b/w estimate only */
2357                         return (100);
2358                 } else {
2359                         /*
2360                          * Here we may have a non-retransmit but we
2361                          * have no overrides, so just use the recovery
2362                          * rate (prr is in effect).
2363                          */
2364                         return (rack->r_ctl.rack_per_of_gp_rec);
2365                 }
2366         }
2367 configured_rate:
2368         /* For the configured rate we look at our cwnd vs the ssthresh */
2369         if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2370                 return (rack->r_ctl.rack_per_of_gp_ss);
2371         else
2372                 return (rack->r_ctl.rack_per_of_gp_ca);
2373 }
2374
2375 static void
2376 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
2377 {
2378         /*
2379          * Types of logs (mod value)
2380          * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
2381          * 2 = a dsack round begins, persist is reset to 16.
2382          * 3 = a dsack round ends
2383          * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
2384          * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
2385          * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
2386          */
2387         if (tcp_bblogging_on(rack->rc_tp)) {
2388                 union tcp_log_stackspecific log;
2389                 struct timeval tv;
2390
2391                 memset(&log, 0, sizeof(log));
2392                 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
2393                 log.u_bbr.flex1 <<= 1;
2394                 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
2395                 log.u_bbr.flex1 <<= 1;
2396                 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
2397                 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
2398                 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
2399                 log.u_bbr.flex4 = flex4;
2400                 log.u_bbr.flex5 = flex5;
2401                 log.u_bbr.flex6 = flex6;
2402                 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
2403                 log.u_bbr.flex8 = mod;
2404                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2405                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2406                     &rack->rc_inp->inp_socket->so_rcv,
2407                     &rack->rc_inp->inp_socket->so_snd,
2408                     RACK_DSACK_HANDLING, 0,
2409                     0, &log, false, &tv);
2410         }
2411 }
2412
2413 static void
2414 rack_log_hdwr_pacing(struct tcp_rack *rack,
2415                      uint64_t rate, uint64_t hw_rate, int line,
2416                      int error, uint16_t mod)
2417 {
2418         if (tcp_bblogging_on(rack->rc_tp)) {
2419                 union tcp_log_stackspecific log;
2420                 struct timeval tv;
2421                 const struct ifnet *ifp;
2422
2423                 memset(&log, 0, sizeof(log));
2424                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2425                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2426                 if (rack->r_ctl.crte) {
2427                         ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2428                 } else if (rack->rc_inp->inp_route.ro_nh &&
2429                            rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2430                         ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2431                 } else
2432                         ifp = NULL;
2433                 if (ifp) {
2434                         log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2435                         log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2436                 }
2437                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2438                 log.u_bbr.bw_inuse = rate;
2439                 log.u_bbr.flex5 = line;
2440                 log.u_bbr.flex6 = error;
2441                 log.u_bbr.flex7 = mod;
2442                 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2443                 log.u_bbr.flex8 = rack->use_fixed_rate;
2444                 log.u_bbr.flex8 <<= 1;
2445                 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2446                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2447                 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2448                 if (rack->r_ctl.crte)
2449                         log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2450                 else
2451                         log.u_bbr.cur_del_rate = 0;
2452                 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2453                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2454                     &rack->rc_inp->inp_socket->so_rcv,
2455                     &rack->rc_inp->inp_socket->so_snd,
2456                     BBR_LOG_HDWR_PACE, 0,
2457                     0, &log, false, &tv);
2458         }
2459 }
2460
2461 static uint64_t
2462 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2463 {
2464         /*
2465          * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2466          */
2467         uint64_t bw_est, high_rate;
2468         uint64_t gain;
2469
2470         if ((rack->r_pacing_discount == 0) ||
2471             (rack_full_buffer_discount == 0)) {
2472                 /*
2473                  * No buffer level based discount from client buffer
2474                  * level is enabled or the feature is disabled.
2475                  */
2476                 gain = (uint64_t)rack_get_output_gain(rack, rsm);
2477                 bw_est = bw * gain;
2478                 bw_est /= (uint64_t)100;
2479         } else {
2480                 /*
2481                  * We have a discount in place apply it with
2482                  * just a 100% gain (we get no boost if the buffer
2483                  * is full).
2484                  */
2485                 uint64_t discount;
2486
2487                 discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm);
2488                 discount /= 100;
2489                 /* What %% of the b/w do we discount */
2490                 bw_est = bw - discount;
2491         }
2492         /* Never fall below the minimum (def 64kbps) */
2493         if (bw_est < RACK_MIN_BW)
2494                 bw_est = RACK_MIN_BW;
2495         if (rack->r_rack_hw_rate_caps) {
2496                 /* Rate caps are in place */
2497                 if (rack->r_ctl.crte != NULL) {
2498                         /* We have a hdwr rate already */
2499                         high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2500                         if (bw_est >= high_rate) {
2501                                 /* We are capping bw at the highest rate table entry */
2502                                 if (rack_hw_rate_cap_per &&
2503                                     (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) {
2504                                         rack->r_rack_hw_rate_caps = 0;
2505                                         goto done;
2506                                 }
2507                                 rack_log_hdwr_pacing(rack,
2508                                                      bw_est, high_rate, __LINE__,
2509                                                      0, 3);
2510                                 bw_est = high_rate;
2511                                 if (capped)
2512                                         *capped = 1;
2513                         }
2514                 } else if ((rack->rack_hdrw_pacing == 0) &&
2515                            (rack->rack_hdw_pace_ena) &&
2516                            (rack->rack_attempt_hdwr_pace == 0) &&
2517                            (rack->rc_inp->inp_route.ro_nh != NULL) &&
2518                            (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2519                         /*
2520                          * Special case, we have not yet attempted hardware
2521                          * pacing, and yet we may, when we do, find out if we are
2522                          * above the highest rate. We need to know the maxbw for the interface
2523                          * in question (if it supports ratelimiting). We get back
2524                          * a 0, if the interface is not found in the RL lists.
2525                          */
2526                         high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2527                         if (high_rate) {
2528                                 /* Yep, we have a rate is it above this rate? */
2529                                 if (bw_est > high_rate) {
2530                                         bw_est = high_rate;
2531                                         if (capped)
2532                                                 *capped = 1;
2533                                 }
2534                         }
2535                 }
2536         }
2537 done:
2538         return (bw_est);
2539 }
2540
2541 static void
2542 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2543 {
2544         if (tcp_bblogging_on(rack->rc_tp)) {
2545                 union tcp_log_stackspecific log;
2546                 struct timeval tv;
2547
2548                 if (rack->sack_attack_disable > 0)
2549                         goto log_anyway;
2550                 if ((mod != 1) && (rack_verbose_logging == 0))  {
2551                         /*
2552                          * We get 3 values currently for mod
2553                          * 1 - We are retransmitting and this tells the reason.
2554                          * 2 - We are clearing a dup-ack count.
2555                          * 3 - We are incrementing a dup-ack count.
2556                          *
2557                          * The clear/increment are only logged
2558                          * if you have BBverbose on.
2559                          */
2560                         return;
2561                 }
2562 log_anyway:
2563                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2564                 log.u_bbr.flex1 = tsused;
2565                 log.u_bbr.flex2 = thresh;
2566                 log.u_bbr.flex3 = rsm->r_flags;
2567                 log.u_bbr.flex4 = rsm->r_dupack;
2568                 log.u_bbr.flex5 = rsm->r_start;
2569                 log.u_bbr.flex6 = rsm->r_end;
2570                 log.u_bbr.flex8 = mod;
2571                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2572                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2573                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2574                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2575                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2576                 log.u_bbr.pacing_gain = rack->r_must_retran;
2577                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2578                     &rack->rc_inp->inp_socket->so_rcv,
2579                     &rack->rc_inp->inp_socket->so_snd,
2580                     BBR_LOG_SETTINGS_CHG, 0,
2581                     0, &log, false, &tv);
2582         }
2583 }
2584
2585 static void
2586 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2587 {
2588         if (tcp_bblogging_on(rack->rc_tp)) {
2589                 union tcp_log_stackspecific log;
2590                 struct timeval tv;
2591
2592                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2593                 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2594                 log.u_bbr.flex2 = to;
2595                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2596                 log.u_bbr.flex4 = slot;
2597                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
2598                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2599                 log.u_bbr.flex7 = rack->rc_in_persist;
2600                 log.u_bbr.flex8 = which;
2601                 if (rack->rack_no_prr)
2602                         log.u_bbr.pkts_out = 0;
2603                 else
2604                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2605                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2606                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2607                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2608                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2609                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2610                 log.u_bbr.pacing_gain = rack->r_must_retran;
2611                 log.u_bbr.cwnd_gain = rack->rack_deferred_inited;
2612                 log.u_bbr.pkt_epoch = rack->rc_has_collapsed;
2613                 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2614                 log.u_bbr.lost = rack_rto_min;
2615                 log.u_bbr.epoch = rack->r_ctl.roundends;
2616                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2617                     &rack->rc_inp->inp_socket->so_rcv,
2618                     &rack->rc_inp->inp_socket->so_snd,
2619                     BBR_LOG_TIMERSTAR, 0,
2620                     0, &log, false, &tv);
2621         }
2622 }
2623
2624 static void
2625 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2626 {
2627         if (tcp_bblogging_on(rack->rc_tp)) {
2628                 union tcp_log_stackspecific log;
2629                 struct timeval tv;
2630
2631                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2632                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2633                 log.u_bbr.flex8 = to_num;
2634                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2635                 log.u_bbr.flex2 = rack->rc_rack_rtt;
2636                 if (rsm == NULL)
2637                         log.u_bbr.flex3 = 0;
2638                 else
2639                         log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2640                 if (rack->rack_no_prr)
2641                         log.u_bbr.flex5 = 0;
2642                 else
2643                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2644                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2645                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2646                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2647                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2648                 log.u_bbr.pacing_gain = rack->r_must_retran;
2649                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2650                     &rack->rc_inp->inp_socket->so_rcv,
2651                     &rack->rc_inp->inp_socket->so_snd,
2652                     BBR_LOG_RTO, 0,
2653                     0, &log, false, &tv);
2654         }
2655 }
2656
2657 static void
2658 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2659                  struct rack_sendmap *prev,
2660                  struct rack_sendmap *rsm,
2661                  struct rack_sendmap *next,
2662                  int flag, uint32_t th_ack, int line)
2663 {
2664         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2665                 union tcp_log_stackspecific log;
2666                 struct timeval tv;
2667
2668                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2669                 log.u_bbr.flex8 = flag;
2670                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2671                 log.u_bbr.cur_del_rate = (uint64_t)prev;
2672                 log.u_bbr.delRate = (uint64_t)rsm;
2673                 log.u_bbr.rttProp = (uint64_t)next;
2674                 log.u_bbr.flex7 = 0;
2675                 if (prev) {
2676                         log.u_bbr.flex1 = prev->r_start;
2677                         log.u_bbr.flex2 = prev->r_end;
2678                         log.u_bbr.flex7 |= 0x4;
2679                 }
2680                 if (rsm) {
2681                         log.u_bbr.flex3 = rsm->r_start;
2682                         log.u_bbr.flex4 = rsm->r_end;
2683                         log.u_bbr.flex7 |= 0x2;
2684                 }
2685                 if (next) {
2686                         log.u_bbr.flex5 = next->r_start;
2687                         log.u_bbr.flex6 = next->r_end;
2688                         log.u_bbr.flex7 |= 0x1;
2689                 }
2690                 log.u_bbr.applimited = line;
2691                 log.u_bbr.pkts_out = th_ack;
2692                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2693                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2694                 if (rack->rack_no_prr)
2695                         log.u_bbr.lost = 0;
2696                 else
2697                         log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2698                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2699                     &rack->rc_inp->inp_socket->so_rcv,
2700                     &rack->rc_inp->inp_socket->so_snd,
2701                     TCP_LOG_MAPCHG, 0,
2702                     0, &log, false, &tv);
2703         }
2704 }
2705
2706 static void
2707 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2708                  struct rack_sendmap *rsm, int conf)
2709 {
2710         if (tcp_bblogging_on(tp)) {
2711                 union tcp_log_stackspecific log;
2712                 struct timeval tv;
2713                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2714                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2715                 log.u_bbr.flex1 = t;
2716                 log.u_bbr.flex2 = len;
2717                 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2718                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2719                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2720                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2721                 log.u_bbr.flex7 = conf;
2722                 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2723                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2724                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2725                 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2726                 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2727                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2728                 if (rsm) {
2729                         log.u_bbr.pkt_epoch = rsm->r_start;
2730                         log.u_bbr.lost = rsm->r_end;
2731                         log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2732                         /* We loose any upper of the 24 bits */
2733                         log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2734                 } else {
2735                         /* Its a SYN */
2736                         log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2737                         log.u_bbr.lost = 0;
2738                         log.u_bbr.cwnd_gain = 0;
2739                         log.u_bbr.pacing_gain = 0;
2740                 }
2741                 /* Write out general bits of interest rrs here */
2742                 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2743                 log.u_bbr.use_lt_bw <<= 1;
2744                 log.u_bbr.use_lt_bw |= rack->forced_ack;
2745                 log.u_bbr.use_lt_bw <<= 1;
2746                 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2747                 log.u_bbr.use_lt_bw <<= 1;
2748                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2749                 log.u_bbr.use_lt_bw <<= 1;
2750                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2751                 log.u_bbr.use_lt_bw <<= 1;
2752                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2753                 log.u_bbr.use_lt_bw <<= 1;
2754                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2755                 log.u_bbr.use_lt_bw <<= 1;
2756                 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2757                 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2758                 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2759                 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2760                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2761                 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2762                 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
2763                 log.u_bbr.bw_inuse <<= 32;
2764                 if (rsm)
2765                         log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2766                 TCP_LOG_EVENTP(tp, NULL,
2767                     &rack->rc_inp->inp_socket->so_rcv,
2768                     &rack->rc_inp->inp_socket->so_snd,
2769                     BBR_LOG_BBRRTT, 0,
2770                     0, &log, false, &tv);
2771
2772
2773         }
2774 }
2775
2776 static void
2777 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2778 {
2779         /*
2780          * Log the rtt sample we are
2781          * applying to the srtt algorithm in
2782          * useconds.
2783          */
2784         if (tcp_bblogging_on(rack->rc_tp)) {
2785                 union tcp_log_stackspecific log;
2786                 struct timeval tv;
2787
2788                 /* Convert our ms to a microsecond */
2789                 memset(&log, 0, sizeof(log));
2790                 log.u_bbr.flex1 = rtt;
2791                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2792                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
2793                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2794                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
2795                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2796                 log.u_bbr.flex7 = 1;
2797                 log.u_bbr.flex8 = rack->sack_attack_disable;
2798                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2799                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2800                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2801                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2802                 log.u_bbr.pacing_gain = rack->r_must_retran;
2803                 /*
2804                  * We capture in delRate the upper 32 bits as
2805                  * the confidence level we had declared, and the
2806                  * lower 32 bits as the actual RTT using the arrival
2807                  * timestamp.
2808                  */
2809                 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2810                 log.u_bbr.delRate <<= 32;
2811                 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2812                 /* Lets capture all the things that make up t_rtxcur */
2813                 log.u_bbr.applimited = rack_rto_min;
2814                 log.u_bbr.epoch = rack_rto_max;
2815                 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2816                 log.u_bbr.lost = rack_rto_min;
2817                 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2818                 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2819                 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2820                 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2821                 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2822                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2823                     &rack->rc_inp->inp_socket->so_rcv,
2824                     &rack->rc_inp->inp_socket->so_snd,
2825                     TCP_LOG_RTT, 0,
2826                     0, &log, false, &tv);
2827         }
2828 }
2829
2830 static void
2831 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2832 {
2833         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2834                 union tcp_log_stackspecific log;
2835                 struct timeval tv;
2836
2837                 /* Convert our ms to a microsecond */
2838                 memset(&log, 0, sizeof(log));
2839                 log.u_bbr.flex1 = rtt;
2840                 log.u_bbr.flex2 = send_time;
2841                 log.u_bbr.flex3 = ack_time;
2842                 log.u_bbr.flex4 = where;
2843                 log.u_bbr.flex7 = 2;
2844                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2845                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2846                     &rack->rc_inp->inp_socket->so_rcv,
2847                     &rack->rc_inp->inp_socket->so_snd,
2848                     TCP_LOG_RTT, 0,
2849                     0, &log, false, &tv);
2850         }
2851 }
2852
2853
2854 static void
2855 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
2856 {
2857         if (tcp_bblogging_on(rack->rc_tp)) {
2858                 union tcp_log_stackspecific log;
2859                 struct timeval tv;
2860
2861                 /* Convert our ms to a microsecond */
2862                 memset(&log, 0, sizeof(log));
2863                 log.u_bbr.flex1 = idx;
2864                 log.u_bbr.flex2 = rack_ts_to_msec(tsv);
2865                 log.u_bbr.flex3 = tsecho;
2866                 log.u_bbr.flex7 = 3;
2867                 log.u_bbr.rttProp = tsv;
2868                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2869                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2870                     &rack->rc_inp->inp_socket->so_rcv,
2871                     &rack->rc_inp->inp_socket->so_snd,
2872                     TCP_LOG_RTT, 0,
2873                     0, &log, false, &tv);
2874         }
2875 }
2876
2877
2878 static inline void
2879 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
2880 {
2881         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2882                 union tcp_log_stackspecific log;
2883                 struct timeval tv;
2884
2885                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2886                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2887                 log.u_bbr.flex1 = line;
2888                 log.u_bbr.flex2 = tick;
2889                 log.u_bbr.flex3 = tp->t_maxunacktime;
2890                 log.u_bbr.flex4 = tp->t_acktime;
2891                 log.u_bbr.flex8 = event;
2892                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2893                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2894                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2895                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2896                 log.u_bbr.pacing_gain = rack->r_must_retran;
2897                 TCP_LOG_EVENTP(tp, NULL,
2898                     &rack->rc_inp->inp_socket->so_rcv,
2899                     &rack->rc_inp->inp_socket->so_snd,
2900                     BBR_LOG_PROGRESS, 0,
2901                     0, &log, false, &tv);
2902         }
2903 }
2904
2905 static void
2906 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line)
2907 {
2908         if (tcp_bblogging_on(rack->rc_tp)) {
2909                 union tcp_log_stackspecific log;
2910
2911                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2912                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2913                 log.u_bbr.flex1 = slot;
2914                 if (rack->rack_no_prr)
2915                         log.u_bbr.flex2 = 0;
2916                 else
2917                         log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2918                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2919                 log.u_bbr.flex5 = rack->r_ctl.ack_during_sd;
2920                 log.u_bbr.flex6 = line;
2921                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2922                 log.u_bbr.flex8 = rack->rc_in_persist;
2923                 log.u_bbr.timeStamp = cts;
2924                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2925                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2926                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2927                 log.u_bbr.pacing_gain = rack->r_must_retran;
2928                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2929                     &rack->rc_inp->inp_socket->so_rcv,
2930                     &rack->rc_inp->inp_socket->so_snd,
2931                     BBR_LOG_BBRSND, 0,
2932                     0, &log, false, tv);
2933         }
2934 }
2935
2936 static void
2937 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2938 {
2939         if (tcp_bblogging_on(rack->rc_tp)) {
2940                 union tcp_log_stackspecific log;
2941                 struct timeval tv;
2942
2943                 memset(&log, 0, sizeof(log));
2944                 log.u_bbr.flex1 = did_out;
2945                 log.u_bbr.flex2 = nxt_pkt;
2946                 log.u_bbr.flex3 = way_out;
2947                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2948                 if (rack->rack_no_prr)
2949                         log.u_bbr.flex5 = 0;
2950                 else
2951                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2952                 log.u_bbr.flex6 = nsegs;
2953                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2954                 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data;        /* Do we have ack-can-send set */
2955                 log.u_bbr.flex7 <<= 1;
2956                 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2957                 log.u_bbr.flex7 <<= 1;
2958                 log.u_bbr.flex7 |= rack->r_wanted_output;       /* Do we want output */
2959                 log.u_bbr.flex8 = rack->rc_in_persist;
2960                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2961                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2962                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2963                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2964                 log.u_bbr.use_lt_bw <<= 1;
2965                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2966                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2967                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2968                 log.u_bbr.pacing_gain = rack->r_must_retran;
2969                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2970                     &rack->rc_inp->inp_socket->so_rcv,
2971                     &rack->rc_inp->inp_socket->so_snd,
2972                     BBR_LOG_DOSEG_DONE, 0,
2973                     0, &log, false, &tv);
2974         }
2975 }
2976
2977 static void
2978 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2979 {
2980         if (tcp_bblogging_on(rack->rc_tp)) {
2981                 union tcp_log_stackspecific log;
2982                 struct timeval tv;
2983
2984                 memset(&log, 0, sizeof(log));
2985                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2986                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
2987                 log.u_bbr.flex4 = arg1;
2988                 log.u_bbr.flex5 = arg2;
2989                 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs;
2990                 log.u_bbr.flex6 = arg3;
2991                 log.u_bbr.flex8 = frm;
2992                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2993                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2994                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2995                 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
2996                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2997                 log.u_bbr.pacing_gain = rack->r_must_retran;
2998                 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
2999                     &tptosocket(tp)->so_snd,
3000                     TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv);
3001         }
3002 }
3003
3004 static void
3005 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
3006                           uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
3007 {
3008         if (tcp_bblogging_on(rack->rc_tp)) {
3009                 union tcp_log_stackspecific log;
3010                 struct timeval tv;
3011
3012                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3013                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
3014                 log.u_bbr.flex1 = slot;
3015                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
3016                 log.u_bbr.flex4 = reason;
3017                 if (rack->rack_no_prr)
3018                         log.u_bbr.flex5 = 0;
3019                 else
3020                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3021                 log.u_bbr.flex7 = hpts_calling;
3022                 log.u_bbr.flex8 = rack->rc_in_persist;
3023                 log.u_bbr.lt_epoch = cwnd_to_use;
3024                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3025                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3026                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3027                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3028                 log.u_bbr.pacing_gain = rack->r_must_retran;
3029                 log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
3030                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3031                     &rack->rc_inp->inp_socket->so_rcv,
3032                     &rack->rc_inp->inp_socket->so_snd,
3033                     BBR_LOG_JUSTRET, 0,
3034                     tlen, &log, false, &tv);
3035         }
3036 }
3037
3038 static void
3039 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
3040                    struct timeval *tv, uint32_t flags_on_entry)
3041 {
3042         if (tcp_bblogging_on(rack->rc_tp)) {
3043                 union tcp_log_stackspecific log;
3044
3045                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3046                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
3047                 log.u_bbr.flex1 = line;
3048                 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
3049                 log.u_bbr.flex3 = flags_on_entry;
3050                 log.u_bbr.flex4 = us_cts;
3051                 if (rack->rack_no_prr)
3052                         log.u_bbr.flex5 = 0;
3053                 else
3054                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3055                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
3056                 log.u_bbr.flex7 = hpts_removed;
3057                 log.u_bbr.flex8 = 1;
3058                 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
3059                 log.u_bbr.timeStamp = us_cts;
3060                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3061                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3062                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3063                 log.u_bbr.pacing_gain = rack->r_must_retran;
3064                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3065                     &rack->rc_inp->inp_socket->so_rcv,
3066                     &rack->rc_inp->inp_socket->so_snd,
3067                     BBR_LOG_TIMERCANC, 0,
3068                     0, &log, false, tv);
3069         }
3070 }
3071
3072 static void
3073 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
3074                           uint32_t flex1, uint32_t flex2,
3075                           uint32_t flex3, uint32_t flex4,
3076                           uint32_t flex5, uint32_t flex6,
3077                           uint16_t flex7, uint8_t mod)
3078 {
3079         if (tcp_bblogging_on(rack->rc_tp)) {
3080                 union tcp_log_stackspecific log;
3081                 struct timeval tv;
3082
3083                 if (mod == 1) {
3084                         /* No you can't use 1, its for the real to cancel */
3085                         return;
3086                 }
3087                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3088                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3089                 log.u_bbr.flex1 = flex1;
3090                 log.u_bbr.flex2 = flex2;
3091                 log.u_bbr.flex3 = flex3;
3092                 log.u_bbr.flex4 = flex4;
3093                 log.u_bbr.flex5 = flex5;
3094                 log.u_bbr.flex6 = flex6;
3095                 log.u_bbr.flex7 = flex7;
3096                 log.u_bbr.flex8 = mod;
3097                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3098                     &rack->rc_inp->inp_socket->so_rcv,
3099                     &rack->rc_inp->inp_socket->so_snd,
3100                     BBR_LOG_TIMERCANC, 0,
3101                     0, &log, false, &tv);
3102         }
3103 }
3104
3105 static void
3106 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
3107 {
3108         if (tcp_bblogging_on(rack->rc_tp)) {
3109                 union tcp_log_stackspecific log;
3110                 struct timeval tv;
3111
3112                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3113                 log.u_bbr.flex1 = timers;
3114                 log.u_bbr.flex2 = ret;
3115                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
3116                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
3117                 log.u_bbr.flex5 = cts;
3118                 if (rack->rack_no_prr)
3119                         log.u_bbr.flex6 = 0;
3120                 else
3121                         log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
3122                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3123                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3124                 log.u_bbr.pacing_gain = rack->r_must_retran;
3125                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3126                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3127                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3128                     &rack->rc_inp->inp_socket->so_rcv,
3129                     &rack->rc_inp->inp_socket->so_snd,
3130                     BBR_LOG_TO_PROCESS, 0,
3131                     0, &log, false, &tv);
3132         }
3133 }
3134
3135 static void
3136 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
3137 {
3138         if (tcp_bblogging_on(rack->rc_tp)) {
3139                 union tcp_log_stackspecific log;
3140                 struct timeval tv;
3141
3142                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3143                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
3144                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
3145                 if (rack->rack_no_prr)
3146                         log.u_bbr.flex3 = 0;
3147                 else
3148                         log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
3149                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
3150                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
3151                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
3152                 log.u_bbr.flex7 = line;
3153                 log.u_bbr.flex8 = frm;
3154                 log.u_bbr.pkts_out = orig_cwnd;
3155                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3156                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3157                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
3158                 log.u_bbr.use_lt_bw <<= 1;
3159                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
3160                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3161                     &rack->rc_inp->inp_socket->so_rcv,
3162                     &rack->rc_inp->inp_socket->so_snd,
3163                     BBR_LOG_BBRUPD, 0,
3164                     0, &log, false, &tv);
3165         }
3166 }
3167
3168 #ifdef TCP_SAD_DETECTION
3169 static void
3170 rack_log_sad(struct tcp_rack *rack, int event)
3171 {
3172         if (tcp_bblogging_on(rack->rc_tp)) {
3173                 union tcp_log_stackspecific log;
3174                 struct timeval tv;
3175
3176                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3177                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
3178                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
3179                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
3180                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
3181                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
3182                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
3183                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
3184                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
3185                 log.u_bbr.lt_epoch |= rack->do_detection;
3186                 log.u_bbr.applimited = tcp_map_minimum;
3187                 log.u_bbr.flex7 = rack->sack_attack_disable;
3188                 log.u_bbr.flex8 = event;
3189                 log.u_bbr.bbr_state = rack->rc_suspicious;
3190                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3191                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3192                 log.u_bbr.delivered = tcp_sad_decay_val;
3193                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3194                     &rack->rc_inp->inp_socket->so_rcv,
3195                     &rack->rc_inp->inp_socket->so_snd,
3196                     TCP_SAD_DETECT, 0,
3197                     0, &log, false, &tv);
3198         }
3199 }
3200 #endif
3201
3202 static void
3203 rack_counter_destroy(void)
3204 {
3205         counter_u64_free(rack_total_bytes);
3206         counter_u64_free(rack_fto_send);
3207         counter_u64_free(rack_fto_rsm_send);
3208         counter_u64_free(rack_nfto_resend);
3209         counter_u64_free(rack_hw_pace_init_fail);
3210         counter_u64_free(rack_hw_pace_lost);
3211         counter_u64_free(rack_non_fto_send);
3212         counter_u64_free(rack_extended_rfo);
3213         counter_u64_free(rack_ack_total);
3214         counter_u64_free(rack_express_sack);
3215         counter_u64_free(rack_sack_total);
3216         counter_u64_free(rack_move_none);
3217         counter_u64_free(rack_move_some);
3218         counter_u64_free(rack_sack_attacks_detected);
3219         counter_u64_free(rack_sack_attacks_reversed);
3220         counter_u64_free(rack_sack_attacks_suspect);
3221         counter_u64_free(rack_sack_used_next_merge);
3222         counter_u64_free(rack_sack_used_prev_merge);
3223         counter_u64_free(rack_tlp_tot);
3224         counter_u64_free(rack_tlp_newdata);
3225         counter_u64_free(rack_tlp_retran);
3226         counter_u64_free(rack_tlp_retran_bytes);
3227         counter_u64_free(rack_to_tot);
3228         counter_u64_free(rack_saw_enobuf);
3229         counter_u64_free(rack_saw_enobuf_hw);
3230         counter_u64_free(rack_saw_enetunreach);
3231         counter_u64_free(rack_hot_alloc);
3232         counter_u64_free(rack_to_alloc);
3233         counter_u64_free(rack_to_alloc_hard);
3234         counter_u64_free(rack_to_alloc_emerg);
3235         counter_u64_free(rack_to_alloc_limited);
3236         counter_u64_free(rack_alloc_limited_conns);
3237         counter_u64_free(rack_split_limited);
3238         counter_u64_free(rack_multi_single_eq);
3239         counter_u64_free(rack_rxt_clamps_cwnd);
3240         counter_u64_free(rack_rxt_clamps_cwnd_uniq);
3241         counter_u64_free(rack_proc_non_comp_ack);
3242         counter_u64_free(rack_sack_proc_all);
3243         counter_u64_free(rack_sack_proc_restart);
3244         counter_u64_free(rack_sack_proc_short);
3245         counter_u64_free(rack_sack_skipped_acked);
3246         counter_u64_free(rack_sack_splits);
3247         counter_u64_free(rack_input_idle_reduces);
3248         counter_u64_free(rack_collapsed_win);
3249         counter_u64_free(rack_collapsed_win_rxt);
3250         counter_u64_free(rack_collapsed_win_rxt_bytes);
3251         counter_u64_free(rack_collapsed_win_seen);
3252         counter_u64_free(rack_try_scwnd);
3253         counter_u64_free(rack_persists_sends);
3254         counter_u64_free(rack_persists_acks);
3255         counter_u64_free(rack_persists_loss);
3256         counter_u64_free(rack_persists_lost_ends);
3257 #ifdef INVARIANTS
3258         counter_u64_free(rack_adjust_map_bw);
3259 #endif
3260         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
3261         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
3262 }
3263
3264 static struct rack_sendmap *
3265 rack_alloc(struct tcp_rack *rack)
3266 {
3267         struct rack_sendmap *rsm;
3268
3269         /*
3270          * First get the top of the list it in
3271          * theory is the "hottest" rsm we have,
3272          * possibly just freed by ack processing.
3273          */
3274         if (rack->rc_free_cnt > rack_free_cache) {
3275                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3276                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3277                 counter_u64_add(rack_hot_alloc, 1);
3278                 rack->rc_free_cnt--;
3279                 return (rsm);
3280         }
3281         /*
3282          * Once we get under our free cache we probably
3283          * no longer have a "hot" one available. Lets
3284          * get one from UMA.
3285          */
3286         rsm = uma_zalloc(rack_zone, M_NOWAIT);
3287         if (rsm) {
3288                 rack->r_ctl.rc_num_maps_alloced++;
3289                 counter_u64_add(rack_to_alloc, 1);
3290                 return (rsm);
3291         }
3292         /*
3293          * Dig in to our aux rsm's (the last two) since
3294          * UMA failed to get us one.
3295          */
3296         if (rack->rc_free_cnt) {
3297                 counter_u64_add(rack_to_alloc_emerg, 1);
3298                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3299                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3300                 rack->rc_free_cnt--;
3301                 return (rsm);
3302         }
3303         return (NULL);
3304 }
3305
3306 static struct rack_sendmap *
3307 rack_alloc_full_limit(struct tcp_rack *rack)
3308 {
3309         if ((V_tcp_map_entries_limit > 0) &&
3310             (rack->do_detection == 0) &&
3311             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3312                 counter_u64_add(rack_to_alloc_limited, 1);
3313                 if (!rack->alloc_limit_reported) {
3314                         rack->alloc_limit_reported = 1;
3315                         counter_u64_add(rack_alloc_limited_conns, 1);
3316                 }
3317                 return (NULL);
3318         }
3319         return (rack_alloc(rack));
3320 }
3321
3322 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3323 static struct rack_sendmap *
3324 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
3325 {
3326         struct rack_sendmap *rsm;
3327
3328         if (limit_type) {
3329                 /* currently there is only one limit type */
3330                 if (rack->r_ctl.rc_split_limit > 0 &&
3331                     (rack->do_detection == 0) &&
3332                     rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) {
3333                         counter_u64_add(rack_split_limited, 1);
3334                         if (!rack->alloc_limit_reported) {
3335                                 rack->alloc_limit_reported = 1;
3336                                 counter_u64_add(rack_alloc_limited_conns, 1);
3337                         }
3338                         return (NULL);
3339 #ifdef TCP_SAD_DETECTION
3340                 } else if ((tcp_sad_limit != 0) &&
3341                            (rack->do_detection == 1) &&
3342                            (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) {
3343                         counter_u64_add(rack_split_limited, 1);
3344                         if (!rack->alloc_limit_reported) {
3345                                 rack->alloc_limit_reported = 1;
3346                                 counter_u64_add(rack_alloc_limited_conns, 1);
3347                         }
3348                         return (NULL);
3349 #endif
3350                 }
3351         }
3352
3353         /* allocate and mark in the limit type, if set */
3354         rsm = rack_alloc(rack);
3355         if (rsm != NULL && limit_type) {
3356                 rsm->r_limit_type = limit_type;
3357                 rack->r_ctl.rc_num_split_allocs++;
3358         }
3359         return (rsm);
3360 }
3361
3362 static void
3363 rack_free_trim(struct tcp_rack *rack)
3364 {
3365         struct rack_sendmap *rsm;
3366
3367         /*
3368          * Free up all the tail entries until
3369          * we get our list down to the limit.
3370          */
3371         while (rack->rc_free_cnt > rack_free_cache) {
3372                 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
3373                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3374                 rack->rc_free_cnt--;
3375                 rack->r_ctl.rc_num_maps_alloced--;
3376                 uma_zfree(rack_zone, rsm);
3377         }
3378 }
3379
3380 static void
3381 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
3382 {
3383         if (rsm->r_flags & RACK_APP_LIMITED) {
3384                 if (rack->r_ctl.rc_app_limited_cnt > 0) {
3385                         rack->r_ctl.rc_app_limited_cnt--;
3386                 }
3387         }
3388         if (rsm->r_limit_type) {
3389                 /* currently there is only one limit type */
3390                 rack->r_ctl.rc_num_split_allocs--;
3391         }
3392         if (rsm == rack->r_ctl.rc_first_appl) {
3393                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3394                         rack->r_ctl.rc_first_appl = NULL;
3395                 else
3396                         rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl);
3397         }
3398         if (rsm == rack->r_ctl.rc_resend)
3399                 rack->r_ctl.rc_resend = NULL;
3400         if (rsm == rack->r_ctl.rc_end_appl)
3401                 rack->r_ctl.rc_end_appl = NULL;
3402         if (rack->r_ctl.rc_tlpsend == rsm)
3403                 rack->r_ctl.rc_tlpsend = NULL;
3404         if (rack->r_ctl.rc_sacklast == rsm)
3405                 rack->r_ctl.rc_sacklast = NULL;
3406         memset(rsm, 0, sizeof(struct rack_sendmap));
3407         /* Make sure we are not going to overrun our count limit of 0xff */
3408         if ((rack->rc_free_cnt + 1) > 0xff) {
3409                 rack_free_trim(rack);
3410         }
3411         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
3412         rack->rc_free_cnt++;
3413 }
3414
3415 static uint32_t
3416 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
3417 {
3418         uint64_t srtt, bw, len, tim;
3419         uint32_t segsiz, def_len, minl;
3420
3421         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3422         def_len = rack_def_data_window * segsiz;
3423         if (rack->rc_gp_filled == 0) {
3424                 /*
3425                  * We have no measurement (IW is in flight?) so
3426                  * we can only guess using our data_window sysctl
3427                  * value (usually 20MSS).
3428                  */
3429                 return (def_len);
3430         }
3431         /*
3432          * Now we have a number of factors to consider.
3433          *
3434          * 1) We have a desired BDP which is usually
3435          *    at least 2.
3436          * 2) We have a minimum number of rtt's usually 1 SRTT
3437          *    but we allow it too to be more.
3438          * 3) We want to make sure a measurement last N useconds (if
3439          *    we have set rack_min_measure_usec.
3440          *
3441          * We handle the first concern here by trying to create a data
3442          * window of max(rack_def_data_window, DesiredBDP). The
3443          * second concern we handle in not letting the measurement
3444          * window end normally until at least the required SRTT's
3445          * have gone by which is done further below in
3446          * rack_enough_for_measurement(). Finally the third concern
3447          * we also handle here by calculating how long that time
3448          * would take at the current BW and then return the
3449          * max of our first calculation and that length. Note
3450          * that if rack_min_measure_usec is 0, we don't deal
3451          * with concern 3. Also for both Concern 1 and 3 an
3452          * application limited period could end the measurement
3453          * earlier.
3454          *
3455          * So lets calculate the BDP with the "known" b/w using
3456          * the SRTT has our rtt and then multiply it by the
3457          * goal.
3458          */
3459         bw = rack_get_bw(rack);
3460         srtt = (uint64_t)tp->t_srtt;
3461         len = bw * srtt;
3462         len /= (uint64_t)HPTS_USEC_IN_SEC;
3463         len *= max(1, rack_goal_bdp);
3464         /* Now we need to round up to the nearest MSS */
3465         len = roundup(len, segsiz);
3466         if (rack_min_measure_usec) {
3467                 /* Now calculate our min length for this b/w */
3468                 tim = rack_min_measure_usec;
3469                 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
3470                 if (minl == 0)
3471                         minl = 1;
3472                 minl = roundup(minl, segsiz);
3473                 if (len < minl)
3474                         len = minl;
3475         }
3476         /*
3477          * Now if we have a very small window we want
3478          * to attempt to get the window that is
3479          * as small as possible. This happens on
3480          * low b/w connections and we don't want to
3481          * span huge numbers of rtt's between measurements.
3482          *
3483          * We basically include 2 over our "MIN window" so
3484          * that the measurement can be shortened (possibly) by
3485          * an ack'ed packet.
3486          */
3487         if (len < def_len)
3488                 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3489         else
3490                 return (max((uint32_t)len, def_len));
3491
3492 }
3493
3494 static int
3495 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3496 {
3497         uint32_t tim, srtts, segsiz;
3498
3499         /*
3500          * Has enough time passed for the GP measurement to be valid?
3501          */
3502         if (SEQ_LT(th_ack, tp->gput_seq)) {
3503                 /* Not enough bytes yet */
3504                 return (0);
3505         }
3506         if ((tp->snd_max == tp->snd_una) ||
3507             (th_ack == tp->snd_max)){
3508                 /*
3509                  * All is acked quality of all acked is
3510                  * usually low or medium, but we in theory could split
3511                  * all acked into two cases, where you got
3512                  * a signifigant amount of your window and
3513                  * where you did not. For now we leave it
3514                  * but it is something to contemplate in the
3515                  * future. The danger here is that delayed ack
3516                  * is effecting the last byte (which is a 50:50 chance).
3517                  */
3518                 *quality = RACK_QUALITY_ALLACKED;
3519                 return (1);
3520         }
3521         if (SEQ_GEQ(th_ack,  tp->gput_ack)) {
3522                 /*
3523                  * We obtained our entire window of data we wanted
3524                  * no matter if we are in recovery or not then
3525                  * its ok since expanding the window does not
3526                  * make things fuzzy (or at least not as much).
3527                  */
3528                 *quality = RACK_QUALITY_HIGH;
3529                 return (1);
3530         }
3531         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3532         if (SEQ_LT(th_ack, tp->gput_ack) &&
3533             ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3534                 /* Not enough bytes yet */
3535                 return (0);
3536         }
3537         if (rack->r_ctl.rc_first_appl &&
3538             (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3539                 /*
3540                  * We are up to the app limited send point
3541                  * we have to measure irrespective of the time..
3542                  */
3543                 *quality = RACK_QUALITY_APPLIMITED;
3544                 return (1);
3545         }
3546         /* Now what about time? */
3547         srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3548         tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3549         if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
3550                 /*
3551                  * We do not allow a measurement if we are in recovery
3552                  * that would shrink the goodput window we wanted.
3553                  * This is to prevent cloudyness of when the last send
3554                  * was actually made.
3555                  */
3556                 *quality = RACK_QUALITY_HIGH;
3557                 return (1);
3558         }
3559         /* Nope not even a full SRTT has passed */
3560         return (0);
3561 }
3562
3563 static void
3564 rack_log_timely(struct tcp_rack *rack,
3565                 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3566                 uint64_t up_bnd, int line, uint8_t method)
3567 {
3568         if (tcp_bblogging_on(rack->rc_tp)) {
3569                 union tcp_log_stackspecific log;
3570                 struct timeval tv;
3571
3572                 memset(&log, 0, sizeof(log));
3573                 log.u_bbr.flex1 = logged;
3574                 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3575                 log.u_bbr.flex2 <<= 4;
3576                 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3577                 log.u_bbr.flex2 <<= 4;
3578                 log.u_bbr.flex2 |= rack->rc_gp_incr;
3579                 log.u_bbr.flex2 <<= 4;
3580                 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3581                 log.u_bbr.flex3 = rack->rc_gp_incr;
3582                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3583                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3584                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3585                 log.u_bbr.flex7 = rack->rc_gp_bwred;
3586                 log.u_bbr.flex8 = method;
3587                 log.u_bbr.cur_del_rate = cur_bw;
3588                 log.u_bbr.delRate = low_bnd;
3589                 log.u_bbr.bw_inuse = up_bnd;
3590                 log.u_bbr.rttProp = rack_get_bw(rack);
3591                 log.u_bbr.pkt_epoch = line;
3592                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3593                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3594                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3595                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3596                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3597                 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3598                 log.u_bbr.cwnd_gain <<= 1;
3599                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3600                 log.u_bbr.cwnd_gain <<= 1;
3601                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3602                 log.u_bbr.cwnd_gain <<= 1;
3603                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3604                 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3605                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3606                     &rack->rc_inp->inp_socket->so_rcv,
3607                     &rack->rc_inp->inp_socket->so_snd,
3608                     TCP_TIMELY_WORK, 0,
3609                     0, &log, false, &tv);
3610         }
3611 }
3612
3613 static int
3614 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3615 {
3616         /*
3617          * Before we increase we need to know if
3618          * the estimate just made was less than
3619          * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3620          *
3621          * If we already are pacing at a fast enough
3622          * rate to push us faster there is no sense of
3623          * increasing.
3624          *
3625          * We first caculate our actual pacing rate (ss or ca multiplier
3626          * times our cur_bw).
3627          *
3628          * Then we take the last measured rate and multipy by our
3629          * maximum pacing overage to give us a max allowable rate.
3630          *
3631          * If our act_rate is smaller than our max_allowable rate
3632          * then we should increase. Else we should hold steady.
3633          *
3634          */
3635         uint64_t act_rate, max_allow_rate;
3636
3637         if (rack_timely_no_stopping)
3638                 return (1);
3639
3640         if ((cur_bw == 0) || (last_bw_est == 0)) {
3641                 /*
3642                  * Initial startup case or
3643                  * everything is acked case.
3644                  */
3645                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3646                                 __LINE__, 9);
3647                 return (1);
3648         }
3649         if (mult <= 100) {
3650                 /*
3651                  * We can always pace at or slightly above our rate.
3652                  */
3653                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3654                                 __LINE__, 9);
3655                 return (1);
3656         }
3657         act_rate = cur_bw * (uint64_t)mult;
3658         act_rate /= 100;
3659         max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3660         max_allow_rate /= 100;
3661         if (act_rate < max_allow_rate) {
3662                 /*
3663                  * Here the rate we are actually pacing at
3664                  * is smaller than 10% above our last measurement.
3665                  * This means we are pacing below what we would
3666                  * like to try to achieve (plus some wiggle room).
3667                  */
3668                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3669                                 __LINE__, 9);
3670                 return (1);
3671         } else {
3672                 /*
3673                  * Here we are already pacing at least rack_max_per_above(10%)
3674                  * what we are getting back. This indicates most likely
3675                  * that we are being limited (cwnd/rwnd/app) and can't
3676                  * get any more b/w. There is no sense of trying to
3677                  * raise up the pacing rate its not speeding us up
3678                  * and we already are pacing faster than we are getting.
3679                  */
3680                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3681                                 __LINE__, 8);
3682                 return (0);
3683         }
3684 }
3685
3686 static void
3687 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3688 {
3689         /*
3690          * When we drag bottom, we want to assure
3691          * that no multiplier is below 1.0, if so
3692          * we want to restore it to at least that.
3693          */
3694         if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
3695                 /* This is unlikely we usually do not touch recovery */
3696                 rack->r_ctl.rack_per_of_gp_rec = 100;
3697         }
3698         if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3699                 rack->r_ctl.rack_per_of_gp_ca = 100;
3700         }
3701         if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3702                 rack->r_ctl.rack_per_of_gp_ss = 100;
3703         }
3704 }
3705
3706 static void
3707 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3708 {
3709         if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3710                 rack->r_ctl.rack_per_of_gp_ca = 100;
3711         }
3712         if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3713                 rack->r_ctl.rack_per_of_gp_ss = 100;
3714         }
3715 }
3716
3717 static void
3718 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3719 {
3720         int32_t  calc, logged, plus;
3721
3722         logged = 0;
3723
3724         if (override) {
3725                 /*
3726                  * override is passed when we are
3727                  * loosing b/w and making one last
3728                  * gasp at trying to not loose out
3729                  * to a new-reno flow.
3730                  */
3731                 goto extra_boost;
3732         }
3733         /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3734         if (rack->rc_gp_incr &&
3735             ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3736                 /*
3737                  * Reset and get 5 strokes more before the boost. Note
3738                  * that the count is 0 based so we have to add one.
3739                  */
3740 extra_boost:
3741                 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3742                 rack->rc_gp_timely_inc_cnt = 0;
3743         } else
3744                 plus = (uint32_t)rack_gp_increase_per;
3745         /* Must be at least 1% increase for true timely increases */
3746         if ((plus < 1) &&
3747             ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3748                 plus = 1;
3749         if (rack->rc_gp_saw_rec &&
3750             (rack->rc_gp_no_rec_chg == 0) &&
3751             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3752                                   rack->r_ctl.rack_per_of_gp_rec)) {
3753                 /* We have been in recovery ding it too */
3754                 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3755                 if (calc > 0xffff)
3756                         calc = 0xffff;
3757                 logged |= 1;
3758                 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3759                 if (rack->r_ctl.rack_per_upper_bound_ca &&
3760                     (rack->rc_dragged_bottom == 0) &&
3761                     (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca))
3762                         rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca;
3763         }
3764         if (rack->rc_gp_saw_ca &&
3765             (rack->rc_gp_saw_ss == 0) &&
3766             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3767                                   rack->r_ctl.rack_per_of_gp_ca)) {
3768                 /* In CA */
3769                 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3770                 if (calc > 0xffff)
3771                         calc = 0xffff;
3772                 logged |= 2;
3773                 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3774                 if (rack->r_ctl.rack_per_upper_bound_ca &&
3775                     (rack->rc_dragged_bottom == 0) &&
3776                     (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca))
3777                         rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca;
3778         }
3779         if (rack->rc_gp_saw_ss &&
3780             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3781                                   rack->r_ctl.rack_per_of_gp_ss)) {
3782                 /* In SS */
3783                 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3784                 if (calc > 0xffff)
3785                         calc = 0xffff;
3786                 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3787                 if (rack->r_ctl.rack_per_upper_bound_ss &&
3788                     (rack->rc_dragged_bottom == 0) &&
3789                     (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss))
3790                         rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss;
3791                 logged |= 4;
3792         }
3793         if (logged &&
3794             (rack->rc_gp_incr == 0)){
3795                 /* Go into increment mode */
3796                 rack->rc_gp_incr = 1;
3797                 rack->rc_gp_timely_inc_cnt = 0;
3798         }
3799         if (rack->rc_gp_incr &&
3800             logged &&
3801             (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3802                 rack->rc_gp_timely_inc_cnt++;
3803         }
3804         rack_log_timely(rack,  logged, plus, 0, 0,
3805                         __LINE__, 1);
3806 }
3807
3808 static uint32_t
3809 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3810 {
3811         /*
3812          * norm_grad = rtt_diff / minrtt;
3813          * new_per = curper * (1 - B * norm_grad)
3814          *
3815          * B = rack_gp_decrease_per (default 10%)
3816          * rtt_dif = input var current rtt-diff
3817          * curper = input var current percentage
3818          * minrtt = from rack filter
3819          *
3820          */
3821         uint64_t perf;
3822
3823         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3824                     ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3825                      (((uint64_t)rtt_diff * (uint64_t)1000000)/
3826                       (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3827                      (uint64_t)1000000)) /
3828                 (uint64_t)1000000);
3829         if (perf > curper) {
3830                 /* TSNH */
3831                 perf = curper - 1;
3832         }
3833         return ((uint32_t)perf);
3834 }
3835
3836 static uint32_t
3837 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3838 {
3839         /*
3840          *                                   highrttthresh
3841          * result = curper * (1 - (B * ( 1 -  ------          ))
3842          *                                     gp_srtt
3843          *
3844          * B = rack_gp_decrease_per (default 10%)
3845          * highrttthresh = filter_min * rack_gp_rtt_maxmul
3846          */
3847         uint64_t perf;
3848         uint32_t highrttthresh;
3849
3850         highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3851
3852         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3853                                      ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3854                                         ((uint64_t)highrttthresh * (uint64_t)1000000) /
3855                                                     (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3856         return (perf);
3857 }
3858
3859 static void
3860 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3861 {
3862         uint64_t logvar, logvar2, logvar3;
3863         uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3864
3865         if (rack->rc_gp_incr) {
3866                 /* Turn off increment counting */
3867                 rack->rc_gp_incr = 0;
3868                 rack->rc_gp_timely_inc_cnt = 0;
3869         }
3870         ss_red = ca_red = rec_red = 0;
3871         logged = 0;
3872         /* Calculate the reduction value */
3873         if (rtt_diff < 0) {
3874                 rtt_diff *= -1;
3875         }
3876         /* Must be at least 1% reduction */
3877         if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3878                 /* We have been in recovery ding it too */
3879                 if (timely_says == 2) {
3880                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3881                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3882                         if (alt < new_per)
3883                                 val = alt;
3884                         else
3885                                 val = new_per;
3886                 } else
3887                          val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3888                 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3889                         rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3890                         rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3891                 } else {
3892                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3893                         rec_red = 0;
3894                 }
3895                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3896                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3897                 logged |= 1;
3898         }
3899         if (rack->rc_gp_saw_ss) {
3900                 /* Sent in SS */
3901                 if (timely_says == 2) {
3902                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3903                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3904                         if (alt < new_per)
3905                                 val = alt;
3906                         else
3907                                 val = new_per;
3908                 } else
3909                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3910                 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3911                         ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3912                         rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3913                 } else {
3914                         ss_red = new_per;
3915                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3916                         logvar = new_per;
3917                         logvar <<= 32;
3918                         logvar |= alt;
3919                         logvar2 = (uint32_t)rtt;
3920                         logvar2 <<= 32;
3921                         logvar2 |= (uint32_t)rtt_diff;
3922                         logvar3 = rack_gp_rtt_maxmul;
3923                         logvar3 <<= 32;
3924                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3925                         rack_log_timely(rack, timely_says,
3926                                         logvar2, logvar3,
3927                                         logvar, __LINE__, 10);
3928                 }
3929                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3930                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3931                 logged |= 4;
3932         } else if (rack->rc_gp_saw_ca) {
3933                 /* Sent in CA */
3934                 if (timely_says == 2) {
3935                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3936                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3937                         if (alt < new_per)
3938                                 val = alt;
3939                         else
3940                                 val = new_per;
3941                 } else
3942                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3943                 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3944                         ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3945                         rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3946                 } else {
3947                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3948                         ca_red = 0;
3949                         logvar = new_per;
3950                         logvar <<= 32;
3951                         logvar |= alt;
3952                         logvar2 = (uint32_t)rtt;
3953                         logvar2 <<= 32;
3954                         logvar2 |= (uint32_t)rtt_diff;
3955                         logvar3 = rack_gp_rtt_maxmul;
3956                         logvar3 <<= 32;
3957                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3958                         rack_log_timely(rack, timely_says,
3959                                         logvar2, logvar3,
3960                                         logvar, __LINE__, 10);
3961                 }
3962                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3963                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3964                 logged |= 2;
3965         }
3966         if (rack->rc_gp_timely_dec_cnt < 0x7) {
3967                 rack->rc_gp_timely_dec_cnt++;
3968                 if (rack_timely_dec_clear &&
3969                     (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3970                         rack->rc_gp_timely_dec_cnt = 0;
3971         }
3972         logvar = ss_red;
3973         logvar <<= 32;
3974         logvar |= ca_red;
3975         rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
3976                         __LINE__, 2);
3977 }
3978
3979 static void
3980 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3981                      uint32_t rtt, uint32_t line, uint8_t reas)
3982 {
3983         if (tcp_bblogging_on(rack->rc_tp)) {
3984                 union tcp_log_stackspecific log;
3985                 struct timeval tv;
3986
3987                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3988                 log.u_bbr.flex1 = line;
3989                 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3990                 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3991                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3992                 log.u_bbr.flex5 = rtt;
3993                 log.u_bbr.flex6 = rack->rc_highly_buffered;
3994                 log.u_bbr.flex6 <<= 1;
3995                 log.u_bbr.flex6 |= rack->forced_ack;
3996                 log.u_bbr.flex6 <<= 1;
3997                 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3998                 log.u_bbr.flex6 <<= 1;
3999                 log.u_bbr.flex6 |= rack->in_probe_rtt;
4000                 log.u_bbr.flex6 <<= 1;
4001                 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
4002                 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
4003                 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
4004                 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
4005                 log.u_bbr.flex8 = reas;
4006                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4007                 log.u_bbr.delRate = rack_get_bw(rack);
4008                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
4009                 log.u_bbr.cur_del_rate <<= 32;
4010                 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
4011                 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
4012                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
4013                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
4014                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
4015                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
4016                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
4017                 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
4018                 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4019                 log.u_bbr.rttProp = us_cts;
4020                 log.u_bbr.rttProp <<= 32;
4021                 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
4022                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4023                     &rack->rc_inp->inp_socket->so_rcv,
4024                     &rack->rc_inp->inp_socket->so_snd,
4025                     BBR_LOG_RTT_SHRINKS, 0,
4026                     0, &log, false, &rack->r_ctl.act_rcv_time);
4027         }
4028 }
4029
4030 static void
4031 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
4032 {
4033         uint64_t bwdp;
4034
4035         bwdp = rack_get_bw(rack);
4036         bwdp *= (uint64_t)rtt;
4037         bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
4038         rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
4039         if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
4040                 /*
4041                  * A window protocol must be able to have 4 packets
4042                  * outstanding as the floor in order to function
4043                  * (especially considering delayed ack :D).
4044                  */
4045                 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
4046         }
4047 }
4048
4049 static void
4050 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
4051 {
4052         /**
4053          * ProbeRTT is a bit different in rack_pacing than in
4054          * BBR. It is like BBR in that it uses the lowering of
4055          * the RTT as a signal that we saw something new and
4056          * counts from there for how long between. But it is
4057          * different in that its quite simple. It does not
4058          * play with the cwnd and wait until we get down
4059          * to N segments outstanding and hold that for
4060          * 200ms. Instead it just sets the pacing reduction
4061          * rate to a set percentage (70 by default) and hold
4062          * that for a number of recent GP Srtt's.
4063          */
4064         uint32_t segsiz;
4065
4066         if (rack->rc_gp_dyn_mul == 0)
4067                 return;
4068
4069         if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
4070                 /* We are idle */
4071                 return;
4072         }
4073         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4074             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4075                 /*
4076                  * Stop the goodput now, the idea here is
4077                  * that future measurements with in_probe_rtt
4078                  * won't register if they are not greater so
4079                  * we want to get what info (if any) is available
4080                  * now.
4081                  */
4082                 rack_do_goodput_measurement(rack->rc_tp, rack,
4083                                             rack->rc_tp->snd_una, __LINE__,
4084                                             RACK_QUALITY_PROBERTT);
4085         }
4086         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4087         rack->r_ctl.rc_time_probertt_entered = us_cts;
4088         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4089                      rack->r_ctl.rc_pace_min_segs);
4090         rack->in_probe_rtt = 1;
4091         rack->measure_saw_probe_rtt = 1;
4092         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4093         rack->r_ctl.rc_time_probertt_starts = 0;
4094         rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
4095         if (rack_probertt_use_min_rtt_entry)
4096                 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4097         else
4098                 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
4099         rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4100                              __LINE__, RACK_RTTS_ENTERPROBE);
4101 }
4102
4103 static void
4104 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
4105 {
4106         struct rack_sendmap *rsm;
4107         uint32_t segsiz;
4108
4109         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4110                      rack->r_ctl.rc_pace_min_segs);
4111         rack->in_probe_rtt = 0;
4112         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4113             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4114                 /*
4115                  * Stop the goodput now, the idea here is
4116                  * that future measurements with in_probe_rtt
4117                  * won't register if they are not greater so
4118                  * we want to get what info (if any) is available
4119                  * now.
4120                  */
4121                 rack_do_goodput_measurement(rack->rc_tp, rack,
4122                                             rack->rc_tp->snd_una, __LINE__,
4123                                             RACK_QUALITY_PROBERTT);
4124         } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
4125                 /*
4126                  * We don't have enough data to make a measurement.
4127                  * So lets just stop and start here after exiting
4128                  * probe-rtt. We probably are not interested in
4129                  * the results anyway.
4130                  */
4131                 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
4132         }
4133         /*
4134          * Measurements through the current snd_max are going
4135          * to be limited by the slower pacing rate.
4136          *
4137          * We need to mark these as app-limited so we
4138          * don't collapse the b/w.
4139          */
4140         rsm = tqhash_max(rack->r_ctl.tqh);
4141         if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
4142                 if (rack->r_ctl.rc_app_limited_cnt == 0)
4143                         rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
4144                 else {
4145                         /*
4146                          * Go out to the end app limited and mark
4147                          * this new one as next and move the end_appl up
4148                          * to this guy.
4149                          */
4150                         if (rack->r_ctl.rc_end_appl)
4151                                 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
4152                         rack->r_ctl.rc_end_appl = rsm;
4153                 }
4154                 rsm->r_flags |= RACK_APP_LIMITED;
4155                 rack->r_ctl.rc_app_limited_cnt++;
4156         }
4157         /*
4158          * Now, we need to examine our pacing rate multipliers.
4159          * If its under 100%, we need to kick it back up to
4160          * 100%. We also don't let it be over our "max" above
4161          * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
4162          * Note setting clamp_atexit_prtt to 0 has the effect
4163          * of setting CA/SS to 100% always at exit (which is
4164          * the default behavior).
4165          */
4166         if (rack_probertt_clear_is) {
4167                 rack->rc_gp_incr = 0;
4168                 rack->rc_gp_bwred = 0;
4169                 rack->rc_gp_timely_inc_cnt = 0;
4170                 rack->rc_gp_timely_dec_cnt = 0;
4171         }
4172         /* Do we do any clamping at exit? */
4173         if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
4174                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
4175                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
4176         }
4177         if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
4178                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
4179                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
4180         }
4181         /*
4182          * Lets set rtt_diff to 0, so that we will get a "boost"
4183          * after exiting.
4184          */
4185         rack->r_ctl.rc_rtt_diff = 0;
4186
4187         /* Clear all flags so we start fresh */
4188         rack->rc_tp->t_bytes_acked = 0;
4189         rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
4190         /*
4191          * If configured to, set the cwnd and ssthresh to
4192          * our targets.
4193          */
4194         if (rack_probe_rtt_sets_cwnd) {
4195                 uint64_t ebdp;
4196                 uint32_t setto;
4197
4198                 /* Set ssthresh so we get into CA once we hit our target */
4199                 if (rack_probertt_use_min_rtt_exit == 1) {
4200                         /* Set to min rtt */
4201                         rack_set_prtt_target(rack, segsiz,
4202                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4203                 } else if (rack_probertt_use_min_rtt_exit == 2) {
4204                         /* Set to current gp rtt */
4205                         rack_set_prtt_target(rack, segsiz,
4206                                              rack->r_ctl.rc_gp_srtt);
4207                 } else if (rack_probertt_use_min_rtt_exit == 3) {
4208                         /* Set to entry gp rtt */
4209                         rack_set_prtt_target(rack, segsiz,
4210                                              rack->r_ctl.rc_entry_gp_rtt);
4211                 } else {
4212                         uint64_t sum;
4213                         uint32_t setval;
4214
4215                         sum = rack->r_ctl.rc_entry_gp_rtt;
4216                         sum *= 10;
4217                         sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
4218                         if (sum >= 20) {
4219                                 /*
4220                                  * A highly buffered path needs
4221                                  * cwnd space for timely to work.
4222                                  * Lets set things up as if
4223                                  * we are heading back here again.
4224                                  */
4225                                 setval = rack->r_ctl.rc_entry_gp_rtt;
4226                         } else if (sum >= 15) {
4227                                 /*
4228                                  * Lets take the smaller of the
4229                                  * two since we are just somewhat
4230                                  * buffered.
4231                                  */
4232                                 setval = rack->r_ctl.rc_gp_srtt;
4233                                 if (setval > rack->r_ctl.rc_entry_gp_rtt)
4234                                         setval = rack->r_ctl.rc_entry_gp_rtt;
4235                         } else {
4236                                 /*
4237                                  * Here we are not highly buffered
4238                                  * and should pick the min we can to
4239                                  * keep from causing loss.
4240                                  */
4241                                 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4242                         }
4243                         rack_set_prtt_target(rack, segsiz,
4244                                              setval);
4245                 }
4246                 if (rack_probe_rtt_sets_cwnd > 1) {
4247                         /* There is a percentage here to boost */
4248                         ebdp = rack->r_ctl.rc_target_probertt_flight;
4249                         ebdp *= rack_probe_rtt_sets_cwnd;
4250                         ebdp /= 100;
4251                         setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
4252                 } else
4253                         setto = rack->r_ctl.rc_target_probertt_flight;
4254                 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
4255                 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
4256                         /* Enforce a min */
4257                         rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
4258                 }
4259                 /* If we set in the cwnd also set the ssthresh point so we are in CA */
4260                 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
4261         }
4262         rack_log_rtt_shrinks(rack,  us_cts,
4263                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4264                              __LINE__, RACK_RTTS_EXITPROBE);
4265         /* Clear times last so log has all the info */
4266         rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
4267         rack->r_ctl.rc_time_probertt_entered = us_cts;
4268         rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4269         rack->r_ctl.rc_time_of_last_probertt = us_cts;
4270 }
4271
4272 static void
4273 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
4274 {
4275         /* Check in on probe-rtt */
4276         if (rack->rc_gp_filled == 0) {
4277                 /* We do not do p-rtt unless we have gp measurements */
4278                 return;
4279         }
4280         if (rack->in_probe_rtt) {
4281                 uint64_t no_overflow;
4282                 uint32_t endtime, must_stay;
4283
4284                 if (rack->r_ctl.rc_went_idle_time &&
4285                     ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
4286                         /*
4287                          * We went idle during prtt, just exit now.
4288                          */
4289                         rack_exit_probertt(rack, us_cts);
4290                 } else if (rack_probe_rtt_safety_val &&
4291                     TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
4292                     ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
4293                         /*
4294                          * Probe RTT safety value triggered!
4295                          */
4296                         rack_log_rtt_shrinks(rack,  us_cts,
4297                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4298                                              __LINE__, RACK_RTTS_SAFETY);
4299                         rack_exit_probertt(rack, us_cts);
4300                 }
4301                 /* Calculate the max we will wait */
4302                 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
4303                 if (rack->rc_highly_buffered)
4304                         endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
4305                 /* Calculate the min we must wait */
4306                 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
4307                 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
4308                     TSTMP_LT(us_cts, endtime)) {
4309                         uint32_t calc;
4310                         /* Do we lower more? */
4311 no_exit:
4312                         if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
4313                                 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
4314                         else
4315                                 calc = 0;
4316                         calc /= max(rack->r_ctl.rc_gp_srtt, 1);
4317                         if (calc) {
4318                                 /* Maybe */
4319                                 calc *= rack_per_of_gp_probertt_reduce;
4320                                 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
4321                                 /* Limit it too */
4322                                 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
4323                                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4324                         }
4325                         /* We must reach target or the time set */
4326                         return;
4327                 }
4328                 if (rack->r_ctl.rc_time_probertt_starts == 0) {
4329                         if ((TSTMP_LT(us_cts, must_stay) &&
4330                              rack->rc_highly_buffered) ||
4331                              (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
4332                               rack->r_ctl.rc_target_probertt_flight)) {
4333                                 /* We are not past the must_stay time */
4334                                 goto no_exit;
4335                         }
4336                         rack_log_rtt_shrinks(rack,  us_cts,
4337                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4338                                              __LINE__, RACK_RTTS_REACHTARGET);
4339                         rack->r_ctl.rc_time_probertt_starts = us_cts;
4340                         if (rack->r_ctl.rc_time_probertt_starts == 0)
4341                                 rack->r_ctl.rc_time_probertt_starts = 1;
4342                         /* Restore back to our rate we want to pace at in prtt */
4343                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4344                 }
4345                 /*
4346                  * Setup our end time, some number of gp_srtts plus 200ms.
4347                  */
4348                 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
4349                                (uint64_t)rack_probertt_gpsrtt_cnt_mul);
4350                 if (rack_probertt_gpsrtt_cnt_div)
4351                         endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
4352                 else
4353                         endtime = 0;
4354                 endtime += rack_min_probertt_hold;
4355                 endtime += rack->r_ctl.rc_time_probertt_starts;
4356                 if (TSTMP_GEQ(us_cts,  endtime)) {
4357                         /* yes, exit probertt */
4358                         rack_exit_probertt(rack, us_cts);
4359                 }
4360
4361         } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
4362                 /* Go into probertt, its been too long since we went lower */
4363                 rack_enter_probertt(rack, us_cts);
4364         }
4365 }
4366
4367 static void
4368 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
4369                        uint32_t rtt, int32_t rtt_diff)
4370 {
4371         uint64_t cur_bw, up_bnd, low_bnd, subfr;
4372         uint32_t losses;
4373
4374         if ((rack->rc_gp_dyn_mul == 0) ||
4375             (rack->use_fixed_rate) ||
4376             (rack->in_probe_rtt) ||
4377             (rack->rc_always_pace == 0)) {
4378                 /* No dynamic GP multiplier in play */
4379                 return;
4380         }
4381         losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
4382         cur_bw = rack_get_bw(rack);
4383         /* Calculate our up and down range */
4384         up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
4385         up_bnd /= 100;
4386         up_bnd += rack->r_ctl.last_gp_comp_bw;
4387
4388         subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
4389         subfr /= 100;
4390         low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
4391         if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
4392                 /*
4393                  * This is the case where our RTT is above
4394                  * the max target and we have been configured
4395                  * to just do timely no bonus up stuff in that case.
4396                  *
4397                  * There are two configurations, set to 1, and we
4398                  * just do timely if we are over our max. If its
4399                  * set above 1 then we slam the multipliers down
4400                  * to 100 and then decrement per timely.
4401                  */
4402                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4403                                 __LINE__, 3);
4404                 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
4405                         rack_validate_multipliers_at_or_below_100(rack);
4406                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4407         } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) {
4408                 /*
4409                  * We are decreasing this is a bit complicated this
4410                  * means we are loosing ground. This could be
4411                  * because another flow entered and we are competing
4412                  * for b/w with it. This will push the RTT up which
4413                  * makes timely unusable unless we want to get shoved
4414                  * into a corner and just be backed off (the age
4415                  * old problem with delay based CC).
4416                  *
4417                  * On the other hand if it was a route change we
4418                  * would like to stay somewhat contained and not
4419                  * blow out the buffers.
4420                  */
4421                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4422                                 __LINE__, 3);
4423                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4424                 if (rack->rc_gp_bwred == 0) {
4425                         /* Go into reduction counting */
4426                         rack->rc_gp_bwred = 1;
4427                         rack->rc_gp_timely_dec_cnt = 0;
4428                 }
4429                 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) {
4430                         /*
4431                          * Push another time with a faster pacing
4432                          * to try to gain back (we include override to
4433                          * get a full raise factor).
4434                          */
4435                         if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
4436                             (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
4437                             (timely_says == 0) ||
4438                             (rack_down_raise_thresh == 0)) {
4439                                 /*
4440                                  * Do an override up in b/w if we were
4441                                  * below the threshold or if the threshold
4442                                  * is zero we always do the raise.
4443                                  */
4444                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
4445                         } else {
4446                                 /* Log it stays the same */
4447                                 rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
4448                                                 __LINE__, 11);
4449                         }
4450                         rack->rc_gp_timely_dec_cnt++;
4451                         /* We are not incrementing really no-count */
4452                         rack->rc_gp_incr = 0;
4453                         rack->rc_gp_timely_inc_cnt = 0;
4454                 } else {
4455                         /*
4456                          * Lets just use the RTT
4457                          * information and give up
4458                          * pushing.
4459                          */
4460                         goto use_timely;
4461                 }
4462         } else if ((timely_says != 2) &&
4463                     !losses &&
4464                     (last_bw_est > up_bnd)) {
4465                 /*
4466                  * We are increasing b/w lets keep going, updating
4467                  * our b/w and ignoring any timely input, unless
4468                  * of course we are at our max raise (if there is one).
4469                  */
4470
4471                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4472                                 __LINE__, 3);
4473                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4474                 if (rack->rc_gp_saw_ss &&
4475                     rack->r_ctl.rack_per_upper_bound_ss &&
4476                      (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) {
4477                             /*
4478                              * In cases where we can't go higher
4479                              * we should just use timely.
4480                              */
4481                             goto use_timely;
4482                 }
4483                 if (rack->rc_gp_saw_ca &&
4484                     rack->r_ctl.rack_per_upper_bound_ca &&
4485                     (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) {
4486                             /*
4487                              * In cases where we can't go higher
4488                              * we should just use timely.
4489                              */
4490                             goto use_timely;
4491                 }
4492                 rack->rc_gp_bwred = 0;
4493                 rack->rc_gp_timely_dec_cnt = 0;
4494                 /* You get a set number of pushes if timely is trying to reduce */
4495                 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
4496                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4497                 } else {
4498                         /* Log it stays the same */
4499                         rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
4500                             __LINE__, 12);
4501                 }
4502                 return;
4503         } else {
4504                 /*
4505                  * We are staying between the lower and upper range bounds
4506                  * so use timely to decide.
4507                  */
4508                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4509                                 __LINE__, 3);
4510 use_timely:
4511                 if (timely_says) {
4512                         rack->rc_gp_incr = 0;
4513                         rack->rc_gp_timely_inc_cnt = 0;
4514                         if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4515                             !losses &&
4516                             (last_bw_est < low_bnd)) {
4517                                 /* We are loosing ground */
4518                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4519                                 rack->rc_gp_timely_dec_cnt++;
4520                                 /* We are not incrementing really no-count */
4521                                 rack->rc_gp_incr = 0;
4522                                 rack->rc_gp_timely_inc_cnt = 0;
4523                         } else
4524                                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4525                 } else {
4526                         rack->rc_gp_bwred = 0;
4527                         rack->rc_gp_timely_dec_cnt = 0;
4528                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4529                 }
4530         }
4531 }
4532
4533 static int32_t
4534 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4535 {
4536         int32_t timely_says;
4537         uint64_t log_mult, log_rtt_a_diff;
4538
4539         log_rtt_a_diff = rtt;
4540         log_rtt_a_diff <<= 32;
4541         log_rtt_a_diff |= (uint32_t)rtt_diff;
4542         if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4543                     rack_gp_rtt_maxmul)) {
4544                 /* Reduce the b/w multiplier */
4545                 timely_says = 2;
4546                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4547                 log_mult <<= 32;
4548                 log_mult |= prev_rtt;
4549                 rack_log_timely(rack,  timely_says, log_mult,
4550                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4551                                 log_rtt_a_diff, __LINE__, 4);
4552         } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4553                            ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4554                             max(rack_gp_rtt_mindiv , 1)))) {
4555                 /* Increase the b/w multiplier */
4556                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4557                         ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4558                          max(rack_gp_rtt_mindiv , 1));
4559                 log_mult <<= 32;
4560                 log_mult |= prev_rtt;
4561                 timely_says = 0;
4562                 rack_log_timely(rack,  timely_says, log_mult ,
4563                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4564                                 log_rtt_a_diff, __LINE__, 5);
4565         } else {
4566                 /*
4567                  * Use a gradient to find it the timely gradient
4568                  * is:
4569                  * grad = rc_rtt_diff / min_rtt;
4570                  *
4571                  * anything below or equal to 0 will be
4572                  * a increase indication. Anything above
4573                  * zero is a decrease. Note we take care
4574                  * of the actual gradient calculation
4575                  * in the reduction (its not needed for
4576                  * increase).
4577                  */
4578                 log_mult = prev_rtt;
4579                 if (rtt_diff <= 0) {
4580                         /*
4581                          * Rttdiff is less than zero, increase the
4582                          * b/w multiplier (its 0 or negative)
4583                          */
4584                         timely_says = 0;
4585                         rack_log_timely(rack,  timely_says, log_mult,
4586                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4587                 } else {
4588                         /* Reduce the b/w multiplier */
4589                         timely_says = 1;
4590                         rack_log_timely(rack,  timely_says, log_mult,
4591                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4592                 }
4593         }
4594         return (timely_says);
4595 }
4596
4597 static __inline int
4598 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)
4599 {
4600         if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4601             SEQ_LEQ(rsm->r_end, tp->gput_ack)) {
4602                 /**
4603                  * This covers the case that the
4604                  * resent is completely inside
4605                  * the gp range or up to it.
4606                  *      |----------------|
4607                  *      |-----| <or>
4608                  *            |----|
4609                  *            <or>   |---|
4610                  */
4611                 return (1);
4612         } else if (SEQ_LT(rsm->r_start, tp->gput_seq) &&
4613                    SEQ_GT(rsm->r_end, tp->gput_seq)){
4614                 /**
4615                  * This covers the case of
4616                  *      |--------------|
4617                  *  |-------->|
4618                  */
4619                 return (1);
4620         } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4621                    SEQ_LT(rsm->r_start, tp->gput_ack) &&
4622                    SEQ_GEQ(rsm->r_end, tp->gput_ack)) {
4623
4624                 /**
4625                  * This covers the case of
4626                  *      |--------------|
4627                  *              |-------->|
4628                  */
4629                 return (1);
4630         }
4631         return (0);
4632 }
4633
4634 static __inline void
4635 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)
4636 {
4637
4638         if ((tp->t_flags & TF_GPUTINPROG) == 0)
4639                 return;
4640         /*
4641          * We have a Goodput measurement in progress. Mark
4642          * the send if its within the window. If its not
4643          * in the window make sure it does not have the mark.
4644          */
4645         if (rack_in_gp_window(tp, rsm))
4646                 rsm->r_flags |= RACK_IN_GP_WIN;
4647         else
4648                 rsm->r_flags &= ~RACK_IN_GP_WIN;
4649 }
4650
4651 static __inline void
4652 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4653 {
4654         /* A GP measurement is ending, clear all marks on the send map*/
4655         struct rack_sendmap *rsm = NULL;
4656
4657         rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4658         if (rsm == NULL) {
4659                 rsm = tqhash_min(rack->r_ctl.tqh);
4660         }
4661         /* Nothing left? */
4662         while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){
4663                 rsm->r_flags &= ~RACK_IN_GP_WIN;
4664                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4665         }
4666 }
4667
4668
4669 static __inline void
4670 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4671 {
4672         struct rack_sendmap *rsm = NULL;
4673
4674         if (tp->snd_una == tp->snd_max) {
4675                 /* Nothing outstanding yet, nothing to do here */
4676                 return;
4677         }
4678         if (SEQ_GT(tp->gput_seq, tp->snd_una)) {
4679                 /*
4680                  * We are measuring ahead of some outstanding
4681                  * data. We need to walk through up until we get
4682                  * to gp_seq marking so that no rsm is set incorrectly
4683                  * with RACK_IN_GP_WIN.
4684                  */
4685                 rsm = tqhash_min(rack->r_ctl.tqh);
4686                 while (rsm != NULL) {
4687                         rack_mark_in_gp_win(tp, rsm);
4688                         if (SEQ_GEQ(rsm->r_end, tp->gput_seq))
4689                                 break;
4690                         rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4691                 }
4692         }
4693         if (rsm == NULL) {
4694                 /*
4695                  * Need to find the GP seq, if rsm is
4696                  * set we stopped as we hit it.
4697                  */
4698                 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4699                 if (rsm == NULL)
4700                         return;
4701                 rack_mark_in_gp_win(tp, rsm);
4702         }
4703         /*
4704          * Now we may need to mark already sent rsm, ahead of
4705          * gput_seq in the window since they may have been sent
4706          * *before* we started our measurment. The rsm, if non-null
4707          * has been marked (note if rsm would have been NULL we would have
4708          * returned in the previous block). So we go to the next, and continue
4709          * until we run out of entries or we exceed the gp_ack value.
4710          */
4711         rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4712         while (rsm) {
4713                 rack_mark_in_gp_win(tp, rsm);
4714                 if (SEQ_GT(rsm->r_end, tp->gput_ack))
4715                         break;
4716                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4717         }
4718 }
4719
4720 static void
4721 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4722                             tcp_seq th_ack, int line, uint8_t quality)
4723 {
4724         uint64_t tim, bytes_ps, stim, utim;
4725         uint32_t segsiz, bytes, reqbytes, us_cts;
4726         int32_t gput, new_rtt_diff, timely_says;
4727         uint64_t  resid_bw, subpart = 0, addpart = 0, srtt;
4728         int did_add = 0;
4729
4730         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4731         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4732         if (TSTMP_GEQ(us_cts, tp->gput_ts))
4733                 tim = us_cts - tp->gput_ts;
4734         else
4735                 tim = 0;
4736         if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4737                 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4738         else
4739                 stim = 0;
4740         /*
4741          * Use the larger of the send time or ack time. This prevents us
4742          * from being influenced by ack artifacts to come up with too
4743          * high of measurement. Note that since we are spanning over many more
4744          * bytes in most of our measurements hopefully that is less likely to
4745          * occur.
4746          */
4747         if (tim > stim)
4748                 utim = max(tim, 1);
4749         else
4750                 utim = max(stim, 1);
4751         reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4752         rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL);
4753         if ((tim == 0) && (stim == 0)) {
4754                 /*
4755                  * Invalid measurement time, maybe
4756                  * all on one ack/one send?
4757                  */
4758                 bytes = 0;
4759                 bytes_ps = 0;
4760                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4761                                            0, 0, 0, 10, __LINE__, NULL, quality);
4762                 goto skip_measurement;
4763         }
4764         if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4765                 /* We never made a us_rtt measurement? */
4766                 bytes = 0;
4767                 bytes_ps = 0;
4768                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4769                                            0, 0, 0, 10, __LINE__, NULL, quality);
4770                 goto skip_measurement;
4771         }
4772         /*
4773          * Calculate the maximum possible b/w this connection
4774          * could have. We base our calculation on the lowest
4775          * rtt we have seen during the measurement and the
4776          * largest rwnd the client has given us in that time. This
4777          * forms a BDP that is the maximum that we could ever
4778          * get to the client. Anything larger is not valid.
4779          *
4780          * I originally had code here that rejected measurements
4781          * where the time was less than 1/2 the latest us_rtt.
4782          * But after thinking on that I realized its wrong since
4783          * say you had a 150Mbps or even 1Gbps link, and you
4784          * were a long way away.. example I am in Europe (100ms rtt)
4785          * talking to my 1Gbps link in S.C. Now measuring say 150,000
4786          * bytes my time would be 1.2ms, and yet my rtt would say
4787          * the measurement was invalid the time was < 50ms. The
4788          * same thing is true for 150Mb (8ms of time).
4789          *
4790          * A better way I realized is to look at what the maximum
4791          * the connection could possibly do. This is gated on
4792          * the lowest RTT we have seen and the highest rwnd.
4793          * We should in theory never exceed that, if we are
4794          * then something on the path is storing up packets
4795          * and then feeding them all at once to our endpoint
4796          * messing up our measurement.
4797          */
4798         rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4799         rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4800         rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4801         if (SEQ_LT(th_ack, tp->gput_seq)) {
4802                 /* No measurement can be made */
4803                 bytes = 0;
4804                 bytes_ps = 0;
4805                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4806                                            0, 0, 0, 10, __LINE__, NULL, quality);
4807                 goto skip_measurement;
4808         } else
4809                 bytes = (th_ack - tp->gput_seq);
4810         bytes_ps = (uint64_t)bytes;
4811         /*
4812          * Don't measure a b/w for pacing unless we have gotten at least
4813          * an initial windows worth of data in this measurement interval.
4814          *
4815          * Small numbers of bytes get badly influenced by delayed ack and
4816          * other artifacts. Note we take the initial window or our
4817          * defined minimum GP (defaulting to 10 which hopefully is the
4818          * IW).
4819          */
4820         if (rack->rc_gp_filled == 0) {
4821                 /*
4822                  * The initial estimate is special. We
4823                  * have blasted out an IW worth of packets
4824                  * without a real valid ack ts results. We
4825                  * then setup the app_limited_needs_set flag,
4826                  * this should get the first ack in (probably 2
4827                  * MSS worth) to be recorded as the timestamp.
4828                  * We thus allow a smaller number of bytes i.e.
4829                  * IW - 2MSS.
4830                  */
4831                 reqbytes -= (2 * segsiz);
4832                 /* Also lets fill previous for our first measurement to be neutral */
4833                 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4834         }
4835         if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4836                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4837                                            rack->r_ctl.rc_app_limited_cnt,
4838                                            0, 0, 10, __LINE__, NULL, quality);
4839                 goto skip_measurement;
4840         }
4841         /*
4842          * We now need to calculate the Timely like status so
4843          * we can update (possibly) the b/w multipliers.
4844          */
4845         new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4846         if (rack->rc_gp_filled == 0) {
4847                 /* No previous reading */
4848                 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4849         } else {
4850                 if (rack->measure_saw_probe_rtt == 0) {
4851                         /*
4852                          * We don't want a probertt to be counted
4853                          * since it will be negative incorrectly. We
4854                          * expect to be reducing the RTT when we
4855                          * pace at a slower rate.
4856                          */
4857                         rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4858                         rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4859                 }
4860         }
4861         timely_says = rack_make_timely_judgement(rack,
4862             rack->r_ctl.rc_gp_srtt,
4863             rack->r_ctl.rc_rtt_diff,
4864             rack->r_ctl.rc_prev_gp_srtt
4865         );
4866         bytes_ps *= HPTS_USEC_IN_SEC;
4867         bytes_ps /= utim;
4868         if (bytes_ps > rack->r_ctl.last_max_bw) {
4869                 /*
4870                  * Something is on path playing
4871                  * since this b/w is not possible based
4872                  * on our BDP (highest rwnd and lowest rtt
4873                  * we saw in the measurement window).
4874                  *
4875                  * Another option here would be to
4876                  * instead skip the measurement.
4877                  */
4878                 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4879                                            bytes_ps, rack->r_ctl.last_max_bw, 0,
4880                                            11, __LINE__, NULL, quality);
4881                 bytes_ps = rack->r_ctl.last_max_bw;
4882         }
4883         /* We store gp for b/w in bytes per second */
4884         if (rack->rc_gp_filled == 0) {
4885                 /* Initial measurement */
4886                 if (bytes_ps) {
4887                         rack->r_ctl.gp_bw = bytes_ps;
4888                         rack->rc_gp_filled = 1;
4889                         rack->r_ctl.num_measurements = 1;
4890                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4891                 } else {
4892                         rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4893                                                    rack->r_ctl.rc_app_limited_cnt,
4894                                                    0, 0, 10, __LINE__, NULL, quality);
4895                 }
4896                 if (tcp_in_hpts(rack->rc_inp) &&
4897                     (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4898                         /*
4899                          * Ok we can't trust the pacer in this case
4900                          * where we transition from un-paced to paced.
4901                          * Or for that matter when the burst mitigation
4902                          * was making a wild guess and got it wrong.
4903                          * Stop the pacer and clear up all the aggregate
4904                          * delays etc.
4905                          */
4906                         tcp_hpts_remove(rack->rc_inp);
4907                         rack->r_ctl.rc_hpts_flags = 0;
4908                         rack->r_ctl.rc_last_output_to = 0;
4909                 }
4910                 did_add = 2;
4911         } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4912                 /* Still a small number run an average */
4913                 rack->r_ctl.gp_bw += bytes_ps;
4914                 addpart = rack->r_ctl.num_measurements;
4915                 rack->r_ctl.num_measurements++;
4916                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4917                         /* We have collected enough to move forward */
4918                         rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4919                 }
4920                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
4921                 did_add = 3;
4922         } else {
4923                 /*
4924                  * We want to take 1/wma of the goodput and add in to 7/8th
4925                  * of the old value weighted by the srtt. So if your measurement
4926                  * period is say 2 SRTT's long you would get 1/4 as the
4927                  * value, if it was like 1/2 SRTT then you would get 1/16th.
4928                  *
4929                  * But we must be careful not to take too much i.e. if the
4930                  * srtt is say 20ms and the measurement is taken over
4931                  * 400ms our weight would be 400/20 i.e. 20. On the
4932                  * other hand if we get a measurement over 1ms with a
4933                  * 10ms rtt we only want to take a much smaller portion.
4934                  */
4935                 if (rack->r_ctl.num_measurements < 0xff) {
4936                         rack->r_ctl.num_measurements++;
4937                 }
4938                 srtt = (uint64_t)tp->t_srtt;
4939                 if (srtt == 0) {
4940                         /*
4941                          * Strange why did t_srtt go back to zero?
4942                          */
4943                         if (rack->r_ctl.rc_rack_min_rtt)
4944                                 srtt = rack->r_ctl.rc_rack_min_rtt;
4945                         else
4946                                 srtt = HPTS_USEC_IN_MSEC;
4947                 }
4948                 /*
4949                  * XXXrrs: Note for reviewers, in playing with
4950                  * dynamic pacing I discovered this GP calculation
4951                  * as done originally leads to some undesired results.
4952                  * Basically you can get longer measurements contributing
4953                  * too much to the WMA. Thus I changed it if you are doing
4954                  * dynamic adjustments to only do the aportioned adjustment
4955                  * if we have a very small (time wise) measurement. Longer
4956                  * measurements just get there weight (defaulting to 1/8)
4957                  * add to the WMA. We may want to think about changing
4958                  * this to always do that for both sides i.e. dynamic
4959                  * and non-dynamic... but considering lots of folks
4960                  * were playing with this I did not want to change the
4961                  * calculation per.se. without your thoughts.. Lawerence?
4962                  * Peter??
4963                  */
4964                 if (rack->rc_gp_dyn_mul == 0) {
4965                         subpart = rack->r_ctl.gp_bw * utim;
4966                         subpart /= (srtt * 8);
4967                         if (subpart < (rack->r_ctl.gp_bw / 2)) {
4968                                 /*
4969                                  * The b/w update takes no more
4970                                  * away then 1/2 our running total
4971                                  * so factor it in.
4972                                  */
4973                                 addpart = bytes_ps * utim;
4974                                 addpart /= (srtt * 8);
4975                         } else {
4976                                 /*
4977                                  * Don't allow a single measurement
4978                                  * to account for more than 1/2 of the
4979                                  * WMA. This could happen on a retransmission
4980                                  * where utim becomes huge compared to
4981                                  * srtt (multiple retransmissions when using
4982                                  * the sending rate which factors in all the
4983                                  * transmissions from the first one).
4984                                  */
4985                                 subpart = rack->r_ctl.gp_bw / 2;
4986                                 addpart = bytes_ps / 2;
4987                         }
4988                         resid_bw = rack->r_ctl.gp_bw - subpart;
4989                         rack->r_ctl.gp_bw = resid_bw + addpart;
4990                         did_add = 1;
4991                 } else {
4992                         if ((utim / srtt) <= 1) {
4993                                 /*
4994                                  * The b/w update was over a small period
4995                                  * of time. The idea here is to prevent a small
4996                                  * measurement time period from counting
4997                                  * too much. So we scale it based on the
4998                                  * time so it attributes less than 1/rack_wma_divisor
4999                                  * of its measurement.
5000                                  */
5001                                 subpart = rack->r_ctl.gp_bw * utim;
5002                                 subpart /= (srtt * rack_wma_divisor);
5003                                 addpart = bytes_ps * utim;
5004                                 addpart /= (srtt * rack_wma_divisor);
5005                         } else {
5006                                 /*
5007                                  * The scaled measurement was long
5008                                  * enough so lets just add in the
5009                                  * portion of the measurement i.e. 1/rack_wma_divisor
5010                                  */
5011                                 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
5012                                 addpart = bytes_ps / rack_wma_divisor;
5013                         }
5014                         if ((rack->measure_saw_probe_rtt == 0) ||
5015                             (bytes_ps > rack->r_ctl.gp_bw)) {
5016                                 /*
5017                                  * For probe-rtt we only add it in
5018                                  * if its larger, all others we just
5019                                  * add in.
5020                                  */
5021                                 did_add = 1;
5022                                 resid_bw = rack->r_ctl.gp_bw - subpart;
5023                                 rack->r_ctl.gp_bw = resid_bw + addpart;
5024                         }
5025                 }
5026                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
5027         }
5028         if ((rack->gp_ready == 0) &&
5029             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
5030                 /* We have enough measurements now */
5031                 rack->gp_ready = 1;
5032                 if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) ||
5033                     rack->rack_hibeta)
5034                         rack_set_cc_pacing(rack);
5035                 if (rack->defer_options)
5036                         rack_apply_deferred_options(rack);
5037         }
5038         rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
5039                                    rack_get_bw(rack), 22, did_add, NULL, quality);
5040         /* We do not update any multipliers if we are in or have seen a probe-rtt */
5041         if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
5042                 rack_update_multiplier(rack, timely_says, bytes_ps,
5043                                        rack->r_ctl.rc_gp_srtt,
5044                                        rack->r_ctl.rc_rtt_diff);
5045         rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
5046                                    rack_get_bw(rack), 3, line, NULL, quality);
5047         rack_log_pacing_delay_calc(rack,
5048                                    bytes, /* flex2 */
5049                                    tim, /* flex1 */
5050                                    bytes_ps, /* bw_inuse */
5051                                    rack->r_ctl.gp_bw, /* delRate */
5052                                    rack_get_lt_bw(rack), /* rttProp */
5053                                    20, line, NULL, 0);
5054         /* reset the gp srtt and setup the new prev */
5055         rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
5056         /* Record the lost count for the next measurement */
5057         rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
5058 skip_measurement:
5059         /*
5060          * We restart our diffs based on the gpsrtt in the
5061          * measurement window.
5062          */
5063         rack->rc_gp_rtt_set = 0;
5064         rack->rc_gp_saw_rec = 0;
5065         rack->rc_gp_saw_ca = 0;
5066         rack->rc_gp_saw_ss = 0;
5067         rack->rc_dragged_bottom = 0;
5068
5069         if (quality == RACK_QUALITY_HIGH) {
5070                 /*
5071                  * Gput in the stats world is in kbps where bytes_ps is
5072                  * bytes per second so we do ((x * 8)/ 1000).
5073                  */
5074                 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000);
5075 #ifdef STATS
5076                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
5077                                          gput);
5078                 /*
5079                  * XXXLAS: This is a temporary hack, and should be
5080                  * chained off VOI_TCP_GPUT when stats(9) grows an
5081                  * API to deal with chained VOIs.
5082                  */
5083                 if (tp->t_stats_gput_prev > 0)
5084                         stats_voi_update_abs_s32(tp->t_stats,
5085                                                  VOI_TCP_GPUT_ND,
5086                                                  ((gput - tp->t_stats_gput_prev) * 100) /
5087                                                  tp->t_stats_gput_prev);
5088 #endif
5089                 tp->t_stats_gput_prev = gput;
5090         }
5091         tp->t_flags &= ~TF_GPUTINPROG;
5092         /*
5093          * Now are we app limited now and there is space from where we
5094          * were to where we want to go?
5095          *
5096          * We don't do the other case i.e. non-applimited here since
5097          * the next send will trigger us picking up the missing data.
5098          */
5099         if (rack->r_ctl.rc_first_appl &&
5100             TCPS_HAVEESTABLISHED(tp->t_state) &&
5101             rack->r_ctl.rc_app_limited_cnt &&
5102             (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
5103             ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
5104              max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
5105                 /*
5106                  * Yep there is enough outstanding to make a measurement here.
5107                  */
5108                 struct rack_sendmap *rsm;
5109
5110                 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
5111                 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
5112                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
5113                 rack->app_limited_needs_set = 0;
5114                 tp->gput_seq = th_ack;
5115                 if (rack->in_probe_rtt)
5116                         rack->measure_saw_probe_rtt = 1;
5117                 else if ((rack->measure_saw_probe_rtt) &&
5118                          (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
5119                         rack->measure_saw_probe_rtt = 0;
5120                 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
5121                         /* There is a full window to gain info from */
5122                         tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
5123                 } else {
5124                         /* We can only measure up to the applimited point */
5125                         tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
5126                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
5127                                 /*
5128                                  * We don't have enough to make a measurement.
5129                                  */
5130                                 tp->t_flags &= ~TF_GPUTINPROG;
5131                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
5132                                                            0, 0, 0, 6, __LINE__, NULL, quality);
5133                                 return;
5134                         }
5135                 }
5136                 if (tp->t_state >= TCPS_FIN_WAIT_1) {
5137                         /*
5138                          * We will get no more data into the SB
5139                          * this means we need to have the data available
5140                          * before we start a measurement.
5141                          */
5142                         if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) {
5143                                 /* Nope not enough data. */
5144                                 return;
5145                         }
5146                 }
5147                 tp->t_flags |= TF_GPUTINPROG;
5148                 /*
5149                  * Now we need to find the timestamp of the send at tp->gput_seq
5150                  * for the send based measurement.
5151                  */
5152                 rack->r_ctl.rc_gp_cumack_ts = 0;
5153                 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
5154                 if (rsm) {
5155                         /* Ok send-based limit is set */
5156                         if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
5157                                 /*
5158                                  * Move back to include the earlier part
5159                                  * so our ack time lines up right (this may
5160                                  * make an overlapping measurement but thats
5161                                  * ok).
5162                                  */
5163                                 tp->gput_seq = rsm->r_start;
5164                         }
5165                         if (rsm->r_flags & RACK_ACKED) {
5166                                 struct rack_sendmap *nrsm;
5167
5168                                 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
5169                                 tp->gput_seq = rsm->r_end;
5170                                 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
5171                                 if (nrsm)
5172                                         rsm = nrsm;
5173                                 else {
5174                                         rack->app_limited_needs_set = 1;
5175                                 }
5176                         } else
5177                                 rack->app_limited_needs_set = 1;
5178                         /* We always go from the first send */
5179                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
5180                 } else {
5181                         /*
5182                          * If we don't find the rsm due to some
5183                          * send-limit set the current time, which
5184                          * basically disables the send-limit.
5185                          */
5186                         struct timeval tv;
5187
5188                         microuptime(&tv);
5189                         rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
5190                 }
5191                 rack_tend_gp_marks(tp, rack);
5192                 rack_log_pacing_delay_calc(rack,
5193                                            tp->gput_seq,
5194                                            tp->gput_ack,
5195                                            (uint64_t)rsm,
5196                                            tp->gput_ts,
5197                                            (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
5198                                            9,
5199                                            __LINE__, rsm, quality);
5200                 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
5201         } else {
5202                 /*
5203                  * To make sure proper timestamp merging occurs, we need to clear
5204                  * all GP marks if we don't start a measurement.
5205                  */
5206                 rack_clear_gp_marks(tp, rack);
5207         }
5208 }
5209
5210 /*
5211  * CC wrapper hook functions
5212  */
5213 static void
5214 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
5215     uint16_t type, int32_t recovery)
5216 {
5217         uint32_t prior_cwnd, acked;
5218         struct tcp_log_buffer *lgb = NULL;
5219         uint8_t labc_to_use, quality;
5220
5221         INP_WLOCK_ASSERT(tptoinpcb(tp));
5222         tp->t_ccv.nsegs = nsegs;
5223         acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una);
5224         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
5225                 uint32_t max;
5226
5227                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
5228                 if (tp->t_ccv.bytes_this_ack > max) {
5229                         tp->t_ccv.bytes_this_ack = max;
5230                 }
5231         }
5232 #ifdef STATS
5233         stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
5234             ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
5235 #endif
5236         if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
5237                 /* We will ack all, time
5238                  * to end any lt_bw_up we
5239                  * have running until something
5240                  * new is sent.
5241                  */
5242                 struct timeval tv;
5243
5244                 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
5245                 rack->r_ctl.lt_seq = tp->snd_max;
5246                 (void)tcp_get_usecs(&tv);
5247                 rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark);
5248                 rack->lt_bw_up = 0;
5249         }
5250         quality = RACK_QUALITY_NONE;
5251         if ((tp->t_flags & TF_GPUTINPROG) &&
5252             rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
5253                 /* Measure the Goodput */
5254                 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
5255         }
5256         /* Which way our we limited, if not cwnd limited no advance in CA */
5257         if (tp->snd_cwnd <= tp->snd_wnd)
5258                 tp->t_ccv.flags |= CCF_CWND_LIMITED;
5259         else
5260                 tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
5261         if (tp->snd_cwnd > tp->snd_ssthresh) {
5262                 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack,
5263                          nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
5264                 /* For the setting of a window past use the actual scwnd we are using */
5265                 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
5266                         tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
5267                         tp->t_ccv.flags |= CCF_ABC_SENTAWND;
5268                 }
5269         } else {
5270                 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
5271                 tp->t_bytes_acked = 0;
5272         }
5273         prior_cwnd = tp->snd_cwnd;
5274         if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
5275             (rack_client_low_buf && rack->client_bufferlvl &&
5276             (rack->client_bufferlvl < rack_client_low_buf)))
5277                 labc_to_use = rack->rc_labc;
5278         else
5279                 labc_to_use = rack_max_abc_post_recovery;
5280         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5281                 union tcp_log_stackspecific log;
5282                 struct timeval tv;
5283
5284                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5285                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5286                 log.u_bbr.flex1 = th_ack;
5287                 log.u_bbr.flex2 = tp->t_ccv.flags;
5288                 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5289                 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5290                 log.u_bbr.flex5 = labc_to_use;
5291                 log.u_bbr.flex6 = prior_cwnd;
5292                 log.u_bbr.flex7 = V_tcp_do_newsack;
5293                 log.u_bbr.flex8 = 1;
5294                 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5295                                      0, &log, false, NULL, __func__, __LINE__,&tv);
5296         }
5297         if (CC_ALGO(tp)->ack_received != NULL) {
5298                 /* XXXLAS: Find a way to live without this */
5299                 tp->t_ccv.curack = th_ack;
5300                 tp->t_ccv.labc = labc_to_use;
5301                 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC;
5302                 CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
5303         }
5304         if (lgb) {
5305                 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
5306         }
5307         if (rack->r_must_retran) {
5308                 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
5309                         /*
5310                          * We now are beyond the rxt point so lets disable
5311                          * the flag.
5312                          */
5313                         rack->r_ctl.rc_out_at_rto = 0;
5314                         rack->r_must_retran = 0;
5315                 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
5316                         /*
5317                          * Only decrement the rc_out_at_rto if the cwnd advances
5318                          * at least a whole segment. Otherwise next time the peer
5319                          * acks, we won't be able to send this generaly happens
5320                          * when we are in Congestion Avoidance.
5321                          */
5322                         if (acked <= rack->r_ctl.rc_out_at_rto){
5323                                 rack->r_ctl.rc_out_at_rto -= acked;
5324                         } else {
5325                                 rack->r_ctl.rc_out_at_rto = 0;
5326                         }
5327                 }
5328         }
5329 #ifdef STATS
5330         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
5331 #endif
5332         if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
5333                 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
5334         }
5335 }
5336
5337 static void
5338 tcp_rack_partialack(struct tcpcb *tp)
5339 {
5340         struct tcp_rack *rack;
5341
5342         rack = (struct tcp_rack *)tp->t_fb_ptr;
5343         INP_WLOCK_ASSERT(tptoinpcb(tp));
5344         /*
5345          * If we are doing PRR and have enough
5346          * room to send <or> we are pacing and prr
5347          * is disabled we will want to see if we
5348          * can send data (by setting r_wanted_output to
5349          * true).
5350          */
5351         if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
5352             rack->rack_no_prr)
5353                 rack->r_wanted_output = 1;
5354 }
5355
5356 static inline void
5357 rack_set_most_aggr(struct tcp_rack *rack)
5358 {
5359         rack->r_fill_less_agg = 0;
5360         /* Once the cwnd as been clamped we don't do fill_cw */
5361         if (rack->r_cwnd_was_clamped == 0)
5362                 rack->rc_pace_to_cwnd = 1;
5363         rack->r_pacing_discount = 0;
5364 }
5365
5366 static inline void
5367 rack_limit_fillcw(struct tcp_rack *rack)
5368 {
5369         rack->r_fill_less_agg = 1;
5370         /* Once the cwnd as been clamped we don't do fill_cw */
5371         if (rack->r_cwnd_was_clamped == 0)
5372                 rack->rc_pace_to_cwnd = 1;
5373         rack->r_pacing_discount = 0;
5374 }
5375
5376 static inline void
5377 rack_disable_fillcw(struct tcp_rack *rack)
5378 {
5379         rack->r_fill_less_agg = 1;
5380         rack->rc_pace_to_cwnd = 0;
5381         rack->r_pacing_discount = 0;
5382 }
5383
5384 static void
5385 rack_client_buffer_level_set(struct tcp_rack *rack)
5386 {
5387         /*
5388          * Only if DGP is on do we do anything that
5389          * changes stack behavior. If DGP is off all
5390          * we will do is issue a BB log (if BB logging is
5391          * on) and return.
5392          */
5393         if (rack->dgp_on == 0) {
5394                 rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl,
5395                                            0, 0, 0, 30, __LINE__, NULL, 0);
5396                 return;
5397         }
5398         if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) {
5399                 goto set_most_agg;
5400         }
5401         /*
5402          * We are in DGP so what setting should we
5403          * apply based on where the client is?
5404          */
5405         switch(rack->r_ctl.rc_dgp_bl_agg) {
5406         default:
5407         case DGP_LEVEL0:
5408 set_most_agg:
5409                 rack_set_most_aggr(rack);
5410                 break;
5411         case DGP_LEVEL1:
5412                 if (rack->client_bufferlvl == 4)
5413                         rack_limit_fillcw(rack);
5414                 else if (rack->client_bufferlvl == 5)
5415                         rack_disable_fillcw(rack);
5416                 else
5417                         rack_set_most_aggr(rack);
5418                 break;
5419         case DGP_LEVEL2:
5420                 if (rack->client_bufferlvl == 3)
5421                         rack_limit_fillcw(rack);
5422                 else if (rack->client_bufferlvl == 4)
5423                         rack_disable_fillcw(rack);
5424                 else if (rack->client_bufferlvl == 5) {
5425                         rack_disable_fillcw(rack);
5426                         rack->r_pacing_discount = 1;
5427                         rack->r_ctl.pacing_discount_amm = 1;
5428                 } else
5429                         rack_set_most_aggr(rack);
5430                 break;
5431         case DGP_LEVEL3:
5432                 if (rack->client_bufferlvl == 2)
5433                         rack_limit_fillcw(rack);
5434                 else if (rack->client_bufferlvl == 3)
5435                         rack_disable_fillcw(rack);
5436                 else if (rack->client_bufferlvl == 4) {
5437                         rack_disable_fillcw(rack);
5438                         rack->r_pacing_discount = 1;
5439                         rack->r_ctl.pacing_discount_amm = 1;
5440                 } else if (rack->client_bufferlvl == 5) {
5441                         rack_disable_fillcw(rack);
5442                         rack->r_pacing_discount = 1;
5443                         rack->r_ctl.pacing_discount_amm = 2;
5444                 } else
5445                         rack_set_most_aggr(rack);
5446                 break;
5447         }
5448         rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0,
5449                                    0, 0, 30, __LINE__, NULL, 0);
5450 }
5451
5452 static void
5453 do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack)
5454 {
5455         /*
5456          * Can we unclamp. We unclamp if more than
5457          * N rounds have transpired with no loss.
5458          */
5459         uint64_t snds, rxts, rxt_per;
5460         uint32_t rnds;
5461
5462         rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
5463         if ((rack_unclamp_round_thresh > 0) &&
5464             (rnds >= rack_unclamp_round_thresh)) {
5465                 snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
5466                 KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp,
5467                     (uintmax_t)snds));
5468                 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
5469                 rxt_per = rxts * 1000;
5470                 rxt_per /= snds;
5471                 if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) {
5472                         /* Unclamp */
5473                         if (tcp_bblogging_on(rack->rc_tp)) {
5474                                 union tcp_log_stackspecific log;
5475                                 struct timeval tv;
5476
5477                                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5478                                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5479                                 log.u_bbr.flex3 = rnds;
5480                                 log.u_bbr.flex4 = rack_unclamp_round_thresh;
5481                                 log.u_bbr.flex5 = (uint32_t)rxt_per;
5482                                 log.u_bbr.flex8 = 6;
5483                                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
5484                                 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
5485                                 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
5486                                 log.u_bbr.applimited = rack->r_ctl.max_clamps;
5487                                 log.u_bbr.epoch = rack->r_ctl.clamp_options;
5488                                 log.u_bbr.cur_del_rate = rxts;
5489                                 log.u_bbr.bw_inuse = rack_get_lt_bw(rack);
5490                                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
5491                                 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
5492                                 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
5493                                 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5494                                               0, &log, false, NULL, NULL, 0, &tv);
5495                         }
5496                         rack->r_ctl.num_of_clamps_applied = 0;
5497                         rack->r_cwnd_was_clamped = 0;
5498                         rack->excess_rxt_on = 1;
5499                         if (rack->r_ctl.clamp_options) {
5500                                 /*
5501                                  * We only allow fillcw to be toggled
5502                                  * if you are setting a max seg too.
5503                                  */
5504                                 if (rack->r_ctl.clamp_options & 0x1) {
5505                                         if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
5506                                                 /* turn on fill cw  for non-dgp*/
5507                                                 rack->rc_pace_to_cwnd = 0;
5508                                         } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
5509                                                 /* For DGP we want it off */
5510                                                 rack->rc_pace_to_cwnd = 1;
5511                                         }
5512                                 }
5513                         }
5514                         if (rack->dgp_on) {
5515                                 /* Reset all multipliers to 100.0 so just the measured bw */
5516                                 /* Crash any per boosts down to 100% */
5517                                 rack->r_ctl.rack_per_of_gp_rec = 100;
5518                                 rack->r_ctl.rack_per_of_gp_ss = 100;
5519                                 rack->r_ctl.rack_per_of_gp_ca = 100;
5520                                 /* Set in an upper bound for ss/ca % increase */
5521                                 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
5522                                 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
5523                         }
5524                 }
5525         }
5526 }
5527
5528 static void
5529 do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack)
5530 {
5531         /*
5532          * Rack excess rxt accounting is turned on. If we
5533          * are above a threshold of rxt's in at least N
5534          * rounds, then back off the cwnd and ssthresh
5535          * to fit into the long-term b/w.
5536          */
5537         uint64_t snds, rxts, rxt_per, lt_bw, bdp;
5538         uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0;
5539
5540         /* Is it shut off by 0 rounds? */
5541         if (rack_rxt_min_rnds == 0)
5542                 return;
5543         if ((rack->r_ctl.max_clamps > 0) &&
5544             (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) {
5545                 /*
5546                  * The idea, if max_clamps is set, is that if clamping it
5547                  * N times did not work again, then there is no sense
5548                  * clamping it again. The link is just a lossy link and
5549                  * our clamps are doing no good. Turn it off so we don't come
5550                  * back here again.
5551                  */
5552                 rack->excess_rxt_on = 0;
5553                 rack->r_cwnd_was_clamped = 0;
5554                 rack->r_ctl.num_of_clamps_applied = 0;
5555                 return;
5556         }
5557         snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes;
5558         rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes;
5559         rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped;
5560         /* Has enough rounds progressed for us to re-measure? */
5561         if ((rnds >= rack_rxt_min_rnds) &&
5562             (rack->r_ctl.rxt_threshold > 0)){
5563                 rxt_per = rxts * 1000;
5564                 rxt_per /= snds;
5565                 if (rxt_per >= rack->r_ctl.rxt_threshold) {
5566                         /*
5567                          * Action required:
5568                          *  We are above our excess retransmit level, lets
5569                          *  cut down the cwnd and ssthresh to match the long-term
5570                          *  b/w we are getting.
5571                          */
5572                         /* First disable scwnd if enabled */
5573 #ifdef NETFLIX_SHARED_CWND
5574                         rack->rack_enable_scwnd = 0;
5575                         if (rack->r_ctl.rc_scw) {
5576                                 uint32_t limit;
5577
5578                                 shared_cwnd_was_enabled = 1;
5579                                 if (rack->r_limit_scw)
5580                                         limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
5581                                 else
5582                                         limit = 0;
5583                                 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
5584                                                           rack->r_ctl.rc_scw_index,
5585                                                           limit);
5586                                 rack->r_ctl.rc_scw = NULL;
5587                         }
5588
5589 #endif
5590                         /* Calculate what the cwnd and ssthresh should be */
5591                         tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT);
5592                         lt_bw = rack_get_lt_bw(rack);
5593                         if (lt_bw == 0) {
5594                                 /*
5595                                  * No lt_bw, lets chop things to one MSS
5596                                  * and the ssthresh to the iwnd.
5597                                  */
5598 reset_to_iw:
5599                                 new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5600                                 new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp));
5601                         } else {
5602                                 rtt = rack->rc_rack_rtt;
5603                                 if (rtt == 0) {
5604                                         /* If we have no rack_rtt drop to the IW situation */
5605                                         goto reset_to_iw;
5606                                 }
5607                                 bdp = lt_bw * (uint64_t)rtt;
5608                                 bdp /= HPTS_USEC_IN_SEC;
5609                                 new_cwnd = (uint32_t)bdp;
5610                                 new_ssthresh = new_cwnd - 1;
5611                                 if (new_cwnd < ctf_fixed_maxseg(tp)) {
5612                                         /* Rock bottom, goto IW settings  */
5613                                         goto reset_to_iw;
5614                                 }
5615                         }
5616                         rack->r_cwnd_was_clamped = 1;
5617                         rack->r_ctl.num_of_clamps_applied++;
5618                         /* Reset the counter fromn now */
5619                         tp->t_bytes_acked = 0;
5620                         /*
5621                          * Now what about options?
5622                          * We look at the bottom  8 bits:
5623                          * F = fill cw bit (toggle it if set)
5624                          * S = Segment bits
5625                          * M = set max segment bit
5626                          *
5627                          * SSSS SSMF
5628                          */
5629                         if (rack->r_ctl.clamp_options) {
5630                                 if (rack->r_ctl.clamp_options & 0x1) {
5631                                         if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) {
5632                                                 /* turn on fill cw  for non-dgp*/
5633                                                 rack->rc_pace_to_cwnd = 1;
5634                                         } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) {
5635                                                 /* For DGP we want it off */
5636                                                 rack->rc_pace_to_cwnd = 0;
5637                                         }
5638                                 }
5639                         }
5640                         if (rack->dgp_on) {
5641                                 /* Reset all multipliers to 100.0 so just the measured bw */
5642                                 /* Crash any per boosts down to 100% */
5643                                 rack->r_ctl.rack_per_of_gp_rec = 100;
5644                                 rack->r_ctl.rack_per_of_gp_ss = 100;
5645                                 rack->r_ctl.rack_per_of_gp_ca = 100;
5646                                 /* Set in an upper bound for ss/ca % increase */
5647                                 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper;
5648                                 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper;
5649                                 /* Now move to the lt_bw */
5650                                 rack->r_ctl.gp_bw = lt_bw;
5651                                 rack->rc_gp_filled = 1;
5652                                 rack->r_ctl.num_measurements = RACK_REQ_AVG;
5653                         }
5654                         if (tcp_bblogging_on(rack->rc_tp)) {
5655                                 union tcp_log_stackspecific log;
5656                                 struct timeval tv;
5657
5658                                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5659                                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5660                                 log.u_bbr.flex1 = new_cwnd;
5661                                 log.u_bbr.flex2 = new_ssthresh;
5662                                 log.u_bbr.flex3 = rnds;
5663                                 log.u_bbr.flex4 = rack_rxt_min_rnds;
5664                                 log.u_bbr.flex5 = rtt;
5665                                 log.u_bbr.flex6 = shared_cwnd_was_enabled;
5666                                 log.u_bbr.flex8 = 5;
5667                                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs;
5668                                 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd;
5669                                 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied;
5670                                 log.u_bbr.applimited = rack->r_ctl.max_clamps;
5671                                 log.u_bbr.epoch = rack->r_ctl.clamp_options;
5672                                 log.u_bbr.cur_del_rate = rxts;
5673                                 log.u_bbr.delRate = snds;
5674                                 log.u_bbr.rttProp = rack->r_ctl.rxt_threshold;
5675                                 log.u_bbr.bw_inuse = lt_bw;
5676                                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
5677                                 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff);
5678                                 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff);
5679                                 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5680                                                0, &log, false, NULL, NULL, 0, &tv);
5681                         }
5682                         /* Update our point where we did it */
5683                         if (rack->r_ctl.already_had_a_excess == 0) {
5684                                 rack->r_ctl.already_had_a_excess = 1;
5685                                 counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1);
5686                         }
5687                         counter_u64_add(rack_rxt_clamps_cwnd, 1);
5688                         rack->r_ctl.last_sndbytes = tp->t_sndbytes;
5689                         rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes;
5690                         rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round;
5691                         if (new_cwnd < tp->snd_cwnd)
5692                                 tp->snd_cwnd = new_cwnd;
5693                         if (new_ssthresh < tp->snd_ssthresh)
5694                                 tp->snd_ssthresh = new_ssthresh;
5695                 }
5696         }
5697 }
5698
5699 static void
5700 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
5701 {
5702         struct tcp_rack *rack;
5703         uint32_t orig_cwnd;
5704
5705         orig_cwnd = tp->snd_cwnd;
5706         INP_WLOCK_ASSERT(tptoinpcb(tp));
5707         rack = (struct tcp_rack *)tp->t_fb_ptr;
5708         /* only alert CC if we alerted when we entered */
5709         if (CC_ALGO(tp)->post_recovery != NULL) {
5710                 tp->t_ccv.curack = th_ack;
5711                 CC_ALGO(tp)->post_recovery(&tp->t_ccv);
5712                 if (tp->snd_cwnd < tp->snd_ssthresh) {
5713                         /*
5714                          * Rack has burst control and pacing
5715                          * so lets not set this any lower than
5716                          * snd_ssthresh per RFC-6582 (option 2).
5717                          */
5718                         tp->snd_cwnd = tp->snd_ssthresh;
5719                 }
5720         }
5721         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5722                 union tcp_log_stackspecific log;
5723                 struct timeval tv;
5724
5725                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5726                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5727                 log.u_bbr.flex1 = th_ack;
5728                 log.u_bbr.flex2 = tp->t_ccv.flags;
5729                 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5730                 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5731                 log.u_bbr.flex5 = V_tcp_abc_l_var;
5732                 log.u_bbr.flex6 = orig_cwnd;
5733                 log.u_bbr.flex7 = V_tcp_do_newsack;
5734                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
5735                 log.u_bbr.flex8 = 2;
5736                 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5737                                0, &log, false, NULL, __func__, __LINE__, &tv);
5738         }
5739         if ((rack->rack_no_prr == 0) &&
5740             (rack->no_prr_addback == 0) &&
5741             (rack->r_ctl.rc_prr_sndcnt > 0)) {
5742                 /*
5743                  * Suck the next prr cnt back into cwnd, but
5744                  * only do that if we are not application limited.
5745                  */
5746                 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) {
5747                         /*
5748                          * We are allowed to add back to the cwnd the amount we did
5749                          * not get out if:
5750                          * a) no_prr_addback is off.
5751                          * b) we are not app limited
5752                          * c) we are doing prr
5753                          * <and>
5754                          * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
5755                          */
5756                         tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
5757                                             rack->r_ctl.rc_prr_sndcnt);
5758                 }
5759                 rack->r_ctl.rc_prr_sndcnt = 0;
5760                 rack_log_to_prr(rack, 1, 0, __LINE__);
5761         }
5762         rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
5763         tp->snd_recover = tp->snd_una;
5764         if (rack->r_ctl.dsack_persist) {
5765                 rack->r_ctl.dsack_persist--;
5766                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
5767                         rack->r_ctl.num_dsack = 0;
5768                 }
5769                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
5770         }
5771         EXIT_RECOVERY(tp->t_flags);
5772         if (rack->r_ctl.full_dgp_in_rec)
5773                 rack_client_buffer_level_set(rack);
5774 }
5775
5776 static void
5777 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
5778 {
5779         struct tcp_rack *rack;
5780         uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
5781
5782         INP_WLOCK_ASSERT(tptoinpcb(tp));
5783 #ifdef STATS
5784         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
5785 #endif
5786         if (IN_RECOVERY(tp->t_flags) == 0) {
5787                 in_rec_at_entry = 0;
5788                 ssthresh_enter = tp->snd_ssthresh;
5789                 cwnd_enter = tp->snd_cwnd;
5790         } else
5791                 in_rec_at_entry = 1;
5792         rack = (struct tcp_rack *)tp->t_fb_ptr;
5793         switch (type) {
5794         case CC_NDUPACK:
5795                 tp->t_flags &= ~TF_WASFRECOVERY;
5796                 tp->t_flags &= ~TF_WASCRECOVERY;
5797                 if (!IN_FASTRECOVERY(tp->t_flags)) {
5798                         if (rack->dgp_on && rack->r_cwnd_was_clamped) {
5799                                 /* Reset the gains so that on exit we will be softer longer */
5800                                 rack->r_ctl.rack_per_of_gp_rec = 100;
5801                                 rack->r_ctl.rack_per_of_gp_ss = 98;
5802                                 rack->r_ctl.rack_per_of_gp_ca = 98;
5803                         }
5804                         rack->r_ctl.rc_prr_delivered = 0;
5805                         rack->r_ctl.rc_prr_out = 0;
5806                         rack->r_fast_output = 0;
5807                         if (rack->rack_no_prr == 0) {
5808                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5809                                 rack_log_to_prr(rack, 2, in_rec_at_entry, line);
5810                         }
5811                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
5812                         tp->snd_recover = tp->snd_max;
5813                         if (tp->t_flags2 & TF2_ECN_PERMIT)
5814                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
5815                 }
5816                 break;
5817         case CC_ECN:
5818                 if (!IN_CONGRECOVERY(tp->t_flags) ||
5819                     /*
5820                      * Allow ECN reaction on ACK to CWR, if
5821                      * that data segment was also CE marked.
5822                      */
5823                     SEQ_GEQ(ack, tp->snd_recover)) {
5824                         EXIT_CONGRECOVERY(tp->t_flags);
5825                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
5826                         rack->r_fast_output = 0;
5827                         tp->snd_recover = tp->snd_max + 1;
5828                         if (tp->t_flags2 & TF2_ECN_PERMIT)
5829                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
5830                 }
5831                 break;
5832         case CC_RTO:
5833                 tp->t_dupacks = 0;
5834                 tp->t_bytes_acked = 0;
5835                 rack->r_fast_output = 0;
5836                 EXIT_RECOVERY(tp->t_flags);
5837                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
5838                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
5839                 orig_cwnd = tp->snd_cwnd;
5840                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
5841                 rack_log_to_prr(rack, 16, orig_cwnd, line);
5842                 if (tp->t_flags2 & TF2_ECN_PERMIT)
5843                         tp->t_flags2 |= TF2_ECN_SND_CWR;
5844                 break;
5845         case CC_RTO_ERR:
5846                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
5847                 /* RTO was unnecessary, so reset everything. */
5848                 tp->snd_cwnd = tp->snd_cwnd_prev;
5849                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
5850                 tp->snd_recover = tp->snd_recover_prev;
5851                 if (tp->t_flags & TF_WASFRECOVERY) {
5852                         ENTER_FASTRECOVERY(tp->t_flags);
5853                         tp->t_flags &= ~TF_WASFRECOVERY;
5854                 }
5855                 if (tp->t_flags & TF_WASCRECOVERY) {
5856                         ENTER_CONGRECOVERY(tp->t_flags);
5857                         tp->t_flags &= ~TF_WASCRECOVERY;
5858                 }
5859                 tp->snd_nxt = tp->snd_max;
5860                 tp->t_badrxtwin = 0;
5861                 break;
5862         }
5863         if ((CC_ALGO(tp)->cong_signal != NULL)  &&
5864             (type != CC_RTO)){
5865                 tp->t_ccv.curack = ack;
5866                 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
5867         }
5868         if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
5869                 rack_log_to_prr(rack, 15, cwnd_enter, line);
5870                 if (rack->r_ctl.full_dgp_in_rec)
5871                         rack_client_buffer_level_set(rack);
5872                 rack->r_ctl.dsack_byte_cnt = 0;
5873                 rack->r_ctl.retran_during_recovery = 0;
5874                 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
5875                 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
5876                 rack->r_ent_rec_ns = 1;
5877         }
5878 }
5879
5880 static inline void
5881 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
5882 {
5883         uint32_t i_cwnd;
5884
5885         INP_WLOCK_ASSERT(tptoinpcb(tp));
5886
5887         if (CC_ALGO(tp)->after_idle != NULL)
5888                 CC_ALGO(tp)->after_idle(&tp->t_ccv);
5889
5890         if (tp->snd_cwnd == 1)
5891                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
5892         else
5893                 i_cwnd = rc_init_window(rack);
5894
5895         /*
5896          * Being idle is no different than the initial window. If the cc
5897          * clamps it down below the initial window raise it to the initial
5898          * window.
5899          */
5900         if (tp->snd_cwnd < i_cwnd) {
5901                 tp->snd_cwnd = i_cwnd;
5902         }
5903 }
5904
5905 /*
5906  * Indicate whether this ack should be delayed.  We can delay the ack if
5907  * following conditions are met:
5908  *      - There is no delayed ack timer in progress.
5909  *      - Our last ack wasn't a 0-sized window. We never want to delay
5910  *        the ack that opens up a 0-sized window.
5911  *      - LRO wasn't used for this segment. We make sure by checking that the
5912  *        segment size is not larger than the MSS.
5913  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
5914  *        connection.
5915  */
5916 #define DELAY_ACK(tp, tlen)                      \
5917         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
5918         ((tp->t_flags & TF_DELACK) == 0) &&      \
5919         (tlen <= tp->t_maxseg) &&                \
5920         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
5921
5922 static struct rack_sendmap *
5923 rack_find_lowest_rsm(struct tcp_rack *rack)
5924 {
5925         struct rack_sendmap *rsm;
5926
5927         /*
5928          * Walk the time-order transmitted list looking for an rsm that is
5929          * not acked. This will be the one that was sent the longest time
5930          * ago that is still outstanding.
5931          */
5932         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
5933                 if (rsm->r_flags & RACK_ACKED) {
5934                         continue;
5935                 }
5936                 goto finish;
5937         }
5938 finish:
5939         return (rsm);
5940 }
5941
5942 static struct rack_sendmap *
5943 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
5944 {
5945         struct rack_sendmap *prsm;
5946
5947         /*
5948          * Walk the sequence order list backward until we hit and arrive at
5949          * the highest seq not acked. In theory when this is called it
5950          * should be the last segment (which it was not).
5951          */
5952         prsm = rsm;
5953
5954         TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) {
5955                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
5956                         continue;
5957                 }
5958                 return (prsm);
5959         }
5960         return (NULL);
5961 }
5962
5963 static uint32_t
5964 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
5965 {
5966         int32_t lro;
5967         uint32_t thresh;
5968
5969         /*
5970          * lro is the flag we use to determine if we have seen reordering.
5971          * If it gets set we have seen reordering. The reorder logic either
5972          * works in one of two ways:
5973          *
5974          * If reorder-fade is configured, then we track the last time we saw
5975          * re-ordering occur. If we reach the point where enough time as
5976          * passed we no longer consider reordering has occuring.
5977          *
5978          * Or if reorder-face is 0, then once we see reordering we consider
5979          * the connection to alway be subject to reordering and just set lro
5980          * to 1.
5981          *
5982          * In the end if lro is non-zero we add the extra time for
5983          * reordering in.
5984          */
5985         if (srtt == 0)
5986                 srtt = 1;
5987         if (rack->r_ctl.rc_reorder_ts) {
5988                 if (rack->r_ctl.rc_reorder_fade) {
5989                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
5990                                 lro = cts - rack->r_ctl.rc_reorder_ts;
5991                                 if (lro == 0) {
5992                                         /*
5993                                          * No time as passed since the last
5994                                          * reorder, mark it as reordering.
5995                                          */
5996                                         lro = 1;
5997                                 }
5998                         } else {
5999                                 /* Negative time? */
6000                                 lro = 0;
6001                         }
6002                         if (lro > rack->r_ctl.rc_reorder_fade) {
6003                                 /* Turn off reordering seen too */
6004                                 rack->r_ctl.rc_reorder_ts = 0;
6005                                 lro = 0;
6006                         }
6007                 } else {
6008                         /* Reodering does not fade */
6009                         lro = 1;
6010                 }
6011         } else {
6012                 lro = 0;
6013         }
6014         if (rack->rc_rack_tmr_std_based == 0) {
6015                 thresh = srtt + rack->r_ctl.rc_pkt_delay;
6016         } else {
6017                 /* Standards based pkt-delay is 1/4 srtt */
6018                 thresh = srtt +  (srtt >> 2);
6019         }
6020         if (lro && (rack->rc_rack_tmr_std_based == 0)) {
6021                 /* It must be set, if not you get 1/4 rtt */
6022                 if (rack->r_ctl.rc_reorder_shift)
6023                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
6024                 else
6025                         thresh += (srtt >> 2);
6026         }
6027         if (rack->rc_rack_use_dsack &&
6028             lro &&
6029             (rack->r_ctl.num_dsack > 0)) {
6030                 /*
6031                  * We only increase the reordering window if we
6032                  * have seen reordering <and> we have a DSACK count.
6033                  */
6034                 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
6035                 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
6036         }
6037         /* SRTT * 2 is the ceiling */
6038         if (thresh > (srtt * 2)) {
6039                 thresh = srtt * 2;
6040         }
6041         /* And we don't want it above the RTO max either */
6042         if (thresh > rack_rto_max) {
6043                 thresh = rack_rto_max;
6044         }
6045         rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
6046         return (thresh);
6047 }
6048
6049 static uint32_t
6050 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
6051                      struct rack_sendmap *rsm, uint32_t srtt)
6052 {
6053         struct rack_sendmap *prsm;
6054         uint32_t thresh, len;
6055         int segsiz;
6056
6057         if (srtt == 0)
6058                 srtt = 1;
6059         if (rack->r_ctl.rc_tlp_threshold)
6060                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
6061         else
6062                 thresh = (srtt * 2);
6063
6064         /* Get the previous sent packet, if any */
6065         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
6066         len = rsm->r_end - rsm->r_start;
6067         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
6068                 /* Exactly like the ID */
6069                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
6070                         uint32_t alt_thresh;
6071                         /*
6072                          * Compensate for delayed-ack with the d-ack time.
6073                          */
6074                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
6075                         if (alt_thresh > thresh)
6076                                 thresh = alt_thresh;
6077                 }
6078         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
6079                 /* 2.1 behavior */
6080                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
6081                 if (prsm && (len <= segsiz)) {
6082                         /*
6083                          * Two packets outstanding, thresh should be (2*srtt) +
6084                          * possible inter-packet delay (if any).
6085                          */
6086                         uint32_t inter_gap = 0;
6087                         int idx, nidx;
6088
6089                         idx = rsm->r_rtr_cnt - 1;
6090                         nidx = prsm->r_rtr_cnt - 1;
6091                         if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
6092                                 /* Yes it was sent later (or at the same time) */
6093                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
6094                         }
6095                         thresh += inter_gap;
6096                 } else if (len <= segsiz) {
6097                         /*
6098                          * Possibly compensate for delayed-ack.
6099                          */
6100                         uint32_t alt_thresh;
6101
6102                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
6103                         if (alt_thresh > thresh)
6104                                 thresh = alt_thresh;
6105                 }
6106         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
6107                 /* 2.2 behavior */
6108                 if (len <= segsiz) {
6109                         uint32_t alt_thresh;
6110                         /*
6111                          * Compensate for delayed-ack with the d-ack time.
6112                          */
6113                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
6114                         if (alt_thresh > thresh)
6115                                 thresh = alt_thresh;
6116                 }
6117         }
6118         /* Not above an RTO */
6119         if (thresh > tp->t_rxtcur) {
6120                 thresh = tp->t_rxtcur;
6121         }
6122         /* Not above a RTO max */
6123         if (thresh > rack_rto_max) {
6124                 thresh = rack_rto_max;
6125         }
6126         /* Apply user supplied min TLP */
6127         if (thresh < rack_tlp_min) {
6128                 thresh = rack_tlp_min;
6129         }
6130         return (thresh);
6131 }
6132
6133 static uint32_t
6134 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
6135 {
6136         /*
6137          * We want the rack_rtt which is the
6138          * last rtt we measured. However if that
6139          * does not exist we fallback to the srtt (which
6140          * we probably will never do) and then as a last
6141          * resort we use RACK_INITIAL_RTO if no srtt is
6142          * yet set.
6143          */
6144         if (rack->rc_rack_rtt)
6145                 return (rack->rc_rack_rtt);
6146         else if (tp->t_srtt == 0)
6147                 return (RACK_INITIAL_RTO);
6148         return (tp->t_srtt);
6149 }
6150
6151 static struct rack_sendmap *
6152 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
6153 {
6154         /*
6155          * Check to see that we don't need to fall into recovery. We will
6156          * need to do so if our oldest transmit is past the time we should
6157          * have had an ack.
6158          */
6159         struct tcp_rack *rack;
6160         struct rack_sendmap *rsm;
6161         int32_t idx;
6162         uint32_t srtt, thresh;
6163
6164         rack = (struct tcp_rack *)tp->t_fb_ptr;
6165         if (tqhash_empty(rack->r_ctl.tqh)) {
6166                 return (NULL);
6167         }
6168         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6169         if (rsm == NULL)
6170                 return (NULL);
6171
6172
6173         if (rsm->r_flags & RACK_ACKED) {
6174                 rsm = rack_find_lowest_rsm(rack);
6175                 if (rsm == NULL)
6176                         return (NULL);
6177         }
6178         idx = rsm->r_rtr_cnt - 1;
6179         srtt = rack_grab_rtt(tp, rack);
6180         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6181         if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
6182                 return (NULL);
6183         }
6184         if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
6185                 return (NULL);
6186         }
6187         /* Ok if we reach here we are over-due and this guy can be sent */
6188         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
6189         return (rsm);
6190 }
6191
6192 static uint32_t
6193 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
6194 {
6195         int32_t t;
6196         int32_t tt;
6197         uint32_t ret_val;
6198
6199         t = (tp->t_srtt + (tp->t_rttvar << 2));
6200         RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
6201             rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
6202         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
6203         ret_val = (uint32_t)tt;
6204         return (ret_val);
6205 }
6206
6207 static uint32_t
6208 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
6209 {
6210         /*
6211          * Start the FR timer, we do this based on getting the first one in
6212          * the rc_tmap. Note that if its NULL we must stop the timer. in all
6213          * events we need to stop the running timer (if its running) before
6214          * starting the new one.
6215          */
6216         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
6217         uint32_t srtt_cur;
6218         int32_t idx;
6219         int32_t is_tlp_timer = 0;
6220         struct rack_sendmap *rsm;
6221
6222         if (rack->t_timers_stopped) {
6223                 /* All timers have been stopped none are to run */
6224                 return (0);
6225         }
6226         if (rack->rc_in_persist) {
6227                 /* We can't start any timer in persists */
6228                 return (rack_get_persists_timer_val(tp, rack));
6229         }
6230         rack->rc_on_min_to = 0;
6231         if ((tp->t_state < TCPS_ESTABLISHED) ||
6232             (rack->sack_attack_disable > 0) ||
6233             ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
6234                 goto activate_rxt;
6235         }
6236         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6237         if ((rsm == NULL) || sup_rack) {
6238                 /* Nothing on the send map or no rack */
6239 activate_rxt:
6240                 time_since_sent = 0;
6241                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6242                 if (rsm) {
6243                         /*
6244                          * Should we discount the RTX timer any?
6245                          *
6246                          * We want to discount it the smallest amount.
6247                          * If a timer (Rack/TLP or RXT) has gone off more
6248                          * recently thats the discount we want to use (now - timer time).
6249                          * If the retransmit of the oldest packet was more recent then
6250                          * we want to use that (now - oldest-packet-last_transmit_time).
6251                          *
6252                          */
6253                         idx = rsm->r_rtr_cnt - 1;
6254                         if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
6255                                 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6256                         else
6257                                 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6258                         if (TSTMP_GT(cts, tstmp_touse))
6259                             time_since_sent = cts - tstmp_touse;
6260                 }
6261                 if (SEQ_LT(tp->snd_una, tp->snd_max) ||
6262                     sbavail(&tptosocket(tp)->so_snd)) {
6263                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
6264                         to = tp->t_rxtcur;
6265                         if (to > time_since_sent)
6266                                 to -= time_since_sent;
6267                         else
6268                                 to = rack->r_ctl.rc_min_to;
6269                         if (to == 0)
6270                                 to = 1;
6271                         /* Special case for KEEPINIT */
6272                         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6273                             (TP_KEEPINIT(tp) != 0) &&
6274                             rsm) {
6275                                 /*
6276                                  * We have to put a ceiling on the rxt timer
6277                                  * of the keep-init timeout.
6278                                  */
6279                                 uint32_t max_time, red;
6280
6281                                 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
6282                                 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
6283                                         red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
6284                                         if (red < max_time)
6285                                                 max_time -= red;
6286                                         else
6287                                                 max_time = 1;
6288                                 }
6289                                 /* Reduce timeout to the keep value if needed */
6290                                 if (max_time < to)
6291                                         to = max_time;
6292                         }
6293                         return (to);
6294                 }
6295                 return (0);
6296         }
6297         if (rsm->r_flags & RACK_ACKED) {
6298                 rsm = rack_find_lowest_rsm(rack);
6299                 if (rsm == NULL) {
6300                         /* No lowest? */
6301                         goto activate_rxt;
6302                 }
6303         }
6304         if (rack->sack_attack_disable) {
6305                 /*
6306                  * We don't want to do
6307                  * any TLP's if you are an attacker.
6308                  * Though if you are doing what
6309                  * is expected you may still have
6310                  * SACK-PASSED marks.
6311                  */
6312                 goto activate_rxt;
6313         }
6314         /* Convert from ms to usecs */
6315         if ((rsm->r_flags & RACK_SACK_PASSED) ||
6316             (rsm->r_flags & RACK_RWND_COLLAPSED) ||
6317             (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
6318                 if ((tp->t_flags & TF_SENTFIN) &&
6319                     ((tp->snd_max - tp->snd_una) == 1) &&
6320                     (rsm->r_flags & RACK_HAS_FIN)) {
6321                         /*
6322                          * We don't start a rack timer if all we have is a
6323                          * FIN outstanding.
6324                          */
6325                         goto activate_rxt;
6326                 }
6327                 if ((rack->use_rack_rr == 0) &&
6328                     (IN_FASTRECOVERY(tp->t_flags)) &&
6329                     (rack->rack_no_prr == 0) &&
6330                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
6331                         /*
6332                          * We are not cheating, in recovery  and
6333                          * not enough ack's to yet get our next
6334                          * retransmission out.
6335                          *
6336                          * Note that classified attackers do not
6337                          * get to use the rack-cheat.
6338                          */
6339                         goto activate_tlp;
6340                 }
6341                 srtt = rack_grab_rtt(tp, rack);
6342                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
6343                 idx = rsm->r_rtr_cnt - 1;
6344                 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
6345                 if (SEQ_GEQ(exp, cts)) {
6346                         to = exp - cts;
6347                         if (to < rack->r_ctl.rc_min_to) {
6348                                 to = rack->r_ctl.rc_min_to;
6349                                 if (rack->r_rr_config == 3)
6350                                         rack->rc_on_min_to = 1;
6351                         }
6352                 } else {
6353                         to = rack->r_ctl.rc_min_to;
6354                         if (rack->r_rr_config == 3)
6355                                 rack->rc_on_min_to = 1;
6356                 }
6357         } else {
6358                 /* Ok we need to do a TLP not RACK */
6359 activate_tlp:
6360                 if ((rack->rc_tlp_in_progress != 0) &&
6361                     (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
6362                         /*
6363                          * The previous send was a TLP and we have sent
6364                          * N TLP's without sending new data.
6365                          */
6366                         goto activate_rxt;
6367                 }
6368                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
6369                 if (rsm == NULL) {
6370                         /* We found no rsm to TLP with. */
6371                         goto activate_rxt;
6372                 }
6373                 if (rsm->r_flags & RACK_HAS_FIN) {
6374                         /* If its a FIN we dont do TLP */
6375                         rsm = NULL;
6376                         goto activate_rxt;
6377                 }
6378                 idx = rsm->r_rtr_cnt - 1;
6379                 time_since_sent = 0;
6380                 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
6381                         tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6382                 else
6383                         tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6384                 if (TSTMP_GT(cts, tstmp_touse))
6385                     time_since_sent = cts - tstmp_touse;
6386                 is_tlp_timer = 1;
6387                 if (tp->t_srtt) {
6388                         if ((rack->rc_srtt_measure_made == 0) &&
6389                             (tp->t_srtt == 1)) {
6390                                 /*
6391                                  * If another stack as run and set srtt to 1,
6392                                  * then the srtt was 0, so lets use the initial.
6393                                  */
6394                                 srtt = RACK_INITIAL_RTO;
6395                         } else {
6396                                 srtt_cur = tp->t_srtt;
6397                                 srtt = srtt_cur;
6398                         }
6399                 } else
6400                         srtt = RACK_INITIAL_RTO;
6401                 /*
6402                  * If the SRTT is not keeping up and the
6403                  * rack RTT has spiked we want to use
6404                  * the last RTT not the smoothed one.
6405                  */
6406                 if (rack_tlp_use_greater &&
6407                     tp->t_srtt &&
6408                     (srtt < rack_grab_rtt(tp, rack))) {
6409                         srtt = rack_grab_rtt(tp, rack);
6410                 }
6411                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
6412                 if (thresh > time_since_sent) {
6413                         to = thresh - time_since_sent;
6414                 } else {
6415                         to = rack->r_ctl.rc_min_to;
6416                         rack_log_alt_to_to_cancel(rack,
6417                                                   thresh,               /* flex1 */
6418                                                   time_since_sent,      /* flex2 */
6419                                                   tstmp_touse,          /* flex3 */
6420                                                   rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
6421                                                   (uint32_t)rsm->r_tim_lastsent[idx],
6422                                                   srtt,
6423                                                   idx, 99);
6424                 }
6425                 if (to < rack_tlp_min) {
6426                         to = rack_tlp_min;
6427                 }
6428                 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
6429                         /*
6430                          * If the TLP time works out to larger than the max
6431                          * RTO lets not do TLP.. just RTO.
6432                          */
6433                         goto activate_rxt;
6434                 }
6435         }
6436         if (is_tlp_timer == 0) {
6437                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
6438         } else {
6439                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
6440         }
6441         if (to == 0)
6442                 to = 1;
6443         return (to);
6444 }
6445
6446 static void
6447 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una)
6448 {
6449         struct timeval tv;
6450
6451         if (rack->rc_in_persist == 0) {
6452                 if (tp->t_flags & TF_GPUTINPROG) {
6453                         /*
6454                          * Stop the goodput now, the calling of the
6455                          * measurement function clears the flag.
6456                          */
6457                         rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
6458                                                     RACK_QUALITY_PERSIST);
6459                 }
6460 #ifdef NETFLIX_SHARED_CWND
6461                 if (rack->r_ctl.rc_scw) {
6462                         tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6463                         rack->rack_scwnd_is_idle = 1;
6464                 }
6465 #endif
6466                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv);
6467                 if (rack->lt_bw_up) {
6468                         /* Suspend our LT BW measurement */
6469                         uint64_t tmark;
6470
6471                         rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
6472                         rack->r_ctl.lt_seq = snd_una;
6473                         tmark = tcp_tv_to_lusectick(&tv);
6474                         rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
6475                         rack->r_ctl.lt_timemark = tmark;
6476                         rack->lt_bw_up = 0;
6477                         rack->r_persist_lt_bw_off = 1;
6478                 }
6479                 if (rack->r_ctl.rc_went_idle_time == 0)
6480                         rack->r_ctl.rc_went_idle_time = 1;
6481                 rack_timer_cancel(tp, rack, cts, __LINE__);
6482                 rack->r_ctl.persist_lost_ends = 0;
6483                 rack->probe_not_answered = 0;
6484                 rack->forced_ack = 0;
6485                 tp->t_rxtshift = 0;
6486                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6487                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6488                 rack->rc_in_persist = 1;
6489         }
6490 }
6491
6492 static void
6493 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6494 {
6495         struct timeval tv;
6496         uint32_t t_time;
6497
6498         if (tcp_in_hpts(rack->rc_inp)) {
6499                 tcp_hpts_remove(rack->rc_inp);
6500                 rack->r_ctl.rc_hpts_flags = 0;
6501         }
6502 #ifdef NETFLIX_SHARED_CWND
6503         if (rack->r_ctl.rc_scw) {
6504                 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6505                 rack->rack_scwnd_is_idle = 0;
6506         }
6507 #endif
6508         t_time = tcp_get_usecs(&tv);
6509         if (rack->rc_gp_dyn_mul &&
6510             (rack->use_fixed_rate == 0) &&
6511             (rack->rc_always_pace)) {
6512                 /*
6513                  * Do we count this as if a probe-rtt just
6514                  * finished?
6515                  */
6516                 uint32_t time_idle, idle_min;
6517
6518                 time_idle = t_time - rack->r_ctl.rc_went_idle_time;
6519                 idle_min = rack_min_probertt_hold;
6520                 if (rack_probertt_gpsrtt_cnt_div) {
6521                         uint64_t extra;
6522                         extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
6523                                 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
6524                         extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
6525                         idle_min += (uint32_t)extra;
6526                 }
6527                 if (time_idle >= idle_min) {
6528                         /* Yes, we count it as a probe-rtt. */
6529                         uint32_t us_cts;
6530
6531                         us_cts = tcp_get_usecs(NULL);
6532                         if (rack->in_probe_rtt == 0) {
6533                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
6534                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
6535                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
6536                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
6537                         } else {
6538                                 rack_exit_probertt(rack, us_cts);
6539                         }
6540                 }
6541         }
6542         if (rack->r_persist_lt_bw_off) {
6543                 /* Continue where we left off */
6544                 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
6545                 rack->lt_bw_up = 1;
6546                 rack->r_persist_lt_bw_off = 0;
6547         }
6548         rack->rc_in_persist = 0;
6549         rack->r_ctl.rc_went_idle_time = 0;
6550         tp->t_rxtshift = 0;
6551         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6552            rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6553         rack->r_ctl.rc_agg_delayed = 0;
6554         rack->r_early = 0;
6555         rack->r_late = 0;
6556         rack->r_ctl.rc_agg_early = 0;
6557 }
6558
6559 static void
6560 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
6561                    struct hpts_diag *diag, struct timeval *tv)
6562 {
6563         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6564                 union tcp_log_stackspecific log;
6565
6566                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6567                 log.u_bbr.flex1 = diag->p_nxt_slot;
6568                 log.u_bbr.flex2 = diag->p_cur_slot;
6569                 log.u_bbr.flex3 = diag->slot_req;
6570                 log.u_bbr.flex4 = diag->inp_hptsslot;
6571                 log.u_bbr.flex5 = diag->slot_remaining;
6572                 log.u_bbr.flex6 = diag->need_new_to;
6573                 log.u_bbr.flex7 = diag->p_hpts_active;
6574                 log.u_bbr.flex8 = diag->p_on_min_sleep;
6575                 /* Hijack other fields as needed */
6576                 log.u_bbr.epoch = diag->have_slept;
6577                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
6578                 log.u_bbr.pkts_out = diag->co_ret;
6579                 log.u_bbr.applimited = diag->hpts_sleep_time;
6580                 log.u_bbr.delivered = diag->p_prev_slot;
6581                 log.u_bbr.inflight = diag->p_runningslot;
6582                 log.u_bbr.bw_inuse = diag->wheel_slot;
6583                 log.u_bbr.rttProp = diag->wheel_cts;
6584                 log.u_bbr.timeStamp = cts;
6585                 log.u_bbr.delRate = diag->maxslots;
6586                 log.u_bbr.cur_del_rate = diag->p_curtick;
6587                 log.u_bbr.cur_del_rate <<= 32;
6588                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
6589                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6590                     &rack->rc_inp->inp_socket->so_rcv,
6591                     &rack->rc_inp->inp_socket->so_snd,
6592                     BBR_LOG_HPTSDIAG, 0,
6593                     0, &log, false, tv);
6594         }
6595
6596 }
6597
6598 static void
6599 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
6600 {
6601         if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6602                 union tcp_log_stackspecific log;
6603                 struct timeval tv;
6604
6605                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6606                 log.u_bbr.flex1 = sb->sb_flags;
6607                 log.u_bbr.flex2 = len;
6608                 log.u_bbr.flex3 = sb->sb_state;
6609                 log.u_bbr.flex8 = type;
6610                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
6611                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6612                     &rack->rc_inp->inp_socket->so_rcv,
6613                     &rack->rc_inp->inp_socket->so_snd,
6614                     TCP_LOG_SB_WAKE, 0,
6615                     len, &log, false, &tv);
6616         }
6617 }
6618
6619 static void
6620 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
6621       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
6622 {
6623         struct hpts_diag diag;
6624         struct inpcb *inp = tptoinpcb(tp);
6625         struct timeval tv;
6626         uint32_t delayed_ack = 0;
6627         uint32_t hpts_timeout;
6628         uint32_t entry_slot = slot;
6629         uint8_t stopped;
6630         uint32_t left = 0;
6631         uint32_t us_cts;
6632
6633         if ((tp->t_state == TCPS_CLOSED) ||
6634             (tp->t_state == TCPS_LISTEN)) {
6635                 return;
6636         }
6637         if (tcp_in_hpts(inp)) {
6638                 /* Already on the pacer */
6639                 return;
6640         }
6641         stopped = rack->rc_tmr_stopped;
6642         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
6643                 left = rack->r_ctl.rc_timer_exp - cts;
6644         }
6645         rack->r_ctl.rc_timer_exp = 0;
6646         rack->r_ctl.rc_hpts_flags = 0;
6647         us_cts = tcp_get_usecs(&tv);
6648         /* Now early/late accounting */
6649         rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
6650         if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
6651                 /*
6652                  * We have a early carry over set,
6653                  * we can always add more time so we
6654                  * can always make this compensation.
6655                  *
6656                  * Note if ack's are allowed to wake us do not
6657                  * penalize the next timer for being awoke
6658                  * by an ack aka the rc_agg_early (non-paced mode).
6659                  */
6660                 slot += rack->r_ctl.rc_agg_early;
6661                 rack->r_early = 0;
6662                 rack->r_ctl.rc_agg_early = 0;
6663         }
6664         if (rack->r_late) {
6665                 /*
6666                  * This is harder, we can
6667                  * compensate some but it
6668                  * really depends on what
6669                  * the current pacing time is.
6670                  */
6671                 if (rack->r_ctl.rc_agg_delayed >= slot) {
6672                         /*
6673                          * We can't compensate for it all.
6674                          * And we have to have some time
6675                          * on the clock. We always have a min
6676                          * 10 slots (10 x 10 i.e. 100 usecs).
6677                          */
6678                         if (slot <= HPTS_TICKS_PER_SLOT) {
6679                                 /* We gain delay */
6680                                 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
6681                                 slot = HPTS_TICKS_PER_SLOT;
6682                         } else {
6683                                 /* We take off some */
6684                                 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
6685                                 slot = HPTS_TICKS_PER_SLOT;
6686                         }
6687                 } else {
6688                         slot -= rack->r_ctl.rc_agg_delayed;
6689                         rack->r_ctl.rc_agg_delayed = 0;
6690                         /* Make sure we have 100 useconds at minimum */
6691                         if (slot < HPTS_TICKS_PER_SLOT) {
6692                                 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
6693                                 slot = HPTS_TICKS_PER_SLOT;
6694                         }
6695                         if (rack->r_ctl.rc_agg_delayed == 0)
6696                                 rack->r_late = 0;
6697                 }
6698         }
6699         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
6700 #ifdef TCP_SAD_DETECTION
6701         if (rack->sack_attack_disable &&
6702             (rack->r_ctl.ack_during_sd > 0) &&
6703             (slot < tcp_sad_pacing_interval)) {
6704                 /*
6705                  * We have a potential attacker on
6706                  * the line. We have possibly some
6707                  * (or now) pacing time set. We want to
6708                  * slow down the processing of sacks by some
6709                  * amount (if it is an attacker). Set the default
6710                  * slot for attackers in place (unless the original
6711                  * interval is longer). Its stored in
6712                  * micro-seconds, so lets convert to msecs.
6713                  */
6714                 slot = tcp_sad_pacing_interval;
6715                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
6716                 rack->r_ctl.ack_during_sd = 0;
6717         }
6718 #endif
6719         if (tp->t_flags & TF_DELACK) {
6720                 delayed_ack = TICKS_2_USEC(tcp_delacktime);
6721                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
6722         }
6723         if (delayed_ack && ((hpts_timeout == 0) ||
6724                             (delayed_ack < hpts_timeout)))
6725                 hpts_timeout = delayed_ack;
6726         else
6727                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6728         /*
6729          * If no timers are going to run and we will fall off the hptsi
6730          * wheel, we resort to a keep-alive timer if its configured.
6731          */
6732         if ((hpts_timeout == 0) &&
6733             (slot == 0)) {
6734                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6735                     (tp->t_state <= TCPS_CLOSING)) {
6736                         /*
6737                          * Ok we have no timer (persists, rack, tlp, rxt  or
6738                          * del-ack), we don't have segments being paced. So
6739                          * all that is left is the keepalive timer.
6740                          */
6741                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6742                                 /* Get the established keep-alive time */
6743                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
6744                         } else {
6745                                 /*
6746                                  * Get the initial setup keep-alive time,
6747                                  * note that this is probably not going to
6748                                  * happen, since rack will be running a rxt timer
6749                                  * if a SYN of some sort is outstanding. It is
6750                                  * actually handled in rack_timeout_rxt().
6751                                  */
6752                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
6753                         }
6754                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
6755                         if (rack->in_probe_rtt) {
6756                                 /*
6757                                  * We want to instead not wake up a long time from
6758                                  * now but to wake up about the time we would
6759                                  * exit probe-rtt and initiate a keep-alive ack.
6760                                  * This will get us out of probe-rtt and update
6761                                  * our min-rtt.
6762                                  */
6763                                 hpts_timeout = rack_min_probertt_hold;
6764                         }
6765                 }
6766         }
6767         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
6768             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
6769                 /*
6770                  * RACK, TLP, persists and RXT timers all are restartable
6771                  * based on actions input .. i.e we received a packet (ack
6772                  * or sack) and that changes things (rw, or snd_una etc).
6773                  * Thus we can restart them with a new value. For
6774                  * keep-alive, delayed_ack we keep track of what was left
6775                  * and restart the timer with a smaller value.
6776                  */
6777                 if (left < hpts_timeout)
6778                         hpts_timeout = left;
6779         }
6780         if (hpts_timeout) {
6781                 /*
6782                  * Hack alert for now we can't time-out over 2,147,483
6783                  * seconds (a bit more than 596 hours), which is probably ok
6784                  * :).
6785                  */
6786                 if (hpts_timeout > 0x7ffffffe)
6787                         hpts_timeout = 0x7ffffffe;
6788                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
6789         }
6790         rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
6791         if ((rack->gp_ready == 0) &&
6792             (rack->use_fixed_rate == 0) &&
6793             (hpts_timeout < slot) &&
6794             (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
6795                 /*
6796                  * We have no good estimate yet for the
6797                  * old clunky burst mitigation or the
6798                  * real pacing. And the tlp or rxt is smaller
6799                  * than the pacing calculation. Lets not
6800                  * pace that long since we know the calculation
6801                  * so far is not accurate.
6802                  */
6803                 slot = hpts_timeout;
6804         }
6805         /**
6806          * Turn off all the flags for queuing by default. The
6807          * flags have important meanings to what happens when
6808          * LRO interacts with the transport. Most likely (by default now)
6809          * mbuf_queueing and ack compression are on. So the transport
6810          * has a couple of flags that control what happens (if those
6811          * are not on then these flags won't have any effect since it
6812          * won't go through the queuing LRO path).
6813          *
6814          * INP_MBUF_QUEUE_READY - This flags says that I am busy
6815          *                        pacing output, so don't disturb. But
6816          *                        it also means LRO can wake me if there
6817          *                        is a SACK arrival.
6818          *
6819          * INP_DONT_SACK_QUEUE - This flag is used in conjunction
6820          *                       with the above flag (QUEUE_READY) and
6821          *                       when present it says don't even wake me
6822          *                       if a SACK arrives.
6823          *
6824          * The idea behind these flags is that if we are pacing we
6825          * set the MBUF_QUEUE_READY and only get woken up if
6826          * a SACK arrives (which could change things) or if
6827          * our pacing timer expires. If, however, we have a rack
6828          * timer running, then we don't even want a sack to wake
6829          * us since the rack timer has to expire before we can send.
6830          *
6831          * Other cases should usually have none of the flags set
6832          * so LRO can call into us.
6833          */
6834         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
6835         if (slot) {
6836                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
6837                 rack->r_ctl.rc_last_output_to = us_cts + slot;
6838                 /*
6839                  * A pacing timer (slot) is being set, in
6840                  * such a case we cannot send (we are blocked by
6841                  * the timer). So lets tell LRO that it should not
6842                  * wake us unless there is a SACK. Note this only
6843                  * will be effective if mbuf queueing is on or
6844                  * compressed acks are being processed.
6845                  */
6846                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
6847                 /*
6848                  * But wait if we have a Rack timer running
6849                  * even a SACK should not disturb us (with
6850                  * the exception of r_rr_config 3).
6851                  */
6852                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) {
6853                         if (rack->r_rr_config != 3)
6854                                 inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
6855                         else if (rack->rc_pace_dnd) {
6856                                 if (IN_RECOVERY(tp->t_flags)) {
6857                                         /*
6858                                          * When DND is on, we only let a sack
6859                                          * interrupt us if we are not in recovery.
6860                                          *
6861                                          * If DND is off, then we never hit here
6862                                          * and let all sacks wake us up.
6863                                          *
6864                                          */
6865                                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
6866                                 }
6867                         }
6868                 }
6869                 /* For sack attackers we want to ignore sack */
6870                 if (rack->sack_attack_disable == 1) {
6871                         inp->inp_flags2 |= (INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
6872                 } else if (rack->rc_ack_can_sendout_data) {
6873                         /*
6874                          * Ahh but wait, this is that special case
6875                          * where the pacing timer can be disturbed
6876                          * backout the changes (used for non-paced
6877                          * burst limiting).
6878                          */
6879                         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
6880                 }
6881                 if ((rack->use_rack_rr) &&
6882                     (rack->r_rr_config < 2) &&
6883                     ((hpts_timeout) && (hpts_timeout < slot))) {
6884                         /*
6885                          * Arrange for the hpts to kick back in after the
6886                          * t-o if the t-o does not cause a send.
6887                          */
6888                         (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),
6889                                                    __LINE__, &diag);
6890                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6891                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6892                 } else {
6893                         (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot),
6894                                                    __LINE__, &diag);
6895                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6896                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
6897                 }
6898         } else if (hpts_timeout) {
6899                 /*
6900                  * With respect to inp_flags2 here, lets let any new acks wake
6901                  * us up here. Since we are not pacing (no pacing timer), output
6902                  * can happen so we should let it. If its a Rack timer, then any inbound
6903                  * packet probably won't change the sending (we will be blocked)
6904                  * but it may change the prr stats so letting it in (the set defaults
6905                  * at the start of this block) are good enough.
6906                  */
6907                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6908                 (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),
6909                                            __LINE__, &diag);
6910                 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6911                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6912         } else {
6913                 /* No timer starting */
6914 #ifdef INVARIANTS
6915                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
6916                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
6917                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
6918                 }
6919 #endif
6920         }
6921         rack->rc_tmr_stopped = 0;
6922         if (slot)
6923                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__);
6924 }
6925
6926 /*
6927  * RACK Timer, here we simply do logging and house keeping.
6928  * the normal rack_output() function will call the
6929  * appropriate thing to check if we need to do a RACK retransmit.
6930  * We return 1, saying don't proceed with rack_output only
6931  * when all timers have been stopped (destroyed PCB?).
6932  */
6933 static int
6934 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6935 {
6936         /*
6937          * This timer simply provides an internal trigger to send out data.
6938          * The check_recovery_mode call will see if there are needed
6939          * retransmissions, if so we will enter fast-recovery. The output
6940          * call may or may not do the same thing depending on sysctl
6941          * settings.
6942          */
6943         struct rack_sendmap *rsm;
6944
6945         counter_u64_add(rack_to_tot, 1);
6946         if (rack->r_state && (rack->r_state != tp->t_state))
6947                 rack_set_state(tp, rack);
6948         rack->rc_on_min_to = 0;
6949         rsm = rack_check_recovery_mode(tp, cts);
6950         rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
6951         if (rsm) {
6952                 rack->r_ctl.rc_resend = rsm;
6953                 rack->r_timer_override = 1;
6954                 if (rack->use_rack_rr) {
6955                         /*
6956                          * Don't accumulate extra pacing delay
6957                          * we are allowing the rack timer to
6958                          * over-ride pacing i.e. rrr takes precedence
6959                          * if the pacing interval is longer than the rrr
6960                          * time (in other words we get the min pacing
6961                          * time versus rrr pacing time).
6962                          */
6963                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6964                 }
6965         }
6966         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
6967         if (rsm == NULL) {
6968                 /* restart a timer and return 1 */
6969                 rack_start_hpts_timer(rack, tp, cts,
6970                                       0, 0, 0);
6971                 return (1);
6972         }
6973         return (0);
6974 }
6975
6976
6977
6978 static void
6979 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
6980 {
6981
6982         if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) {
6983                 /*
6984                  * The trailing space changed, mbufs can grow
6985                  * at the tail but they can't shrink from
6986                  * it, KASSERT that. Adjust the orig_m_len to
6987                  * compensate for this change.
6988                  */
6989                 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)),
6990                         ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
6991                          rsm->m,
6992                          rsm,
6993                          (intmax_t)M_TRAILINGROOM(rsm->m),
6994                          rsm->orig_t_space,
6995                          rsm->orig_m_len,
6996                          rsm->m->m_len));
6997                 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m));
6998                 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6999         }
7000         if (rsm->m->m_len < rsm->orig_m_len) {
7001                 /*
7002                  * Mbuf shrank, trimmed off the top by an ack, our
7003                  * offset changes.
7004                  */
7005                 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)),
7006                         ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n",
7007                          rsm->m, rsm->m->m_len,
7008                          rsm, rsm->orig_m_len,
7009                          rsm->soff));
7010                 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len))
7011                         rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
7012                 else
7013                         rsm->soff = 0;
7014                 rsm->orig_m_len = rsm->m->m_len;
7015 #ifdef INVARIANTS
7016         } else if (rsm->m->m_len > rsm->orig_m_len) {
7017                 panic("rsm:%p m:%p m_len grew outside of t_space compensation",
7018                       rsm, rsm->m);
7019 #endif
7020         }
7021 }
7022
7023 static void
7024 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
7025 {
7026         struct mbuf *m;
7027         uint32_t soff;
7028
7029         if (src_rsm->m &&
7030             ((src_rsm->orig_m_len != src_rsm->m->m_len) ||
7031              (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) {
7032                 /* Fix up the orig_m_len and possibly the mbuf offset */
7033                 rack_adjust_orig_mlen(src_rsm);
7034         }
7035         m = src_rsm->m;
7036         soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
7037         while (soff >= m->m_len) {
7038                 /* Move out past this mbuf */
7039                 soff -= m->m_len;
7040                 m = m->m_next;
7041                 KASSERT((m != NULL),
7042                         ("rsm:%p nrsm:%p hit at soff:%u null m",
7043                          src_rsm, rsm, soff));
7044                 if (m == NULL) {
7045                         /* This should *not* happen which is why there is a kassert */
7046                         src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
7047                                                (src_rsm->r_start - rack->rc_tp->snd_una),
7048                                                &src_rsm->soff);
7049                         src_rsm->orig_m_len = src_rsm->m->m_len;
7050                         src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m);
7051                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
7052                                            (rsm->r_start - rack->rc_tp->snd_una),
7053                                            &rsm->soff);
7054                         rsm->orig_m_len = rsm->m->m_len;
7055                         rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
7056                         return;
7057                 }
7058         }
7059         rsm->m = m;
7060         rsm->soff = soff;
7061         rsm->orig_m_len = m->m_len;
7062         rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
7063 }
7064
7065 static __inline void
7066 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
7067                struct rack_sendmap *rsm, uint32_t start)
7068 {
7069         int idx;
7070
7071         nrsm->r_start = start;
7072         nrsm->r_end = rsm->r_end;
7073         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
7074         nrsm->r_flags = rsm->r_flags;
7075         nrsm->r_dupack = rsm->r_dupack;
7076         nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
7077         nrsm->r_rtr_bytes = 0;
7078         nrsm->r_fas = rsm->r_fas;
7079         nrsm->r_bas = rsm->r_bas;
7080         rsm->r_end = nrsm->r_start;
7081         nrsm->r_just_ret = rsm->r_just_ret;
7082         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
7083                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
7084         }
7085         /* Now if we have SYN flag we keep it on the left edge */
7086         if (nrsm->r_flags & RACK_HAS_SYN)
7087                 nrsm->r_flags &= ~RACK_HAS_SYN;
7088         /* Now if we have a FIN flag we keep it on the right edge */
7089         if (rsm->r_flags & RACK_HAS_FIN)
7090                 rsm->r_flags &= ~RACK_HAS_FIN;
7091         /* Push bit must go to the right edge as well */
7092         if (rsm->r_flags & RACK_HAD_PUSH)
7093                 rsm->r_flags &= ~RACK_HAD_PUSH;
7094         /* Clone over the state of the hw_tls flag */
7095         nrsm->r_hw_tls = rsm->r_hw_tls;
7096         /*
7097          * Now we need to find nrsm's new location in the mbuf chain
7098          * we basically calculate a new offset, which is soff +
7099          * how much is left in original rsm. Then we walk out the mbuf
7100          * chain to find the righ position, it may be the same mbuf
7101          * or maybe not.
7102          */
7103         KASSERT(((rsm->m != NULL) ||
7104                  (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
7105                 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
7106         if (rsm->m)
7107                 rack_setup_offset_for_rsm(rack, rsm, nrsm);
7108 }
7109
7110 static struct rack_sendmap *
7111 rack_merge_rsm(struct tcp_rack *rack,
7112                struct rack_sendmap *l_rsm,
7113                struct rack_sendmap *r_rsm)
7114 {
7115         /*
7116          * We are merging two ack'd RSM's,
7117          * the l_rsm is on the left (lower seq
7118          * values) and the r_rsm is on the right
7119          * (higher seq value). The simplest way
7120          * to merge these is to move the right
7121          * one into the left. I don't think there
7122          * is any reason we need to try to find
7123          * the oldest (or last oldest retransmitted).
7124          */
7125         rack_log_map_chg(rack->rc_tp, rack, NULL,
7126                          l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
7127         l_rsm->r_end = r_rsm->r_end;
7128         if (l_rsm->r_dupack < r_rsm->r_dupack)
7129                 l_rsm->r_dupack = r_rsm->r_dupack;
7130         if (r_rsm->r_rtr_bytes)
7131                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
7132         if (r_rsm->r_in_tmap) {
7133                 /* This really should not happen */
7134                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
7135                 r_rsm->r_in_tmap = 0;
7136         }
7137
7138         /* Now the flags */
7139         if (r_rsm->r_flags & RACK_HAS_FIN)
7140                 l_rsm->r_flags |= RACK_HAS_FIN;
7141         if (r_rsm->r_flags & RACK_TLP)
7142                 l_rsm->r_flags |= RACK_TLP;
7143         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
7144                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
7145         if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
7146             ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
7147                 /*
7148                  * If both are app-limited then let the
7149                  * free lower the count. If right is app
7150                  * limited and left is not, transfer.
7151                  */
7152                 l_rsm->r_flags |= RACK_APP_LIMITED;
7153                 r_rsm->r_flags &= ~RACK_APP_LIMITED;
7154                 if (r_rsm == rack->r_ctl.rc_first_appl)
7155                         rack->r_ctl.rc_first_appl = l_rsm;
7156         }
7157         tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE);
7158         /*
7159          * We keep the largest value, which is the newest
7160          * send. We do this in case a segment that is
7161          * joined together and not part of a GP estimate
7162          * later gets expanded into the GP estimate.
7163          *
7164          * We prohibit the merging of unlike kinds i.e.
7165          * all pieces that are in the GP estimate can be
7166          * merged and all pieces that are not in a GP estimate
7167          * can be merged, but not disimilar pieces. Combine
7168          * this with taking the highest here and we should
7169          * be ok unless of course the client reneges. Then
7170          * all bets are off.
7171          */
7172         if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] <
7173            r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) {
7174                 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)];
7175         }
7176         /*
7177          * When merging two RSM's we also need to consider the ack time and keep
7178          * newest. If the ack gets merged into a measurement then that is the
7179          * one we will want to be using.
7180          */
7181         if(l_rsm->r_ack_arrival  < r_rsm->r_ack_arrival)
7182                 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival;
7183
7184         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
7185                 /* Transfer the split limit to the map we free */
7186                 r_rsm->r_limit_type = l_rsm->r_limit_type;
7187                 l_rsm->r_limit_type = 0;
7188         }
7189         rack_free(rack, r_rsm);
7190         l_rsm->r_flags |= RACK_MERGED;
7191         return (l_rsm);
7192 }
7193
7194 /*
7195  * TLP Timer, here we simply setup what segment we want to
7196  * have the TLP expire on, the normal rack_output() will then
7197  * send it out.
7198  *
7199  * We return 1, saying don't proceed with rack_output only
7200  * when all timers have been stopped (destroyed PCB?).
7201  */
7202 static int
7203 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
7204 {
7205         /*
7206          * Tail Loss Probe.
7207          */
7208         struct rack_sendmap *rsm = NULL;
7209         int insret __diagused;
7210         struct socket *so = tptosocket(tp);
7211         uint32_t amm;
7212         uint32_t out, avail;
7213         int collapsed_win = 0;
7214
7215         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7216                 /* Its not time yet */
7217                 return (0);
7218         }
7219         if (ctf_progress_timeout_check(tp, true)) {
7220                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7221                 return (-ETIMEDOUT);    /* tcp_drop() */
7222         }
7223         /*
7224          * A TLP timer has expired. We have been idle for 2 rtts. So we now
7225          * need to figure out how to force a full MSS segment out.
7226          */
7227         rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
7228         rack->r_ctl.retran_during_recovery = 0;
7229         rack->r_ctl.dsack_byte_cnt = 0;
7230         counter_u64_add(rack_tlp_tot, 1);
7231         if (rack->r_state && (rack->r_state != tp->t_state))
7232                 rack_set_state(tp, rack);
7233         avail = sbavail(&so->so_snd);
7234         out = tp->snd_max - tp->snd_una;
7235         if ((out > tp->snd_wnd) || rack->rc_has_collapsed) {
7236                 /* special case, we need a retransmission */
7237                 collapsed_win = 1;
7238                 goto need_retran;
7239         }
7240         if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
7241                 rack->r_ctl.dsack_persist--;
7242                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7243                         rack->r_ctl.num_dsack = 0;
7244                 }
7245                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7246         }
7247         if ((tp->t_flags & TF_GPUTINPROG) &&
7248             (rack->r_ctl.rc_tlp_cnt_out == 1)) {
7249                 /*
7250                  * If this is the second in a row
7251                  * TLP and we are doing a measurement
7252                  * its time to abandon the measurement.
7253                  * Something is likely broken on
7254                  * the clients network and measuring a
7255                  * broken network does us no good.
7256                  */
7257                 tp->t_flags &= ~TF_GPUTINPROG;
7258                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7259                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
7260                                            tp->gput_seq,
7261                                            0, 0, 18, __LINE__, NULL, 0);
7262         }
7263         /*
7264          * Check our send oldest always settings, and if
7265          * there is an oldest to send jump to the need_retran.
7266          */
7267         if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
7268                 goto need_retran;
7269
7270         if (avail > out) {
7271                 /* New data is available */
7272                 amm = avail - out;
7273                 if (amm > ctf_fixed_maxseg(tp)) {
7274                         amm = ctf_fixed_maxseg(tp);
7275                         if ((amm + out) > tp->snd_wnd) {
7276                                 /* We are rwnd limited */
7277                                 goto need_retran;
7278                         }
7279                 } else if (amm < ctf_fixed_maxseg(tp)) {
7280                         /* not enough to fill a MTU */
7281                         goto need_retran;
7282                 }
7283                 if (IN_FASTRECOVERY(tp->t_flags)) {
7284                         /* Unlikely */
7285                         if (rack->rack_no_prr == 0) {
7286                                 if (out + amm <= tp->snd_wnd) {
7287                                         rack->r_ctl.rc_prr_sndcnt = amm;
7288                                         rack->r_ctl.rc_tlp_new_data = amm;
7289                                         rack_log_to_prr(rack, 4, 0, __LINE__);
7290                                 }
7291                         } else
7292                                 goto need_retran;
7293                 } else {
7294                         /* Set the send-new override */
7295                         if (out + amm <= tp->snd_wnd)
7296                                 rack->r_ctl.rc_tlp_new_data = amm;
7297                         else
7298                                 goto need_retran;
7299                 }
7300                 rack->r_ctl.rc_tlpsend = NULL;
7301                 counter_u64_add(rack_tlp_newdata, 1);
7302                 goto send;
7303         }
7304 need_retran:
7305         /*
7306          * Ok we need to arrange the last un-acked segment to be re-sent, or
7307          * optionally the first un-acked segment.
7308          */
7309         if (collapsed_win == 0) {
7310                 if (rack_always_send_oldest)
7311                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7312                 else {
7313                         rsm = tqhash_max(rack->r_ctl.tqh);
7314                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
7315                                 rsm = rack_find_high_nonack(rack, rsm);
7316                         }
7317                 }
7318                 if (rsm == NULL) {
7319 #ifdef TCP_BLACKBOX
7320                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
7321 #endif
7322                         goto out;
7323                 }
7324         } else {
7325                 /*
7326                  * We had a collapsed window, lets find
7327                  * the point before the collapse.
7328                  */
7329                 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una))
7330                         rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1));
7331                 else {
7332                         rsm = tqhash_min(rack->r_ctl.tqh);
7333                 }
7334                 if (rsm == NULL) {
7335                         /* Huh */
7336                         goto out;
7337                 }
7338         }
7339         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
7340                 /*
7341                  * We need to split this the last segment in two.
7342                  */
7343                 struct rack_sendmap *nrsm;
7344
7345                 nrsm = rack_alloc_full_limit(rack);
7346                 if (nrsm == NULL) {
7347                         /*
7348                          * No memory to split, we will just exit and punt
7349                          * off to the RXT timer.
7350                          */
7351                         goto out;
7352                 }
7353                 rack_clone_rsm(rack, nrsm, rsm,
7354                                (rsm->r_end - ctf_fixed_maxseg(tp)));
7355                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7356 #ifndef INVARIANTS
7357                 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
7358 #else
7359                 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
7360                         panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
7361                               nrsm, insret, rack, rsm);
7362                 }
7363 #endif
7364                 if (rsm->r_in_tmap) {
7365                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7366                         nrsm->r_in_tmap = 1;
7367                 }
7368                 rsm = nrsm;
7369         }
7370         rack->r_ctl.rc_tlpsend = rsm;
7371 send:
7372         /* Make sure output path knows we are doing a TLP */
7373         *doing_tlp = 1;
7374         rack->r_timer_override = 1;
7375         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7376         return (0);
7377 out:
7378         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7379         return (0);
7380 }
7381
7382 /*
7383  * Delayed ack Timer, here we simply need to setup the
7384  * ACK_NOW flag and remove the DELACK flag. From there
7385  * the output routine will send the ack out.
7386  *
7387  * We only return 1, saying don't proceed, if all timers
7388  * are stopped (destroyed PCB?).
7389  */
7390 static int
7391 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7392 {
7393
7394         rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
7395         tp->t_flags &= ~TF_DELACK;
7396         tp->t_flags |= TF_ACKNOW;
7397         KMOD_TCPSTAT_INC(tcps_delack);
7398         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
7399         return (0);
7400 }
7401
7402 /*
7403  * Persists timer, here we simply send the
7404  * same thing as a keepalive will.
7405  * the one byte send.
7406  *
7407  * We only return 1, saying don't proceed, if all timers
7408  * are stopped (destroyed PCB?).
7409  */
7410 static int
7411 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7412 {
7413         struct tcptemp *t_template;
7414         int32_t retval = 1;
7415
7416         if (rack->rc_in_persist == 0)
7417                 return (0);
7418         if (ctf_progress_timeout_check(tp, false)) {
7419                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7420                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7421                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7422                 return (-ETIMEDOUT);    /* tcp_drop() */
7423         }
7424         /*
7425          * Persistence timer into zero window. Force a byte to be output, if
7426          * possible.
7427          */
7428         KMOD_TCPSTAT_INC(tcps_persisttimeo);
7429         /*
7430          * Hack: if the peer is dead/unreachable, we do not time out if the
7431          * window is closed.  After a full backoff, drop the connection if
7432          * the idle time (no responses to probes) reaches the maximum
7433          * backoff that we would use if retransmitting.
7434          */
7435         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
7436             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
7437              TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
7438                 KMOD_TCPSTAT_INC(tcps_persistdrop);
7439                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7440                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7441                 retval = -ETIMEDOUT;    /* tcp_drop() */
7442                 goto out;
7443         }
7444         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
7445             tp->snd_una == tp->snd_max)
7446                 rack_exit_persist(tp, rack, cts);
7447         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
7448         /*
7449          * If the user has closed the socket then drop a persisting
7450          * connection after a much reduced timeout.
7451          */
7452         if (tp->t_state > TCPS_CLOSE_WAIT &&
7453             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
7454                 KMOD_TCPSTAT_INC(tcps_persistdrop);
7455                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7456                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7457                 retval = -ETIMEDOUT;    /* tcp_drop() */
7458                 goto out;
7459         }
7460         t_template = tcpip_maketemplate(rack->rc_inp);
7461         if (t_template) {
7462                 /* only set it if we were answered */
7463                 if (rack->forced_ack == 0) {
7464                         rack->forced_ack = 1;
7465                         rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
7466                 } else {
7467                         rack->probe_not_answered = 1;
7468                         counter_u64_add(rack_persists_loss, 1);
7469                         rack->r_ctl.persist_lost_ends++;
7470                 }
7471                 counter_u64_add(rack_persists_sends, 1);
7472                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
7473                 tcp_respond(tp, t_template->tt_ipgen,
7474                             &t_template->tt_t, (struct mbuf *)NULL,
7475                             tp->rcv_nxt, tp->snd_una - 1, 0);
7476                 /* This sends an ack */
7477                 if (tp->t_flags & TF_DELACK)
7478                         tp->t_flags &= ~TF_DELACK;
7479                 free(t_template, M_TEMP);
7480         }
7481         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
7482                 tp->t_rxtshift++;
7483 out:
7484         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
7485         rack_start_hpts_timer(rack, tp, cts,
7486                               0, 0, 0);
7487         return (retval);
7488 }
7489
7490 /*
7491  * If a keepalive goes off, we had no other timers
7492  * happening. We always return 1 here since this
7493  * routine either drops the connection or sends
7494  * out a segment with respond.
7495  */
7496 static int
7497 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7498 {
7499         struct tcptemp *t_template;
7500         struct inpcb *inp = tptoinpcb(tp);
7501
7502         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
7503         rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
7504         /*
7505          * Keep-alive timer went off; send something or drop connection if
7506          * idle for too long.
7507          */
7508         KMOD_TCPSTAT_INC(tcps_keeptimeo);
7509         if (tp->t_state < TCPS_ESTABLISHED)
7510                 goto dropit;
7511         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
7512             tp->t_state <= TCPS_CLOSING) {
7513                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
7514                         goto dropit;
7515                 /*
7516                  * Send a packet designed to force a response if the peer is
7517                  * up and reachable: either an ACK if the connection is
7518                  * still alive, or an RST if the peer has closed the
7519                  * connection due to timeout or reboot. Using sequence
7520                  * number tp->snd_una-1 causes the transmitted zero-length
7521                  * segment to lie outside the receive window; by the
7522                  * protocol spec, this requires the correspondent TCP to
7523                  * respond.
7524                  */
7525                 KMOD_TCPSTAT_INC(tcps_keepprobe);
7526                 t_template = tcpip_maketemplate(inp);
7527                 if (t_template) {
7528                         if (rack->forced_ack == 0) {
7529                                 rack->forced_ack = 1;
7530                                 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
7531                         } else {
7532                                 rack->probe_not_answered = 1;
7533                         }
7534                         tcp_respond(tp, t_template->tt_ipgen,
7535                             &t_template->tt_t, (struct mbuf *)NULL,
7536                             tp->rcv_nxt, tp->snd_una - 1, 0);
7537                         free(t_template, M_TEMP);
7538                 }
7539         }
7540         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7541         return (1);
7542 dropit:
7543         KMOD_TCPSTAT_INC(tcps_keepdrops);
7544         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7545         return (-ETIMEDOUT);    /* tcp_drop() */
7546 }
7547
7548 /*
7549  * Retransmit helper function, clear up all the ack
7550  * flags and take care of important book keeping.
7551  */
7552 static void
7553 rack_remxt_tmr(struct tcpcb *tp)
7554 {
7555         /*
7556          * The retransmit timer went off, all sack'd blocks must be
7557          * un-acked.
7558          */
7559         struct rack_sendmap *rsm, *trsm = NULL;
7560         struct tcp_rack *rack;
7561
7562         rack = (struct tcp_rack *)tp->t_fb_ptr;
7563         rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
7564         rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
7565         if (rack->r_state && (rack->r_state != tp->t_state))
7566                 rack_set_state(tp, rack);
7567         /*
7568          * Ideally we would like to be able to
7569          * mark SACK-PASS on anything not acked here.
7570          *
7571          * However, if we do that we would burst out
7572          * all that data 1ms apart. This would be unwise,
7573          * so for now we will just let the normal rxt timer
7574          * and tlp timer take care of it.
7575          *
7576          * Also we really need to stick them back in sequence
7577          * order. This way we send in the proper order and any
7578          * sacks that come floating in will "re-ack" the data.
7579          * To do this we zap the tmap with an INIT and then
7580          * walk through and place every rsm in the RB tree
7581          * back in its seq ordered place.
7582          */
7583         TAILQ_INIT(&rack->r_ctl.rc_tmap);
7584
7585         TQHASH_FOREACH(rsm, rack->r_ctl.tqh)  {
7586                 rsm->r_dupack = 0;
7587                 if (rack_verbose_logging)
7588                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7589                 /* We must re-add it back to the tlist */
7590                 if (trsm == NULL) {
7591                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7592                 } else {
7593                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
7594                 }
7595                 rsm->r_in_tmap = 1;
7596                 trsm = rsm;
7597                 if (rsm->r_flags & RACK_ACKED)
7598                         rsm->r_flags |= RACK_WAS_ACKED;
7599                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
7600                 rsm->r_flags |= RACK_MUST_RXT;
7601         }
7602         /* Clear the count (we just un-acked them) */
7603         rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
7604         rack->r_ctl.rc_sacked = 0;
7605         rack->r_ctl.rc_sacklast = NULL;
7606         rack->r_ctl.rc_agg_delayed = 0;
7607         rack->r_early = 0;
7608         rack->r_ctl.rc_agg_early = 0;
7609         rack->r_late = 0;
7610         /* Clear the tlp rtx mark */
7611         rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
7612         if (rack->r_ctl.rc_resend != NULL)
7613                 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7614         rack->r_ctl.rc_prr_sndcnt = 0;
7615         rack_log_to_prr(rack, 6, 0, __LINE__);
7616         rack->r_timer_override = 1;
7617         if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
7618 #ifdef TCP_SAD_DETECTION
7619             || (rack->sack_attack_disable != 0)
7620 #endif
7621                     ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
7622                 /*
7623                  * For non-sack customers new data
7624                  * needs to go out as retransmits until
7625                  * we retransmit up to snd_max.
7626                  */
7627                 rack->r_must_retran = 1;
7628                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
7629                                                 rack->r_ctl.rc_sacked);
7630         }
7631         rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
7632 }
7633
7634 static void
7635 rack_convert_rtts(struct tcpcb *tp)
7636 {
7637         tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
7638         tp->t_rxtcur = RACK_REXMTVAL(tp);
7639         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
7640                 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
7641         }
7642         if (tp->t_rxtcur > rack_rto_max) {
7643                 tp->t_rxtcur = rack_rto_max;
7644         }
7645 }
7646
7647 static void
7648 rack_cc_conn_init(struct tcpcb *tp)
7649 {
7650         struct tcp_rack *rack;
7651         uint32_t srtt;
7652
7653         rack = (struct tcp_rack *)tp->t_fb_ptr;
7654         srtt = tp->t_srtt;
7655         cc_conn_init(tp);
7656         /*
7657          * Now convert to rack's internal format,
7658          * if required.
7659          */
7660         if ((srtt == 0) && (tp->t_srtt != 0))
7661                 rack_convert_rtts(tp);
7662         /*
7663          * We want a chance to stay in slowstart as
7664          * we create a connection. TCP spec says that
7665          * initially ssthresh is infinite. For our
7666          * purposes that is the snd_wnd.
7667          */
7668         if (tp->snd_ssthresh < tp->snd_wnd) {
7669                 tp->snd_ssthresh = tp->snd_wnd;
7670         }
7671         /*
7672          * We also want to assure a IW worth of
7673          * data can get inflight.
7674          */
7675         if (rc_init_window(rack) < tp->snd_cwnd)
7676                 tp->snd_cwnd = rc_init_window(rack);
7677 }
7678
7679 /*
7680  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
7681  * we will setup to retransmit the lowest seq number outstanding.
7682  */
7683 static int
7684 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7685 {
7686         struct inpcb *inp = tptoinpcb(tp);
7687         int32_t rexmt;
7688         int32_t retval = 0;
7689         bool isipv6;
7690
7691         if ((tp->t_flags & TF_GPUTINPROG) &&
7692             (tp->t_rxtshift)) {
7693                 /*
7694                  * We have had a second timeout
7695                  * measurements on successive rxt's are not profitable.
7696                  * It is unlikely to be of any use (the network is
7697                  * broken or the client went away).
7698                  */
7699                 tp->t_flags &= ~TF_GPUTINPROG;
7700                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7701                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
7702                                            tp->gput_seq,
7703                                            0, 0, 18, __LINE__, NULL, 0);
7704         }
7705         if (ctf_progress_timeout_check(tp, false)) {
7706                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7707                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7708                 return (-ETIMEDOUT);    /* tcp_drop() */
7709         }
7710         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
7711         rack->r_ctl.retran_during_recovery = 0;
7712         rack->rc_ack_required = 1;
7713         rack->r_ctl.dsack_byte_cnt = 0;
7714         if (IN_FASTRECOVERY(tp->t_flags))
7715                 tp->t_flags |= TF_WASFRECOVERY;
7716         else
7717                 tp->t_flags &= ~TF_WASFRECOVERY;
7718         if (IN_CONGRECOVERY(tp->t_flags))
7719                 tp->t_flags |= TF_WASCRECOVERY;
7720         else
7721                 tp->t_flags &= ~TF_WASCRECOVERY;
7722         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7723             (tp->snd_una == tp->snd_max)) {
7724                 /* Nothing outstanding .. nothing to do */
7725                 return (0);
7726         }
7727         if (rack->r_ctl.dsack_persist) {
7728                 rack->r_ctl.dsack_persist--;
7729                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7730                         rack->r_ctl.num_dsack = 0;
7731                 }
7732                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7733         }
7734         /*
7735          * Rack can only run one timer  at a time, so we cannot
7736          * run a KEEPINIT (gating SYN sending) and a retransmit
7737          * timer for the SYN. So if we are in a front state and
7738          * have a KEEPINIT timer we need to check the first transmit
7739          * against now to see if we have exceeded the KEEPINIT time
7740          * (if one is set).
7741          */
7742         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
7743             (TP_KEEPINIT(tp) != 0)) {
7744                 struct rack_sendmap *rsm;
7745
7746                 rsm = tqhash_min(rack->r_ctl.tqh);
7747                 if (rsm) {
7748                         /* Ok we have something outstanding to test keepinit with */
7749                         if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
7750                             ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
7751                                 /* We have exceeded the KEEPINIT time */
7752                                 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7753                                 goto drop_it;
7754                         }
7755                 }
7756         }
7757         /*
7758          * Retransmission timer went off.  Message has not been acked within
7759          * retransmit interval.  Back off to a longer retransmit interval
7760          * and retransmit one segment.
7761          */
7762         rack_remxt_tmr(tp);
7763         if ((rack->r_ctl.rc_resend == NULL) ||
7764             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
7765                 /*
7766                  * If the rwnd collapsed on
7767                  * the one we are retransmitting
7768                  * it does not count against the
7769                  * rxt count.
7770                  */
7771                 tp->t_rxtshift++;
7772         }
7773         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
7774                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7775 drop_it:
7776                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
7777                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
7778                 /* XXXGL: previously t_softerror was casted to uint16_t */
7779                 MPASS(tp->t_softerror >= 0);
7780                 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
7781                 goto out;       /* tcp_drop() */
7782         }
7783         if (tp->t_state == TCPS_SYN_SENT) {
7784                 /*
7785                  * If the SYN was retransmitted, indicate CWND to be limited
7786                  * to 1 segment in cc_conn_init().
7787                  */
7788                 tp->snd_cwnd = 1;
7789         } else if (tp->t_rxtshift == 1) {
7790                 /*
7791                  * first retransmit; record ssthresh and cwnd so they can be
7792                  * recovered if this turns out to be a "bad" retransmit. A
7793                  * retransmit is considered "bad" if an ACK for this segment
7794                  * is received within RTT/2 interval; the assumption here is
7795                  * that the ACK was already in flight.  See "On Estimating
7796                  * End-to-End Network Path Properties" by Allman and Paxson
7797                  * for more details.
7798                  */
7799                 tp->snd_cwnd_prev = tp->snd_cwnd;
7800                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
7801                 tp->snd_recover_prev = tp->snd_recover;
7802                 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
7803                 tp->t_flags |= TF_PREVVALID;
7804         } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
7805                 tp->t_flags &= ~TF_PREVVALID;
7806         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
7807         if ((tp->t_state == TCPS_SYN_SENT) ||
7808             (tp->t_state == TCPS_SYN_RECEIVED))
7809                 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
7810         else
7811                 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
7812
7813         RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
7814            max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
7815         /*
7816          * We enter the path for PLMTUD if connection is established or, if
7817          * connection is FIN_WAIT_1 status, reason for the last is that if
7818          * amount of data we send is very small, we could send it in couple
7819          * of packets and process straight to FIN. In that case we won't
7820          * catch ESTABLISHED state.
7821          */
7822 #ifdef INET6
7823         isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false;
7824 #else
7825         isipv6 = false;
7826 #endif
7827         if (((V_tcp_pmtud_blackhole_detect == 1) ||
7828             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
7829             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
7830             ((tp->t_state == TCPS_ESTABLISHED) ||
7831             (tp->t_state == TCPS_FIN_WAIT_1))) {
7832                 /*
7833                  * Idea here is that at each stage of mtu probe (usually,
7834                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
7835                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
7836                  * should take care of that.
7837                  */
7838                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
7839                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
7840                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
7841                     tp->t_rxtshift % 2 == 0)) {
7842                         /*
7843                          * Enter Path MTU Black-hole Detection mechanism: -
7844                          * Disable Path MTU Discovery (IP "DF" bit). -
7845                          * Reduce MTU to lower value than what we negotiated
7846                          * with peer.
7847                          */
7848                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
7849                                 /* Record that we may have found a black hole. */
7850                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
7851                                 /* Keep track of previous MSS. */
7852                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
7853                         }
7854
7855                         /*
7856                          * Reduce the MSS to blackhole value or to the
7857                          * default in an attempt to retransmit.
7858                          */
7859 #ifdef INET6
7860                         if (isipv6 &&
7861                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
7862                                 /* Use the sysctl tuneable blackhole MSS. */
7863                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
7864                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7865                         } else if (isipv6) {
7866                                 /* Use the default MSS. */
7867                                 tp->t_maxseg = V_tcp_v6mssdflt;
7868                                 /*
7869                                  * Disable Path MTU Discovery when we switch
7870                                  * to minmss.
7871                                  */
7872                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7873                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7874                         }
7875 #endif
7876 #if defined(INET6) && defined(INET)
7877                         else
7878 #endif
7879 #ifdef INET
7880                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
7881                                 /* Use the sysctl tuneable blackhole MSS. */
7882                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
7883                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7884                         } else {
7885                                 /* Use the default MSS. */
7886                                 tp->t_maxseg = V_tcp_mssdflt;
7887                                 /*
7888                                  * Disable Path MTU Discovery when we switch
7889                                  * to minmss.
7890                                  */
7891                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7892                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7893                         }
7894 #endif
7895                 } else {
7896                         /*
7897                          * If further retransmissions are still unsuccessful
7898                          * with a lowered MTU, maybe this isn't a blackhole
7899                          * and we restore the previous MSS and blackhole
7900                          * detection flags. The limit '6' is determined by
7901                          * giving each probe stage (1448, 1188, 524) 2
7902                          * chances to recover.
7903                          */
7904                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
7905                             (tp->t_rxtshift >= 6)) {
7906                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
7907                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
7908                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
7909                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
7910                         }
7911                 }
7912         }
7913         /*
7914          * Disable RFC1323 and SACK if we haven't got any response to
7915          * our third SYN to work-around some broken terminal servers
7916          * (most of which have hopefully been retired) that have bad VJ
7917          * header compression code which trashes TCP segments containing
7918          * unknown-to-them TCP options.
7919          */
7920         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
7921             (tp->t_rxtshift == 3))
7922                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
7923         /*
7924          * If we backed off this far, our srtt estimate is probably bogus.
7925          * Clobber it so we'll take the next rtt measurement as our srtt;
7926          * move the current srtt into rttvar to keep the current retransmit
7927          * times until then.
7928          */
7929         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
7930 #ifdef INET6
7931                 if ((inp->inp_vflag & INP_IPV6) != 0)
7932                         in6_losing(inp);
7933                 else
7934 #endif
7935                         in_losing(inp);
7936                 tp->t_rttvar += tp->t_srtt;
7937                 tp->t_srtt = 0;
7938         }
7939         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
7940         tp->snd_recover = tp->snd_max;
7941         tp->t_flags |= TF_ACKNOW;
7942         tp->t_rtttime = 0;
7943         rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__);
7944 out:
7945         return (retval);
7946 }
7947
7948 static int
7949 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
7950 {
7951         int32_t ret = 0;
7952         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
7953
7954         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
7955             (tp->t_flags & TF_GPUTINPROG)) {
7956                 /*
7957                  * We have a goodput in progress
7958                  * and we have entered a late state.
7959                  * Do we have enough data in the sb
7960                  * to handle the GPUT request?
7961                  */
7962                 uint32_t bytes;
7963
7964                 bytes = tp->gput_ack - tp->gput_seq;
7965                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
7966                         bytes += tp->gput_seq - tp->snd_una;
7967                 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
7968                         /*
7969                          * There are not enough bytes in the socket
7970                          * buffer that have been sent to cover this
7971                          * measurement. Cancel it.
7972                          */
7973                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7974                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
7975                                                    tp->gput_seq,
7976                                                    0, 0, 18, __LINE__, NULL, 0);
7977                         tp->t_flags &= ~TF_GPUTINPROG;
7978                 }
7979         }
7980         if (timers == 0) {
7981                 return (0);
7982         }
7983         if (tp->t_state == TCPS_LISTEN) {
7984                 /* no timers on listen sockets */
7985                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
7986                         return (0);
7987                 return (1);
7988         }
7989         if ((timers & PACE_TMR_RACK) &&
7990             rack->rc_on_min_to) {
7991                 /*
7992                  * For the rack timer when we
7993                  * are on a min-timeout (which means rrr_conf = 3)
7994                  * we don't want to check the timer. It may
7995                  * be going off for a pace and thats ok we
7996                  * want to send the retransmit (if its ready).
7997                  *
7998                  * If its on a normal rack timer (non-min) then
7999                  * we will check if its expired.
8000                  */
8001                 goto skip_time_check;
8002         }
8003         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
8004                 uint32_t left;
8005
8006                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
8007                         ret = -1;
8008                         rack_log_to_processing(rack, cts, ret, 0);
8009                         return (0);
8010                 }
8011                 if (hpts_calling == 0) {
8012                         /*
8013                          * A user send or queued mbuf (sack) has called us? We
8014                          * return 0 and let the pacing guards
8015                          * deal with it if they should or
8016                          * should not cause a send.
8017                          */
8018                         ret = -2;
8019                         rack_log_to_processing(rack, cts, ret, 0);
8020                         return (0);
8021                 }
8022                 /*
8023                  * Ok our timer went off early and we are not paced false
8024                  * alarm, go back to sleep. We make sure we don't have
8025                  * no-sack wakeup on since we no longer have a PKT_OUTPUT
8026                  * flag in place.
8027                  */
8028                 rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
8029                 ret = -3;
8030                 left = rack->r_ctl.rc_timer_exp - cts;
8031                 tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left));
8032                 rack_log_to_processing(rack, cts, ret, left);
8033                 return (1);
8034         }
8035 skip_time_check:
8036         rack->rc_tmr_stopped = 0;
8037         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
8038         if (timers & PACE_TMR_DELACK) {
8039                 ret = rack_timeout_delack(tp, rack, cts);
8040         } else if (timers & PACE_TMR_RACK) {
8041                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
8042                 rack->r_fast_output = 0;
8043                 ret = rack_timeout_rack(tp, rack, cts);
8044         } else if (timers & PACE_TMR_TLP) {
8045                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
8046                 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
8047         } else if (timers & PACE_TMR_RXT) {
8048                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
8049                 rack->r_fast_output = 0;
8050                 ret = rack_timeout_rxt(tp, rack, cts);
8051         } else if (timers & PACE_TMR_PERSIT) {
8052                 ret = rack_timeout_persist(tp, rack, cts);
8053         } else if (timers & PACE_TMR_KEEP) {
8054                 ret = rack_timeout_keepalive(tp, rack, cts);
8055         }
8056         rack_log_to_processing(rack, cts, ret, timers);
8057         return (ret);
8058 }
8059
8060 static void
8061 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
8062 {
8063         struct timeval tv;
8064         uint32_t us_cts, flags_on_entry;
8065         uint8_t hpts_removed = 0;
8066
8067         flags_on_entry = rack->r_ctl.rc_hpts_flags;
8068         us_cts = tcp_get_usecs(&tv);
8069         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
8070             ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
8071              ((tp->snd_max - tp->snd_una) == 0))) {
8072                 tcp_hpts_remove(rack->rc_inp);
8073                 hpts_removed = 1;
8074                 /* If we were not delayed cancel out the flag. */
8075                 if ((tp->snd_max - tp->snd_una) == 0)
8076                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
8077                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
8078         }
8079         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
8080                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
8081                 if (tcp_in_hpts(rack->rc_inp) &&
8082                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
8083                         /*
8084                          * Canceling timer's when we have no output being
8085                          * paced. We also must remove ourselves from the
8086                          * hpts.
8087                          */
8088                         tcp_hpts_remove(rack->rc_inp);
8089                         hpts_removed = 1;
8090                 }
8091                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
8092         }
8093         if (hpts_removed == 0)
8094                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
8095 }
8096
8097 static int
8098 rack_stopall(struct tcpcb *tp)
8099 {
8100         struct tcp_rack *rack;
8101         rack = (struct tcp_rack *)tp->t_fb_ptr;
8102         rack->t_timers_stopped = 1;
8103         return (0);
8104 }
8105
8106 static void
8107 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack)
8108 {
8109         /*
8110          * Assure no timers are running.
8111          */
8112         if (tcp_timer_active(tp, TT_PERSIST)) {
8113                 /* We enter in persists, set the flag appropriately */
8114                 rack->rc_in_persist = 1;
8115         }
8116         if (tcp_in_hpts(rack->rc_inp)) {
8117                 tcp_hpts_remove(rack->rc_inp);
8118         }
8119 }
8120
8121 static void
8122 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
8123     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz)
8124 {
8125         int32_t idx;
8126
8127         rsm->r_rtr_cnt++;
8128         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8129         rsm->r_dupack = 0;
8130         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
8131                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
8132                 rsm->r_flags |= RACK_OVERMAX;
8133         }
8134         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
8135                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
8136                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
8137         }
8138         idx = rsm->r_rtr_cnt - 1;
8139         rsm->r_tim_lastsent[idx] = ts;
8140         /*
8141          * Here we don't add in the len of send, since its already
8142          * in snduna <->snd_max.
8143          */
8144         rsm->r_fas = ctf_flight_size(rack->rc_tp,
8145                                      rack->r_ctl.rc_sacked);
8146         if (rsm->r_flags & RACK_ACKED) {
8147                 /* Problably MTU discovery messing with us */
8148                 rsm->r_flags &= ~RACK_ACKED;
8149                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
8150         }
8151         if (rsm->r_in_tmap) {
8152                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8153                 rsm->r_in_tmap = 0;
8154         }
8155         /* Lets make sure it really is in or not the GP window */
8156         rack_mark_in_gp_win(tp, rsm);
8157         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8158         rsm->r_in_tmap = 1;
8159         rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz);
8160         /* Take off the must retransmit flag, if its on */
8161         if (rsm->r_flags & RACK_MUST_RXT) {
8162                 if (rack->r_must_retran)
8163                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
8164                 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
8165                         /*
8166                          * We have retransmitted all we need. Clear
8167                          * any must retransmit flags.
8168                          */
8169                         rack->r_must_retran = 0;
8170                         rack->r_ctl.rc_out_at_rto = 0;
8171                 }
8172                 rsm->r_flags &= ~RACK_MUST_RXT;
8173         }
8174         /* Remove any collapsed flag */
8175         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
8176         if (rsm->r_flags & RACK_SACK_PASSED) {
8177                 /* We have retransmitted due to the SACK pass */
8178                 rsm->r_flags &= ~RACK_SACK_PASSED;
8179                 rsm->r_flags |= RACK_WAS_SACKPASS;
8180         }
8181 }
8182
8183 static uint32_t
8184 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
8185     struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz)
8186 {
8187         /*
8188          * We (re-)transmitted starting at rsm->r_start for some length
8189          * (possibly less than r_end.
8190          */
8191         struct rack_sendmap *nrsm;
8192         int insret __diagused;
8193         uint32_t c_end;
8194         int32_t len;
8195
8196         len = *lenp;
8197         c_end = rsm->r_start + len;
8198         if (SEQ_GEQ(c_end, rsm->r_end)) {
8199                 /*
8200                  * We retransmitted the whole piece or more than the whole
8201                  * slopping into the next rsm.
8202                  */
8203                 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8204                 if (c_end == rsm->r_end) {
8205                         *lenp = 0;
8206                         return (0);
8207                 } else {
8208                         int32_t act_len;
8209
8210                         /* Hangs over the end return whats left */
8211                         act_len = rsm->r_end - rsm->r_start;
8212                         *lenp = (len - act_len);
8213                         return (rsm->r_end);
8214                 }
8215                 /* We don't get out of this block. */
8216         }
8217         /*
8218          * Here we retransmitted less than the whole thing which means we
8219          * have to split this into what was transmitted and what was not.
8220          */
8221         nrsm = rack_alloc_full_limit(rack);
8222         if (nrsm == NULL) {
8223                 /*
8224                  * We can't get memory, so lets not proceed.
8225                  */
8226                 *lenp = 0;
8227                 return (0);
8228         }
8229         /*
8230          * So here we are going to take the original rsm and make it what we
8231          * retransmitted. nrsm will be the tail portion we did not
8232          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
8233          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
8234          * 1, 6 and the new piece will be 6, 11.
8235          */
8236         rack_clone_rsm(rack, nrsm, rsm, c_end);
8237         nrsm->r_dupack = 0;
8238         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8239 #ifndef INVARIANTS
8240         (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8241 #else
8242         if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8243                 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
8244                       nrsm, insret, rack, rsm);
8245         }
8246 #endif
8247         if (rsm->r_in_tmap) {
8248                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8249                 nrsm->r_in_tmap = 1;
8250         }
8251         rsm->r_flags &= (~RACK_HAS_FIN);
8252         rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8253         /* Log a split of rsm into rsm and nrsm */
8254         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8255         *lenp = 0;
8256         return (0);
8257 }
8258
8259 static void
8260 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
8261                 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
8262                 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb,
8263                 uint32_t s_moff, int hw_tls, int segsiz)
8264 {
8265         struct tcp_rack *rack;
8266         struct rack_sendmap *rsm, *nrsm;
8267         int insret __diagused;
8268
8269         register uint32_t snd_max, snd_una;
8270
8271         /*
8272          * Add to the RACK log of packets in flight or retransmitted. If
8273          * there is a TS option we will use the TS echoed, if not we will
8274          * grab a TS.
8275          *
8276          * Retransmissions will increment the count and move the ts to its
8277          * proper place. Note that if options do not include TS's then we
8278          * won't be able to effectively use the ACK for an RTT on a retran.
8279          *
8280          * Notes about r_start and r_end. Lets consider a send starting at
8281          * sequence 1 for 10 bytes. In such an example the r_start would be
8282          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
8283          * This means that r_end is actually the first sequence for the next
8284          * slot (11).
8285          *
8286          */
8287         /*
8288          * If err is set what do we do XXXrrs? should we not add the thing?
8289          * -- i.e. return if err != 0 or should we pretend we sent it? --
8290          * i.e. proceed with add ** do this for now.
8291          */
8292         INP_WLOCK_ASSERT(tptoinpcb(tp));
8293         if (err)
8294                 /*
8295                  * We don't log errors -- we could but snd_max does not
8296                  * advance in this case either.
8297                  */
8298                 return;
8299
8300         if (th_flags & TH_RST) {
8301                 /*
8302                  * We don't log resets and we return immediately from
8303                  * sending
8304                  */
8305                 return;
8306         }
8307         rack = (struct tcp_rack *)tp->t_fb_ptr;
8308         snd_una = tp->snd_una;
8309         snd_max = tp->snd_max;
8310         if (th_flags & (TH_SYN | TH_FIN)) {
8311                 /*
8312                  * The call to rack_log_output is made before bumping
8313                  * snd_max. This means we can record one extra byte on a SYN
8314                  * or FIN if seq_out is adding more on and a FIN is present
8315                  * (and we are not resending).
8316                  */
8317                 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
8318                         len++;
8319                 if (th_flags & TH_FIN)
8320                         len++;
8321                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
8322                         /*
8323                          * The add/update as not been done for the FIN/SYN
8324                          * yet.
8325                          */
8326                         snd_max = tp->snd_nxt;
8327                 }
8328         }
8329         if (SEQ_LEQ((seq_out + len), snd_una)) {
8330                 /* Are sending an old segment to induce an ack (keep-alive)? */
8331                 return;
8332         }
8333         if (SEQ_LT(seq_out, snd_una)) {
8334                 /* huh? should we panic? */
8335                 uint32_t end;
8336
8337                 end = seq_out + len;
8338                 seq_out = snd_una;
8339                 if (SEQ_GEQ(end, seq_out))
8340                         len = end - seq_out;
8341                 else
8342                         len = 0;
8343         }
8344         if (len == 0) {
8345                 /* We don't log zero window probes */
8346                 return;
8347         }
8348         if (IN_FASTRECOVERY(tp->t_flags)) {
8349                 rack->r_ctl.rc_prr_out += len;
8350         }
8351         /* First question is it a retransmission or new? */
8352         if (seq_out == snd_max) {
8353                 /* Its new */
8354                 rack_chk_http_and_hybrid_on_out(rack, seq_out, len, cts);
8355 again:
8356                 rsm = rack_alloc(rack);
8357                 if (rsm == NULL) {
8358                         /*
8359                          * Hmm out of memory and the tcb got destroyed while
8360                          * we tried to wait.
8361                          */
8362                         return;
8363                 }
8364                 if (th_flags & TH_FIN) {
8365                         rsm->r_flags = RACK_HAS_FIN|add_flag;
8366                 } else {
8367                         rsm->r_flags = add_flag;
8368                 }
8369                 if (hw_tls)
8370                         rsm->r_hw_tls = 1;
8371                 rsm->r_tim_lastsent[0] = cts;
8372                 rsm->r_rtr_cnt = 1;
8373                 rsm->r_rtr_bytes = 0;
8374                 if (th_flags & TH_SYN) {
8375                         /* The data space is one beyond snd_una */
8376                         rsm->r_flags |= RACK_HAS_SYN;
8377                 }
8378                 rsm->r_start = seq_out;
8379                 rsm->r_end = rsm->r_start + len;
8380                 rack_mark_in_gp_win(tp, rsm);
8381                 rsm->r_dupack = 0;
8382                 /*
8383                  * save off the mbuf location that
8384                  * sndmbuf_noadv returned (which is
8385                  * where we started copying from)..
8386                  */
8387                 rsm->m = s_mb;
8388                 rsm->soff = s_moff;
8389                 /*
8390                  * Here we do add in the len of send, since its not yet
8391                  * reflected in in snduna <->snd_max
8392                  */
8393                 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
8394                                               rack->r_ctl.rc_sacked) +
8395                               (rsm->r_end - rsm->r_start));
8396                 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
8397                 if (rsm->m) {
8398                         if (rsm->m->m_len <= rsm->soff) {
8399                                 /*
8400                                  * XXXrrs Question, will this happen?
8401                                  *
8402                                  * If sbsndptr is set at the correct place
8403                                  * then s_moff should always be somewhere
8404                                  * within rsm->m. But if the sbsndptr was
8405                                  * off then that won't be true. If it occurs
8406                                  * we need to walkout to the correct location.
8407                                  */
8408                                 struct mbuf *lm;
8409
8410                                 lm = rsm->m;
8411                                 while (lm->m_len <= rsm->soff) {
8412                                         rsm->soff -= lm->m_len;
8413                                         lm = lm->m_next;
8414                                         KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
8415                                                              __func__, rack, s_moff, s_mb, rsm->soff));
8416                                 }
8417                                 rsm->m = lm;
8418                         }
8419                         rsm->orig_m_len = rsm->m->m_len;
8420                         rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
8421                 } else {
8422                         rsm->orig_m_len = 0;
8423                         rsm->orig_t_space = 0;
8424                 }
8425                 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz);
8426                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8427                 /* Log a new rsm */
8428                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
8429 #ifndef INVARIANTS
8430                 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
8431 #else
8432                 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
8433                         panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
8434                               nrsm, insret, rack, rsm);
8435                 }
8436 #endif
8437                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8438                 rsm->r_in_tmap = 1;
8439                 /*
8440                  * Special case detection, is there just a single
8441                  * packet outstanding when we are not in recovery?
8442                  *
8443                  * If this is true mark it so.
8444                  */
8445                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
8446                     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
8447                         struct rack_sendmap *prsm;
8448
8449                         prsm = tqhash_prev(rack->r_ctl.tqh, rsm);
8450                         if (prsm)
8451                                 prsm->r_one_out_nr = 1;
8452                 }
8453                 return;
8454         }
8455         /*
8456          * If we reach here its a retransmission and we need to find it.
8457          */
8458 more:
8459         if (hintrsm && (hintrsm->r_start == seq_out)) {
8460                 rsm = hintrsm;
8461                 hintrsm = NULL;
8462         } else {
8463                 /* No hints sorry */
8464                 rsm = NULL;
8465         }
8466         if ((rsm) && (rsm->r_start == seq_out)) {
8467                 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8468                 if (len == 0) {
8469                         return;
8470                 } else {
8471                         goto more;
8472                 }
8473         }
8474         /* Ok it was not the last pointer go through it the hard way. */
8475 refind:
8476         rsm = tqhash_find(rack->r_ctl.tqh, seq_out);
8477         if (rsm) {
8478                 if (rsm->r_start == seq_out) {
8479                         seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8480                         if (len == 0) {
8481                                 return;
8482                         } else {
8483                                 goto refind;
8484                         }
8485                 }
8486                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
8487                         /* Transmitted within this piece */
8488                         /*
8489                          * Ok we must split off the front and then let the
8490                          * update do the rest
8491                          */
8492                         nrsm = rack_alloc_full_limit(rack);
8493                         if (nrsm == NULL) {
8494                                 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz);
8495                                 return;
8496                         }
8497                         /*
8498                          * copy rsm to nrsm and then trim the front of rsm
8499                          * to not include this part.
8500                          */
8501                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
8502                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8503 #ifndef INVARIANTS
8504                         (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8505 #else
8506                         if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8507                                 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
8508                                       nrsm, insret, rack, rsm);
8509                         }
8510 #endif
8511                         if (rsm->r_in_tmap) {
8512                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8513                                 nrsm->r_in_tmap = 1;
8514                         }
8515                         rsm->r_flags &= (~RACK_HAS_FIN);
8516                         seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz);
8517                         if (len == 0) {
8518                                 return;
8519                         } else if (len > 0)
8520                                 goto refind;
8521                 }
8522         }
8523         /*
8524          * Hmm not found in map did they retransmit both old and on into the
8525          * new?
8526          */
8527         if (seq_out == tp->snd_max) {
8528                 goto again;
8529         } else if (SEQ_LT(seq_out, tp->snd_max)) {
8530 #ifdef INVARIANTS
8531                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
8532                        seq_out, len, tp->snd_una, tp->snd_max);
8533                 printf("Starting Dump of all rack entries\n");
8534                 TQHASH_FOREACH(rsm, rack->r_ctl.tqh)  {
8535                         printf("rsm:%p start:%u end:%u\n",
8536                                rsm, rsm->r_start, rsm->r_end);
8537                 }
8538                 printf("Dump complete\n");
8539                 panic("seq_out not found rack:%p tp:%p",
8540                       rack, tp);
8541 #endif
8542         } else {
8543 #ifdef INVARIANTS
8544                 /*
8545                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
8546                  * flag)
8547                  */
8548                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
8549                       seq_out, len, tp->snd_max, tp);
8550 #endif
8551         }
8552 }
8553
8554 /*
8555  * Record one of the RTT updates from an ack into
8556  * our sample structure.
8557  */
8558
8559 static void
8560 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
8561                     int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
8562 {
8563         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8564             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
8565                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
8566         }
8567         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8568             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
8569                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
8570         }
8571         if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
8572             if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
8573                 rack->r_ctl.rc_gp_lowrtt = us_rtt;
8574             if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
8575                     rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
8576         }
8577         if ((confidence == 1) &&
8578             ((rsm == NULL) ||
8579              (rsm->r_just_ret) ||
8580              (rsm->r_one_out_nr &&
8581               len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
8582                 /*
8583                  * If the rsm had a just return
8584                  * hit it then we can't trust the
8585                  * rtt measurement for buffer deterimination
8586                  * Note that a confidence of 2, indicates
8587                  * SACK'd which overrides the r_just_ret or
8588                  * the r_one_out_nr. If it was a CUM-ACK and
8589                  * we had only two outstanding, but get an
8590                  * ack for only 1. Then that also lowers our
8591                  * confidence.
8592                  */
8593                 confidence = 0;
8594         }
8595         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8596             (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
8597                 if (rack->r_ctl.rack_rs.confidence == 0) {
8598                         /*
8599                          * We take anything with no current confidence
8600                          * saved.
8601                          */
8602                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8603                         rack->r_ctl.rack_rs.confidence = confidence;
8604                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8605                 } else if (confidence != 0) {
8606                         /*
8607                          * Once we have a confident number,
8608                          * we can update it with a smaller
8609                          * value since this confident number
8610                          * may include the DSACK time until
8611                          * the next segment (the second one) arrived.
8612                          */
8613                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8614                         rack->r_ctl.rack_rs.confidence = confidence;
8615                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8616                 }
8617         }
8618         rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
8619         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
8620         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
8621         rack->r_ctl.rack_rs.rs_rtt_cnt++;
8622 }
8623
8624 /*
8625  * Collect new round-trip time estimate
8626  * and update averages and current timeout.
8627  */
8628 static void
8629 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
8630 {
8631         int32_t delta;
8632         int32_t rtt;
8633
8634         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
8635                 /* No valid sample */
8636                 return;
8637         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
8638                 /* We are to use the lowest RTT seen in a single ack */
8639                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
8640         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
8641                 /* We are to use the highest RTT seen in a single ack */
8642                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
8643         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
8644                 /* We are to use the average RTT seen in a single ack */
8645                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
8646                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
8647         } else {
8648 #ifdef INVARIANTS
8649                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
8650 #endif
8651                 return;
8652         }
8653         if (rtt == 0)
8654                 rtt = 1;
8655         if (rack->rc_gp_rtt_set == 0) {
8656                 /*
8657                  * With no RTT we have to accept
8658                  * even one we are not confident of.
8659                  */
8660                 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
8661                 rack->rc_gp_rtt_set = 1;
8662         } else if (rack->r_ctl.rack_rs.confidence) {
8663                 /* update the running gp srtt */
8664                 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
8665                 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
8666         }
8667         if (rack->r_ctl.rack_rs.confidence) {
8668                 /*
8669                  * record the low and high for highly buffered path computation,
8670                  * we only do this if we are confident (not a retransmission).
8671                  */
8672                 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
8673                         rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8674                 }
8675                 if (rack->rc_highly_buffered == 0) {
8676                         /*
8677                          * Currently once we declare a path has
8678                          * highly buffered there is no going
8679                          * back, which may be a problem...
8680                          */
8681                         if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
8682                                 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
8683                                                      rack->r_ctl.rc_highest_us_rtt,
8684                                                      rack->r_ctl.rc_lowest_us_rtt,
8685                                                      RACK_RTTS_SEEHBP);
8686                                 rack->rc_highly_buffered = 1;
8687                         }
8688                 }
8689         }
8690         if ((rack->r_ctl.rack_rs.confidence) ||
8691             (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
8692                 /*
8693                  * If we are highly confident of it <or> it was
8694                  * never retransmitted we accept it as the last us_rtt.
8695                  */
8696                 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8697                 /* The lowest rtt can be set if its was not retransmited */
8698                 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
8699                         rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8700                         if (rack->r_ctl.rc_lowest_us_rtt == 0)
8701                                 rack->r_ctl.rc_lowest_us_rtt = 1;
8702                 }
8703         }
8704         rack = (struct tcp_rack *)tp->t_fb_ptr;
8705         if (tp->t_srtt != 0) {
8706                 /*
8707                  * We keep a simple srtt in microseconds, like our rtt
8708                  * measurement. We don't need to do any tricks with shifting
8709                  * etc. Instead we just add in 1/8th of the new measurement
8710                  * and subtract out 1/8 of the old srtt. We do the same with
8711                  * the variance after finding the absolute value of the
8712                  * difference between this sample and the current srtt.
8713                  */
8714                 delta = tp->t_srtt - rtt;
8715                 /* Take off 1/8th of the current sRTT */
8716                 tp->t_srtt -= (tp->t_srtt >> 3);
8717                 /* Add in 1/8th of the new RTT just measured */
8718                 tp->t_srtt += (rtt >> 3);
8719                 if (tp->t_srtt <= 0)
8720                         tp->t_srtt = 1;
8721                 /* Now lets make the absolute value of the variance */
8722                 if (delta < 0)
8723                         delta = -delta;
8724                 /* Subtract out 1/8th */
8725                 tp->t_rttvar -= (tp->t_rttvar >> 3);
8726                 /* Add in 1/8th of the new variance we just saw */
8727                 tp->t_rttvar += (delta >> 3);
8728                 if (tp->t_rttvar <= 0)
8729                         tp->t_rttvar = 1;
8730         } else {
8731                 /*
8732                  * No rtt measurement yet - use the unsmoothed rtt. Set the
8733                  * variance to half the rtt (so our first retransmit happens
8734                  * at 3*rtt).
8735                  */
8736                 tp->t_srtt = rtt;
8737                 tp->t_rttvar = rtt >> 1;
8738         }
8739         rack->rc_srtt_measure_made = 1;
8740         KMOD_TCPSTAT_INC(tcps_rttupdated);
8741         if (tp->t_rttupdated < UCHAR_MAX)
8742                 tp->t_rttupdated++;
8743 #ifdef STATS
8744         if (rack_stats_gets_ms_rtt == 0) {
8745                 /* Send in the microsecond rtt used for rxt timeout purposes */
8746                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
8747         } else if (rack_stats_gets_ms_rtt == 1) {
8748                 /* Send in the millisecond rtt used for rxt timeout purposes */
8749                 int32_t ms_rtt;
8750
8751                 /* Round up */
8752                 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8753                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8754         } else if (rack_stats_gets_ms_rtt == 2) {
8755                 /* Send in the millisecond rtt has close to the path RTT as we can get  */
8756                 int32_t ms_rtt;
8757
8758                 /* Round up */
8759                 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8760                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8761         }  else {
8762                 /* Send in the microsecond rtt has close to the path RTT as we can get  */
8763                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8764         }
8765         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8766 #endif
8767         /*
8768          * the retransmit should happen at rtt + 4 * rttvar. Because of the
8769          * way we do the smoothing, srtt and rttvar will each average +1/2
8770          * tick of bias.  When we compute the retransmit timer, we want 1/2
8771          * tick of rounding and 1 extra tick because of +-1/2 tick
8772          * uncertainty in the firing of the timer.  The bias will give us
8773          * exactly the 1.5 tick we need.  But, because the bias is
8774          * statistical, we have to test that we don't drop below the minimum
8775          * feasible timer (which is 2 ticks).
8776          */
8777         tp->t_rxtshift = 0;
8778         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8779                       max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
8780         rack_log_rtt_sample(rack, rtt);
8781         tp->t_softerror = 0;
8782 }
8783
8784
8785 static void
8786 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
8787 {
8788         /*
8789          * Apply to filter the inbound us-rtt at us_cts.
8790          */
8791         uint32_t old_rtt;
8792
8793         old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
8794         apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
8795                                us_rtt, us_cts);
8796         if (old_rtt > us_rtt) {
8797                 /* We just hit a new lower rtt time */
8798                 rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
8799                                      __LINE__, RACK_RTTS_NEWRTT);
8800                 /*
8801                  * Only count it if its lower than what we saw within our
8802                  * calculated range.
8803                  */
8804                 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
8805                         if (rack_probertt_lower_within &&
8806                             rack->rc_gp_dyn_mul &&
8807                             (rack->use_fixed_rate == 0) &&
8808                             (rack->rc_always_pace)) {
8809                                 /*
8810                                  * We are seeing a new lower rtt very close
8811                                  * to the time that we would have entered probe-rtt.
8812                                  * This is probably due to the fact that a peer flow
8813                                  * has entered probe-rtt. Lets go in now too.
8814                                  */
8815                                 uint32_t val;
8816
8817                                 val = rack_probertt_lower_within * rack_time_between_probertt;
8818                                 val /= 100;
8819                                 if ((rack->in_probe_rtt == 0)  &&
8820                                     ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
8821                                         rack_enter_probertt(rack, us_cts);
8822                                 }
8823                         }
8824                         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
8825                 }
8826         }
8827 }
8828
8829 static int
8830 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
8831     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
8832 {
8833         uint32_t us_rtt;
8834         int32_t i, all;
8835         uint32_t t, len_acked;
8836
8837         if ((rsm->r_flags & RACK_ACKED) ||
8838             (rsm->r_flags & RACK_WAS_ACKED))
8839                 /* Already done */
8840                 return (0);
8841         if (rsm->r_no_rtt_allowed) {
8842                 /* Not allowed */
8843                 return (0);
8844         }
8845         if (ack_type == CUM_ACKED) {
8846                 if (SEQ_GT(th_ack, rsm->r_end)) {
8847                         len_acked = rsm->r_end - rsm->r_start;
8848                         all = 1;
8849                 } else {
8850                         len_acked = th_ack - rsm->r_start;
8851                         all = 0;
8852                 }
8853         } else {
8854                 len_acked = rsm->r_end - rsm->r_start;
8855                 all = 0;
8856         }
8857         if (rsm->r_rtr_cnt == 1) {
8858
8859                 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8860                 if ((int)t <= 0)
8861                         t = 1;
8862                 if (!tp->t_rttlow || tp->t_rttlow > t)
8863                         tp->t_rttlow = t;
8864                 if (!rack->r_ctl.rc_rack_min_rtt ||
8865                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8866                         rack->r_ctl.rc_rack_min_rtt = t;
8867                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
8868                                 rack->r_ctl.rc_rack_min_rtt = 1;
8869                         }
8870                 }
8871                 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
8872                         us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8873                 else
8874                         us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8875                 if (us_rtt == 0)
8876                         us_rtt = 1;
8877                 if (CC_ALGO(tp)->rttsample != NULL) {
8878                         /* Kick the RTT to the CC */
8879                         CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8880                 }
8881                 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
8882                 if (ack_type == SACKED) {
8883                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
8884                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
8885                 } else {
8886                         /*
8887                          * We need to setup what our confidence
8888                          * is in this ack.
8889                          *
8890                          * If the rsm was app limited and it is
8891                          * less than a mss in length (the end
8892                          * of the send) then we have a gap. If we
8893                          * were app limited but say we were sending
8894                          * multiple MSS's then we are more confident
8895                          * int it.
8896                          *
8897                          * When we are not app-limited then we see if
8898                          * the rsm is being included in the current
8899                          * measurement, we tell this by the app_limited_needs_set
8900                          * flag.
8901                          *
8902                          * Note that being cwnd blocked is not applimited
8903                          * as well as the pacing delay between packets which
8904                          * are sending only 1 or 2 MSS's also will show up
8905                          * in the RTT. We probably need to examine this algorithm
8906                          * a bit more and enhance it to account for the delay
8907                          * between rsm's. We could do that by saving off the
8908                          * pacing delay of each rsm (in an rsm) and then
8909                          * factoring that in somehow though for now I am
8910                          * not sure how :)
8911                          */
8912                         int calc_conf = 0;
8913
8914                         if (rsm->r_flags & RACK_APP_LIMITED) {
8915                                 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
8916                                         calc_conf = 0;
8917                                 else
8918                                         calc_conf = 1;
8919                         } else if (rack->app_limited_needs_set == 0) {
8920                                 calc_conf = 1;
8921                         } else {
8922                                 calc_conf = 0;
8923                         }
8924                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
8925                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
8926                                             calc_conf, rsm, rsm->r_rtr_cnt);
8927                 }
8928                 if ((rsm->r_flags & RACK_TLP) &&
8929                     (!IN_FASTRECOVERY(tp->t_flags))) {
8930                         /* Segment was a TLP and our retrans matched */
8931                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
8932                                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
8933                         }
8934                 }
8935                 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8936                     (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8937                             (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
8938                         /* New more recent rack_tmit_time */
8939                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8940                         if (rack->r_ctl.rc_rack_tmit_time == 0)
8941                                 rack->r_ctl.rc_rack_tmit_time = 1;
8942                         rack->rc_rack_rtt = t;
8943                 }
8944                 return (1);
8945         }
8946         /*
8947          * We clear the soft/rxtshift since we got an ack.
8948          * There is no assurance we will call the commit() function
8949          * so we need to clear these to avoid incorrect handling.
8950          */
8951         tp->t_rxtshift = 0;
8952         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8953                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
8954         tp->t_softerror = 0;
8955         if (to && (to->to_flags & TOF_TS) &&
8956             (ack_type == CUM_ACKED) &&
8957             (to->to_tsecr) &&
8958             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
8959                 /*
8960                  * Now which timestamp does it match? In this block the ACK
8961                  * must be coming from a previous transmission.
8962                  */
8963                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8964                         if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
8965                                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8966                                 if ((int)t <= 0)
8967                                         t = 1;
8968                                 if (CC_ALGO(tp)->rttsample != NULL) {
8969                                         /*
8970                                          * Kick the RTT to the CC, here
8971                                          * we lie a bit in that we know the
8972                                          * retransmission is correct even though
8973                                          * we retransmitted. This is because
8974                                          * we match the timestamps.
8975                                          */
8976                                         if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
8977                                                 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
8978                                         else
8979                                                 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
8980                                         CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8981                                 }
8982                                 if ((i + 1) < rsm->r_rtr_cnt) {
8983                                         /*
8984                                          * The peer ack'd from our previous
8985                                          * transmission. We have a spurious
8986                                          * retransmission and thus we dont
8987                                          * want to update our rack_rtt.
8988                                          *
8989                                          * Hmm should there be a CC revert here?
8990                                          *
8991                                          */
8992                                         return (0);
8993                                 }
8994                                 if (!tp->t_rttlow || tp->t_rttlow > t)
8995                                         tp->t_rttlow = t;
8996                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8997                                         rack->r_ctl.rc_rack_min_rtt = t;
8998                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
8999                                                 rack->r_ctl.rc_rack_min_rtt = 1;
9000                                         }
9001                                 }
9002                                 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
9003                                     (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
9004                                             (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
9005                                         /* New more recent rack_tmit_time */
9006                                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
9007                                         if (rack->r_ctl.rc_rack_tmit_time == 0)
9008                                                 rack->r_ctl.rc_rack_tmit_time = 1;
9009                                         rack->rc_rack_rtt = t;
9010                                 }
9011                                 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
9012                                 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
9013                                                     rsm->r_rtr_cnt);
9014                                 return (1);
9015                         }
9016                 }
9017                 /* If we are logging log out the sendmap */
9018                 if (tcp_bblogging_on(rack->rc_tp)) {
9019                         for (i = 0; i < rsm->r_rtr_cnt; i++) {
9020                                 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr);
9021                         }
9022                 }
9023                 goto ts_not_found;
9024         } else {
9025                 /*
9026                  * Ok its a SACK block that we retransmitted. or a windows
9027                  * machine without timestamps. We can tell nothing from the
9028                  * time-stamp since its not there or the time the peer last
9029                  * recieved a segment that moved forward its cum-ack point.
9030                  */
9031 ts_not_found:
9032                 i = rsm->r_rtr_cnt - 1;
9033                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
9034                 if ((int)t <= 0)
9035                         t = 1;
9036                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
9037                         /*
9038                          * We retransmitted and the ack came back in less
9039                          * than the smallest rtt we have observed. We most
9040                          * likely did an improper retransmit as outlined in
9041                          * 6.2 Step 2 point 2 in the rack-draft so we
9042                          * don't want to update our rack_rtt. We in
9043                          * theory (in future) might want to think about reverting our
9044                          * cwnd state but we won't for now.
9045                          */
9046                         return (0);
9047                 } else if (rack->r_ctl.rc_rack_min_rtt) {
9048                         /*
9049                          * We retransmitted it and the retransmit did the
9050                          * job.
9051                          */
9052                         if (!rack->r_ctl.rc_rack_min_rtt ||
9053                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
9054                                 rack->r_ctl.rc_rack_min_rtt = t;
9055                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
9056                                         rack->r_ctl.rc_rack_min_rtt = 1;
9057                                 }
9058                         }
9059                         if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
9060                             (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
9061                                     (uint32_t)rsm->r_tim_lastsent[i]))) {
9062                                 /* New more recent rack_tmit_time */
9063                                 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
9064                                 if (rack->r_ctl.rc_rack_tmit_time == 0)
9065                                         rack->r_ctl.rc_rack_tmit_time = 1;
9066                                 rack->rc_rack_rtt = t;
9067                         }
9068                         return (1);
9069                 }
9070         }
9071         return (0);
9072 }
9073
9074 /*
9075  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
9076  */
9077 static void
9078 rack_log_sack_passed(struct tcpcb *tp,
9079     struct tcp_rack *rack, struct rack_sendmap *rsm)
9080 {
9081         struct rack_sendmap *nrsm;
9082
9083         nrsm = rsm;
9084         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
9085             rack_head, r_tnext) {
9086                 if (nrsm == rsm) {
9087                         /* Skip original segment he is acked */
9088                         continue;
9089                 }
9090                 if (nrsm->r_flags & RACK_ACKED) {
9091                         /*
9092                          * Skip ack'd segments, though we
9093                          * should not see these, since tmap
9094                          * should not have ack'd segments.
9095                          */
9096                         continue;
9097                 }
9098                 if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
9099                         /*
9100                          * If the peer dropped the rwnd on
9101                          * these then we don't worry about them.
9102                          */
9103                         continue;
9104                 }
9105                 if (nrsm->r_flags & RACK_SACK_PASSED) {
9106                         /*
9107                          * We found one that is already marked
9108                          * passed, we have been here before and
9109                          * so all others below this are marked.
9110                          */
9111                         break;
9112                 }
9113                 nrsm->r_flags |= RACK_SACK_PASSED;
9114                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
9115         }
9116 }
9117
9118 static void
9119 rack_need_set_test(struct tcpcb *tp,
9120                    struct tcp_rack *rack,
9121                    struct rack_sendmap *rsm,
9122                    tcp_seq th_ack,
9123                    int line,
9124                    int use_which)
9125 {
9126         struct rack_sendmap *s_rsm;
9127
9128         if ((tp->t_flags & TF_GPUTINPROG) &&
9129             SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9130                 /*
9131                  * We were app limited, and this ack
9132                  * butts up or goes beyond the point where we want
9133                  * to start our next measurement. We need
9134                  * to record the new gput_ts as here and
9135                  * possibly update the start sequence.
9136                  */
9137                 uint32_t seq, ts;
9138
9139                 if (rsm->r_rtr_cnt > 1) {
9140                         /*
9141                          * This is a retransmit, can we
9142                          * really make any assessment at this
9143                          * point?  We are not really sure of
9144                          * the timestamp, is it this or the
9145                          * previous transmission?
9146                          *
9147                          * Lets wait for something better that
9148                          * is not retransmitted.
9149                          */
9150                         return;
9151                 }
9152                 seq = tp->gput_seq;
9153                 ts = tp->gput_ts;
9154                 rack->app_limited_needs_set = 0;
9155                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
9156                 /* Do we start at a new end? */
9157                 if ((use_which == RACK_USE_BEG) &&
9158                     SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
9159                         /*
9160                          * When we get an ACK that just eats
9161                          * up some of the rsm, we set RACK_USE_BEG
9162                          * since whats at r_start (i.e. th_ack)
9163                          * is left unacked and thats where the
9164                          * measurement now starts.
9165                          */
9166                         tp->gput_seq = rsm->r_start;
9167                 }
9168                 if ((use_which == RACK_USE_END) &&
9169                     SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9170                         /*
9171                          * We use the end when the cumack
9172                          * is moving forward and completely
9173                          * deleting the rsm passed so basically
9174                          * r_end holds th_ack.
9175                          *
9176                          * For SACK's we also want to use the end
9177                          * since this piece just got sacked and
9178                          * we want to target anything after that
9179                          * in our measurement.
9180                          */
9181                         tp->gput_seq = rsm->r_end;
9182                 }
9183                 if (use_which == RACK_USE_END_OR_THACK) {
9184                         /*
9185                          * special case for ack moving forward,
9186                          * not a sack, we need to move all the
9187                          * way up to where this ack cum-ack moves
9188                          * to.
9189                          */
9190                         if (SEQ_GT(th_ack, rsm->r_end))
9191                                 tp->gput_seq = th_ack;
9192                         else
9193                                 tp->gput_seq = rsm->r_end;
9194                 }
9195                 if (SEQ_LT(tp->gput_seq, tp->snd_max))
9196                         s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
9197                 else
9198                         s_rsm = NULL;
9199                 /*
9200                  * Pick up the correct send time if we can the rsm passed in
9201                  * may be equal to s_rsm if the RACK_USE_BEG was set. For the other
9202                  * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will
9203                  * find a different seq i.e. the next send up.
9204                  *
9205                  * If that has not been sent, s_rsm will be NULL and we must
9206                  * arrange it so this function will get called again by setting
9207                  * app_limited_needs_set.
9208                  */
9209                 if (s_rsm)
9210                         rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0];
9211                 else {
9212                         /* If we hit here we have to have *not* sent tp->gput_seq */
9213                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
9214                         /* Set it up so we will go through here again */
9215                         rack->app_limited_needs_set = 1;
9216                 }
9217                 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
9218                         /*
9219                          * We moved beyond this guy's range, re-calculate
9220                          * the new end point.
9221                          */
9222                         if (rack->rc_gp_filled == 0) {
9223                                 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
9224                         } else {
9225                                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
9226                         }
9227                 }
9228                 /*
9229                  * We are moving the goal post, we may be able to clear the
9230                  * measure_saw_probe_rtt flag.
9231                  */
9232                 if ((rack->in_probe_rtt == 0) &&
9233                     (rack->measure_saw_probe_rtt) &&
9234                     (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
9235                         rack->measure_saw_probe_rtt = 0;
9236                 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
9237                                            seq, tp->gput_seq,
9238                                            (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9239                                             (uint64_t)rack->r_ctl.rc_gp_output_ts),
9240                                            5, line, NULL, 0);
9241                 if (rack->rc_gp_filled &&
9242                     ((tp->gput_ack - tp->gput_seq) <
9243                      max(rc_init_window(rack), (MIN_GP_WIN *
9244                                                 ctf_fixed_maxseg(tp))))) {
9245                         uint32_t ideal_amount;
9246
9247                         ideal_amount = rack_get_measure_window(tp, rack);
9248                         if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) {
9249                                 /*
9250                                  * There is no sense of continuing this measurement
9251                                  * because its too small to gain us anything we
9252                                  * trust. Skip it and that way we can start a new
9253                                  * measurement quicker.
9254                                  */
9255                                 tp->t_flags &= ~TF_GPUTINPROG;
9256                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
9257                                                            0, 0,
9258                                                            (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9259                                                             (uint64_t)rack->r_ctl.rc_gp_output_ts),
9260                                                            6, __LINE__, NULL, 0);
9261                         } else {
9262                                 /*
9263                                  * Reset the window further out.
9264                                  */
9265                                 tp->gput_ack = tp->gput_seq + ideal_amount;
9266                         }
9267                 }
9268                 rack_tend_gp_marks(tp, rack);
9269                 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm);
9270         }
9271 }
9272
9273 static inline int
9274 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
9275 {
9276         if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
9277                 /* Behind our TLP definition or right at */
9278                 return (0);
9279         }
9280         if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
9281                 /* The start is beyond or right at our end of TLP definition */
9282                 return (0);
9283         }
9284         /* It has to be a sub-part of the original TLP recorded */
9285         return (1);
9286 }
9287
9288
9289
9290 static uint32_t
9291 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
9292                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
9293                    int *no_extra,
9294                    int *moved_two, uint32_t segsiz)
9295 {
9296         uint32_t start, end, changed = 0;
9297         struct rack_sendmap stack_map;
9298         struct rack_sendmap *rsm, *nrsm, *prev, *next;
9299         int insret __diagused;
9300         int32_t used_ref = 1;
9301         int moved = 0;
9302 #ifdef TCP_SAD_DETECTION
9303         int allow_segsiz;
9304         int first_time_through = 1;
9305 #endif
9306         int noextra = 0;
9307         int can_use_hookery = 0;
9308
9309         start = sack->start;
9310         end = sack->end;
9311         rsm = *prsm;
9312
9313 #ifdef TCP_SAD_DETECTION
9314         /*
9315          * There are a strange number of proxys and meddle boxes in the world
9316          * that seem to cut up segments on different boundaries. This gets us
9317          * smaller sacks that are still ok in terms of it being an attacker.
9318          * We use the base segsiz to calculate an allowable smallness but
9319          * also enforce a min on the segsiz in case it is an attacker playing
9320          * games with MSS. So basically if the sack arrives and it is
9321          * larger than a worse case 960 bytes, we don't classify the guy
9322          * as supicious.
9323          */
9324         allow_segsiz = max(segsiz, 1200) * sad_seg_size_per;
9325         allow_segsiz /= 1000;
9326 #endif
9327 do_rest_ofb:
9328         if ((rsm == NULL) ||
9329             (SEQ_LT(end, rsm->r_start)) ||
9330             (SEQ_GEQ(start, rsm->r_end)) ||
9331             (SEQ_LT(start, rsm->r_start))) {
9332                 /*
9333                  * We are not in the right spot,
9334                  * find the correct spot in the tree.
9335                  */
9336                 used_ref = 0;
9337                 rsm = tqhash_find(rack->r_ctl.tqh, start);
9338                 moved++;
9339         }
9340         if (rsm == NULL) {
9341                 /* TSNH */
9342                 goto out;
9343         }
9344 #ifdef TCP_SAD_DETECTION
9345         /* Now we must check for suspicous activity */
9346         if ((first_time_through == 1) &&
9347             ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) &&
9348             ((rsm->r_flags & RACK_PMTU_CHG) == 0) &&
9349             ((rsm->r_flags & RACK_TLP) == 0)) {
9350                 /*
9351                  * Its less than a full MSS or the segment being acked
9352                  * this should only happen if the rsm in question had the
9353                  * r_just_ret flag set <and> the end matches the end of
9354                  * the rsm block.
9355                  *
9356                  * Note we do not look at segments that have had TLP's on
9357                  * them since we can get un-reported rwnd collapses that
9358                  * basically we TLP on and then we get back a sack block
9359                  * that goes from the start to only a small way.
9360                  *
9361                  */
9362                 int loss, ok;
9363
9364                 ok = 0;
9365                 if (SEQ_GEQ(end, rsm->r_end)) {
9366                         if (rsm->r_just_ret == 1) {
9367                                 /* This was at the end of a send which is ok */
9368                                 ok = 1;
9369                         } else {
9370                                 /* A bit harder was it the end of our segment */
9371                                 int segs, len;
9372
9373                                 len = (rsm->r_end - rsm->r_start);
9374                                 segs = len / segsiz;
9375                                 segs *= segsiz;
9376                                 if ((segs + (rsm->r_end - start)) == len) {
9377                                         /*
9378                                          * So this last bit was the
9379                                          * end of our send if we cut it
9380                                          * up into segsiz pieces so its ok.
9381                                          */
9382                                         ok = 1;
9383                                 }
9384                         }
9385                 }
9386                 if (ok == 0) {
9387                         /*
9388                          * This guy is doing something suspicious
9389                          * lets start detection.
9390                          */
9391                         if (rack->rc_suspicious == 0) {
9392                                 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT);
9393                                 counter_u64_add(rack_sack_attacks_suspect, 1);
9394                                 rack->rc_suspicious = 1;
9395                                 rack_log_sad(rack, 4);
9396                                 if (tcp_bblogging_on(rack->rc_tp)) {
9397                                         union tcp_log_stackspecific log;
9398                                         struct timeval tv;
9399
9400                                         memset(&log.u_bbr, 0, sizeof(log.u_bbr));
9401                                         log.u_bbr.flex1 = end;
9402                                         log.u_bbr.flex2 = start;
9403                                         log.u_bbr.flex3 = rsm->r_end;
9404                                         log.u_bbr.flex4 = rsm->r_start;
9405                                         log.u_bbr.flex5 = segsiz;
9406                                         log.u_bbr.flex6 = rsm->r_fas;
9407                                         log.u_bbr.flex7 = rsm->r_bas;
9408                                         log.u_bbr.flex8 = 5;
9409                                         log.u_bbr.pkts_out = rsm->r_flags;
9410                                         log.u_bbr.bbr_state = rack->rc_suspicious;
9411                                         log.u_bbr.bbr_substate = rsm->r_just_ret;
9412                                         log.u_bbr.timeStamp = tcp_get_usecs(&tv);
9413                                         log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9414                                         TCP_LOG_EVENTP(rack->rc_tp, NULL,
9415                                                        &rack->rc_inp->inp_socket->so_rcv,
9416                                                        &rack->rc_inp->inp_socket->so_snd,
9417                                                        TCP_SAD_DETECTION, 0,
9418                                                        0, &log, false, &tv);
9419                                 }
9420                         }
9421                         /* You loose some ack count every time you sack
9422                          * a small bit that is not butting to the end of
9423                          * what we have sent. This is because we never
9424                          * send small bits unless its the end of the sb.
9425                          * Anyone sending a sack that is not at the end
9426                          * is thus very very suspicious.
9427                          */
9428                         loss = (segsiz/2) / (end - start);
9429                         if (loss < rack->r_ctl.ack_count)
9430                                 rack->r_ctl.ack_count -= loss;
9431                         else
9432                                 rack->r_ctl.ack_count = 0;
9433                 }
9434         }
9435         first_time_through = 0;
9436 #endif
9437         /* Ok we have an ACK for some piece of this rsm */
9438         if (rsm->r_start != start) {
9439                 if ((rsm->r_flags & RACK_ACKED) == 0) {
9440                         /*
9441                          * Before any splitting or hookery is
9442                          * done is it a TLP of interest i.e. rxt?
9443                          */
9444                         if ((rsm->r_flags & RACK_TLP) &&
9445                             (rsm->r_rtr_cnt > 1)) {
9446                                 /*
9447                                  * We are splitting a rxt TLP, check
9448                                  * if we need to save off the start/end
9449                                  */
9450                                 if (rack->rc_last_tlp_acked_set &&
9451                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9452                                         /*
9453                                          * We already turned this on since we are inside
9454                                          * the previous one was a partially sack now we
9455                                          * are getting another one (maybe all of it).
9456                                          *
9457                                          */
9458                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9459                                         /*
9460                                          * Lets make sure we have all of it though.
9461                                          */
9462                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9463                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9464                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9465                                                                      rack->r_ctl.last_tlp_acked_end);
9466                                         }
9467                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9468                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9469                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9470                                                                      rack->r_ctl.last_tlp_acked_end);
9471                                         }
9472                                 } else {
9473                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9474                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9475                                         rack->rc_last_tlp_past_cumack = 0;
9476                                         rack->rc_last_tlp_acked_set = 1;
9477                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9478                                 }
9479                         }
9480                         /**
9481                          * Need to split this in two pieces the before and after,
9482                          * the before remains in the map, the after must be
9483                          * added. In other words we have:
9484                          * rsm        |--------------|
9485                          * sackblk        |------->
9486                          * rsm will become
9487                          *     rsm    |---|
9488                          * and nrsm will be  the sacked piece
9489                          *     nrsm       |----------|
9490                          *
9491                          * But before we start down that path lets
9492                          * see if the sack spans over on top of
9493                          * the next guy and it is already sacked.
9494                          *
9495                          */
9496                         /*
9497                          * Hookery can only be used if the two entries
9498                          * are in the same bucket and neither one of
9499                          * them staddle the bucket line.
9500                          */
9501                         next = tqhash_next(rack->r_ctl.tqh, rsm);
9502                         if (next &&
9503                             (rsm->bindex == next->bindex) &&
9504                             ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9505                             ((next->r_flags & RACK_STRADDLE) == 0) &&
9506                             (rsm->r_flags & RACK_IN_GP_WIN) &&
9507                             (next->r_flags & RACK_IN_GP_WIN))
9508                                 can_use_hookery = 1;
9509                         else if (next &&
9510                                  (rsm->bindex == next->bindex) &&
9511                                  ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9512                                  ((next->r_flags & RACK_STRADDLE) == 0) &&
9513                                  ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
9514                                  ((next->r_flags & RACK_IN_GP_WIN) == 0))
9515                                 can_use_hookery = 1;
9516                         else
9517                                 can_use_hookery = 0;
9518                         if (next && can_use_hookery &&
9519                             (next->r_flags & RACK_ACKED) &&
9520                             SEQ_GEQ(end, next->r_start)) {
9521                                 /**
9522                                  * So the next one is already acked, and
9523                                  * we can thus by hookery use our stack_map
9524                                  * to reflect the piece being sacked and
9525                                  * then adjust the two tree entries moving
9526                                  * the start and ends around. So we start like:
9527                                  *  rsm     |------------|             (not-acked)
9528                                  *  next                 |-----------| (acked)
9529                                  *  sackblk        |-------->
9530                                  *  We want to end like so:
9531                                  *  rsm     |------|                   (not-acked)
9532                                  *  next           |-----------------| (acked)
9533                                  *  nrsm           |-----|
9534                                  * Where nrsm is a temporary stack piece we
9535                                  * use to update all the gizmos.
9536                                  */
9537                                 /* Copy up our fudge block */
9538                                 noextra++;
9539                                 nrsm = &stack_map;
9540                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9541                                 /* Now adjust our tree blocks */
9542                                 rsm->r_end = start;
9543                                 next->r_start = start;
9544                                 rsm->r_flags |= RACK_SHUFFLED;
9545                                 next->r_flags |= RACK_SHUFFLED;
9546                                 /* Now we must adjust back where next->m is */
9547                                 rack_setup_offset_for_rsm(rack, rsm, next);
9548                                 /*
9549                                  * Which timestamp do we keep? It is rather
9550                                  * important in GP measurements to have the
9551                                  * accurate end of the send window.
9552                                  *
9553                                  * We keep the largest value, which is the newest
9554                                  * send. We do this in case a segment that is
9555                                  * joined together and not part of a GP estimate
9556                                  * later gets expanded into the GP estimate.
9557                                  *
9558                                  * We prohibit the merging of unlike kinds i.e.
9559                                  * all pieces that are in the GP estimate can be
9560                                  * merged and all pieces that are not in a GP estimate
9561                                  * can be merged, but not disimilar pieces. Combine
9562                                  * this with taking the highest here and we should
9563                                  * be ok unless of course the client reneges. Then
9564                                  * all bets are off.
9565                                  */
9566                                 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] <
9567                                     nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)])
9568                                         next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)];
9569                                 /*
9570                                  * And we must keep the newest ack arrival time.
9571                                  */
9572                                 if (next->r_ack_arrival <
9573                                     rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9574                                         next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9575
9576
9577                                 /* We don't need to adjust rsm, it did not change */
9578                                 /* Clear out the dup ack count of the remainder */
9579                                 rsm->r_dupack = 0;
9580                                 rsm->r_just_ret = 0;
9581                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9582                                 /* Now lets make sure our fudge block is right */
9583                                 nrsm->r_start = start;
9584                                 /* Now lets update all the stats and such */
9585                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9586                                 if (rack->app_limited_needs_set)
9587                                         rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9588                                 changed += (nrsm->r_end - nrsm->r_start);
9589                                 /* You get a count for acking a whole segment or more */
9590                                 if ((nrsm->r_end - nrsm->r_start) >= segsiz)
9591                                         rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
9592                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9593                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
9594                                         rack->r_ctl.rc_reorder_ts = cts;
9595                                         if (rack->r_ctl.rc_reorder_ts == 0)
9596                                                 rack->r_ctl.rc_reorder_ts = 1;
9597                                 }
9598                                 /*
9599                                  * Now we want to go up from rsm (the
9600                                  * one left un-acked) to the next one
9601                                  * in the tmap. We do this so when
9602                                  * we walk backwards we include marking
9603                                  * sack-passed on rsm (The one passed in
9604                                  * is skipped since it is generally called
9605                                  * on something sacked before removing it
9606                                  * from the tmap).
9607                                  */
9608                                 if (rsm->r_in_tmap) {
9609                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
9610                                         /*
9611                                          * Now that we have the next
9612                                          * one walk backwards from there.
9613                                          */
9614                                         if (nrsm && nrsm->r_in_tmap)
9615                                                 rack_log_sack_passed(tp, rack, nrsm);
9616                                 }
9617                                 /* Now are we done? */
9618                                 if (SEQ_LT(end, next->r_end) ||
9619                                     (end == next->r_end)) {
9620                                         /* Done with block */
9621                                         goto out;
9622                                 }
9623                                 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
9624                                 counter_u64_add(rack_sack_used_next_merge, 1);
9625                                 /* Postion for the next block */
9626                                 start = next->r_end;
9627                                 rsm = tqhash_next(rack->r_ctl.tqh, next);
9628                                 if (rsm == NULL)
9629                                         goto out;
9630                         } else {
9631                                 /**
9632                                  * We can't use any hookery here, so we
9633                                  * need to split the map. We enter like
9634                                  * so:
9635                                  *  rsm      |--------|
9636                                  *  sackblk       |----->
9637                                  * We will add the new block nrsm and
9638                                  * that will be the new portion, and then
9639                                  * fall through after reseting rsm. So we
9640                                  * split and look like this:
9641                                  *  rsm      |----|
9642                                  *  sackblk       |----->
9643                                  *  nrsm          |---|
9644                                  * We then fall through reseting
9645                                  * rsm to nrsm, so the next block
9646                                  * picks it up.
9647                                  */
9648                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9649                                 if (nrsm == NULL) {
9650                                         /*
9651                                          * failed XXXrrs what can we do but loose the sack
9652                                          * info?
9653                                          */
9654                                         goto out;
9655                                 }
9656                                 counter_u64_add(rack_sack_splits, 1);
9657                                 rack_clone_rsm(rack, nrsm, rsm, start);
9658                                 moved++;
9659                                 rsm->r_just_ret = 0;
9660 #ifndef INVARIANTS
9661                                 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
9662 #else
9663                                 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
9664                                         panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
9665                                               nrsm, insret, rack, rsm);
9666                                 }
9667 #endif
9668                                 if (rsm->r_in_tmap) {
9669                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
9670                                         nrsm->r_in_tmap = 1;
9671                                 }
9672                                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
9673                                 rsm->r_flags &= (~RACK_HAS_FIN);
9674                                 /* Position us to point to the new nrsm that starts the sack blk */
9675                                 rsm = nrsm;
9676                         }
9677                 } else {
9678                         /* Already sacked this piece */
9679                         counter_u64_add(rack_sack_skipped_acked, 1);
9680                         moved++;
9681                         if (end == rsm->r_end) {
9682                                 /* Done with block */
9683                                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9684                                 goto out;
9685                         } else if (SEQ_LT(end, rsm->r_end)) {
9686                                 /* A partial sack to a already sacked block */
9687                                 moved++;
9688                                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9689                                 goto out;
9690                         } else {
9691                                 /*
9692                                  * The end goes beyond this guy
9693                                  * reposition the start to the
9694                                  * next block.
9695                                  */
9696                                 start = rsm->r_end;
9697                                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9698                                 if (rsm == NULL)
9699                                         goto out;
9700                         }
9701                 }
9702         }
9703         if (SEQ_GEQ(end, rsm->r_end)) {
9704                 /**
9705                  * The end of this block is either beyond this guy or right
9706                  * at this guy. I.e.:
9707                  *  rsm ---                 |-----|
9708                  *  end                     |-----|
9709                  *  <or>
9710                  *  end                     |---------|
9711                  */
9712                 if ((rsm->r_flags & RACK_ACKED) == 0) {
9713                         /*
9714                          * Is it a TLP of interest?
9715                          */
9716                         if ((rsm->r_flags & RACK_TLP) &&
9717                             (rsm->r_rtr_cnt > 1)) {
9718                                 /*
9719                                  * We are splitting a rxt TLP, check
9720                                  * if we need to save off the start/end
9721                                  */
9722                                 if (rack->rc_last_tlp_acked_set &&
9723                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9724                                         /*
9725                                          * We already turned this on since we are inside
9726                                          * the previous one was a partially sack now we
9727                                          * are getting another one (maybe all of it).
9728                                          */
9729                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9730                                         /*
9731                                          * Lets make sure we have all of it though.
9732                                          */
9733                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9734                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9735                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9736                                                                      rack->r_ctl.last_tlp_acked_end);
9737                                         }
9738                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9739                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9740                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9741                                                                      rack->r_ctl.last_tlp_acked_end);
9742                                         }
9743                                 } else {
9744                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9745                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9746                                         rack->rc_last_tlp_past_cumack = 0;
9747                                         rack->rc_last_tlp_acked_set = 1;
9748                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9749                                 }
9750                         }
9751                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
9752                         changed += (rsm->r_end - rsm->r_start);
9753                         /* You get a count for acking a whole segment or more */
9754                         if ((rsm->r_end - rsm->r_start) >= segsiz)
9755                                 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
9756                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
9757                         if (rsm->r_in_tmap) /* should be true */
9758                                 rack_log_sack_passed(tp, rack, rsm);
9759                         /* Is Reordering occuring? */
9760                         if (rsm->r_flags & RACK_SACK_PASSED) {
9761                                 rsm->r_flags &= ~RACK_SACK_PASSED;
9762                                 rack->r_ctl.rc_reorder_ts = cts;
9763                                 if (rack->r_ctl.rc_reorder_ts == 0)
9764                                         rack->r_ctl.rc_reorder_ts = 1;
9765                         }
9766                         if (rack->app_limited_needs_set)
9767                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
9768                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9769                         rsm->r_flags |= RACK_ACKED;
9770                         if (rsm->r_in_tmap) {
9771                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9772                                 rsm->r_in_tmap = 0;
9773                         }
9774                         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
9775                 } else {
9776                         counter_u64_add(rack_sack_skipped_acked, 1);
9777                         moved++;
9778                 }
9779                 if (end == rsm->r_end) {
9780                         /* This block only - done, setup for next */
9781                         goto out;
9782                 }
9783                 /*
9784                  * There is more not coverend by this rsm move on
9785                  * to the next block in the RB tree.
9786                  */
9787                 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
9788                 start = rsm->r_end;
9789                 rsm = nrsm;
9790                 if (rsm == NULL)
9791                         goto out;
9792                 goto do_rest_ofb;
9793         }
9794         /**
9795          * The end of this sack block is smaller than
9796          * our rsm i.e.:
9797          *  rsm ---                 |-----|
9798          *  end                     |--|
9799          */
9800         if ((rsm->r_flags & RACK_ACKED) == 0) {
9801                 /*
9802                  * Is it a TLP of interest?
9803                  */
9804                 if ((rsm->r_flags & RACK_TLP) &&
9805                     (rsm->r_rtr_cnt > 1)) {
9806                         /*
9807                          * We are splitting a rxt TLP, check
9808                          * if we need to save off the start/end
9809                          */
9810                         if (rack->rc_last_tlp_acked_set &&
9811                             (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9812                                 /*
9813                                  * We already turned this on since we are inside
9814                                  * the previous one was a partially sack now we
9815                                  * are getting another one (maybe all of it).
9816                                  */
9817                                 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9818                                 /*
9819                                  * Lets make sure we have all of it though.
9820                                  */
9821                                 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9822                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9823                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9824                                                              rack->r_ctl.last_tlp_acked_end);
9825                                 }
9826                                 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9827                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9828                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9829                                                              rack->r_ctl.last_tlp_acked_end);
9830                                 }
9831                         } else {
9832                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9833                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9834                                 rack->rc_last_tlp_past_cumack = 0;
9835                                 rack->rc_last_tlp_acked_set = 1;
9836                                 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9837                         }
9838                 }
9839                 /*
9840                  * Hookery can only be used if the two entries
9841                  * are in the same bucket and neither one of
9842                  * them staddle the bucket line.
9843                  */
9844                 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9845                 if (prev &&
9846                     (rsm->bindex == prev->bindex) &&
9847                     ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9848                     ((prev->r_flags & RACK_STRADDLE) == 0) &&
9849                     (rsm->r_flags & RACK_IN_GP_WIN) &&
9850                     (prev->r_flags & RACK_IN_GP_WIN))
9851                         can_use_hookery = 1;
9852                 else if (prev &&
9853                          (rsm->bindex == prev->bindex) &&
9854                          ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9855                          ((prev->r_flags & RACK_STRADDLE) == 0) &&
9856                          ((rsm->r_flags & RACK_IN_GP_WIN) == 0) &&
9857                          ((prev->r_flags & RACK_IN_GP_WIN) == 0))
9858                         can_use_hookery = 1;
9859                 else
9860                         can_use_hookery = 0;
9861
9862                 if (prev && can_use_hookery &&
9863                     (prev->r_flags & RACK_ACKED)) {
9864                         /**
9865                          * Goal, we want the right remainder of rsm to shrink
9866                          * in place and span from (rsm->r_start = end) to rsm->r_end.
9867                          * We want to expand prev to go all the way
9868                          * to prev->r_end <- end.
9869                          * so in the tree we have before:
9870                          *   prev     |--------|         (acked)
9871                          *   rsm               |-------| (non-acked)
9872                          *   sackblk           |-|
9873                          * We churn it so we end up with
9874                          *   prev     |----------|       (acked)
9875                          *   rsm                 |-----| (non-acked)
9876                          *   nrsm              |-| (temporary)
9877                          *
9878                          * Note if either prev/rsm is a TLP we don't
9879                          * do this.
9880                          */
9881                         noextra++;
9882                         nrsm = &stack_map;
9883                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9884                         prev->r_end = end;
9885                         rsm->r_start = end;
9886                         rsm->r_flags |= RACK_SHUFFLED;
9887                         prev->r_flags |= RACK_SHUFFLED;
9888                         /* Now adjust nrsm (stack copy) to be
9889                          * the one that is the small
9890                          * piece that was "sacked".
9891                          */
9892                         nrsm->r_end = end;
9893                         rsm->r_dupack = 0;
9894                         /*
9895                          * Which timestamp do we keep? It is rather
9896                          * important in GP measurements to have the
9897                          * accurate end of the send window.
9898                          *
9899                          * We keep the largest value, which is the newest
9900                          * send. We do this in case a segment that is
9901                          * joined together and not part of a GP estimate
9902                          * later gets expanded into the GP estimate.
9903                          *
9904                          * We prohibit the merging of unlike kinds i.e.
9905                          * all pieces that are in the GP estimate can be
9906                          * merged and all pieces that are not in a GP estimate
9907                          * can be merged, but not disimilar pieces. Combine
9908                          * this with taking the highest here and we should
9909                          * be ok unless of course the client reneges. Then
9910                          * all bets are off.
9911                          */
9912                         if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] <
9913                            nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) {
9914                                 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9915                         }
9916                         /*
9917                          * And we must keep the newest ack arrival time.
9918                          */
9919
9920                         if(prev->r_ack_arrival <
9921                            rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9922                                 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9923
9924                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9925                         /*
9926                          * Now that the rsm has had its start moved forward
9927                          * lets go ahead and get its new place in the world.
9928                          */
9929                         rack_setup_offset_for_rsm(rack, prev, rsm);
9930                         /*
9931                          * Now nrsm is our new little piece
9932                          * that is acked (which was merged
9933                          * to prev). Update the rtt and changed
9934                          * based on that. Also check for reordering.
9935                          */
9936                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9937                         if (rack->app_limited_needs_set)
9938                                 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9939                         changed += (nrsm->r_end - nrsm->r_start);
9940                         /* You get a count for acking a whole segment or more */
9941                         if ((nrsm->r_end - nrsm->r_start) >= segsiz)
9942                                 rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz);
9943
9944                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9945                         if (nrsm->r_flags & RACK_SACK_PASSED) {
9946                                 rack->r_ctl.rc_reorder_ts = cts;
9947                                 if (rack->r_ctl.rc_reorder_ts == 0)
9948                                         rack->r_ctl.rc_reorder_ts = 1;
9949                         }
9950                         rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
9951                         rsm = prev;
9952                         counter_u64_add(rack_sack_used_prev_merge, 1);
9953                 } else {
9954                         /**
9955                          * This is the case where our previous
9956                          * block is not acked either, so we must
9957                          * split the block in two.
9958                          */
9959                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9960                         if (nrsm == NULL) {
9961                                 /* failed rrs what can we do but loose the sack info? */
9962                                 goto out;
9963                         }
9964                         if ((rsm->r_flags & RACK_TLP) &&
9965                             (rsm->r_rtr_cnt > 1)) {
9966                                 /*
9967                                  * We are splitting a rxt TLP, check
9968                                  * if we need to save off the start/end
9969                                  */
9970                                 if (rack->rc_last_tlp_acked_set &&
9971                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9972                                         /*
9973                                          * We already turned this on since this block is inside
9974                                          * the previous one was a partially sack now we
9975                                          * are getting another one (maybe all of it).
9976                                          */
9977                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9978                                         /*
9979                                          * Lets make sure we have all of it though.
9980                                          */
9981                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9982                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9983                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9984                                                                      rack->r_ctl.last_tlp_acked_end);
9985                                         }
9986                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9987                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9988                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9989                                                                      rack->r_ctl.last_tlp_acked_end);
9990                                         }
9991                                 } else {
9992                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9993                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9994                                         rack->rc_last_tlp_acked_set = 1;
9995                                         rack->rc_last_tlp_past_cumack = 0;
9996                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9997                                 }
9998                         }
9999                         /**
10000                          * In this case nrsm becomes
10001                          * nrsm->r_start = end;
10002                          * nrsm->r_end = rsm->r_end;
10003                          * which is un-acked.
10004                          * <and>
10005                          * rsm->r_end = nrsm->r_start;
10006                          * i.e. the remaining un-acked
10007                          * piece is left on the left
10008                          * hand side.
10009                          *
10010                          * So we start like this
10011                          * rsm      |----------| (not acked)
10012                          * sackblk  |---|
10013                          * build it so we have
10014                          * rsm      |---|         (acked)
10015                          * nrsm         |------|  (not acked)
10016                          */
10017                         counter_u64_add(rack_sack_splits, 1);
10018                         rack_clone_rsm(rack, nrsm, rsm, end);
10019                         moved++;
10020                         rsm->r_flags &= (~RACK_HAS_FIN);
10021                         rsm->r_just_ret = 0;
10022 #ifndef INVARIANTS
10023                         (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
10024 #else
10025                         if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
10026                                 panic("Insert in rb tree of %p fails ret:% rack:%p rsm:%p",
10027                                       nrsm, insret, rack, rsm);
10028                         }
10029 #endif
10030                         if (rsm->r_in_tmap) {
10031                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
10032                                 nrsm->r_in_tmap = 1;
10033                         }
10034                         nrsm->r_dupack = 0;
10035                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
10036                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
10037                         changed += (rsm->r_end - rsm->r_start);
10038                         /* You get a count for acking a whole segment or more */
10039                         if ((rsm->r_end - rsm->r_start) >= segsiz)
10040                                 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz);
10041
10042                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
10043                         if (rsm->r_in_tmap) /* should be true */
10044                                 rack_log_sack_passed(tp, rack, rsm);
10045                         /* Is Reordering occuring? */
10046                         if (rsm->r_flags & RACK_SACK_PASSED) {
10047                                 rsm->r_flags &= ~RACK_SACK_PASSED;
10048                                 rack->r_ctl.rc_reorder_ts = cts;
10049                                 if (rack->r_ctl.rc_reorder_ts == 0)
10050                                         rack->r_ctl.rc_reorder_ts = 1;
10051                         }
10052                         if (rack->app_limited_needs_set)
10053                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
10054                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
10055                         rsm->r_flags |= RACK_ACKED;
10056                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
10057                         if (rsm->r_in_tmap) {
10058                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10059                                 rsm->r_in_tmap = 0;
10060                         }
10061                 }
10062         } else if (start != end){
10063                 /*
10064                  * The block was already acked.
10065                  */
10066                 counter_u64_add(rack_sack_skipped_acked, 1);
10067                 moved++;
10068         }
10069 out:
10070         if (rsm &&
10071             ((rsm->r_flags & RACK_TLP) == 0) &&
10072             (rsm->r_flags & RACK_ACKED)) {
10073                 /*
10074                  * Now can we merge where we worked
10075                  * with either the previous or
10076                  * next block?
10077                  */
10078                 next = tqhash_next(rack->r_ctl.tqh, rsm);
10079                 while (next) {
10080                         if (next->r_flags & RACK_TLP)
10081                                 break;
10082                         /* Only allow merges between ones in or out of GP window */
10083                         if ((next->r_flags & RACK_IN_GP_WIN) &&
10084                             ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
10085                                 break;
10086                         }
10087                         if ((rsm->r_flags & RACK_IN_GP_WIN) &&
10088                             ((next->r_flags & RACK_IN_GP_WIN) == 0)) {
10089                                 break;
10090                         }
10091                         if (rsm->bindex != next->bindex)
10092                                 break;
10093                         if (rsm->r_flags & RACK_STRADDLE)
10094                                 break;
10095                         if (next->r_flags & RACK_STRADDLE)
10096                                 break;
10097                         if (next->r_flags & RACK_ACKED) {
10098                                 /* yep this and next can be merged */
10099                                 rsm = rack_merge_rsm(rack, rsm, next);
10100                                 noextra++;
10101                                 next = tqhash_next(rack->r_ctl.tqh, rsm);
10102                         } else
10103                                 break;
10104                 }
10105                 /* Now what about the previous? */
10106                 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
10107                 while (prev) {
10108                         if (prev->r_flags & RACK_TLP)
10109                                 break;
10110                         /* Only allow merges between ones in or out of GP window */
10111                         if ((prev->r_flags & RACK_IN_GP_WIN) &&
10112                             ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
10113                                 break;
10114                         }
10115                         if ((rsm->r_flags & RACK_IN_GP_WIN) &&
10116                             ((prev->r_flags & RACK_IN_GP_WIN) == 0)) {
10117                                 break;
10118                         }
10119                         if (rsm->bindex != prev->bindex)
10120                                 break;
10121                         if (rsm->r_flags & RACK_STRADDLE)
10122                                 break;
10123                         if (prev->r_flags & RACK_STRADDLE)
10124                                 break;
10125                         if (prev->r_flags & RACK_ACKED) {
10126                                 /* yep the previous and this can be merged */
10127                                 rsm = rack_merge_rsm(rack, prev, rsm);
10128                                 noextra++;
10129                                 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
10130                         } else
10131                                 break;
10132                 }
10133         }
10134         if (used_ref == 0) {
10135                 counter_u64_add(rack_sack_proc_all, 1);
10136         } else {
10137                 counter_u64_add(rack_sack_proc_short, 1);
10138         }
10139         /* Save off the next one for quick reference. */
10140         nrsm = tqhash_find(rack->r_ctl.tqh, end);
10141         *prsm = rack->r_ctl.rc_sacklast = nrsm;
10142         /* Pass back the moved. */
10143         *moved_two = moved;
10144         *no_extra = noextra;
10145         return (changed);
10146 }
10147
10148 static void inline
10149 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
10150 {
10151         struct rack_sendmap *tmap;
10152
10153         tmap = NULL;
10154         while (rsm && (rsm->r_flags & RACK_ACKED)) {
10155                 /* Its no longer sacked, mark it so */
10156                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
10157 #ifdef INVARIANTS
10158                 if (rsm->r_in_tmap) {
10159                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
10160                               rack, rsm, rsm->r_flags);
10161                 }
10162 #endif
10163                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
10164                 /* Rebuild it into our tmap */
10165                 if (tmap == NULL) {
10166                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10167                         tmap = rsm;
10168                 } else {
10169                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
10170                         tmap = rsm;
10171                 }
10172                 tmap->r_in_tmap = 1;
10173                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
10174         }
10175         /*
10176          * Now lets possibly clear the sack filter so we start
10177          * recognizing sacks that cover this area.
10178          */
10179         sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
10180
10181 }
10182
10183 static void
10184 rack_do_decay(struct tcp_rack *rack)
10185 {
10186         struct timeval res;
10187
10188 #define timersub(tvp, uvp, vvp)                                         \
10189         do {                                                            \
10190                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
10191                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
10192                 if ((vvp)->tv_usec < 0) {                               \
10193                         (vvp)->tv_sec--;                                \
10194                         (vvp)->tv_usec += 1000000;                      \
10195                 }                                                       \
10196         } while (0)
10197
10198         timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
10199 #undef timersub
10200
10201         rack->r_ctl.input_pkt++;
10202         if ((rack->rc_in_persist) ||
10203             (res.tv_sec >= 1) ||
10204             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
10205                 /*
10206                  * Check for decay of non-SAD,
10207                  * we want all SAD detection metrics to
10208                  * decay 1/4 per second (or more) passed.
10209                  * Current default is 800 so it decays
10210                  * 80% every second.
10211                  */
10212 #ifdef TCP_SAD_DETECTION
10213                 uint32_t pkt_delta;
10214
10215                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
10216 #endif
10217                 /* Update our saved tracking values */
10218                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
10219                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
10220                 /* Now do we escape without decay? */
10221 #ifdef TCP_SAD_DETECTION
10222                 if (rack->rc_in_persist ||
10223                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
10224                     (pkt_delta < tcp_sad_low_pps)){
10225                         /*
10226                          * We don't decay idle connections
10227                          * or ones that have a low input pps.
10228                          */
10229                         return;
10230                 }
10231                 /* Decay the counters */
10232                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
10233                                                         tcp_sad_decay_val);
10234                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
10235                                                          tcp_sad_decay_val);
10236                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
10237                                                                tcp_sad_decay_val);
10238                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
10239                                                                 tcp_sad_decay_val);
10240 #endif
10241         }
10242 }
10243
10244 static void inline
10245 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from)
10246 {
10247         /*
10248          * We look at advancing the end send time for our GP
10249          * measurement tracking only as the cumulative acknowledgment
10250          * moves forward. You might wonder about this, why not
10251          * at every transmission or retransmission within the
10252          * GP window update the rc_gp_cumack_ts? Well its rather
10253          * nuanced but basically the GP window *may* expand (as
10254          * it does below) or worse and harder to track it may shrink.
10255          *
10256          * This last makes it impossible to track at the time of
10257          * the send, since you may set forward your rc_gp_cumack_ts
10258          * when you send, because that send *is* in your currently
10259          * "guessed" window, but then it shrinks. Now which was
10260          * the send time of the last bytes in the window, by the
10261          * time you ask that question that part of the sendmap
10262          * is freed. So you don't know and you will have too
10263          * long of send window. Instead by updating the time
10264          * marker only when the cumack advances this assures us
10265          * that we will have only the sends in the window of our
10266          * GP measurement.
10267          *
10268          * Another complication from this is the
10269          * merging of sendmap entries. During SACK processing this
10270          * can happen to conserve the sendmap size. That breaks
10271          * everything down in tracking the send window of the GP
10272          * estimate. So to prevent that and keep it working with
10273          * a tiny bit more limited merging, we only allow like
10274          * types to be merged. I.e. if two sends are in the GP window
10275          * then its ok to merge them together. If two sends are not
10276          * in the GP window its ok to merge them together too. Though
10277          * one send in and one send out cannot be merged. We combine
10278          * this with never allowing the shrinking of the GP window when
10279          * we are in recovery so that we can properly calculate the
10280          * sending times.
10281          *
10282          * This all of course seems complicated, because it is.. :)
10283          *
10284          * The cum-ack is being advanced upon the sendmap.
10285          * If we are not doing a GP estimate don't
10286          * proceed.
10287          */
10288         uint64_t ts;
10289
10290         if ((tp->t_flags & TF_GPUTINPROG) == 0)
10291                 return;
10292         /*
10293          * If this sendmap entry is going
10294          * beyond the measurement window we had picked,
10295          * expand the measurement window by that much.
10296          */
10297         if (SEQ_GT(rsm->r_end, tp->gput_ack)) {
10298                 tp->gput_ack = rsm->r_end;
10299         }
10300         /*
10301          * If we have not setup a ack, then we
10302          * have no idea if the newly acked pieces
10303          * will be "in our seq measurement range". If
10304          * it is when we clear the app_limited_needs_set
10305          * flag the timestamp will be updated.
10306          */
10307         if (rack->app_limited_needs_set)
10308                 return;
10309         /*
10310          * Finally, we grab out the latest timestamp
10311          * that this packet was sent and then see
10312          * if:
10313          *  a) The packet touches are newly defined GP range.
10314          *  b) The time is greater than (newer) than the
10315          *     one we currently have. If so we update
10316          *     our sending end time window.
10317          *
10318          * Note we *do not* do this at send time. The reason
10319          * is that if you do you *may* pick up a newer timestamp
10320          * for a range you are not going to measure. We project
10321          * out how far and then sometimes modify that to be
10322          * smaller. If that occurs then you will have a send
10323          * that does not belong to the range included.
10324          */
10325         if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <=
10326             rack->r_ctl.rc_gp_cumack_ts)
10327                 return;
10328         if (rack_in_gp_window(tp, rsm)) {
10329                 rack->r_ctl.rc_gp_cumack_ts = ts;
10330                 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end,
10331                                __LINE__, from, rsm);
10332         }
10333 }
10334
10335 static void
10336 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime)
10337 {
10338         struct rack_sendmap *rsm;
10339         /*
10340          * The ACK point is advancing to th_ack, we must drop off
10341          * the packets in the rack log and calculate any eligble
10342          * RTT's.
10343          */
10344
10345         rack->r_wanted_output = 1;
10346         if (SEQ_GT(th_ack, tp->snd_una))
10347             rack->r_ctl.last_cumack_advance = acktime;
10348
10349         /* Tend any TLP that has been marked for 1/2 the seq space (its old)  */
10350         if ((rack->rc_last_tlp_acked_set == 1)&&
10351             (rack->rc_last_tlp_past_cumack == 1) &&
10352             (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
10353                 /*
10354                  * We have reached the point where our last rack
10355                  * tlp retransmit sequence is ahead of the cum-ack.
10356                  * This can only happen when the cum-ack moves all
10357                  * the way around (its been a full 2^^31+1 bytes
10358                  * or more since we sent a retransmitted TLP). Lets
10359                  * turn off the valid flag since its not really valid.
10360                  *
10361                  * Note since sack's also turn on this event we have
10362                  * a complication, we have to wait to age it out until
10363                  * the cum-ack is by the TLP before checking which is
10364                  * what the next else clause does.
10365                  */
10366                 rack_log_dsack_event(rack, 9, __LINE__,
10367                                      rack->r_ctl.last_tlp_acked_start,
10368                                      rack->r_ctl.last_tlp_acked_end);
10369                 rack->rc_last_tlp_acked_set = 0;
10370                 rack->rc_last_tlp_past_cumack = 0;
10371         } else if ((rack->rc_last_tlp_acked_set == 1) &&
10372                    (rack->rc_last_tlp_past_cumack == 0) &&
10373                    (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
10374                 /*
10375                  * It is safe to start aging TLP's out.
10376                  */
10377                 rack->rc_last_tlp_past_cumack = 1;
10378         }
10379         /* We do the same for the tlp send seq as well */
10380         if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10381             (rack->rc_last_sent_tlp_past_cumack == 1) &&
10382             (SEQ_GT(rack->r_ctl.last_sent_tlp_seq,  th_ack))) {
10383                 rack_log_dsack_event(rack, 9, __LINE__,
10384                                      rack->r_ctl.last_sent_tlp_seq,
10385                                      (rack->r_ctl.last_sent_tlp_seq +
10386                                       rack->r_ctl.last_sent_tlp_len));
10387                 rack->rc_last_sent_tlp_seq_valid = 0;
10388                 rack->rc_last_sent_tlp_past_cumack = 0;
10389         } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10390                    (rack->rc_last_sent_tlp_past_cumack == 0) &&
10391                    (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
10392                 /*
10393                  * It is safe to start aging TLP's send.
10394                  */
10395                 rack->rc_last_sent_tlp_past_cumack = 1;
10396         }
10397 more:
10398         rsm = tqhash_min(rack->r_ctl.tqh);
10399         if (rsm == NULL) {
10400                 if ((th_ack - 1) == tp->iss) {
10401                         /*
10402                          * For the SYN incoming case we will not
10403                          * have called tcp_output for the sending of
10404                          * the SYN, so there will be no map. All
10405                          * other cases should probably be a panic.
10406                          */
10407                         return;
10408                 }
10409                 if (tp->t_flags & TF_SENTFIN) {
10410                         /* if we sent a FIN we often will not have map */
10411                         return;
10412                 }
10413 #ifdef INVARIANTS
10414                 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
10415                       tp,
10416                       tp->t_state, th_ack, rack,
10417                       tp->snd_una, tp->snd_max, tp->snd_nxt);
10418 #endif
10419                 return;
10420         }
10421         if (SEQ_LT(th_ack, rsm->r_start)) {
10422                 /* Huh map is missing this */
10423 #ifdef INVARIANTS
10424                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
10425                        rsm->r_start,
10426                        th_ack, tp->t_state, rack->r_state);
10427 #endif
10428                 return;
10429         }
10430         rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
10431
10432         /* Now was it a retransmitted TLP? */
10433         if ((rsm->r_flags & RACK_TLP) &&
10434             (rsm->r_rtr_cnt > 1)) {
10435                 /*
10436                  * Yes, this rsm was a TLP and retransmitted, remember that
10437                  * since if a DSACK comes back on this we don't want
10438                  * to think of it as a reordered segment. This may
10439                  * get updated again with possibly even other TLPs
10440                  * in flight, but thats ok. Only when we don't send
10441                  * a retransmitted TLP for 1/2 the sequences space
10442                  * will it get turned off (above).
10443                  */
10444                 if (rack->rc_last_tlp_acked_set &&
10445                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
10446                         /*
10447                          * We already turned this on since the end matches,
10448                          * the previous one was a partially ack now we
10449                          * are getting another one (maybe all of it).
10450                          */
10451                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
10452                         /*
10453                          * Lets make sure we have all of it though.
10454                          */
10455                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
10456                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10457                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10458                                                      rack->r_ctl.last_tlp_acked_end);
10459                         }
10460                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
10461                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10462                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10463                                                      rack->r_ctl.last_tlp_acked_end);
10464                         }
10465                 } else {
10466                         rack->rc_last_tlp_past_cumack = 1;
10467                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10468                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10469                         rack->rc_last_tlp_acked_set = 1;
10470                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
10471                 }
10472         }
10473         /* Now do we consume the whole thing? */
10474         if (SEQ_GEQ(th_ack, rsm->r_end)) {
10475                 /* Its all consumed. */
10476                 uint32_t left;
10477                 uint8_t newly_acked;
10478
10479                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
10480                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
10481                 rsm->r_rtr_bytes = 0;
10482                 /*
10483                  * Record the time of highest cumack sent if its in our measurement
10484                  * window and possibly bump out the end.
10485                  */
10486                 rack_rsm_sender_update(rack, tp, rsm, 4);
10487                 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
10488                 if (rsm->r_in_tmap) {
10489                         TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10490                         rsm->r_in_tmap = 0;
10491                 }
10492                 newly_acked = 1;
10493                 if (rsm->r_flags & RACK_ACKED) {
10494                         /*
10495                          * It was acked on the scoreboard -- remove
10496                          * it from total
10497                          */
10498                         rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
10499                         newly_acked = 0;
10500                 } else if (rsm->r_flags & RACK_SACK_PASSED) {
10501                         /*
10502                          * There are segments ACKED on the
10503                          * scoreboard further up. We are seeing
10504                          * reordering.
10505                          */
10506                         rsm->r_flags &= ~RACK_SACK_PASSED;
10507                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
10508                         rsm->r_flags |= RACK_ACKED;
10509                         rack->r_ctl.rc_reorder_ts = cts;
10510                         if (rack->r_ctl.rc_reorder_ts == 0)
10511                                 rack->r_ctl.rc_reorder_ts = 1;
10512                         if (rack->r_ent_rec_ns) {
10513                                 /*
10514                                  * We have sent no more, and we saw an sack
10515                                  * then ack arrive.
10516                                  */
10517                                 rack->r_might_revert = 1;
10518                         }
10519                 }
10520                 if ((rsm->r_flags & RACK_TO_REXT) &&
10521                     (tp->t_flags & TF_RCVD_TSTMP) &&
10522                     (to->to_flags & TOF_TS) &&
10523                     (to->to_tsecr != 0) &&
10524                     (tp->t_flags & TF_PREVVALID)) {
10525                         /*
10526                          * We can use the timestamp to see
10527                          * if this retransmission was from the
10528                          * first transmit. If so we made a mistake.
10529                          */
10530                         tp->t_flags &= ~TF_PREVVALID;
10531                         if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
10532                                 /* The first transmit is what this ack is for */
10533                                 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__);
10534                         }
10535                 }
10536                 left = th_ack - rsm->r_end;
10537                 if (rack->app_limited_needs_set && newly_acked)
10538                         rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
10539                 /* Free back to zone */
10540                 rack_free(rack, rsm);
10541                 if (left) {
10542                         goto more;
10543                 }
10544                 /* Check for reneging */
10545                 rsm = tqhash_min(rack->r_ctl.tqh);
10546                 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
10547                         /*
10548                          * The peer has moved snd_una up to
10549                          * the edge of this send, i.e. one
10550                          * that it had previously acked. The only
10551                          * way that can be true if the peer threw
10552                          * away data (space issues) that it had
10553                          * previously sacked (else it would have
10554                          * given us snd_una up to (rsm->r_end).
10555                          * We need to undo the acked markings here.
10556                          *
10557                          * Note we have to look to make sure th_ack is
10558                          * our rsm->r_start in case we get an old ack
10559                          * where th_ack is behind snd_una.
10560                          */
10561                         rack_peer_reneges(rack, rsm, th_ack);
10562                 }
10563                 return;
10564         }
10565         if (rsm->r_flags & RACK_ACKED) {
10566                 /*
10567                  * It was acked on the scoreboard -- remove it from
10568                  * total for the part being cum-acked.
10569                  */
10570                 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
10571         }
10572         /*
10573          * Clear the dup ack count for
10574          * the piece that remains.
10575          */
10576         rsm->r_dupack = 0;
10577         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
10578         if (rsm->r_rtr_bytes) {
10579                 /*
10580                  * It was retransmitted adjust the
10581                  * sack holes for what was acked.
10582                  */
10583                 int ack_am;
10584
10585                 ack_am = (th_ack - rsm->r_start);
10586                 if (ack_am >= rsm->r_rtr_bytes) {
10587                         rack->r_ctl.rc_holes_rxt -= ack_am;
10588                         rsm->r_rtr_bytes -= ack_am;
10589                 }
10590         }
10591         /*
10592          * Update where the piece starts and record
10593          * the time of send of highest cumack sent if
10594          * its in our GP range.
10595          */
10596         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
10597         /* Now we need to move our offset forward too */
10598         if (rsm->m &&
10599             ((rsm->orig_m_len != rsm->m->m_len) ||
10600              (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
10601                 /* Fix up the orig_m_len and possibly the mbuf offset */
10602                 rack_adjust_orig_mlen(rsm);
10603         }
10604         rsm->soff += (th_ack - rsm->r_start);
10605         rack_rsm_sender_update(rack, tp, rsm, 5);
10606         /* The trim will move th_ack into r_start for us */
10607         tqhash_trim(rack->r_ctl.tqh, th_ack);
10608         /* Now do we need to move the mbuf fwd too? */
10609         if (rsm->m) {
10610                 while (rsm->soff >= rsm->m->m_len) {
10611                         rsm->soff -= rsm->m->m_len;
10612                         rsm->m = rsm->m->m_next;
10613                         KASSERT((rsm->m != NULL),
10614                                 (" nrsm:%p hit at soff:%u null m",
10615                                  rsm, rsm->soff));
10616                 }
10617                 rsm->orig_m_len = rsm->m->m_len;
10618                 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
10619         }
10620         if (rack->app_limited_needs_set &&
10621             SEQ_GEQ(th_ack, tp->gput_seq))
10622                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
10623 }
10624
10625 static void
10626 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
10627 {
10628         struct rack_sendmap *rsm;
10629         int sack_pass_fnd = 0;
10630
10631         if (rack->r_might_revert) {
10632                 /*
10633                  * Ok we have reordering, have not sent anything, we
10634                  * might want to revert the congestion state if nothing
10635                  * further has SACK_PASSED on it. Lets check.
10636                  *
10637                  * We also get here when we have DSACKs come in for
10638                  * all the data that we FR'd. Note that a rxt or tlp
10639                  * timer clears this from happening.
10640                  */
10641
10642                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
10643                         if (rsm->r_flags & RACK_SACK_PASSED) {
10644                                 sack_pass_fnd = 1;
10645                                 break;
10646                         }
10647                 }
10648                 if (sack_pass_fnd == 0) {
10649                         /*
10650                          * We went into recovery
10651                          * incorrectly due to reordering!
10652                          */
10653                         int orig_cwnd;
10654
10655                         rack->r_ent_rec_ns = 0;
10656                         orig_cwnd = tp->snd_cwnd;
10657                         tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
10658                         tp->snd_recover = tp->snd_una;
10659                         rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
10660                         EXIT_RECOVERY(tp->t_flags);
10661                 }
10662                 rack->r_might_revert = 0;
10663         }
10664 }
10665
10666 #ifdef TCP_SAD_DETECTION
10667
10668 static void
10669 rack_merge_out_sacks(struct tcp_rack *rack)
10670 {
10671         struct rack_sendmap *cur, *next, *rsm, *trsm = NULL;
10672
10673         cur = tqhash_min(rack->r_ctl.tqh);
10674         while(cur) {
10675                 next = tqhash_next(rack->r_ctl.tqh, cur);
10676                 /*
10677                  * The idea is to go through all and merge back
10678                  * together the pieces sent together,
10679                  */
10680                 if ((next != NULL) &&
10681                     (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) {
10682                         rack_merge_rsm(rack, cur, next);
10683                 } else {
10684                         cur = next;
10685                 }
10686         }
10687         /*
10688          * now treat it like a rxt event, everything is outstanding
10689          * and sent nothing acvked and dupacks are all zero. If this
10690          * is not an attacker it will have to dupack its way through
10691          * it all.
10692          */
10693         TAILQ_INIT(&rack->r_ctl.rc_tmap);
10694         TQHASH_FOREACH(rsm, rack->r_ctl.tqh)  {
10695                 rsm->r_dupack = 0;
10696                 /* We must re-add it back to the tlist */
10697                 if (trsm == NULL) {
10698                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10699                 } else {
10700                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
10701                 }
10702                 rsm->r_in_tmap = 1;
10703                 trsm = rsm;
10704                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
10705         }
10706         sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una);
10707 }
10708
10709 static void
10710 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t bytes_this_ack, uint32_t segsiz)
10711 {
10712         int do_detection = 0;
10713
10714         if (rack->sack_attack_disable || rack->rc_suspicious) {
10715                 /*
10716                  * If we have been disabled we must detect
10717                  * to possibly reverse it. Or if the guy has
10718                  * sent in suspicious sacks we want to do detection too.
10719                  */
10720                 do_detection = 1;
10721
10722         } else if  ((rack->do_detection || tcp_force_detection) &&
10723                     (tcp_sack_to_ack_thresh > 0) &&
10724                     (tcp_sack_to_move_thresh > 0) &&
10725                     (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) {
10726                 /*
10727                  * We only detect here if:
10728                  * 1) System wide forcing is on <or> do_detection is on
10729                  *   <and>
10730                  * 2) We have thresholds for move and ack (set one to 0 and we are off)
10731                  *   <and>
10732                  * 3) We have maps allocated larger than our min (500).
10733                  */
10734                 do_detection = 1;
10735         }
10736         if (do_detection > 0) {
10737                 /*
10738                  * We have thresholds set to find
10739                  * possible attackers and disable sack.
10740                  * Check them.
10741                  */
10742                 uint64_t ackratio, moveratio, movetotal;
10743
10744                 /* Log detecting */
10745                 rack_log_sad(rack, 1);
10746                 /* Do we establish a ack ratio */
10747                 if ((rack->r_ctl.sack_count > tcp_map_minimum)  ||
10748                     (rack->rc_suspicious == 1) ||
10749                     (rack->sack_attack_disable > 0)) {
10750                         ackratio = (uint64_t)(rack->r_ctl.sack_count);
10751                         ackratio *= (uint64_t)(1000);
10752                         if (rack->r_ctl.ack_count)
10753                                 ackratio /= (uint64_t)(rack->r_ctl.ack_count);
10754                         else {
10755                                 /* We can hit this due to ack totals degregation (via small sacks) */
10756                                 ackratio = 1000;
10757                         }
10758                 } else {
10759                         /*
10760                          * No ack ratio needed if we have not
10761                          * seen more sacks then the number of map entries.
10762                          * The exception to that is if we have disabled sack then
10763                          * we need to find a ratio.
10764                          */
10765                         ackratio = 0;
10766                 }
10767
10768                 if ((rack->sack_attack_disable == 0) &&
10769                     (ackratio > rack_highest_sack_thresh_seen))
10770                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
10771                 /* Do we establish a move ratio? */
10772                 if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) ||
10773                     (rack->rc_suspicious == 1) ||
10774                     (rack->sack_attack_disable > 0)) {
10775                         /*
10776                          * We need to have more sack moves than maps
10777                          * allocated to have a move ratio considered.
10778                          */
10779                         movetotal = rack->r_ctl.sack_moved_extra;
10780                         movetotal += rack->r_ctl.sack_noextra_move;
10781                         moveratio = rack->r_ctl.sack_moved_extra;
10782                         moveratio *= (uint64_t)1000;
10783                         if (movetotal)
10784                                 moveratio /= movetotal;
10785                         else {
10786                                 /* No moves, thats pretty good */
10787                                 moveratio = 0;
10788                         }
10789                 } else {
10790                         /*
10791                          * Not enough moves have occured to consider
10792                          * if we are out of whack in that ratio.
10793                          * The exception to that is if we have disabled sack then
10794                          * we need to find a ratio.
10795                          */
10796                         moveratio = 0;
10797                 }
10798                 if ((rack->sack_attack_disable == 0) &&
10799                     (moveratio > rack_highest_move_thresh_seen))
10800                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
10801                 /* Now the tests */
10802                 if (rack->sack_attack_disable == 0) {
10803                         /* Not disabled, do we need to disable? */
10804                         if ((ackratio > tcp_sack_to_ack_thresh) &&
10805                             (moveratio > tcp_sack_to_move_thresh)) {
10806                                 /* Disable sack processing */
10807                                 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED);
10808                                 rack->sack_attack_disable = 1;
10809                                 /* set it so we have the built in delay */
10810                                 rack->r_ctl.ack_during_sd = 1;
10811                                 if (rack_merge_out_sacks_on_attack)
10812                                         rack_merge_out_sacks(rack);
10813                                 counter_u64_add(rack_sack_attacks_detected, 1);
10814                                 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED);
10815                                 /* Clamp the cwnd at flight size */
10816                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
10817                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
10818                                 rack_log_sad(rack, 2);
10819                         }
10820                 } else {
10821                         /* We are sack-disabled check for false positives */
10822                         if ((ackratio <= tcp_restoral_thresh) ||
10823                             ((rack_merge_out_sacks_on_attack == 0) &&
10824                              (rack->rc_suspicious == 0) &&
10825                              (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) {
10826                                 rack->sack_attack_disable = 0;
10827                                 rack_log_sad(rack, 3);
10828                                 /* Restart counting */
10829                                 rack->r_ctl.sack_count = 0;
10830                                 rack->r_ctl.sack_moved_extra = 0;
10831                                 rack->r_ctl.sack_noextra_move = 1;
10832                                 rack->rc_suspicious = 0;
10833                                 rack->r_ctl.ack_count = max(1,
10834                                                             (bytes_this_ack / segsiz));
10835
10836                                 counter_u64_add(rack_sack_attacks_reversed, 1);
10837                                 /* Restore the cwnd */
10838                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
10839                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
10840                         }
10841                 }
10842         }
10843 }
10844 #endif
10845
10846 static int
10847 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
10848 {
10849
10850         uint32_t am, l_end;
10851         int was_tlp = 0;
10852
10853         if (SEQ_GT(end, start))
10854                 am = end - start;
10855         else
10856                 am = 0;
10857         if ((rack->rc_last_tlp_acked_set ) &&
10858             (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
10859             (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
10860                 /*
10861                  * The DSACK is because of a TLP which we don't
10862                  * do anything with the reordering window over since
10863                  * it was not reordering that caused the DSACK but
10864                  * our previous retransmit TLP.
10865                  */
10866                 rack_log_dsack_event(rack, 7, __LINE__, start, end);
10867                 was_tlp = 1;
10868                 goto skip_dsack_round;
10869         }
10870         if (rack->rc_last_sent_tlp_seq_valid) {
10871                 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
10872                 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
10873                     (SEQ_LEQ(end, l_end))) {
10874                         /*
10875                          * This dsack is from the last sent TLP, ignore it
10876                          * for reordering purposes.
10877                          */
10878                         rack_log_dsack_event(rack, 7, __LINE__, start, end);
10879                         was_tlp = 1;
10880                         goto skip_dsack_round;
10881                 }
10882         }
10883         if (rack->rc_dsack_round_seen == 0) {
10884                 rack->rc_dsack_round_seen = 1;
10885                 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
10886                 rack->r_ctl.num_dsack++;
10887                 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
10888                 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
10889         }
10890 skip_dsack_round:
10891         /*
10892          * We keep track of how many DSACK blocks we get
10893          * after a recovery incident.
10894          */
10895         rack->r_ctl.dsack_byte_cnt += am;
10896         if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
10897             rack->r_ctl.retran_during_recovery &&
10898             (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
10899                 /*
10900                  * False recovery most likely culprit is reordering. If
10901                  * nothing else is missing we need to revert.
10902                  */
10903                 rack->r_might_revert = 1;
10904                 rack_handle_might_revert(rack->rc_tp, rack);
10905                 rack->r_might_revert = 0;
10906                 rack->r_ctl.retran_during_recovery = 0;
10907                 rack->r_ctl.dsack_byte_cnt = 0;
10908         }
10909         return (was_tlp);
10910 }
10911
10912 static uint32_t
10913 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una)
10914 {
10915         return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt);
10916 }
10917
10918 static int32_t
10919 rack_compute_pipe(struct tcpcb *tp)
10920 {
10921         return ((int32_t)do_rack_compute_pipe(tp,
10922                                               (struct tcp_rack *)tp->t_fb_ptr,
10923                                               tp->snd_una));
10924 }
10925
10926 static void
10927 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
10928 {
10929         /* Deal with changed and PRR here (in recovery only) */
10930         uint32_t pipe, snd_una;
10931
10932         rack->r_ctl.rc_prr_delivered += changed;
10933
10934         if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
10935                 /*
10936                  * It is all outstanding, we are application limited
10937                  * and thus we don't need more room to send anything.
10938                  * Note we use tp->snd_una here and not th_ack because
10939                  * the data as yet not been cut from the sb.
10940                  */
10941                 rack->r_ctl.rc_prr_sndcnt = 0;
10942                 return;
10943         }
10944         /* Compute prr_sndcnt */
10945         if (SEQ_GT(tp->snd_una, th_ack)) {
10946                 snd_una = tp->snd_una;
10947         } else {
10948                 snd_una = th_ack;
10949         }
10950         pipe = do_rack_compute_pipe(tp, rack, snd_una);
10951         if (pipe > tp->snd_ssthresh) {
10952                 long sndcnt;
10953
10954                 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
10955                 if (rack->r_ctl.rc_prr_recovery_fs > 0)
10956                         sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
10957                 else {
10958                         rack->r_ctl.rc_prr_sndcnt = 0;
10959                         rack_log_to_prr(rack, 9, 0, __LINE__);
10960                         sndcnt = 0;
10961                 }
10962                 sndcnt++;
10963                 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
10964                         sndcnt -= rack->r_ctl.rc_prr_out;
10965                 else
10966                         sndcnt = 0;
10967                 rack->r_ctl.rc_prr_sndcnt = sndcnt;
10968                 rack_log_to_prr(rack, 10, 0, __LINE__);
10969         } else {
10970                 uint32_t limit;
10971
10972                 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
10973                         limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
10974                 else
10975                         limit = 0;
10976                 if (changed > limit)
10977                         limit = changed;
10978                 limit += ctf_fixed_maxseg(tp);
10979                 if (tp->snd_ssthresh > pipe) {
10980                         rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
10981                         rack_log_to_prr(rack, 11, 0, __LINE__);
10982                 } else {
10983                         rack->r_ctl.rc_prr_sndcnt = min(0, limit);
10984                         rack_log_to_prr(rack, 12, 0, __LINE__);
10985                 }
10986         }
10987 }
10988
10989 static void
10990 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck,
10991              int *dsack_seen, int *sacks_seen)
10992 {
10993         uint32_t changed;
10994         struct tcp_rack *rack;
10995         struct rack_sendmap *rsm;
10996         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
10997         register uint32_t th_ack;
10998         int32_t i, j, k, num_sack_blks = 0;
10999         uint32_t cts, acked, ack_point;
11000         int loop_start = 0, moved_two = 0, no_extra = 0;
11001         uint32_t tsused;
11002         uint32_t segsiz, o_cnt;
11003
11004
11005         INP_WLOCK_ASSERT(tptoinpcb(tp));
11006         if (tcp_get_flags(th) & TH_RST) {
11007                 /* We don't log resets */
11008                 return;
11009         }
11010         rack = (struct tcp_rack *)tp->t_fb_ptr;
11011         cts = tcp_get_usecs(NULL);
11012         rsm = tqhash_min(rack->r_ctl.tqh);
11013         changed = 0;
11014         th_ack = th->th_ack;
11015         if (rack->sack_attack_disable == 0)
11016                 rack_do_decay(rack);
11017         segsiz = ctf_fixed_maxseg(rack->rc_tp);
11018         if (BYTES_THIS_ACK(tp, th) >=  segsiz) {
11019                 /*
11020                  * You only get credit for
11021                  * MSS and greater (and you get extra
11022                  * credit for larger cum-ack moves).
11023                  */
11024                 int ac;
11025
11026                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
11027                 rack->r_ctl.ack_count += ac;
11028                 counter_u64_add(rack_ack_total, ac);
11029         }
11030         if (rack->r_ctl.ack_count > 0xfff00000) {
11031                 /*
11032                  * reduce the number to keep us under
11033                  * a uint32_t.
11034                  */
11035                 rack->r_ctl.ack_count /= 2;
11036                 rack->r_ctl.sack_count /= 2;
11037         }
11038         if (SEQ_GT(th_ack, tp->snd_una)) {
11039                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
11040                 tp->t_acktime = ticks;
11041         }
11042         if (rsm && SEQ_GT(th_ack, rsm->r_start))
11043                 changed = th_ack - rsm->r_start;
11044         if (changed) {
11045                 rack_process_to_cumack(tp, rack, th_ack, cts, to,
11046                                        tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
11047         }
11048         if ((to->to_flags & TOF_SACK) == 0) {
11049                 /* We are done nothing left and no sack. */
11050                 rack_handle_might_revert(tp, rack);
11051                 /*
11052                  * For cases where we struck a dup-ack
11053                  * with no SACK, add to the changes so
11054                  * PRR will work right.
11055                  */
11056                 if (dup_ack_struck && (changed == 0)) {
11057                         changed += ctf_fixed_maxseg(rack->rc_tp);
11058                 }
11059                 goto out;
11060         }
11061         /* Sack block processing */
11062         if (SEQ_GT(th_ack, tp->snd_una))
11063                 ack_point = th_ack;
11064         else
11065                 ack_point = tp->snd_una;
11066         for (i = 0; i < to->to_nsacks; i++) {
11067                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
11068                       &sack, sizeof(sack));
11069                 sack.start = ntohl(sack.start);
11070                 sack.end = ntohl(sack.end);
11071                 if (SEQ_GT(sack.end, sack.start) &&
11072                     SEQ_GT(sack.start, ack_point) &&
11073                     SEQ_LT(sack.start, tp->snd_max) &&
11074                     SEQ_GT(sack.end, ack_point) &&
11075                     SEQ_LEQ(sack.end, tp->snd_max)) {
11076                         sack_blocks[num_sack_blks] = sack;
11077                         num_sack_blks++;
11078                 } else if (SEQ_LEQ(sack.start, th_ack) &&
11079                            SEQ_LEQ(sack.end, th_ack)) {
11080                         int was_tlp;
11081
11082                         if (dsack_seen != NULL)
11083                                 *dsack_seen = 1;
11084                         was_tlp = rack_note_dsack(rack, sack.start, sack.end);
11085                         /*
11086                          * Its a D-SACK block.
11087                          */
11088                         tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
11089                 }
11090         }
11091         if (rack->rc_dsack_round_seen) {
11092                 /* Is the dsack roound over? */
11093                 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
11094                         /* Yes it is */
11095                         rack->rc_dsack_round_seen = 0;
11096                         rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
11097                 }
11098         }
11099         /*
11100          * Sort the SACK blocks so we can update the rack scoreboard with
11101          * just one pass.
11102          */
11103         o_cnt = num_sack_blks;
11104         num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
11105                                          num_sack_blks, th->th_ack);
11106         ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
11107         if (sacks_seen != NULL)
11108                 *sacks_seen = num_sack_blks;
11109         if (num_sack_blks == 0) {
11110                 /* Nothing to sack, but we need to update counts */
11111                 if ((o_cnt == 1) &&
11112                     (*dsack_seen != 1))
11113                         rack->r_ctl.sack_count++;
11114                 else if (o_cnt > 1)
11115                         rack->r_ctl.sack_count++;
11116                 goto out_with_totals;
11117         }
11118         if (rack->sack_attack_disable) {
11119                 /*
11120                  * An attacker disablement is in place, for
11121                  * every sack block that is not at least a full MSS
11122                  * count up sack_count.
11123                  */
11124                 for (i = 0; i < num_sack_blks; i++) {
11125                         if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) {
11126                                 rack->r_ctl.sack_count++;
11127                         }
11128                         if (rack->r_ctl.sack_count > 0xfff00000) {
11129                                 /*
11130                                  * reduce the number to keep us under
11131                                  * a uint32_t.
11132                                  */
11133                                 rack->r_ctl.ack_count /= 2;
11134                                 rack->r_ctl.sack_count /= 2;
11135                         }
11136                 }
11137                 goto out;
11138         }
11139         /* Its a sack of some sort */
11140         rack->r_ctl.sack_count += num_sack_blks;
11141         if (rack->r_ctl.sack_count > 0xfff00000) {
11142                 /*
11143                  * reduce the number to keep us under
11144                  * a uint32_t.
11145                  */
11146                 rack->r_ctl.ack_count /= 2;
11147                 rack->r_ctl.sack_count /= 2;
11148         }
11149         if (num_sack_blks < 2) {
11150                 /* Only one, we don't need to sort */
11151                 goto do_sack_work;
11152         }
11153         /* Sort the sacks */
11154         for (i = 0; i < num_sack_blks; i++) {
11155                 for (j = i + 1; j < num_sack_blks; j++) {
11156                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
11157                                 sack = sack_blocks[i];
11158                                 sack_blocks[i] = sack_blocks[j];
11159                                 sack_blocks[j] = sack;
11160                         }
11161                 }
11162         }
11163         /*
11164          * Now are any of the sack block ends the same (yes some
11165          * implementations send these)?
11166          */
11167 again:
11168         if (num_sack_blks == 0)
11169                 goto out_with_totals;
11170         if (num_sack_blks > 1) {
11171                 for (i = 0; i < num_sack_blks; i++) {
11172                         for (j = i + 1; j < num_sack_blks; j++) {
11173                                 if (sack_blocks[i].end == sack_blocks[j].end) {
11174                                         /*
11175                                          * Ok these two have the same end we
11176                                          * want the smallest end and then
11177                                          * throw away the larger and start
11178                                          * again.
11179                                          */
11180                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
11181                                                 /*
11182                                                  * The second block covers
11183                                                  * more area use that
11184                                                  */
11185                                                 sack_blocks[i].start = sack_blocks[j].start;
11186                                         }
11187                                         /*
11188                                          * Now collapse out the dup-sack and
11189                                          * lower the count
11190                                          */
11191                                         for (k = (j + 1); k < num_sack_blks; k++) {
11192                                                 sack_blocks[j].start = sack_blocks[k].start;
11193                                                 sack_blocks[j].end = sack_blocks[k].end;
11194                                                 j++;
11195                                         }
11196                                         num_sack_blks--;
11197                                         goto again;
11198                                 }
11199                         }
11200                 }
11201         }
11202 do_sack_work:
11203         /*
11204          * First lets look to see if
11205          * we have retransmitted and
11206          * can use the transmit next?
11207          */
11208         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
11209         if (rsm &&
11210             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
11211             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
11212                 /*
11213                  * We probably did the FR and the next
11214                  * SACK in continues as we would expect.
11215                  */
11216                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz);
11217                 if (acked) {
11218                         rack->r_wanted_output = 1;
11219                         changed += acked;
11220                 }
11221                 if (num_sack_blks == 1) {
11222                         /*
11223                          * This is what we would expect from
11224                          * a normal implementation to happen
11225                          * after we have retransmitted the FR,
11226                          * i.e the sack-filter pushes down
11227                          * to 1 block and the next to be retransmitted
11228                          * is the sequence in the sack block (has more
11229                          * are acked). Count this as ACK'd data to boost
11230                          * up the chances of recovering any false positives.
11231                          */
11232                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
11233                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
11234                         counter_u64_add(rack_express_sack, 1);
11235                         if (rack->r_ctl.ack_count > 0xfff00000) {
11236                                 /*
11237                                  * reduce the number to keep us under
11238                                  * a uint32_t.
11239                                  */
11240                                 rack->r_ctl.ack_count /= 2;
11241                                 rack->r_ctl.sack_count /= 2;
11242                         }
11243                         if (moved_two) {
11244                                 /*
11245                                  * If we did not get a SACK for at least a MSS and
11246                                  * had to move at all, or if we moved more than our
11247                                  * threshold, it counts against the "extra" move.
11248                                  */
11249                                 rack->r_ctl.sack_moved_extra += moved_two;
11250                                 rack->r_ctl.sack_noextra_move += no_extra;
11251                                 counter_u64_add(rack_move_some, 1);
11252                         } else {
11253                                 /*
11254                                  * else we did not have to move
11255                                  * any more than we would expect.
11256                                  */
11257                                 rack->r_ctl.sack_noextra_move += no_extra;
11258                                 rack->r_ctl.sack_noextra_move++;
11259                                 counter_u64_add(rack_move_none, 1);
11260                         }
11261                         if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
11262                             (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
11263                                 rack->r_ctl.sack_moved_extra /= 2;
11264                                 rack->r_ctl.sack_noextra_move /= 2;
11265                         }
11266                         goto out_with_totals;
11267                 } else {
11268                         /*
11269                          * Start the loop through the
11270                          * rest of blocks, past the first block.
11271                          */
11272                         loop_start = 1;
11273                 }
11274         }
11275         counter_u64_add(rack_sack_total, 1);
11276         rsm = rack->r_ctl.rc_sacklast;
11277         for (i = loop_start; i < num_sack_blks; i++) {
11278                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz);
11279                 if (acked) {
11280                         rack->r_wanted_output = 1;
11281                         changed += acked;
11282                 }
11283                 if (moved_two) {
11284                         /*
11285                          * If we did not get a SACK for at least a MSS and
11286                          * had to move at all, or if we moved more than our
11287                          * threshold, it counts against the "extra" move.
11288                          */
11289                         rack->r_ctl.sack_moved_extra += moved_two;
11290                         rack->r_ctl.sack_noextra_move += no_extra;
11291                         counter_u64_add(rack_move_some, 1);
11292                 } else {
11293                         /*
11294                          * else we did not have to move
11295                          * any more than we would expect.
11296                          */
11297                         rack->r_ctl.sack_noextra_move += no_extra;
11298                         rack->r_ctl.sack_noextra_move++;
11299                         counter_u64_add(rack_move_none, 1);
11300                 }
11301                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
11302                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
11303                         rack->r_ctl.sack_moved_extra /= 2;
11304                         rack->r_ctl.sack_noextra_move /= 2;
11305                 }
11306                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
11307                         /*
11308                          * If the SACK was not a full MSS then
11309                          * we add to sack_count the number of
11310                          * MSS's (or possibly more than
11311                          * a MSS if its a TSO send) we had to skip by.
11312                          */
11313                         rack->r_ctl.sack_count += moved_two;
11314                         if (rack->r_ctl.sack_count > 0xfff00000) {
11315                                 rack->r_ctl.ack_count /= 2;
11316                                 rack->r_ctl.sack_count /= 2;
11317                         }
11318                         counter_u64_add(rack_sack_total, moved_two);
11319                 }
11320                 /*
11321                  * Now we need to setup for the next
11322                  * round. First we make sure we won't
11323                  * exceed the size of our uint32_t on
11324                  * the various counts, and then clear out
11325                  * moved_two.
11326                  */
11327                 moved_two = 0;
11328                 no_extra = 0;
11329         }
11330 out_with_totals:
11331         if (num_sack_blks > 1) {
11332                 /*
11333                  * You get an extra stroke if
11334                  * you have more than one sack-blk, this
11335                  * could be where we are skipping forward
11336                  * and the sack-filter is still working, or
11337                  * it could be an attacker constantly
11338                  * moving us.
11339                  */
11340                 rack->r_ctl.sack_moved_extra++;
11341                 counter_u64_add(rack_move_some, 1);
11342         }
11343 out:
11344 #ifdef TCP_SAD_DETECTION
11345         rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
11346 #endif
11347         if (changed) {
11348                 /* Something changed cancel the rack timer */
11349                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11350         }
11351         tsused = tcp_get_usecs(NULL);
11352         rsm = tcp_rack_output(tp, rack, tsused);
11353         if ((!IN_FASTRECOVERY(tp->t_flags)) &&
11354             rsm &&
11355             ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
11356                 /* Enter recovery */
11357                 entered_recovery = 1;
11358                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
11359                 /*
11360                  * When we enter recovery we need to assure we send
11361                  * one packet.
11362                  */
11363                 if (rack->rack_no_prr == 0) {
11364                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
11365                         rack_log_to_prr(rack, 8, 0, __LINE__);
11366                 }
11367                 rack->r_timer_override = 1;
11368                 rack->r_early = 0;
11369                 rack->r_ctl.rc_agg_early = 0;
11370         } else if (IN_FASTRECOVERY(tp->t_flags) &&
11371                    rsm &&
11372                    (rack->r_rr_config == 3)) {
11373                 /*
11374                  * Assure we can output and we get no
11375                  * remembered pace time except the retransmit.
11376                  */
11377                 rack->r_timer_override = 1;
11378                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
11379                 rack->r_ctl.rc_resend = rsm;
11380         }
11381         if (IN_FASTRECOVERY(tp->t_flags) &&
11382             (rack->rack_no_prr == 0) &&
11383             (entered_recovery == 0)) {
11384                 rack_update_prr(tp, rack, changed, th_ack);
11385                 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
11386                      ((tcp_in_hpts(rack->rc_inp) == 0) &&
11387                       ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
11388                         /*
11389                          * If you are pacing output you don't want
11390                          * to override.
11391                          */
11392                         rack->r_early = 0;
11393                         rack->r_ctl.rc_agg_early = 0;
11394                         rack->r_timer_override = 1;
11395                 }
11396         }
11397 }
11398
11399 static void
11400 rack_strike_dupack(struct tcp_rack *rack)
11401 {
11402         struct rack_sendmap *rsm;
11403
11404         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
11405         while (rsm) {
11406                 /*
11407                  * We need to skip anything already set
11408                  * to be retransmitted.
11409                  */
11410                 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD)  ||
11411                     (rsm->r_flags & RACK_MUST_RXT)) {
11412                         rsm = TAILQ_NEXT(rsm, r_tnext);
11413                         continue;
11414                 }
11415                 break;
11416         }
11417         if (rsm && (rsm->r_dupack < 0xff)) {
11418                 rsm->r_dupack++;
11419                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
11420                         struct timeval tv;
11421                         uint32_t cts;
11422                         /*
11423                          * Here we see if we need to retransmit. For
11424                          * a SACK type connection if enough time has passed
11425                          * we will get a return of the rsm. For a non-sack
11426                          * connection we will get the rsm returned if the
11427                          * dupack value is 3 or more.
11428                          */
11429                         cts = tcp_get_usecs(&tv);
11430                         rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
11431                         if (rack->r_ctl.rc_resend != NULL) {
11432                                 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
11433                                         rack_cong_signal(rack->rc_tp, CC_NDUPACK,
11434                                                          rack->rc_tp->snd_una, __LINE__);
11435                                 }
11436                                 rack->r_wanted_output = 1;
11437                                 rack->r_timer_override = 1;
11438                                 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
11439                         }
11440                 } else {
11441                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
11442                 }
11443         }
11444 }
11445
11446 static void
11447 rack_check_bottom_drag(struct tcpcb *tp,
11448                        struct tcp_rack *rack,
11449                        struct socket *so)
11450 {
11451         uint32_t segsiz, minseg;
11452
11453         segsiz = ctf_fixed_maxseg(tp);
11454         minseg = segsiz;
11455         if (tp->snd_max == tp->snd_una) {
11456                 /*
11457                  * We are doing dynamic pacing and we are way
11458                  * under. Basically everything got acked while
11459                  * we were still waiting on the pacer to expire.
11460                  *
11461                  * This means we need to boost the b/w in
11462                  * addition to any earlier boosting of
11463                  * the multiplier.
11464                  */
11465                 uint64_t lt_bw;
11466
11467                 lt_bw = rack_get_lt_bw(rack);
11468                 rack->rc_dragged_bottom = 1;
11469                 rack_validate_multipliers_at_or_above100(rack);
11470                 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
11471                     (lt_bw > 0)) {
11472                         /*
11473                          * Lets use the long-term b/w we have
11474                          * been getting as a base.
11475                          */
11476                         if (rack->rc_gp_filled == 0) {
11477                                 if (lt_bw > ONE_POINT_TWO_MEG) {
11478                                         /*
11479                                          * If we have no measurement
11480                                          * don't let us set in more than
11481                                          * 1.2Mbps. If we are still too
11482                                          * low after pacing with this we
11483                                          * will hopefully have a max b/w
11484                                          * available to sanity check things.
11485                                          */
11486                                         lt_bw = ONE_POINT_TWO_MEG;
11487                                 }
11488                                 rack->r_ctl.rc_rtt_diff = 0;
11489                                 rack->r_ctl.gp_bw = lt_bw;
11490                                 rack->rc_gp_filled = 1;
11491                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
11492                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
11493                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
11494                         } else if (lt_bw > rack->r_ctl.gp_bw) {
11495                                 rack->r_ctl.rc_rtt_diff = 0;
11496                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
11497                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
11498                                 rack->r_ctl.gp_bw = lt_bw;
11499                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
11500                         } else
11501                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11502                         if ((rack->gp_ready == 0) &&
11503                             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
11504                                 /* We have enough measurements now */
11505                                 rack->gp_ready = 1;
11506                                 if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) ||
11507                                     rack->rack_hibeta)
11508                                         rack_set_cc_pacing(rack);
11509                                 if (rack->defer_options)
11510                                         rack_apply_deferred_options(rack);
11511                         }
11512                 } else {
11513                         /*
11514                          * zero rtt possibly?, settle for just an old increase.
11515                          */
11516                         rack_increase_bw_mul(rack, -1, 0, 0, 1);
11517                 }
11518         } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
11519                    (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
11520                                                minseg)) &&
11521                    (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
11522                    (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
11523                    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
11524                     (segsiz * rack_req_segs))) {
11525                 /*
11526                  * We are doing dynamic GP pacing and
11527                  * we have everything except 1MSS or less
11528                  * bytes left out. We are still pacing away.
11529                  * And there is data that could be sent, This
11530                  * means we are inserting delayed ack time in
11531                  * our measurements because we are pacing too slow.
11532                  */
11533                 rack_validate_multipliers_at_or_above100(rack);
11534                 rack->rc_dragged_bottom = 1;
11535                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11536         }
11537 }
11538
11539 #ifdef TCP_REQUEST_TRK
11540 static void
11541 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq,
11542                 struct http_sendfile_track *cur, uint8_t mod, int line, int err)
11543 {
11544         int do_log;
11545
11546         do_log = tcp_bblogging_on(rack->rc_tp);
11547         if (do_log == 0) {
11548                 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0)
11549                         return;
11550                 /* We only allow the three below with point logging on */
11551                 if ((mod != HYBRID_LOG_RULES_APP) &&
11552                     (mod != HYBRID_LOG_RULES_SET) &&
11553                     (mod != HYBRID_LOG_REQ_COMP))
11554                         return;
11555
11556         }
11557         if (do_log) {
11558                 union tcp_log_stackspecific log;
11559                 struct timeval tv;
11560
11561                 /* Convert our ms to a microsecond */
11562                 memset(&log, 0, sizeof(log));
11563                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11564                 log.u_bbr.flex1 = seq;
11565                 log.u_bbr.cwnd_gain = line;
11566                 if (cur != NULL) {
11567                         uint64_t off;
11568
11569                         log.u_bbr.flex2 = cur->start_seq;
11570                         log.u_bbr.flex3 = cur->end_seq;
11571                         log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
11572                         log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff);
11573                         log.u_bbr.flex6 = cur->flags;
11574                         log.u_bbr.pkts_out = cur->hybrid_flags;
11575                         log.u_bbr.rttProp = cur->timestamp;
11576                         log.u_bbr.cur_del_rate = cur->cspr;
11577                         log.u_bbr.bw_inuse = cur->start;
11578                         log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff);
11579                         log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
11580                         log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
11581                         log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
11582                         log.u_bbr.bbr_state = 1;
11583                         off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]);
11584                         log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct http_sendfile_track));
11585                 } else {
11586                         log.u_bbr.flex2 = err;
11587                 }
11588                 /*
11589                  * Fill in flex7 to be CHD (catchup|hybrid|DGP)
11590                  */
11591                 log.u_bbr.flex7 = rack->rc_catch_up;
11592                 log.u_bbr.flex7 <<= 1;
11593                 log.u_bbr.flex7 |= rack->rc_hybrid_mode;
11594                 log.u_bbr.flex7 <<= 1;
11595                 log.u_bbr.flex7 |= rack->dgp_on;
11596                 log.u_bbr.flex8 = mod;
11597                 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
11598                 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
11599                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11600                 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start;
11601                 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error;
11602                 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop;
11603                 tcp_log_event(rack->rc_tp, NULL,
11604                     &rack->rc_inp->inp_socket->so_rcv,
11605                     &rack->rc_inp->inp_socket->so_snd,
11606                     TCP_HYBRID_PACING_LOG, 0,
11607                     0, &log, false, NULL, __func__, __LINE__, &tv);
11608         }
11609 }
11610 #endif
11611
11612 #ifdef TCP_REQUEST_TRK
11613 static void
11614 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len)
11615 {
11616         struct http_sendfile_track *rc_cur;
11617         struct tcpcb *tp;
11618         int err = 0;
11619
11620         rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, seq);
11621         if (rc_cur == NULL) {
11622                 /* If not in the beginning what about the end piece */
11623                 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11624                 rc_cur = tcp_http_find_req_for_seq(rack->rc_tp, (seq + len - 1));
11625         } else {
11626                 err = 12345;
11627         }
11628         /* If we find no parameters we are in straight DGP mode */
11629         if(rc_cur == NULL) {
11630                 /* None found for this seq, just DGP for now */
11631                 rack->r_ctl.client_suggested_maxseg = 0;
11632                 rack->rc_catch_up = 0;
11633                 rack->r_ctl.bw_rate_cap = 0;
11634                 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11635                 if (rack->r_ctl.rc_last_sft) {
11636                         rack->r_ctl.rc_last_sft = NULL;
11637                 }
11638                 return;
11639         }
11640         /*
11641          * Ok if we have a new entry *or* have never
11642          * set up an entry we need to proceed. If
11643          * we have already set it up this entry we
11644          * just continue along with what we already
11645          * setup.
11646          */
11647         tp = rack->rc_tp;
11648         if ((rack->r_ctl.rc_last_sft != NULL) &&
11649             (rack->r_ctl.rc_last_sft == rc_cur)) {
11650                 /* Its already in place */
11651                 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0);
11652                 return;
11653         }
11654         if (rack->rc_hybrid_mode == 0) {
11655                 rack->r_ctl.rc_last_sft = rc_cur;
11656                 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11657                 return;
11658         }
11659         if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
11660                 /* Compensate for all the header overhead's */
11661                 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
11662         } else
11663                 rack->r_ctl.bw_rate_cap = 0;
11664         if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
11665                 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
11666         else
11667                 rack->r_ctl.client_suggested_maxseg = 0;
11668         if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
11669             (rc_cur->cspr > 0)) {
11670                 uint64_t len;
11671
11672                 rack->rc_catch_up = 1;
11673                 /*
11674                  * Calculate the deadline time, first set the
11675                  * time to when the request arrived.
11676                  */
11677                 rc_cur->deadline = rc_cur->localtime;
11678                 /*
11679                  * Next calculate the length and compensate for
11680                  * TLS if need be.
11681                  */
11682                 len = rc_cur->end - rc_cur->start;
11683                 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) {
11684                         /*
11685                          * This session is doing TLS. Take a swag guess
11686                          * at the overhead.
11687                          */
11688                         len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len);
11689                 }
11690                 /*
11691                  * Now considering the size, and the cspr, what is the time that
11692                  * would be required at the cspr rate. Here we use the raw
11693                  * cspr value since the client only looks at the raw data. We
11694                  * do use len which includes TLS overhead, but not the TCP/IP etc.
11695                  * That will get made up for in the CU pacing rate set.
11696                  */
11697                 len *= HPTS_USEC_IN_SEC;
11698                 len /= rc_cur->cspr;
11699                 rc_cur->deadline += len;
11700         } else {
11701                 rack->rc_catch_up = 0;
11702                 rc_cur->deadline = 0;
11703         }
11704         if (rack->r_ctl.client_suggested_maxseg != 0) {
11705                 /*
11706                  * We need to reset the max pace segs if we have a
11707                  * client_suggested_maxseg.
11708                  */
11709                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
11710         }
11711         rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11712         /* Remember it for next time and for CU mode */
11713         rack->r_ctl.rc_last_sft = rc_cur;
11714 }
11715 #endif
11716
11717 static void
11718 rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
11719 {
11720 #ifdef TCP_REQUEST_TRK
11721         struct http_sendfile_track *ent;
11722
11723         ent = rack->r_ctl.rc_last_sft;
11724         if ((ent == NULL) ||
11725             (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) ||
11726             (SEQ_GEQ(seq, ent->end_seq))) {
11727                 /* Time to update the track. */
11728                 rack_set_dgp_hybrid_mode(rack, seq, len);
11729                 ent = rack->r_ctl.rc_last_sft;
11730         }
11731         /* Out of all */
11732         if (ent == NULL) {
11733                 return;
11734         }
11735         if (SEQ_LT(ent->end_seq, (seq + len))) {
11736                 /*
11737                  * This is the case where our end_seq guess
11738                  * was wrong. This is usually due to TLS having
11739                  * more bytes then our guess. It could also be the
11740                  * case that the client sent in two requests closely
11741                  * and the SB is full of both so we are sending part
11742                  * of each (end|beg). In such a case lets move this
11743                  * guys end to match the end of this send. That
11744                  * way it will complete when all of it is acked.
11745                  */
11746                 ent->end_seq = (seq + len);
11747                 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent);
11748         }
11749         /* Now validate we have set the send time of this one */
11750         if ((ent->flags & TCP_HTTP_TRACK_FLG_FSND) == 0) {
11751                 ent->flags |= TCP_HTTP_TRACK_FLG_FSND;
11752                 ent->first_send = cts;
11753                 ent->sent_at_fs = rack->rc_tp->t_sndbytes;
11754                 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
11755         }
11756 #endif
11757 }
11758
11759 static void
11760 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
11761 {
11762         /*
11763          * The fast output path is enabled and we
11764          * have moved the cumack forward. Lets see if
11765          * we can expand forward the fast path length by
11766          * that amount. What we would ideally like to
11767          * do is increase the number of bytes in the
11768          * fast path block (left_to_send) by the
11769          * acked amount. However we have to gate that
11770          * by two factors:
11771          * 1) The amount outstanding and the rwnd of the peer
11772          *    (i.e. we don't want to exceed the rwnd of the peer).
11773          *    <and>
11774          * 2) The amount of data left in the socket buffer (i.e.
11775          *    we can't send beyond what is in the buffer).
11776          *
11777          * Note that this does not take into account any increase
11778          * in the cwnd. We will only extend the fast path by
11779          * what was acked.
11780          */
11781         uint32_t new_total, gating_val;
11782
11783         new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
11784         gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
11785                          (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
11786         if (new_total <= gating_val) {
11787                 /* We can increase left_to_send by the acked amount */
11788                 counter_u64_add(rack_extended_rfo, 1);
11789                 rack->r_ctl.fsb.left_to_send = new_total;
11790                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
11791                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
11792                          rack, rack->r_ctl.fsb.left_to_send,
11793                          sbavail(&rack->rc_inp->inp_socket->so_snd),
11794                          (tp->snd_max - tp->snd_una)));
11795
11796         }
11797 }
11798
11799 static void
11800 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb)
11801 {
11802         /*
11803          * Here any sendmap entry that points to the
11804          * beginning mbuf must be adjusted to the correct
11805          * offset. This must be called with:
11806          * 1) The socket buffer locked
11807          * 2) snd_una adjusted to its new position.
11808          *
11809          * Note that (2) implies rack_ack_received has also
11810          * been called and all the sbcut's have been done.
11811          *
11812          * We grab the first mbuf in the socket buffer and
11813          * then go through the front of the sendmap, recalculating
11814          * the stored offset for any sendmap entry that has
11815          * that mbuf. We must use the sb functions to do this
11816          * since its possible an add was done has well as
11817          * the subtraction we may have just completed. This should
11818          * not be a penalty though, since we just referenced the sb
11819          * to go in and trim off the mbufs that we freed (of course
11820          * there will be a penalty for the sendmap references though).
11821          *
11822          * Note also with INVARIANT on, we validate with a KASSERT
11823          * that the first sendmap entry has a soff of 0.
11824          *
11825          */
11826         struct mbuf *m;
11827         struct rack_sendmap *rsm;
11828         tcp_seq snd_una;
11829 #ifdef INVARIANTS
11830         int first_processed = 0;
11831 #endif
11832
11833         snd_una = rack->rc_tp->snd_una;
11834         SOCKBUF_LOCK_ASSERT(sb);
11835         m = sb->sb_mb;
11836         rsm = tqhash_min(rack->r_ctl.tqh);
11837         if ((rsm == NULL) || (m == NULL)) {
11838                 /* Nothing outstanding */
11839                 return;
11840         }
11841         /* The very first RSM's mbuf must point to the head mbuf in the sb */
11842         KASSERT((rsm->m == m),
11843                 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb",
11844                  rack, sb, rsm));
11845         while (rsm->m && (rsm->m == m)) {
11846                 /* one to adjust */
11847 #ifdef INVARIANTS
11848                 struct mbuf *tm;
11849                 uint32_t soff;
11850
11851                 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
11852                 if ((rsm->orig_m_len != m->m_len) ||
11853                     (rsm->orig_t_space != M_TRAILINGROOM(m))){
11854                         rack_adjust_orig_mlen(rsm);
11855                 }
11856                 if (first_processed == 0) {
11857                         KASSERT((rsm->soff == 0),
11858                                 ("Rack:%p rsm:%p -- rsm at head but soff not zero",
11859                                  rack, rsm));
11860                         first_processed = 1;
11861                 }
11862                 if ((rsm->soff != soff) || (rsm->m != tm)) {
11863                         /*
11864                          * This is not a fatal error, we anticipate it
11865                          * might happen (the else code), so we count it here
11866                          * so that under invariant we can see that it really
11867                          * does happen.
11868                          */
11869                         counter_u64_add(rack_adjust_map_bw, 1);
11870                 }
11871                 rsm->m = tm;
11872                 rsm->soff = soff;
11873                 if (tm) {
11874                         rsm->orig_m_len = rsm->m->m_len;
11875                         rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11876                 } else {
11877                         rsm->orig_m_len = 0;
11878                         rsm->orig_t_space = 0;
11879                 }
11880 #else
11881                 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
11882                 if (rsm->m) {
11883                         rsm->orig_m_len = rsm->m->m_len;
11884                         rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11885                 } else {
11886                         rsm->orig_m_len = 0;
11887                         rsm->orig_t_space = 0;
11888                 }
11889 #endif
11890                 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
11891                 if (rsm == NULL)
11892                         break;
11893         }
11894 }
11895
11896 #ifdef TCP_REQUEST_TRK
11897 static inline void
11898 rack_http_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack)
11899 {
11900         struct http_sendfile_track *ent;
11901         int i;
11902
11903         if ((rack->rc_hybrid_mode == 0) &&
11904             (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) {
11905                 /*
11906                  * Just do normal completions hybrid pacing is not on
11907                  * and CLDL is off as well.
11908                  */
11909                 tcp_http_check_for_comp(rack->rc_tp, th_ack);
11910                 return;
11911         }
11912         /*
11913          * Originally I was just going to find the th_ack associated
11914          * with an entry. But then I realized a large strech ack could
11915          * in theory ack two or more requests at once. So instead we
11916          * need to find all entries that are completed by th_ack not
11917          * just a single entry and do our logging.
11918          */
11919         ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11920         while (ent != NULL) {
11921                 /*
11922                  * We may be doing hybrid pacing or CLDL and need more details possibly
11923                  * so we do it manually instead of calling
11924                  * tcp_http_check_for_comp()
11925                  */
11926                 uint64_t laa, tim, data, cbw, ftim;
11927
11928                 /* Ok this ack frees it */
11929                 rack_log_hybrid(rack, th_ack,
11930                                 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0);
11931                 /* calculate the time based on the ack arrival */
11932                 data = ent->end - ent->start;
11933                 laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time);
11934                 if (ent->flags & TCP_HTTP_TRACK_FLG_FSND) {
11935                         if (ent->first_send > ent->localtime)
11936                                 ftim = ent->first_send;
11937                         else
11938                                 ftim = ent->localtime;
11939                 } else {
11940                         /* TSNH */
11941                         ftim = ent->localtime;
11942                 }
11943                 if (laa > ent->localtime)
11944                         tim = laa - ftim;
11945                 else
11946                         tim = 0;
11947                 cbw = data * HPTS_USEC_IN_SEC;
11948                 if (tim > 0)
11949                         cbw /= tim;
11950                 else
11951                         cbw = 0;
11952                 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent);
11953                 /*
11954                  * Check to see if we are freeing what we are pointing to send wise
11955                  * if so be sure to NULL the pointer so we know we are no longer
11956                  * set to anything.
11957                  */
11958                 if (ent == rack->r_ctl.rc_last_sft)
11959                         rack->r_ctl.rc_last_sft = NULL;
11960                 /* Generate the log that the tcp_netflix call would have */
11961                 tcp_http_log_req_info(rack->rc_tp, ent,
11962                                       i, TCP_HTTP_REQ_LOG_FREED, 0, 0);
11963                 /* Free it and see if there is another one */
11964                 tcp_http_free_a_slot(rack->rc_tp, ent);
11965                 ent = tcp_http_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11966         }
11967 }
11968 #endif
11969
11970
11971 /*
11972  * Return value of 1, we do not need to call rack_process_data().
11973  * return value of 0, rack_process_data can be called.
11974  * For ret_val if its 0 the TCP is locked, if its non-zero
11975  * its unlocked and probably unsafe to touch the TCB.
11976  */
11977 static int
11978 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
11979     struct tcpcb *tp, struct tcpopt *to,
11980     uint32_t tiwin, int32_t tlen,
11981     int32_t * ofia, int32_t thflags, int32_t *ret_val)
11982 {
11983         int32_t ourfinisacked = 0;
11984         int32_t nsegs, acked_amount;
11985         int32_t acked;
11986         struct mbuf *mfree;
11987         struct tcp_rack *rack;
11988         int32_t under_pacing = 0;
11989         int32_t recovery = 0;
11990
11991         INP_WLOCK_ASSERT(tptoinpcb(tp));
11992
11993         rack = (struct tcp_rack *)tp->t_fb_ptr;
11994         if (SEQ_GT(th->th_ack, tp->snd_max)) {
11995                 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
11996                                       &rack->r_ctl.challenge_ack_ts,
11997                                       &rack->r_ctl.challenge_ack_cnt);
11998                 rack->r_wanted_output = 1;
11999                 return (1);
12000         }
12001         if (rack->gp_ready &&
12002             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12003                 under_pacing = 1;
12004         }
12005         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
12006                 int in_rec, dup_ack_struck = 0;
12007                 int dsack_seen = 0, sacks_seen = 0;
12008
12009                 in_rec = IN_FASTRECOVERY(tp->t_flags);
12010                 if (rack->rc_in_persist) {
12011                         tp->t_rxtshift = 0;
12012                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
12013                                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
12014                 }
12015
12016                 if ((th->th_ack == tp->snd_una) &&
12017                     (tiwin == tp->snd_wnd) &&
12018                     ((to->to_flags & TOF_SACK) == 0)) {
12019                         rack_strike_dupack(rack);
12020                         dup_ack_struck = 1;
12021                 }
12022                 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
12023                              dup_ack_struck, &dsack_seen, &sacks_seen);
12024                 if ((rack->sack_attack_disable > 0) &&
12025                     (th->th_ack == tp->snd_una) &&
12026                     (tiwin == tp->snd_wnd) &&
12027                     (dsack_seen == 0) &&
12028                     (sacks_seen > 0)) {
12029                         /*
12030                          * If sacks have been disabled we may
12031                          * want to strike a dup-ack "ignoring" the
12032                          * sack as long as the sack was not a "dsack". Note
12033                          * that if no sack is sent (TOF_SACK is off) then the
12034                          * normal dsack code above rack_log_ack() would have
12035                          * already struck. So this is just to catch the case
12036                          * were we are ignoring sacks from this guy due to
12037                          * it being a suspected attacker.
12038                          */
12039                         rack_strike_dupack(rack);
12040                 }
12041
12042         }
12043         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
12044                 /*
12045                  * Old ack, behind (or duplicate to) the last one rcv'd
12046                  * Note: We mark reordering is occuring if its
12047                  * less than and we have not closed our window.
12048                  */
12049                 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
12050                         rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
12051                         if (rack->r_ctl.rc_reorder_ts == 0)
12052                                 rack->r_ctl.rc_reorder_ts = 1;
12053                 }
12054                 return (0);
12055         }
12056         /*
12057          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
12058          * something we sent.
12059          */
12060         if (tp->t_flags & TF_NEEDSYN) {
12061                 /*
12062                  * T/TCP: Connection was half-synchronized, and our SYN has
12063                  * been ACK'd (so connection is now fully synchronized).  Go
12064                  * to non-starred state, increment snd_una for ACK of SYN,
12065                  * and check if we can do window scaling.
12066                  */
12067                 tp->t_flags &= ~TF_NEEDSYN;
12068                 tp->snd_una++;
12069                 /* Do window scaling? */
12070                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
12071                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
12072                         tp->rcv_scale = tp->request_r_scale;
12073                         /* Send window already scaled. */
12074                 }
12075         }
12076         nsegs = max(1, m->m_pkthdr.lro_nsegs);
12077
12078         acked = BYTES_THIS_ACK(tp, th);
12079         if (acked) {
12080                 /*
12081                  * Any time we move the cum-ack forward clear
12082                  * keep-alive tied probe-not-answered. The
12083                  * persists clears its own on entry.
12084                  */
12085                 rack->probe_not_answered = 0;
12086         }
12087         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
12088         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
12089         /*
12090          * If we just performed our first retransmit, and the ACK arrives
12091          * within our recovery window, then it was a mistake to do the
12092          * retransmit in the first place.  Recover our original cwnd and
12093          * ssthresh, and proceed to transmit where we left off.
12094          */
12095         if ((tp->t_flags & TF_PREVVALID) &&
12096             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
12097                 tp->t_flags &= ~TF_PREVVALID;
12098                 if (tp->t_rxtshift == 1 &&
12099                     (int)(ticks - tp->t_badrxtwin) < 0)
12100                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
12101         }
12102         if (acked) {
12103                 /* assure we are not backed off */
12104                 tp->t_rxtshift = 0;
12105                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
12106                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
12107                 rack->rc_tlp_in_progress = 0;
12108                 rack->r_ctl.rc_tlp_cnt_out = 0;
12109                 /*
12110                  * If it is the RXT timer we want to
12111                  * stop it, so we can restart a TLP.
12112                  */
12113                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
12114                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12115 #ifdef TCP_REQUEST_TRK
12116                 rack_http_check_for_comp(rack, th->th_ack);
12117 #endif
12118         }
12119         /*
12120          * If we have a timestamp reply, update smoothed round trip time. If
12121          * no timestamp is present but transmit timer is running and timed
12122          * sequence number was acked, update smoothed round trip time. Since
12123          * we now have an rtt measurement, cancel the timer backoff (cf.,
12124          * Phil Karn's retransmit alg.). Recompute the initial retransmit
12125          * timer.
12126          *
12127          * Some boxes send broken timestamp replies during the SYN+ACK
12128          * phase, ignore timestamps of 0 or we could calculate a huge RTT
12129          * and blow up the retransmit timer.
12130          */
12131         /*
12132          * If all outstanding data is acked, stop retransmit timer and
12133          * remember to restart (more output or persist). If there is more
12134          * data to be acked, restart retransmit timer, using current
12135          * (possibly backed-off) value.
12136          */
12137         if (acked == 0) {
12138                 if (ofia)
12139                         *ofia = ourfinisacked;
12140                 return (0);
12141         }
12142         if (IN_RECOVERY(tp->t_flags)) {
12143                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
12144                     (SEQ_LT(th->th_ack, tp->snd_max))) {
12145                         tcp_rack_partialack(tp);
12146                 } else {
12147                         rack_post_recovery(tp, th->th_ack);
12148                         recovery = 1;
12149                 }
12150         }
12151         /*
12152          * Let the congestion control algorithm update congestion control
12153          * related information. This typically means increasing the
12154          * congestion window.
12155          */
12156         rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
12157         SOCKBUF_LOCK(&so->so_snd);
12158         acked_amount = min(acked, (int)sbavail(&so->so_snd));
12159         tp->snd_wnd -= acked_amount;
12160         mfree = sbcut_locked(&so->so_snd, acked_amount);
12161         if ((sbused(&so->so_snd) == 0) &&
12162             (acked > acked_amount) &&
12163             (tp->t_state >= TCPS_FIN_WAIT_1) &&
12164             (tp->t_flags & TF_SENTFIN)) {
12165                 /*
12166                  * We must be sure our fin
12167                  * was sent and acked (we can be
12168                  * in FIN_WAIT_1 without having
12169                  * sent the fin).
12170                  */
12171                 ourfinisacked = 1;
12172         }
12173         tp->snd_una = th->th_ack;
12174         /* wakeups? */
12175         if (acked_amount && sbavail(&so->so_snd))
12176                 rack_adjust_sendmap_head(rack, &so->so_snd);
12177         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
12178         /* NB: sowwakeup_locked() does an implicit unlock. */
12179         sowwakeup_locked(so);
12180         /* now check the rxt clamps */
12181         if ((recovery == 1) &&
12182             (rack->excess_rxt_on) &&
12183             (rack->r_cwnd_was_clamped == 0))  {
12184                 do_rack_excess_rxt(tp, rack);
12185         } else if (rack->r_cwnd_was_clamped)
12186                 do_rack_check_for_unclamp(tp, rack);
12187         m_freem(mfree);
12188         if (SEQ_GT(tp->snd_una, tp->snd_recover))
12189                 tp->snd_recover = tp->snd_una;
12190
12191         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
12192                 tp->snd_nxt = tp->snd_una;
12193         }
12194         if (under_pacing &&
12195             (rack->use_fixed_rate == 0) &&
12196             (rack->in_probe_rtt == 0) &&
12197             rack->rc_gp_dyn_mul &&
12198             rack->rc_always_pace) {
12199                 /* Check if we are dragging bottom */
12200                 rack_check_bottom_drag(tp, rack, so);
12201         }
12202         if (tp->snd_una == tp->snd_max) {
12203                 /* Nothing left outstanding */
12204                 tp->t_flags &= ~TF_PREVVALID;
12205                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
12206                 rack->r_ctl.retran_during_recovery = 0;
12207                 rack->r_ctl.dsack_byte_cnt = 0;
12208                 if (rack->r_ctl.rc_went_idle_time == 0)
12209                         rack->r_ctl.rc_went_idle_time = 1;
12210                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
12211                 if (sbavail(&tptosocket(tp)->so_snd) == 0)
12212                         tp->t_acktime = 0;
12213                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12214                 rack->rc_suspicious = 0;
12215                 /* Set need output so persist might get set */
12216                 rack->r_wanted_output = 1;
12217                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
12218                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
12219                     (sbavail(&so->so_snd) == 0) &&
12220                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
12221                         /*
12222                          * The socket was gone and the
12223                          * peer sent data (now or in the past), time to
12224                          * reset him.
12225                          */
12226                         *ret_val = 1;
12227                         /* tcp_close will kill the inp pre-log the Reset */
12228                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
12229                         tp = tcp_close(tp);
12230                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
12231                         return (1);
12232                 }
12233         }
12234         if (ofia)
12235                 *ofia = ourfinisacked;
12236         return (0);
12237 }
12238
12239
12240 static void
12241 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
12242                   int dir, uint32_t flags, struct rack_sendmap *rsm)
12243 {
12244         if (tcp_bblogging_on(rack->rc_tp)) {
12245                 union tcp_log_stackspecific log;
12246                 struct timeval tv;
12247
12248                 memset(&log, 0, sizeof(log));
12249                 log.u_bbr.flex1 = cnt;
12250                 log.u_bbr.flex2 = split;
12251                 log.u_bbr.flex3 = out;
12252                 log.u_bbr.flex4 = line;
12253                 log.u_bbr.flex5 = rack->r_must_retran;
12254                 log.u_bbr.flex6 = flags;
12255                 log.u_bbr.flex7 = rack->rc_has_collapsed;
12256                 log.u_bbr.flex8 = dir;  /*
12257                                          * 1 is collapsed, 0 is uncollapsed,
12258                                          * 2 is log of a rsm being marked, 3 is a split.
12259                                          */
12260                 if (rsm == NULL)
12261                         log.u_bbr.rttProp = 0;
12262                 else
12263                         log.u_bbr.rttProp = (uint64_t)rsm;
12264                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
12265                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
12266                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
12267                     &rack->rc_inp->inp_socket->so_rcv,
12268                     &rack->rc_inp->inp_socket->so_snd,
12269                     TCP_RACK_LOG_COLLAPSE, 0,
12270                     0, &log, false, &tv);
12271         }
12272 }
12273
12274 static void
12275 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line)
12276 {
12277         /*
12278          * Here all we do is mark the collapsed point and set the flag.
12279          * This may happen again and again, but there is no
12280          * sense splitting our map until we know where the
12281          * peer finally lands in the collapse.
12282          */
12283         tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
12284         if ((rack->rc_has_collapsed == 0) ||
12285             (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd)))
12286                 counter_u64_add(rack_collapsed_win_seen, 1);
12287         rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd;
12288         rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
12289         rack->rc_has_collapsed = 1;
12290         rack->r_collapse_point_valid = 1;
12291         rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
12292 }
12293
12294 static void
12295 rack_un_collapse_window(struct tcp_rack *rack, int line)
12296 {
12297         struct rack_sendmap *nrsm, *rsm;
12298         int cnt = 0, split = 0;
12299         int insret __diagused;
12300
12301
12302         tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
12303         rack->rc_has_collapsed = 0;
12304         rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
12305         if (rsm == NULL) {
12306                 /* Nothing to do maybe the peer ack'ed it all */
12307                 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
12308                 return;
12309         }
12310         /* Now do we need to split this one? */
12311         if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
12312                 rack_log_collapse(rack, rsm->r_start, rsm->r_end,
12313                                   rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
12314                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
12315                 if (nrsm == NULL) {
12316                         /* We can't get a rsm, mark all? */
12317                         nrsm = rsm;
12318                         goto no_split;
12319                 }
12320                 /* Clone it */
12321                 split = 1;
12322                 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
12323 #ifndef INVARIANTS
12324                 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
12325 #else
12326                 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
12327                         panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p",
12328                               nrsm, insret, rack, rsm);
12329                 }
12330 #endif
12331                 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
12332                                  rack->r_ctl.last_collapse_point, __LINE__);
12333                 if (rsm->r_in_tmap) {
12334                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
12335                         nrsm->r_in_tmap = 1;
12336                 }
12337                 /*
12338                  * Set in the new RSM as the
12339                  * collapsed starting point
12340                  */
12341                 rsm = nrsm;
12342         }
12343
12344 no_split:
12345         TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm)  {
12346                 cnt++;
12347                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
12348                 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
12349                 cnt++;
12350         }
12351         if (cnt) {
12352                 counter_u64_add(rack_collapsed_win, 1);
12353         }
12354         rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
12355 }
12356
12357 static void
12358 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
12359                         int32_t tlen, int32_t tfo_syn)
12360 {
12361         if (DELAY_ACK(tp, tlen) || tfo_syn) {
12362                 rack_timer_cancel(tp, rack,
12363                                   rack->r_ctl.rc_rcvtime, __LINE__);
12364                 tp->t_flags |= TF_DELACK;
12365         } else {
12366                 rack->r_wanted_output = 1;
12367                 tp->t_flags |= TF_ACKNOW;
12368         }
12369 }
12370
12371 static void
12372 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
12373 {
12374         /*
12375          * If fast output is in progress, lets validate that
12376          * the new window did not shrink on us and make it
12377          * so fast output should end.
12378          */
12379         if (rack->r_fast_output) {
12380                 uint32_t out;
12381
12382                 /*
12383                  * Calculate what we will send if left as is
12384                  * and compare that to our send window.
12385                  */
12386                 out = ctf_outstanding(tp);
12387                 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
12388                         /* ok we have an issue */
12389                         if (out >= tp->snd_wnd) {
12390                                 /* Turn off fast output the window is met or collapsed */
12391                                 rack->r_fast_output = 0;
12392                         } else {
12393                                 /* we have some room left */
12394                                 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
12395                                 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
12396                                         /* If not at least 1 full segment never mind */
12397                                         rack->r_fast_output = 0;
12398                                 }
12399                         }
12400                 }
12401         }
12402 }
12403
12404
12405 /*
12406  * Return value of 1, the TCB is unlocked and most
12407  * likely gone, return value of 0, the TCP is still
12408  * locked.
12409  */
12410 static int
12411 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
12412     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
12413     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
12414 {
12415         /*
12416          * Update window information. Don't look at window if no ACK: TAC's
12417          * send garbage on first SYN.
12418          */
12419         int32_t nsegs;
12420         int32_t tfo_syn;
12421         struct tcp_rack *rack;
12422
12423         INP_WLOCK_ASSERT(tptoinpcb(tp));
12424
12425         rack = (struct tcp_rack *)tp->t_fb_ptr;
12426         nsegs = max(1, m->m_pkthdr.lro_nsegs);
12427         if ((thflags & TH_ACK) &&
12428             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
12429             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
12430             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
12431                 /* keep track of pure window updates */
12432                 if (tlen == 0 &&
12433                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
12434                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
12435                 tp->snd_wnd = tiwin;
12436                 rack_validate_fo_sendwin_up(tp, rack);
12437                 tp->snd_wl1 = th->th_seq;
12438                 tp->snd_wl2 = th->th_ack;
12439                 if (tp->snd_wnd > tp->max_sndwnd)
12440                         tp->max_sndwnd = tp->snd_wnd;
12441                 rack->r_wanted_output = 1;
12442         } else if (thflags & TH_ACK) {
12443                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
12444                         tp->snd_wnd = tiwin;
12445                         rack_validate_fo_sendwin_up(tp, rack);
12446                         tp->snd_wl1 = th->th_seq;
12447                         tp->snd_wl2 = th->th_ack;
12448                 }
12449         }
12450         if (tp->snd_wnd < ctf_outstanding(tp))
12451                 /* The peer collapsed the window */
12452                 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12453         else if (rack->rc_has_collapsed)
12454                 rack_un_collapse_window(rack, __LINE__);
12455         if ((rack->r_collapse_point_valid) &&
12456             (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
12457                 rack->r_collapse_point_valid = 0;
12458         /* Was persist timer active and now we have window space? */
12459         if ((rack->rc_in_persist != 0) &&
12460             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12461                                 rack->r_ctl.rc_pace_min_segs))) {
12462                 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
12463                 tp->snd_nxt = tp->snd_max;
12464                 /* Make sure we output to start the timer */
12465                 rack->r_wanted_output = 1;
12466         }
12467         /* Do we enter persists? */
12468         if ((rack->rc_in_persist == 0) &&
12469             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12470             TCPS_HAVEESTABLISHED(tp->t_state) &&
12471             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12472             sbavail(&tptosocket(tp)->so_snd) &&
12473             (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12474                 /*
12475                  * Here the rwnd is less than
12476                  * the pacing size, we are established,
12477                  * nothing is outstanding, and there is
12478                  * data to send. Enter persists.
12479                  */
12480                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
12481         }
12482         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
12483                 m_freem(m);
12484                 return (0);
12485         }
12486         /*
12487          * don't process the URG bit, ignore them drag
12488          * along the up.
12489          */
12490         tp->rcv_up = tp->rcv_nxt;
12491
12492         /*
12493          * Process the segment text, merging it into the TCP sequencing
12494          * queue, and arranging for acknowledgment of receipt if necessary.
12495          * This process logically involves adjusting tp->rcv_wnd as data is
12496          * presented to the user (this happens in tcp_usrreq.c, case
12497          * PRU_RCVD).  If a FIN has already been received on this connection
12498          * then we just ignore the text.
12499          */
12500         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
12501                    IS_FASTOPEN(tp->t_flags));
12502         if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
12503             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12504                 tcp_seq save_start = th->th_seq;
12505                 tcp_seq save_rnxt  = tp->rcv_nxt;
12506                 int     save_tlen  = tlen;
12507
12508                 m_adj(m, drop_hdrlen);  /* delayed header drop */
12509                 /*
12510                  * Insert segment which includes th into TCP reassembly
12511                  * queue with control block tp.  Set thflags to whether
12512                  * reassembly now includes a segment with FIN.  This handles
12513                  * the common case inline (segment is the next to be
12514                  * received on an established connection, and the queue is
12515                  * empty), avoiding linkage into and removal from the queue
12516                  * and repetition of various conversions. Set DELACK for
12517                  * segments received in order, but ack immediately when
12518                  * segments are out of order (so fast retransmit can work).
12519                  */
12520                 if (th->th_seq == tp->rcv_nxt &&
12521                     SEGQ_EMPTY(tp) &&
12522                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
12523                     tfo_syn)) {
12524 #ifdef NETFLIX_SB_LIMITS
12525                         u_int mcnt, appended;
12526
12527                         if (so->so_rcv.sb_shlim) {
12528                                 mcnt = m_memcnt(m);
12529                                 appended = 0;
12530                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12531                                     CFO_NOSLEEP, NULL) == false) {
12532                                         counter_u64_add(tcp_sb_shlim_fails, 1);
12533                                         m_freem(m);
12534                                         return (0);
12535                                 }
12536                         }
12537 #endif
12538                         rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
12539                         tp->rcv_nxt += tlen;
12540                         if (tlen &&
12541                             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12542                             (tp->t_fbyte_in == 0)) {
12543                                 tp->t_fbyte_in = ticks;
12544                                 if (tp->t_fbyte_in == 0)
12545                                         tp->t_fbyte_in = 1;
12546                                 if (tp->t_fbyte_out && tp->t_fbyte_in)
12547                                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12548                         }
12549                         thflags = tcp_get_flags(th) & TH_FIN;
12550                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12551                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12552                         SOCKBUF_LOCK(&so->so_rcv);
12553                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12554                                 m_freem(m);
12555                         } else
12556 #ifdef NETFLIX_SB_LIMITS
12557                                 appended =
12558 #endif
12559                                         sbappendstream_locked(&so->so_rcv, m, 0);
12560
12561                         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12562                         /* NB: sorwakeup_locked() does an implicit unlock. */
12563                         sorwakeup_locked(so);
12564 #ifdef NETFLIX_SB_LIMITS
12565                         if (so->so_rcv.sb_shlim && appended != mcnt)
12566                                 counter_fo_release(so->so_rcv.sb_shlim,
12567                                     mcnt - appended);
12568 #endif
12569                 } else {
12570                         /*
12571                          * XXX: Due to the header drop above "th" is
12572                          * theoretically invalid by now.  Fortunately
12573                          * m_adj() doesn't actually frees any mbufs when
12574                          * trimming from the head.
12575                          */
12576                         tcp_seq temp = save_start;
12577
12578                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
12579                         tp->t_flags |= TF_ACKNOW;
12580                         if (tp->t_flags & TF_WAKESOR) {
12581                                 tp->t_flags &= ~TF_WAKESOR;
12582                                 /* NB: sorwakeup_locked() does an implicit unlock. */
12583                                 sorwakeup_locked(so);
12584                         }
12585                 }
12586                 if ((tp->t_flags & TF_SACK_PERMIT) &&
12587                     (save_tlen > 0) &&
12588                     TCPS_HAVEESTABLISHED(tp->t_state)) {
12589                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
12590                                 /*
12591                                  * DSACK actually handled in the fastpath
12592                                  * above.
12593                                  */
12594                                 tcp_update_sack_list(tp, save_start,
12595                                     save_start + save_tlen);
12596                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
12597                                 if ((tp->rcv_numsacks >= 1) &&
12598                                     (tp->sackblks[0].end == save_start)) {
12599                                         /*
12600                                          * Partial overlap, recorded at todrop
12601                                          * above.
12602                                          */
12603                                         tcp_update_sack_list(tp,
12604                                             tp->sackblks[0].start,
12605                                             tp->sackblks[0].end);
12606                                 } else {
12607                                         tcp_update_dsack_list(tp, save_start,
12608                                             save_start + save_tlen);
12609                                 }
12610                         } else if (tlen >= save_tlen) {
12611                                 /* Update of sackblks. */
12612                                 tcp_update_dsack_list(tp, save_start,
12613                                     save_start + save_tlen);
12614                         } else if (tlen > 0) {
12615                                 tcp_update_dsack_list(tp, save_start,
12616                                     save_start + tlen);
12617                         }
12618                 }
12619         } else {
12620                 m_freem(m);
12621                 thflags &= ~TH_FIN;
12622         }
12623
12624         /*
12625          * If FIN is received ACK the FIN and let the user know that the
12626          * connection is closing.
12627          */
12628         if (thflags & TH_FIN) {
12629                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12630                         /* The socket upcall is handled by socantrcvmore. */
12631                         socantrcvmore(so);
12632                         /*
12633                          * If connection is half-synchronized (ie NEEDSYN
12634                          * flag on) then delay ACK, so it may be piggybacked
12635                          * when SYN is sent. Otherwise, since we received a
12636                          * FIN then no more input can be expected, send ACK
12637                          * now.
12638                          */
12639                         if (tp->t_flags & TF_NEEDSYN) {
12640                                 rack_timer_cancel(tp, rack,
12641                                     rack->r_ctl.rc_rcvtime, __LINE__);
12642                                 tp->t_flags |= TF_DELACK;
12643                         } else {
12644                                 tp->t_flags |= TF_ACKNOW;
12645                         }
12646                         tp->rcv_nxt++;
12647                 }
12648                 switch (tp->t_state) {
12649                         /*
12650                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
12651                          * CLOSE_WAIT state.
12652                          */
12653                 case TCPS_SYN_RECEIVED:
12654                         tp->t_starttime = ticks;
12655                         /* FALLTHROUGH */
12656                 case TCPS_ESTABLISHED:
12657                         rack_timer_cancel(tp, rack,
12658                             rack->r_ctl.rc_rcvtime, __LINE__);
12659                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
12660                         break;
12661
12662                         /*
12663                          * If still in FIN_WAIT_1 STATE FIN has not been
12664                          * acked so enter the CLOSING state.
12665                          */
12666                 case TCPS_FIN_WAIT_1:
12667                         rack_timer_cancel(tp, rack,
12668                             rack->r_ctl.rc_rcvtime, __LINE__);
12669                         tcp_state_change(tp, TCPS_CLOSING);
12670                         break;
12671
12672                         /*
12673                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
12674                          * starting the time-wait timer, turning off the
12675                          * other standard timers.
12676                          */
12677                 case TCPS_FIN_WAIT_2:
12678                         rack_timer_cancel(tp, rack,
12679                             rack->r_ctl.rc_rcvtime, __LINE__);
12680                         tcp_twstart(tp);
12681                         return (1);
12682                 }
12683         }
12684         /*
12685          * Return any desired output.
12686          */
12687         if ((tp->t_flags & TF_ACKNOW) ||
12688             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
12689                 rack->r_wanted_output = 1;
12690         }
12691         return (0);
12692 }
12693
12694 /*
12695  * Here nothing is really faster, its just that we
12696  * have broken out the fast-data path also just like
12697  * the fast-ack.
12698  */
12699 static int
12700 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
12701     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12702     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
12703 {
12704         int32_t nsegs;
12705         int32_t newsize = 0;    /* automatic sockbuf scaling */
12706         struct tcp_rack *rack;
12707 #ifdef NETFLIX_SB_LIMITS
12708         u_int mcnt, appended;
12709 #endif
12710
12711         /*
12712          * If last ACK falls within this segment's sequence numbers, record
12713          * the timestamp. NOTE that the test is modified according to the
12714          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12715          */
12716         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
12717                 return (0);
12718         }
12719         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
12720                 return (0);
12721         }
12722         if (tiwin && tiwin != tp->snd_wnd) {
12723                 return (0);
12724         }
12725         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
12726                 return (0);
12727         }
12728         if (__predict_false((to->to_flags & TOF_TS) &&
12729             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
12730                 return (0);
12731         }
12732         if (__predict_false((th->th_ack != tp->snd_una))) {
12733                 return (0);
12734         }
12735         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
12736                 return (0);
12737         }
12738         if ((to->to_flags & TOF_TS) != 0 &&
12739             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12740                 tp->ts_recent_age = tcp_ts_getticks();
12741                 tp->ts_recent = to->to_tsval;
12742         }
12743         rack = (struct tcp_rack *)tp->t_fb_ptr;
12744         /*
12745          * This is a pure, in-sequence data packet with nothing on the
12746          * reassembly queue and we have enough buffer space to take it.
12747          */
12748         nsegs = max(1, m->m_pkthdr.lro_nsegs);
12749
12750 #ifdef NETFLIX_SB_LIMITS
12751         if (so->so_rcv.sb_shlim) {
12752                 mcnt = m_memcnt(m);
12753                 appended = 0;
12754                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12755                     CFO_NOSLEEP, NULL) == false) {
12756                         counter_u64_add(tcp_sb_shlim_fails, 1);
12757                         m_freem(m);
12758                         return (1);
12759                 }
12760         }
12761 #endif
12762         /* Clean receiver SACK report if present */
12763         if (tp->rcv_numsacks)
12764                 tcp_clean_sackreport(tp);
12765         KMOD_TCPSTAT_INC(tcps_preddat);
12766         tp->rcv_nxt += tlen;
12767         if (tlen &&
12768             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12769             (tp->t_fbyte_in == 0)) {
12770                 tp->t_fbyte_in = ticks;
12771                 if (tp->t_fbyte_in == 0)
12772                         tp->t_fbyte_in = 1;
12773                 if (tp->t_fbyte_out && tp->t_fbyte_in)
12774                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12775         }
12776         /*
12777          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
12778          */
12779         tp->snd_wl1 = th->th_seq;
12780         /*
12781          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
12782          */
12783         tp->rcv_up = tp->rcv_nxt;
12784         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12785         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12786         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
12787
12788         /* Add data to socket buffer. */
12789         SOCKBUF_LOCK(&so->so_rcv);
12790         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12791                 m_freem(m);
12792         } else {
12793                 /*
12794                  * Set new socket buffer size. Give up when limit is
12795                  * reached.
12796                  */
12797                 if (newsize)
12798                         if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
12799                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
12800                 m_adj(m, drop_hdrlen);  /* delayed header drop */
12801 #ifdef NETFLIX_SB_LIMITS
12802                 appended =
12803 #endif
12804                         sbappendstream_locked(&so->so_rcv, m, 0);
12805                 ctf_calc_rwin(so, tp);
12806         }
12807         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12808         /* NB: sorwakeup_locked() does an implicit unlock. */
12809         sorwakeup_locked(so);
12810 #ifdef NETFLIX_SB_LIMITS
12811         if (so->so_rcv.sb_shlim && mcnt != appended)
12812                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
12813 #endif
12814         rack_handle_delayed_ack(tp, rack, tlen, 0);
12815         if (tp->snd_una == tp->snd_max)
12816                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
12817         return (1);
12818 }
12819
12820 /*
12821  * This subfunction is used to try to highly optimize the
12822  * fast path. We again allow window updates that are
12823  * in sequence to remain in the fast-path. We also add
12824  * in the __predict's to attempt to help the compiler.
12825  * Note that if we return a 0, then we can *not* process
12826  * it and the caller should push the packet into the
12827  * slow-path.
12828  */
12829 static int
12830 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12831     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12832     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
12833 {
12834         int32_t acked;
12835         int32_t nsegs;
12836         int32_t under_pacing = 0;
12837         struct tcp_rack *rack;
12838
12839         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
12840                 /* Old ack, behind (or duplicate to) the last one rcv'd */
12841                 return (0);
12842         }
12843         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
12844                 /* Above what we have sent? */
12845                 return (0);
12846         }
12847         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
12848                 /* We are retransmitting */
12849                 return (0);
12850         }
12851         if (__predict_false(tiwin == 0)) {
12852                 /* zero window */
12853                 return (0);
12854         }
12855         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
12856                 /* We need a SYN or a FIN, unlikely.. */
12857                 return (0);
12858         }
12859         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
12860                 /* Timestamp is behind .. old ack with seq wrap? */
12861                 return (0);
12862         }
12863         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
12864                 /* Still recovering */
12865                 return (0);
12866         }
12867         rack = (struct tcp_rack *)tp->t_fb_ptr;
12868         if (rack->r_ctl.rc_sacked) {
12869                 /* We have sack holes on our scoreboard */
12870                 return (0);
12871         }
12872         /* Ok if we reach here, we can process a fast-ack */
12873         if (rack->gp_ready &&
12874             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12875                 under_pacing = 1;
12876         }
12877         nsegs = max(1, m->m_pkthdr.lro_nsegs);
12878         rack_log_ack(tp, to, th, 0, 0, NULL, NULL);
12879         /* Did the window get updated? */
12880         if (tiwin != tp->snd_wnd) {
12881                 tp->snd_wnd = tiwin;
12882                 rack_validate_fo_sendwin_up(tp, rack);
12883                 tp->snd_wl1 = th->th_seq;
12884                 if (tp->snd_wnd > tp->max_sndwnd)
12885                         tp->max_sndwnd = tp->snd_wnd;
12886         }
12887         /* Do we exit persists? */
12888         if ((rack->rc_in_persist != 0) &&
12889             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12890                                rack->r_ctl.rc_pace_min_segs))) {
12891                 rack_exit_persist(tp, rack, cts);
12892         }
12893         /* Do we enter persists? */
12894         if ((rack->rc_in_persist == 0) &&
12895             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12896             TCPS_HAVEESTABLISHED(tp->t_state) &&
12897             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12898             sbavail(&tptosocket(tp)->so_snd) &&
12899             (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12900                 /*
12901                  * Here the rwnd is less than
12902                  * the pacing size, we are established,
12903                  * nothing is outstanding, and there is
12904                  * data to send. Enter persists.
12905                  */
12906                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack);
12907         }
12908         /*
12909          * If last ACK falls within this segment's sequence numbers, record
12910          * the timestamp. NOTE that the test is modified according to the
12911          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12912          */
12913         if ((to->to_flags & TOF_TS) != 0 &&
12914             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12915                 tp->ts_recent_age = tcp_ts_getticks();
12916                 tp->ts_recent = to->to_tsval;
12917         }
12918         /*
12919          * This is a pure ack for outstanding data.
12920          */
12921         KMOD_TCPSTAT_INC(tcps_predack);
12922
12923         /*
12924          * "bad retransmit" recovery.
12925          */
12926         if ((tp->t_flags & TF_PREVVALID) &&
12927             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
12928                 tp->t_flags &= ~TF_PREVVALID;
12929                 if (tp->t_rxtshift == 1 &&
12930                     (int)(ticks - tp->t_badrxtwin) < 0)
12931                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
12932         }
12933         /*
12934          * Recalculate the transmit timer / rtt.
12935          *
12936          * Some boxes send broken timestamp replies during the SYN+ACK
12937          * phase, ignore timestamps of 0 or we could calculate a huge RTT
12938          * and blow up the retransmit timer.
12939          */
12940         acked = BYTES_THIS_ACK(tp, th);
12941
12942 #ifdef TCP_HHOOK
12943         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
12944         hhook_run_tcp_est_in(tp, th, to);
12945 #endif
12946         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
12947         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
12948         if (acked) {
12949                 struct mbuf *mfree;
12950
12951                 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
12952                 SOCKBUF_LOCK(&so->so_snd);
12953                 mfree = sbcut_locked(&so->so_snd, acked);
12954                 tp->snd_una = th->th_ack;
12955                 /* Note we want to hold the sb lock through the sendmap adjust */
12956                 rack_adjust_sendmap_head(rack, &so->so_snd);
12957                 /* Wake up the socket if we have room to write more */
12958                 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
12959                 sowwakeup_locked(so);
12960                 m_freem(mfree);
12961                 tp->t_rxtshift = 0;
12962                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
12963                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
12964                 rack->rc_tlp_in_progress = 0;
12965                 rack->r_ctl.rc_tlp_cnt_out = 0;
12966                 /*
12967                  * If it is the RXT timer we want to
12968                  * stop it, so we can restart a TLP.
12969                  */
12970                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
12971                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12972
12973 #ifdef TCP_REQUEST_TRK
12974                 rack_http_check_for_comp(rack, th->th_ack);
12975 #endif
12976         }
12977         /*
12978          * Let the congestion control algorithm update congestion control
12979          * related information. This typically means increasing the
12980          * congestion window.
12981          */
12982         if (tp->snd_wnd < ctf_outstanding(tp)) {
12983                 /* The peer collapsed the window */
12984                 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12985         } else if (rack->rc_has_collapsed)
12986                 rack_un_collapse_window(rack, __LINE__);
12987         if ((rack->r_collapse_point_valid) &&
12988             (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
12989                 rack->r_collapse_point_valid = 0;
12990         /*
12991          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
12992          */
12993         tp->snd_wl2 = th->th_ack;
12994         tp->t_dupacks = 0;
12995         m_freem(m);
12996         /* ND6_HINT(tp);         *//* Some progress has been made. */
12997
12998         /*
12999          * If all outstanding data are acked, stop retransmit timer,
13000          * otherwise restart timer using current (possibly backed-off)
13001          * value. If process is waiting for space, wakeup/selwakeup/signal.
13002          * If data are ready to send, let tcp_output decide between more
13003          * output or persist.
13004          */
13005         if (under_pacing &&
13006             (rack->use_fixed_rate == 0) &&
13007             (rack->in_probe_rtt == 0) &&
13008             rack->rc_gp_dyn_mul &&
13009             rack->rc_always_pace) {
13010                 /* Check if we are dragging bottom */
13011                 rack_check_bottom_drag(tp, rack, so);
13012         }
13013         if (tp->snd_una == tp->snd_max) {
13014                 tp->t_flags &= ~TF_PREVVALID;
13015                 rack->r_ctl.retran_during_recovery = 0;
13016                 rack->rc_suspicious = 0;
13017                 rack->r_ctl.dsack_byte_cnt = 0;
13018                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
13019                 if (rack->r_ctl.rc_went_idle_time == 0)
13020                         rack->r_ctl.rc_went_idle_time = 1;
13021                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
13022                 if (sbavail(&tptosocket(tp)->so_snd) == 0)
13023                         tp->t_acktime = 0;
13024                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13025         }
13026         if (acked && rack->r_fast_output)
13027                 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
13028         if (sbavail(&so->so_snd)) {
13029                 rack->r_wanted_output = 1;
13030         }
13031         return (1);
13032 }
13033
13034 /*
13035  * Return value of 1, the TCB is unlocked and most
13036  * likely gone, return value of 0, the TCP is still
13037  * locked.
13038  */
13039 static int
13040 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
13041     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13042     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13043 {
13044         int32_t ret_val = 0;
13045         int32_t todrop;
13046         int32_t ourfinisacked = 0;
13047         struct tcp_rack *rack;
13048
13049         INP_WLOCK_ASSERT(tptoinpcb(tp));
13050
13051         ctf_calc_rwin(so, tp);
13052         /*
13053          * If the state is SYN_SENT: if seg contains an ACK, but not for our
13054          * SYN, drop the input. if seg contains a RST, then drop the
13055          * connection. if seg does not contain SYN, then drop it. Otherwise
13056          * this is an acceptable SYN segment initialize tp->rcv_nxt and
13057          * tp->irs if seg contains ack then advance tp->snd_una if seg
13058          * contains an ECE and ECN support is enabled, the stream is ECN
13059          * capable. if SYN has been acked change to ESTABLISHED else
13060          * SYN_RCVD state arrange for segment to be acked (eventually)
13061          * continue processing rest of data/controls.
13062          */
13063         if ((thflags & TH_ACK) &&
13064             (SEQ_LEQ(th->th_ack, tp->iss) ||
13065             SEQ_GT(th->th_ack, tp->snd_max))) {
13066                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
13067                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13068                 return (1);
13069         }
13070         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
13071                 TCP_PROBE5(connect__refused, NULL, tp,
13072                     mtod(m, const char *), tp, th);
13073                 tp = tcp_drop(tp, ECONNREFUSED);
13074                 ctf_do_drop(m, tp);
13075                 return (1);
13076         }
13077         if (thflags & TH_RST) {
13078                 ctf_do_drop(m, tp);
13079                 return (1);
13080         }
13081         if (!(thflags & TH_SYN)) {
13082                 ctf_do_drop(m, tp);
13083                 return (1);
13084         }
13085         tp->irs = th->th_seq;
13086         tcp_rcvseqinit(tp);
13087         rack = (struct tcp_rack *)tp->t_fb_ptr;
13088         if (thflags & TH_ACK) {
13089                 int tfo_partial = 0;
13090
13091                 KMOD_TCPSTAT_INC(tcps_connects);
13092                 soisconnected(so);
13093 #ifdef MAC
13094                 mac_socketpeer_set_from_mbuf(m, so);
13095 #endif
13096                 /* Do window scaling on this connection? */
13097                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
13098                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
13099                         tp->rcv_scale = tp->request_r_scale;
13100                 }
13101                 tp->rcv_adv += min(tp->rcv_wnd,
13102                     TCP_MAXWIN << tp->rcv_scale);
13103                 /*
13104                  * If not all the data that was sent in the TFO SYN
13105                  * has been acked, resend the remainder right away.
13106                  */
13107                 if (IS_FASTOPEN(tp->t_flags) &&
13108                     (tp->snd_una != tp->snd_max)) {
13109                         tp->snd_nxt = th->th_ack;
13110                         tfo_partial = 1;
13111                 }
13112                 /*
13113                  * If there's data, delay ACK; if there's also a FIN ACKNOW
13114                  * will be turned on later.
13115                  */
13116                 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
13117                         rack_timer_cancel(tp, rack,
13118                                           rack->r_ctl.rc_rcvtime, __LINE__);
13119                         tp->t_flags |= TF_DELACK;
13120                 } else {
13121                         rack->r_wanted_output = 1;
13122                         tp->t_flags |= TF_ACKNOW;
13123                 }
13124
13125                 tcp_ecn_input_syn_sent(tp, thflags, iptos);
13126
13127                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
13128                         /*
13129                          * We advance snd_una for the
13130                          * fast open case. If th_ack is
13131                          * acknowledging data beyond
13132                          * snd_una we can't just call
13133                          * ack-processing since the
13134                          * data stream in our send-map
13135                          * will start at snd_una + 1 (one
13136                          * beyond the SYN). If its just
13137                          * equal we don't need to do that
13138                          * and there is no send_map.
13139                          */
13140                         tp->snd_una++;
13141                 }
13142                 /*
13143                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
13144                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
13145                  */
13146                 tp->t_starttime = ticks;
13147                 if (tp->t_flags & TF_NEEDFIN) {
13148                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
13149                         tp->t_flags &= ~TF_NEEDFIN;
13150                         thflags &= ~TH_SYN;
13151                 } else {
13152                         tcp_state_change(tp, TCPS_ESTABLISHED);
13153                         TCP_PROBE5(connect__established, NULL, tp,
13154                             mtod(m, const char *), tp, th);
13155                         rack_cc_conn_init(tp);
13156                 }
13157         } else {
13158                 /*
13159                  * Received initial SYN in SYN-SENT[*] state => simultaneous
13160                  * open.  If segment contains CC option and there is a
13161                  * cached CC, apply TAO test. If it succeeds, connection is *
13162                  * half-synchronized. Otherwise, do 3-way handshake:
13163                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
13164                  * there was no CC option, clear cached CC value.
13165                  */
13166                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
13167                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
13168         }
13169         /*
13170          * Advance th->th_seq to correspond to first data byte. If data,
13171          * trim to stay within window, dropping FIN if necessary.
13172          */
13173         th->th_seq++;
13174         if (tlen > tp->rcv_wnd) {
13175                 todrop = tlen - tp->rcv_wnd;
13176                 m_adj(m, -todrop);
13177                 tlen = tp->rcv_wnd;
13178                 thflags &= ~TH_FIN;
13179                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
13180                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
13181         }
13182         tp->snd_wl1 = th->th_seq - 1;
13183         tp->rcv_up = th->th_seq;
13184         /*
13185          * Client side of transaction: already sent SYN and data. If the
13186          * remote host used T/TCP to validate the SYN, our data will be
13187          * ACK'd; if so, enter normal data segment processing in the middle
13188          * of step 5, ack processing. Otherwise, goto step 6.
13189          */
13190         if (thflags & TH_ACK) {
13191                 /* For syn-sent we need to possibly update the rtt */
13192                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
13193                         uint32_t t, mcts;
13194
13195                         mcts = tcp_ts_getticks();
13196                         t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
13197                         if (!tp->t_rttlow || tp->t_rttlow > t)
13198                                 tp->t_rttlow = t;
13199                         rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
13200                         tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
13201                         tcp_rack_xmit_timer_commit(rack, tp);
13202                 }
13203                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
13204                         return (ret_val);
13205                 /* We may have changed to FIN_WAIT_1 above */
13206                 if (tp->t_state == TCPS_FIN_WAIT_1) {
13207                         /*
13208                          * In FIN_WAIT_1 STATE in addition to the processing
13209                          * for the ESTABLISHED state if our FIN is now
13210                          * acknowledged then enter FIN_WAIT_2.
13211                          */
13212                         if (ourfinisacked) {
13213                                 /*
13214                                  * If we can't receive any more data, then
13215                                  * closing user can proceed. Starting the
13216                                  * timer is contrary to the specification,
13217                                  * but if we don't get a FIN we'll hang
13218                                  * forever.
13219                                  *
13220                                  * XXXjl: we should release the tp also, and
13221                                  * use a compressed state.
13222                                  */
13223                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13224                                         soisdisconnected(so);
13225                                         tcp_timer_activate(tp, TT_2MSL,
13226                                             (tcp_fast_finwait2_recycle ?
13227                                             tcp_finwait2_timeout :
13228                                             TP_MAXIDLE(tp)));
13229                                 }
13230                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13231                         }
13232                 }
13233         }
13234         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13235            tiwin, thflags, nxt_pkt));
13236 }
13237
13238 /*
13239  * Return value of 1, the TCB is unlocked and most
13240  * likely gone, return value of 0, the TCP is still
13241  * locked.
13242  */
13243 static int
13244 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
13245     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13246     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13247 {
13248         struct tcp_rack *rack;
13249         int32_t ret_val = 0;
13250         int32_t ourfinisacked = 0;
13251
13252         ctf_calc_rwin(so, tp);
13253         if ((thflags & TH_ACK) &&
13254             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
13255             SEQ_GT(th->th_ack, tp->snd_max))) {
13256                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
13257                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13258                 return (1);
13259         }
13260         rack = (struct tcp_rack *)tp->t_fb_ptr;
13261         if (IS_FASTOPEN(tp->t_flags)) {
13262                 /*
13263                  * When a TFO connection is in SYN_RECEIVED, the
13264                  * only valid packets are the initial SYN, a
13265                  * retransmit/copy of the initial SYN (possibly with
13266                  * a subset of the original data), a valid ACK, a
13267                  * FIN, or a RST.
13268                  */
13269                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
13270                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
13271                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13272                         return (1);
13273                 } else if (thflags & TH_SYN) {
13274                         /* non-initial SYN is ignored */
13275                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
13276                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
13277                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
13278                                 ctf_do_drop(m, NULL);
13279                                 return (0);
13280                         }
13281                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
13282                         ctf_do_drop(m, NULL);
13283                         return (0);
13284                 }
13285         }
13286
13287         if ((thflags & TH_RST) ||
13288             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13289                 return (__ctf_process_rst(m, th, so, tp,
13290                                           &rack->r_ctl.challenge_ack_ts,
13291                                           &rack->r_ctl.challenge_ack_cnt));
13292         /*
13293          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13294          * it's less than ts_recent, drop it.
13295          */
13296         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13297             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13298                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13299                         return (ret_val);
13300         }
13301         /*
13302          * In the SYN-RECEIVED state, validate that the packet belongs to
13303          * this connection before trimming the data to fit the receive
13304          * window.  Check the sequence number versus IRS since we know the
13305          * sequence numbers haven't wrapped.  This is a partial fix for the
13306          * "LAND" DoS attack.
13307          */
13308         if (SEQ_LT(th->th_seq, tp->irs)) {
13309                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
13310                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13311                 return (1);
13312         }
13313         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13314                               &rack->r_ctl.challenge_ack_ts,
13315                               &rack->r_ctl.challenge_ack_cnt)) {
13316                 return (ret_val);
13317         }
13318         /*
13319          * If last ACK falls within this segment's sequence numbers, record
13320          * its timestamp. NOTE: 1) That the test incorporates suggestions
13321          * from the latest proposal of the tcplw@cray.com list (Braden
13322          * 1993/04/26). 2) That updating only on newer timestamps interferes
13323          * with our earlier PAWS tests, so this check should be solely
13324          * predicated on the sequence space of this segment. 3) That we
13325          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13326          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13327          * SEG.Len, This modified check allows us to overcome RFC1323's
13328          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13329          * p.869. In such cases, we can still calculate the RTT correctly
13330          * when RCV.NXT == Last.ACK.Sent.
13331          */
13332         if ((to->to_flags & TOF_TS) != 0 &&
13333             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13334             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13335             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13336                 tp->ts_recent_age = tcp_ts_getticks();
13337                 tp->ts_recent = to->to_tsval;
13338         }
13339         tp->snd_wnd = tiwin;
13340         rack_validate_fo_sendwin_up(tp, rack);
13341         /*
13342          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
13343          * is on (half-synchronized state), then queue data for later
13344          * processing; else drop segment and return.
13345          */
13346         if ((thflags & TH_ACK) == 0) {
13347                 if (IS_FASTOPEN(tp->t_flags)) {
13348                         rack_cc_conn_init(tp);
13349                 }
13350                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13351                     tiwin, thflags, nxt_pkt));
13352         }
13353         KMOD_TCPSTAT_INC(tcps_connects);
13354         if (tp->t_flags & TF_SONOTCONN) {
13355                 tp->t_flags &= ~TF_SONOTCONN;
13356                 soisconnected(so);
13357         }
13358         /* Do window scaling? */
13359         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
13360             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
13361                 tp->rcv_scale = tp->request_r_scale;
13362         }
13363         /*
13364          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
13365          * FIN-WAIT-1
13366          */
13367         tp->t_starttime = ticks;
13368         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
13369                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
13370                 tp->t_tfo_pending = NULL;
13371         }
13372         if (tp->t_flags & TF_NEEDFIN) {
13373                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
13374                 tp->t_flags &= ~TF_NEEDFIN;
13375         } else {
13376                 tcp_state_change(tp, TCPS_ESTABLISHED);
13377                 TCP_PROBE5(accept__established, NULL, tp,
13378                     mtod(m, const char *), tp, th);
13379                 /*
13380                  * TFO connections call cc_conn_init() during SYN
13381                  * processing.  Calling it again here for such connections
13382                  * is not harmless as it would undo the snd_cwnd reduction
13383                  * that occurs when a TFO SYN|ACK is retransmitted.
13384                  */
13385                 if (!IS_FASTOPEN(tp->t_flags))
13386                         rack_cc_conn_init(tp);
13387         }
13388         /*
13389          * Account for the ACK of our SYN prior to
13390          * regular ACK processing below, except for
13391          * simultaneous SYN, which is handled later.
13392          */
13393         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
13394                 tp->snd_una++;
13395         /*
13396          * If segment contains data or ACK, will call tcp_reass() later; if
13397          * not, do so now to pass queued data to user.
13398          */
13399         if (tlen == 0 && (thflags & TH_FIN) == 0) {
13400                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
13401                     (struct mbuf *)0);
13402                 if (tp->t_flags & TF_WAKESOR) {
13403                         tp->t_flags &= ~TF_WAKESOR;
13404                         /* NB: sorwakeup_locked() does an implicit unlock. */
13405                         sorwakeup_locked(so);
13406                 }
13407         }
13408         tp->snd_wl1 = th->th_seq - 1;
13409         /* For syn-recv we need to possibly update the rtt */
13410         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
13411                 uint32_t t, mcts;
13412
13413                 mcts = tcp_ts_getticks();
13414                 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
13415                 if (!tp->t_rttlow || tp->t_rttlow > t)
13416                         tp->t_rttlow = t;
13417                 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
13418                 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
13419                 tcp_rack_xmit_timer_commit(rack, tp);
13420         }
13421         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
13422                 return (ret_val);
13423         }
13424         if (tp->t_state == TCPS_FIN_WAIT_1) {
13425                 /* We could have went to FIN_WAIT_1 (or EST) above */
13426                 /*
13427                  * In FIN_WAIT_1 STATE in addition to the processing for the
13428                  * ESTABLISHED state if our FIN is now acknowledged then
13429                  * enter FIN_WAIT_2.
13430                  */
13431                 if (ourfinisacked) {
13432                         /*
13433                          * If we can't receive any more data, then closing
13434                          * user can proceed. Starting the timer is contrary
13435                          * to the specification, but if we don't get a FIN
13436                          * we'll hang forever.
13437                          *
13438                          * XXXjl: we should release the tp also, and use a
13439                          * compressed state.
13440                          */
13441                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13442                                 soisdisconnected(so);
13443                                 tcp_timer_activate(tp, TT_2MSL,
13444                                     (tcp_fast_finwait2_recycle ?
13445                                     tcp_finwait2_timeout :
13446                                     TP_MAXIDLE(tp)));
13447                         }
13448                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
13449                 }
13450         }
13451         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13452             tiwin, thflags, nxt_pkt));
13453 }
13454
13455 /*
13456  * Return value of 1, the TCB is unlocked and most
13457  * likely gone, return value of 0, the TCP is still
13458  * locked.
13459  */
13460 static int
13461 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
13462     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13463     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13464 {
13465         int32_t ret_val = 0;
13466         struct tcp_rack *rack;
13467
13468         /*
13469          * Header prediction: check for the two common cases of a
13470          * uni-directional data xfer.  If the packet has no control flags,
13471          * is in-sequence, the window didn't change and we're not
13472          * retransmitting, it's a candidate.  If the length is zero and the
13473          * ack moved forward, we're the sender side of the xfer.  Just free
13474          * the data acked & wake any higher level process that was blocked
13475          * waiting for space.  If the length is non-zero and the ack didn't
13476          * move, we're the receiver side.  If we're getting packets in-order
13477          * (the reassembly queue is empty), add the data toc The socket
13478          * buffer and note that we need a delayed ack. Make sure that the
13479          * hidden state-flags are also off. Since we check for
13480          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
13481          */
13482         rack = (struct tcp_rack *)tp->t_fb_ptr;
13483         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
13484             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
13485             __predict_true(SEGQ_EMPTY(tp)) &&
13486             __predict_true(th->th_seq == tp->rcv_nxt)) {
13487                 if (tlen == 0) {
13488                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
13489                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
13490                                 return (0);
13491                         }
13492                 } else {
13493                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
13494                             tiwin, nxt_pkt, iptos)) {
13495                                 return (0);
13496                         }
13497                 }
13498         }
13499         ctf_calc_rwin(so, tp);
13500
13501         if ((thflags & TH_RST) ||
13502             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13503                 return (__ctf_process_rst(m, th, so, tp,
13504                                           &rack->r_ctl.challenge_ack_ts,
13505                                           &rack->r_ctl.challenge_ack_cnt));
13506
13507         /*
13508          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13509          * synchronized state.
13510          */
13511         if (thflags & TH_SYN) {
13512                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13513                 return (ret_val);
13514         }
13515         /*
13516          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13517          * it's less than ts_recent, drop it.
13518          */
13519         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13520             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13521                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13522                         return (ret_val);
13523         }
13524         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13525                               &rack->r_ctl.challenge_ack_ts,
13526                               &rack->r_ctl.challenge_ack_cnt)) {
13527                 return (ret_val);
13528         }
13529         /*
13530          * If last ACK falls within this segment's sequence numbers, record
13531          * its timestamp. NOTE: 1) That the test incorporates suggestions
13532          * from the latest proposal of the tcplw@cray.com list (Braden
13533          * 1993/04/26). 2) That updating only on newer timestamps interferes
13534          * with our earlier PAWS tests, so this check should be solely
13535          * predicated on the sequence space of this segment. 3) That we
13536          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13537          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13538          * SEG.Len, This modified check allows us to overcome RFC1323's
13539          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13540          * p.869. In such cases, we can still calculate the RTT correctly
13541          * when RCV.NXT == Last.ACK.Sent.
13542          */
13543         if ((to->to_flags & TOF_TS) != 0 &&
13544             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13545             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13546             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13547                 tp->ts_recent_age = tcp_ts_getticks();
13548                 tp->ts_recent = to->to_tsval;
13549         }
13550         /*
13551          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
13552          * is on (half-synchronized state), then queue data for later
13553          * processing; else drop segment and return.
13554          */
13555         if ((thflags & TH_ACK) == 0) {
13556                 if (tp->t_flags & TF_NEEDSYN) {
13557                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13558                             tiwin, thflags, nxt_pkt));
13559
13560                 } else if (tp->t_flags & TF_ACKNOW) {
13561                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13562                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13563                         return (ret_val);
13564                 } else {
13565                         ctf_do_drop(m, NULL);
13566                         return (0);
13567                 }
13568         }
13569         /*
13570          * Ack processing.
13571          */
13572         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
13573                 return (ret_val);
13574         }
13575         if (sbavail(&so->so_snd)) {
13576                 if (ctf_progress_timeout_check(tp, true)) {
13577                         rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
13578                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13579                         return (1);
13580                 }
13581         }
13582         /* State changes only happen in rack_process_data() */
13583         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13584             tiwin, thflags, nxt_pkt));
13585 }
13586
13587 /*
13588  * Return value of 1, the TCB is unlocked and most
13589  * likely gone, return value of 0, the TCP is still
13590  * locked.
13591  */
13592 static int
13593 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
13594     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13595     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13596 {
13597         int32_t ret_val = 0;
13598         struct tcp_rack *rack;
13599
13600         rack = (struct tcp_rack *)tp->t_fb_ptr;
13601         ctf_calc_rwin(so, tp);
13602         if ((thflags & TH_RST) ||
13603             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13604                 return (__ctf_process_rst(m, th, so, tp,
13605                                           &rack->r_ctl.challenge_ack_ts,
13606                                           &rack->r_ctl.challenge_ack_cnt));
13607         /*
13608          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13609          * synchronized state.
13610          */
13611         if (thflags & TH_SYN) {
13612                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13613                 return (ret_val);
13614         }
13615         /*
13616          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13617          * it's less than ts_recent, drop it.
13618          */
13619         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13620             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13621                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13622                         return (ret_val);
13623         }
13624         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13625                               &rack->r_ctl.challenge_ack_ts,
13626                               &rack->r_ctl.challenge_ack_cnt)) {
13627                 return (ret_val);
13628         }
13629         /*
13630          * If last ACK falls within this segment's sequence numbers, record
13631          * its timestamp. NOTE: 1) That the test incorporates suggestions
13632          * from the latest proposal of the tcplw@cray.com list (Braden
13633          * 1993/04/26). 2) That updating only on newer timestamps interferes
13634          * with our earlier PAWS tests, so this check should be solely
13635          * predicated on the sequence space of this segment. 3) That we
13636          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13637          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13638          * SEG.Len, This modified check allows us to overcome RFC1323's
13639          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13640          * p.869. In such cases, we can still calculate the RTT correctly
13641          * when RCV.NXT == Last.ACK.Sent.
13642          */
13643         if ((to->to_flags & TOF_TS) != 0 &&
13644             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13645             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13646             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13647                 tp->ts_recent_age = tcp_ts_getticks();
13648                 tp->ts_recent = to->to_tsval;
13649         }
13650         /*
13651          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
13652          * is on (half-synchronized state), then queue data for later
13653          * processing; else drop segment and return.
13654          */
13655         if ((thflags & TH_ACK) == 0) {
13656                 if (tp->t_flags & TF_NEEDSYN) {
13657                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13658                             tiwin, thflags, nxt_pkt));
13659
13660                 } else if (tp->t_flags & TF_ACKNOW) {
13661                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13662                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13663                         return (ret_val);
13664                 } else {
13665                         ctf_do_drop(m, NULL);
13666                         return (0);
13667                 }
13668         }
13669         /*
13670          * Ack processing.
13671          */
13672         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
13673                 return (ret_val);
13674         }
13675         if (sbavail(&so->so_snd)) {
13676                 if (ctf_progress_timeout_check(tp, true)) {
13677                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13678                                                 tp, tick, PROGRESS_DROP, __LINE__);
13679                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13680                         return (1);
13681                 }
13682         }
13683         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13684             tiwin, thflags, nxt_pkt));
13685 }
13686
13687 static int
13688 rack_check_data_after_close(struct mbuf *m,
13689     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
13690 {
13691         struct tcp_rack *rack;
13692
13693         rack = (struct tcp_rack *)tp->t_fb_ptr;
13694         if (rack->rc_allow_data_af_clo == 0) {
13695         close_now:
13696                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13697                 /* tcp_close will kill the inp pre-log the Reset */
13698                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13699                 tp = tcp_close(tp);
13700                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
13701                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
13702                 return (1);
13703         }
13704         if (sbavail(&so->so_snd) == 0)
13705                 goto close_now;
13706         /* Ok we allow data that is ignored and a followup reset */
13707         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13708         tp->rcv_nxt = th->th_seq + *tlen;
13709         tp->t_flags2 |= TF2_DROP_AF_DATA;
13710         rack->r_wanted_output = 1;
13711         *tlen = 0;
13712         return (0);
13713 }
13714
13715 /*
13716  * Return value of 1, the TCB is unlocked and most
13717  * likely gone, return value of 0, the TCP is still
13718  * locked.
13719  */
13720 static int
13721 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
13722     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13723     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13724 {
13725         int32_t ret_val = 0;
13726         int32_t ourfinisacked = 0;
13727         struct tcp_rack *rack;
13728
13729         rack = (struct tcp_rack *)tp->t_fb_ptr;
13730         ctf_calc_rwin(so, tp);
13731
13732         if ((thflags & TH_RST) ||
13733             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13734                 return (__ctf_process_rst(m, th, so, tp,
13735                                           &rack->r_ctl.challenge_ack_ts,
13736                                           &rack->r_ctl.challenge_ack_cnt));
13737         /*
13738          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13739          * synchronized state.
13740          */
13741         if (thflags & TH_SYN) {
13742                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13743                 return (ret_val);
13744         }
13745         /*
13746          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13747          * it's less than ts_recent, drop it.
13748          */
13749         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13750             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13751                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13752                         return (ret_val);
13753         }
13754         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13755                               &rack->r_ctl.challenge_ack_ts,
13756                               &rack->r_ctl.challenge_ack_cnt)) {
13757                 return (ret_val);
13758         }
13759         /*
13760          * If new data are received on a connection after the user processes
13761          * are gone, then RST the other end.
13762          */
13763         if ((tp->t_flags & TF_CLOSED) && tlen &&
13764             rack_check_data_after_close(m, tp, &tlen, th, so))
13765                 return (1);
13766         /*
13767          * If last ACK falls within this segment's sequence numbers, record
13768          * its timestamp. NOTE: 1) That the test incorporates suggestions
13769          * from the latest proposal of the tcplw@cray.com list (Braden
13770          * 1993/04/26). 2) That updating only on newer timestamps interferes
13771          * with our earlier PAWS tests, so this check should be solely
13772          * predicated on the sequence space of this segment. 3) That we
13773          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13774          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13775          * SEG.Len, This modified check allows us to overcome RFC1323's
13776          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13777          * p.869. In such cases, we can still calculate the RTT correctly
13778          * when RCV.NXT == Last.ACK.Sent.
13779          */
13780         if ((to->to_flags & TOF_TS) != 0 &&
13781             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13782             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13783             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13784                 tp->ts_recent_age = tcp_ts_getticks();
13785                 tp->ts_recent = to->to_tsval;
13786         }
13787         /*
13788          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
13789          * is on (half-synchronized state), then queue data for later
13790          * processing; else drop segment and return.
13791          */
13792         if ((thflags & TH_ACK) == 0) {
13793                 if (tp->t_flags & TF_NEEDSYN) {
13794                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13795                             tiwin, thflags, nxt_pkt));
13796                 } else if (tp->t_flags & TF_ACKNOW) {
13797                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13798                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13799                         return (ret_val);
13800                 } else {
13801                         ctf_do_drop(m, NULL);
13802                         return (0);
13803                 }
13804         }
13805         /*
13806          * Ack processing.
13807          */
13808         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
13809                 return (ret_val);
13810         }
13811         if (ourfinisacked) {
13812                 /*
13813                  * If we can't receive any more data, then closing user can
13814                  * proceed. Starting the timer is contrary to the
13815                  * specification, but if we don't get a FIN we'll hang
13816                  * forever.
13817                  *
13818                  * XXXjl: we should release the tp also, and use a
13819                  * compressed state.
13820                  */
13821                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13822                         soisdisconnected(so);
13823                         tcp_timer_activate(tp, TT_2MSL,
13824                             (tcp_fast_finwait2_recycle ?
13825                             tcp_finwait2_timeout :
13826                             TP_MAXIDLE(tp)));
13827                 }
13828                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13829         }
13830         if (sbavail(&so->so_snd)) {
13831                 if (ctf_progress_timeout_check(tp, true)) {
13832                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13833                                                 tp, tick, PROGRESS_DROP, __LINE__);
13834                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13835                         return (1);
13836                 }
13837         }
13838         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13839             tiwin, thflags, nxt_pkt));
13840 }
13841
13842 /*
13843  * Return value of 1, the TCB is unlocked and most
13844  * likely gone, return value of 0, the TCP is still
13845  * locked.
13846  */
13847 static int
13848 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
13849     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13850     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13851 {
13852         int32_t ret_val = 0;
13853         int32_t ourfinisacked = 0;
13854         struct tcp_rack *rack;
13855
13856         rack = (struct tcp_rack *)tp->t_fb_ptr;
13857         ctf_calc_rwin(so, tp);
13858
13859         if ((thflags & TH_RST) ||
13860             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13861                 return (__ctf_process_rst(m, th, so, tp,
13862                                           &rack->r_ctl.challenge_ack_ts,
13863                                           &rack->r_ctl.challenge_ack_cnt));
13864         /*
13865          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13866          * synchronized state.
13867          */
13868         if (thflags & TH_SYN) {
13869                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13870                 return (ret_val);
13871         }
13872         /*
13873          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13874          * it's less than ts_recent, drop it.
13875          */
13876         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13877             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13878                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13879                         return (ret_val);
13880         }
13881         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13882                               &rack->r_ctl.challenge_ack_ts,
13883                               &rack->r_ctl.challenge_ack_cnt)) {
13884                 return (ret_val);
13885         }
13886         /*
13887          * If new data are received on a connection after the user processes
13888          * are gone, then RST the other end.
13889          */
13890         if ((tp->t_flags & TF_CLOSED) && tlen &&
13891             rack_check_data_after_close(m, tp, &tlen, th, so))
13892                 return (1);
13893         /*
13894          * If last ACK falls within this segment's sequence numbers, record
13895          * its timestamp. NOTE: 1) That the test incorporates suggestions
13896          * from the latest proposal of the tcplw@cray.com list (Braden
13897          * 1993/04/26). 2) That updating only on newer timestamps interferes
13898          * with our earlier PAWS tests, so this check should be solely
13899          * predicated on the sequence space of this segment. 3) That we
13900          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13901          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13902          * SEG.Len, This modified check allows us to overcome RFC1323's
13903          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13904          * p.869. In such cases, we can still calculate the RTT correctly
13905          * when RCV.NXT == Last.ACK.Sent.
13906          */
13907         if ((to->to_flags & TOF_TS) != 0 &&
13908             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13909             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13910             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13911                 tp->ts_recent_age = tcp_ts_getticks();
13912                 tp->ts_recent = to->to_tsval;
13913         }
13914         /*
13915          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
13916          * is on (half-synchronized state), then queue data for later
13917          * processing; else drop segment and return.
13918          */
13919         if ((thflags & TH_ACK) == 0) {
13920                 if (tp->t_flags & TF_NEEDSYN) {
13921                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13922                             tiwin, thflags, nxt_pkt));
13923                 } else if (tp->t_flags & TF_ACKNOW) {
13924                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13925                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13926                         return (ret_val);
13927                 } else {
13928                         ctf_do_drop(m, NULL);
13929                         return (0);
13930                 }
13931         }
13932         /*
13933          * Ack processing.
13934          */
13935         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
13936                 return (ret_val);
13937         }
13938         if (ourfinisacked) {
13939                 tcp_twstart(tp);
13940                 m_freem(m);
13941                 return (1);
13942         }
13943         if (sbavail(&so->so_snd)) {
13944                 if (ctf_progress_timeout_check(tp, true)) {
13945                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13946                                                 tp, tick, PROGRESS_DROP, __LINE__);
13947                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
13948                         return (1);
13949                 }
13950         }
13951         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13952             tiwin, thflags, nxt_pkt));
13953 }
13954
13955 /*
13956  * Return value of 1, the TCB is unlocked and most
13957  * likely gone, return value of 0, the TCP is still
13958  * locked.
13959  */
13960 static int
13961 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
13962     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13963     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13964 {
13965         int32_t ret_val = 0;
13966         int32_t ourfinisacked = 0;
13967         struct tcp_rack *rack;
13968
13969         rack = (struct tcp_rack *)tp->t_fb_ptr;
13970         ctf_calc_rwin(so, tp);
13971
13972         if ((thflags & TH_RST) ||
13973             (tp->t_fin_is_rst && (thflags & TH_FIN)))
13974                 return (__ctf_process_rst(m, th, so, tp,
13975                                           &rack->r_ctl.challenge_ack_ts,
13976                                           &rack->r_ctl.challenge_ack_cnt));
13977         /*
13978          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13979          * synchronized state.
13980          */
13981         if (thflags & TH_SYN) {
13982                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13983                 return (ret_val);
13984         }
13985         /*
13986          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13987          * it's less than ts_recent, drop it.
13988          */
13989         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13990             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13991                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13992                         return (ret_val);
13993         }
13994         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
13995                               &rack->r_ctl.challenge_ack_ts,
13996                               &rack->r_ctl.challenge_ack_cnt)) {
13997                 return (ret_val);
13998         }
13999         /*
14000          * If new data are received on a connection after the user processes
14001          * are gone, then RST the other end.
14002          */
14003         if ((tp->t_flags & TF_CLOSED) && tlen &&
14004             rack_check_data_after_close(m, tp, &tlen, th, so))
14005                 return (1);
14006         /*
14007          * If last ACK falls within this segment's sequence numbers, record
14008          * its timestamp. NOTE: 1) That the test incorporates suggestions
14009          * from the latest proposal of the tcplw@cray.com list (Braden
14010          * 1993/04/26). 2) That updating only on newer timestamps interferes
14011          * with our earlier PAWS tests, so this check should be solely
14012          * predicated on the sequence space of this segment. 3) That we
14013          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
14014          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
14015          * SEG.Len, This modified check allows us to overcome RFC1323's
14016          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
14017          * p.869. In such cases, we can still calculate the RTT correctly
14018          * when RCV.NXT == Last.ACK.Sent.
14019          */
14020         if ((to->to_flags & TOF_TS) != 0 &&
14021             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
14022             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
14023             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
14024                 tp->ts_recent_age = tcp_ts_getticks();
14025                 tp->ts_recent = to->to_tsval;
14026         }
14027         /*
14028          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
14029          * is on (half-synchronized state), then queue data for later
14030          * processing; else drop segment and return.
14031          */
14032         if ((thflags & TH_ACK) == 0) {
14033                 if (tp->t_flags & TF_NEEDSYN) {
14034                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
14035                             tiwin, thflags, nxt_pkt));
14036                 } else if (tp->t_flags & TF_ACKNOW) {
14037                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
14038                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
14039                         return (ret_val);
14040                 } else {
14041                         ctf_do_drop(m, NULL);
14042                         return (0);
14043                 }
14044         }
14045         /*
14046          * case TCPS_LAST_ACK: Ack processing.
14047          */
14048         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
14049                 return (ret_val);
14050         }
14051         if (ourfinisacked) {
14052                 tp = tcp_close(tp);
14053                 ctf_do_drop(m, tp);
14054                 return (1);
14055         }
14056         if (sbavail(&so->so_snd)) {
14057                 if (ctf_progress_timeout_check(tp, true)) {
14058                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
14059                                                 tp, tick, PROGRESS_DROP, __LINE__);
14060                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
14061                         return (1);
14062                 }
14063         }
14064         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
14065             tiwin, thflags, nxt_pkt));
14066 }
14067
14068 /*
14069  * Return value of 1, the TCB is unlocked and most
14070  * likely gone, return value of 0, the TCP is still
14071  * locked.
14072  */
14073 static int
14074 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
14075     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
14076     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
14077 {
14078         int32_t ret_val = 0;
14079         int32_t ourfinisacked = 0;
14080         struct tcp_rack *rack;
14081
14082         rack = (struct tcp_rack *)tp->t_fb_ptr;
14083         ctf_calc_rwin(so, tp);
14084
14085         /* Reset receive buffer auto scaling when not in bulk receive mode. */
14086         if ((thflags & TH_RST) ||
14087             (tp->t_fin_is_rst && (thflags & TH_FIN)))
14088                 return (__ctf_process_rst(m, th, so, tp,
14089                                           &rack->r_ctl.challenge_ack_ts,
14090                                           &rack->r_ctl.challenge_ack_cnt));
14091         /*
14092          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
14093          * synchronized state.
14094          */
14095         if (thflags & TH_SYN) {
14096                 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
14097                 return (ret_val);
14098         }
14099         /*
14100          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
14101          * it's less than ts_recent, drop it.
14102          */
14103         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
14104             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
14105                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
14106                         return (ret_val);
14107         }
14108         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
14109                               &rack->r_ctl.challenge_ack_ts,
14110                               &rack->r_ctl.challenge_ack_cnt)) {
14111                 return (ret_val);
14112         }
14113         /*
14114          * If new data are received on a connection after the user processes
14115          * are gone, then RST the other end.
14116          */
14117         if ((tp->t_flags & TF_CLOSED) && tlen &&
14118             rack_check_data_after_close(m, tp, &tlen, th, so))
14119                 return (1);
14120         /*
14121          * If last ACK falls within this segment's sequence numbers, record
14122          * its timestamp. NOTE: 1) That the test incorporates suggestions
14123          * from the latest proposal of the tcplw@cray.com list (Braden
14124          * 1993/04/26). 2) That updating only on newer timestamps interferes
14125          * with our earlier PAWS tests, so this check should be solely
14126          * predicated on the sequence space of this segment. 3) That we
14127          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
14128          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
14129          * SEG.Len, This modified check allows us to overcome RFC1323's
14130          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
14131          * p.869. In such cases, we can still calculate the RTT correctly
14132          * when RCV.NXT == Last.ACK.Sent.
14133          */
14134         if ((to->to_flags & TOF_TS) != 0 &&
14135             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
14136             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
14137             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
14138                 tp->ts_recent_age = tcp_ts_getticks();
14139                 tp->ts_recent = to->to_tsval;
14140         }
14141         /*
14142          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
14143          * is on (half-synchronized state), then queue data for later
14144          * processing; else drop segment and return.
14145          */
14146         if ((thflags & TH_ACK) == 0) {
14147                 if (tp->t_flags & TF_NEEDSYN) {
14148                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
14149                             tiwin, thflags, nxt_pkt));
14150                 } else if (tp->t_flags & TF_ACKNOW) {
14151                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
14152                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
14153                         return (ret_val);
14154                 } else {
14155                         ctf_do_drop(m, NULL);
14156                         return (0);
14157                 }
14158         }
14159         /*
14160          * Ack processing.
14161          */
14162         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
14163                 return (ret_val);
14164         }
14165         if (sbavail(&so->so_snd)) {
14166                 if (ctf_progress_timeout_check(tp, true)) {
14167                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
14168                                                 tp, tick, PROGRESS_DROP, __LINE__);
14169                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
14170                         return (1);
14171                 }
14172         }
14173         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
14174             tiwin, thflags, nxt_pkt));
14175 }
14176
14177 static void inline
14178 rack_clear_rate_sample(struct tcp_rack *rack)
14179 {
14180         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
14181         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
14182         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
14183 }
14184
14185 static void
14186 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
14187 {
14188         uint64_t bw_est, rate_wanted;
14189         int chged = 0;
14190         uint32_t user_max, orig_min, orig_max;
14191
14192 #ifdef TCP_REQUEST_TRK
14193         if (rack->rc_hybrid_mode &&
14194             (rack->r_ctl.rc_pace_max_segs != 0) &&
14195             (rack_hybrid_allow_set_maxseg == 1) &&
14196             (rack->r_ctl.rc_last_sft != NULL)) {
14197                 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS;
14198                 return;
14199         }
14200 #endif
14201         orig_min = rack->r_ctl.rc_pace_min_segs;
14202         orig_max = rack->r_ctl.rc_pace_max_segs;
14203         user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
14204         if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
14205                 chged = 1;
14206         rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
14207         if (rack->use_fixed_rate || rack->rc_force_max_seg) {
14208                 if (user_max != rack->r_ctl.rc_pace_max_segs)
14209                         chged = 1;
14210         }
14211         if (rack->rc_force_max_seg) {
14212                 rack->r_ctl.rc_pace_max_segs = user_max;
14213         } else if (rack->use_fixed_rate) {
14214                 bw_est = rack_get_bw(rack);
14215                 if ((rack->r_ctl.crte == NULL) ||
14216                     (bw_est != rack->r_ctl.crte->rate)) {
14217                         rack->r_ctl.rc_pace_max_segs = user_max;
14218                 } else {
14219                         /* We are pacing right at the hardware rate */
14220                         uint32_t segsiz, pace_one;
14221
14222                         if (rack_pace_one_seg ||
14223                             (rack->r_ctl.rc_user_set_min_segs == 1))
14224                                 pace_one = 1;
14225                         else
14226                                 pace_one = 0;
14227                         segsiz = min(ctf_fixed_maxseg(tp),
14228                                      rack->r_ctl.rc_pace_min_segs);
14229                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(
14230                                 tp, bw_est, segsiz, pace_one,
14231                                 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
14232                 }
14233         } else if (rack->rc_always_pace) {
14234                 if (rack->r_ctl.gp_bw ||
14235                     rack->r_ctl.init_rate) {
14236                         /* We have a rate of some sort set */
14237                         uint32_t  orig;
14238
14239                         bw_est = rack_get_bw(rack);
14240                         orig = rack->r_ctl.rc_pace_max_segs;
14241                         if (fill_override)
14242                                 rate_wanted = *fill_override;
14243                         else
14244                                 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
14245                         if (rate_wanted) {
14246                                 /* We have something */
14247                                 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
14248                                                                                    rate_wanted,
14249                                                                                    ctf_fixed_maxseg(rack->rc_tp));
14250                         } else
14251                                 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
14252                         if (orig != rack->r_ctl.rc_pace_max_segs)
14253                                 chged = 1;
14254                 } else if ((rack->r_ctl.gp_bw == 0) &&
14255                            (rack->r_ctl.rc_pace_max_segs == 0)) {
14256                         /*
14257                          * If we have nothing limit us to bursting
14258                          * out IW sized pieces.
14259                          */
14260                         chged = 1;
14261                         rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
14262                 }
14263         }
14264         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
14265                 chged = 1;
14266                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
14267         }
14268         if (chged)
14269                 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
14270 }
14271
14272
14273 static void
14274 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags)
14275 {
14276 #ifdef INET6
14277         struct ip6_hdr *ip6 = NULL;
14278 #endif
14279 #ifdef INET
14280         struct ip *ip = NULL;
14281 #endif
14282         struct udphdr *udp = NULL;
14283
14284         /* Ok lets fill in the fast block, it can only be used with no IP options! */
14285 #ifdef INET6
14286         if (rack->r_is_v6) {
14287                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
14288                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
14289                 if (tp->t_port) {
14290                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
14291                         udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
14292                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
14293                         udp->uh_dport = tp->t_port;
14294                         rack->r_ctl.fsb.udp = udp;
14295                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
14296                 } else
14297                 {
14298                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
14299                         rack->r_ctl.fsb.udp = NULL;
14300                 }
14301                 tcpip_fillheaders(rack->rc_inp,
14302                                   tp->t_port,
14303                                   ip6, rack->r_ctl.fsb.th);
14304                 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL);
14305         } else
14306 #endif                          /* INET6 */
14307 #ifdef INET
14308         {
14309                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
14310                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
14311                 if (tp->t_port) {
14312                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
14313                         udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
14314                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
14315                         udp->uh_dport = tp->t_port;
14316                         rack->r_ctl.fsb.udp = udp;
14317                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
14318                 } else
14319                 {
14320                         rack->r_ctl.fsb.udp = NULL;
14321                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
14322                 }
14323                 tcpip_fillheaders(rack->rc_inp,
14324                                   tp->t_port,
14325                                   ip, rack->r_ctl.fsb.th);
14326                 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl;
14327         }
14328 #endif
14329         rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0),
14330             (long)TCP_MAXWIN << tp->rcv_scale);
14331         rack->r_fsb_inited = 1;
14332 }
14333
14334 static int
14335 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
14336 {
14337         /*
14338          * Allocate the larger of spaces V6 if available else just
14339          * V4 and include udphdr (overbook)
14340          */
14341 #ifdef INET6
14342         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
14343 #else
14344         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
14345 #endif
14346         rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
14347                                             M_TCPFSB, M_NOWAIT|M_ZERO);
14348         if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
14349                 return (ENOMEM);
14350         }
14351         rack->r_fsb_inited = 0;
14352         return (0);
14353 }
14354
14355 static void
14356 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod)
14357 {
14358         /*
14359          * Types of logs (mod value)
14360          * 20 - Initial round setup
14361          * 21 - Rack declares a new round.
14362          */
14363         struct tcpcb *tp;
14364
14365         tp = rack->rc_tp;
14366         if (tcp_bblogging_on(tp)) {
14367                 union tcp_log_stackspecific log;
14368                 struct timeval tv;
14369
14370                 memset(&log, 0, sizeof(log));
14371                 log.u_bbr.flex1 = rack->r_ctl.current_round;
14372                 log.u_bbr.flex2 = rack->r_ctl.roundends;
14373                 log.u_bbr.flex3 = high_seq;
14374                 log.u_bbr.flex4 = tp->snd_max;
14375                 log.u_bbr.flex8 = mod;
14376                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14377                 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
14378                 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
14379                 TCP_LOG_EVENTP(tp, NULL,
14380                     &tptosocket(tp)->so_rcv,
14381                     &tptosocket(tp)->so_snd,
14382                     TCP_HYSTART, 0,
14383                     0, &log, false, &tv);
14384         }
14385 }
14386
14387 static void
14388 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack)
14389 {
14390         rack->rack_deferred_inited = 1;
14391         rack->r_ctl.roundends = tp->snd_max;
14392         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
14393         rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
14394 }
14395
14396 static void
14397 rack_init_retransmit_value(struct tcp_rack *rack, int ctl)
14398 {
14399         /* Retransmit bit controls.
14400          *
14401          * The setting of these values control one of
14402          * three settings you can have and dictate
14403          * how rack does retransmissions. Note this
14404          * is in *any* mode i.e. pacing on or off DGP
14405          * fixed rate pacing, or just bursting rack.
14406          *
14407          * 1 - Use full sized retransmits i.e. limit
14408          *     the size to whatever the pace_max_segments
14409          *     size is.
14410          *
14411          * 2 - Use pacer min granularity as a guide to
14412          *     the size combined with the current calculated
14413          *     goodput b/w measurement. So for example if
14414          *     the goodput is measured at 20Mbps we would
14415          *     calculate 8125 (pacer minimum 250usec in
14416          *     that b/w) and then round it up to the next
14417          *     MSS i.e. for 1448 mss 6 MSS or 8688 bytes.
14418          *
14419          * 0 - The rack default 1 MSS (anything not 0/1/2
14420          *     fall here too if we are setting via rack_init()).
14421          *
14422          */
14423         if (ctl == 1) {
14424                 rack->full_size_rxt = 1;
14425                 rack->shape_rxt_to_pacing_min  = 0;
14426         } else if (ctl == 2) {
14427                 rack->full_size_rxt = 0;
14428                 rack->shape_rxt_to_pacing_min  = 1;
14429         } else {
14430                 rack->full_size_rxt = 0;
14431                 rack->shape_rxt_to_pacing_min  = 0;
14432         }
14433 }
14434
14435 static void
14436 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod,
14437                   uint32_t flex1,
14438                   uint32_t flex2,
14439                   uint32_t flex3)
14440 {
14441         if (tcp_bblogging_on(rack->rc_tp)) {
14442                 union tcp_log_stackspecific log;
14443                 struct timeval tv;
14444
14445                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
14446                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14447                 log.u_bbr.flex8 = mod;
14448                 log.u_bbr.flex1 = flex1;
14449                 log.u_bbr.flex2 = flex2;
14450                 log.u_bbr.flex3 = flex3;
14451                 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0,
14452                                0, &log, false, NULL, __func__, __LINE__, &tv);
14453         }
14454 }
14455
14456 static int
14457 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr)
14458 {
14459         struct tcp_rack *rack;
14460         struct rack_sendmap *rsm;
14461         int i;
14462
14463
14464         rack = (struct tcp_rack *)tp->t_fb_ptr;
14465         switch (reqr->req) {
14466         case TCP_QUERY_SENDMAP:
14467                 if ((reqr->req_param == tp->snd_max) ||
14468                     (tp->snd_max == tp->snd_una)){
14469                         /* Unlikely */
14470                         return (0);
14471                 }
14472                 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param);
14473                 if (rsm == NULL) {
14474                         /* Can't find that seq -- unlikely */
14475                         return (0);
14476                 }
14477                 reqr->sendmap_start = rsm->r_start;
14478                 reqr->sendmap_end = rsm->r_end;
14479                 reqr->sendmap_send_cnt = rsm->r_rtr_cnt;
14480                 reqr->sendmap_fas = rsm->r_fas;
14481                 if (reqr->sendmap_send_cnt > SNDMAP_NRTX)
14482                         reqr->sendmap_send_cnt = SNDMAP_NRTX;
14483                 for(i=0; i<reqr->sendmap_send_cnt; i++)
14484                         reqr->sendmap_time[i] = rsm->r_tim_lastsent[i];
14485                 reqr->sendmap_ack_arrival = rsm->r_ack_arrival;
14486                 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK;
14487                 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes;
14488                 reqr->sendmap_dupacks = rsm->r_dupack;
14489                 rack_log_chg_info(tp, rack, 1,
14490                                   rsm->r_start,
14491                                   rsm->r_end,
14492                                   rsm->r_flags);
14493                 return(1);
14494                 break;
14495         case TCP_QUERY_TIMERS_UP:
14496                 if (rack->r_ctl.rc_hpts_flags == 0) {
14497                         /* no timers up */
14498                         return (0);
14499                 }
14500                 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags;
14501                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14502                         reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to;
14503                 }
14504                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14505                         reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp;
14506                 }
14507                 rack_log_chg_info(tp, rack, 2,
14508                                   rack->r_ctl.rc_hpts_flags,
14509                                   rack->r_ctl.rc_last_output_to,
14510                                   rack->r_ctl.rc_timer_exp);
14511                 return (1);
14512                 break;
14513         case TCP_QUERY_RACK_TIMES:
14514                 /* Reordering items */
14515                 reqr->rack_num_dsacks = rack->r_ctl.num_dsack;
14516                 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts;
14517                 /* Timerstamps and timers */
14518                 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time;
14519                 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt;
14520                 reqr->rack_rtt = rack->rc_rack_rtt;
14521                 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time;
14522                 reqr->rack_srtt_measured = rack->rc_srtt_measure_made;
14523                 /* PRR data */
14524                 reqr->rack_sacked = rack->r_ctl.rc_sacked;
14525                 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt;
14526                 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered;
14527                 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs;
14528                 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt;
14529                 reqr->rack_prr_out = rack->r_ctl.rc_prr_out;
14530                 /* TLP and persists info */
14531                 reqr->rack_tlp_out = rack->rc_tlp_in_progress;
14532                 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out;
14533                 if (rack->rc_in_persist) {
14534                         reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time;
14535                         reqr->rack_in_persist = 1;
14536                 } else {
14537                         reqr->rack_time_went_idle = 0;
14538                         reqr->rack_in_persist = 0;
14539                 }
14540                 if (rack->r_wanted_output)
14541                         reqr->rack_wanted_output = 1;
14542                 else
14543                         reqr->rack_wanted_output = 0;
14544                 return (1);
14545                 break;
14546         default:
14547                 return (-EINVAL);
14548         }
14549 }
14550
14551 static void
14552 rack_switch_failed(struct tcpcb *tp)
14553 {
14554         /*
14555          * This method gets called if a stack switch was
14556          * attempted and it failed. We are left
14557          * but our hpts timers were stopped and we
14558          * need to validate time units and inp_flags2.
14559          */
14560         struct inpcb *inp = tptoinpcb(tp);
14561         struct tcp_rack *rack;
14562         struct timeval tv;
14563         uint32_t cts;
14564         uint32_t toval;
14565         struct hpts_diag diag;
14566
14567         rack = (struct tcp_rack *)tp->t_fb_ptr;
14568         tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
14569         if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
14570                 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
14571         else
14572                 inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
14573         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14574                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
14575         if (inp->inp_in_hpts) {
14576                 /* Strange */
14577                 return;
14578         }
14579         cts = tcp_get_usecs(&tv);
14580         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14581                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
14582                         toval = rack->r_ctl.rc_last_output_to - cts;
14583                 } else {
14584                         /* one slot please */
14585                         toval = HPTS_TICKS_PER_SLOT;
14586                 }
14587         } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14588                 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
14589                         toval = rack->r_ctl.rc_timer_exp - cts;
14590                 } else {
14591                         /* one slot please */
14592                         toval = HPTS_TICKS_PER_SLOT;
14593                 }
14594         } else
14595                 toval = HPTS_TICKS_PER_SLOT;
14596         (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
14597                                    __LINE__, &diag);
14598         rack_log_hpts_diag(rack, cts, &diag, &tv);
14599 }
14600
14601 static int
14602 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr)
14603 {
14604         struct rack_sendmap *rsm, *ersm;
14605         int insret __diagused;
14606         /*
14607          * When initing outstanding, we must be quite careful
14608          * to not refer to tp->t_fb_ptr. This has the old rack
14609          * pointer in it, not the "new" one (when we are doing
14610          * a stack switch).
14611          */
14612
14613
14614         if (tp->t_fb->tfb_chg_query == NULL) {
14615                 /* Create a send map for the current outstanding data */
14616
14617                 rsm = rack_alloc(rack);
14618                 if (rsm == NULL) {
14619                         uma_zfree(rack_pcb_zone, ptr);
14620                         return (ENOMEM);
14621                 }
14622                 rsm->r_no_rtt_allowed = 1;
14623                 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
14624                 rsm->r_rtr_cnt = 1;
14625                 rsm->r_rtr_bytes = 0;
14626                 if (tp->t_flags & TF_SENTFIN)
14627                         rsm->r_flags |= RACK_HAS_FIN;
14628                 rsm->r_end = tp->snd_max;
14629                 if (tp->snd_una == tp->iss) {
14630                         /* The data space is one beyond snd_una */
14631                         rsm->r_flags |= RACK_HAS_SYN;
14632                         rsm->r_start = tp->iss;
14633                         rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
14634                 } else
14635                         rsm->r_start = tp->snd_una;
14636                 rsm->r_dupack = 0;
14637                 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
14638                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
14639                         if (rsm->m) {
14640                                 rsm->orig_m_len = rsm->m->m_len;
14641                                 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14642                         } else {
14643                                 rsm->orig_m_len = 0;
14644                                 rsm->orig_t_space = 0;
14645                         }
14646                 } else {
14647                         /*
14648                          * This can happen if we have a stand-alone FIN or
14649                          *  SYN.
14650                          */
14651                         rsm->m = NULL;
14652                         rsm->orig_m_len = 0;
14653                         rsm->orig_t_space = 0;
14654                         rsm->soff = 0;
14655                 }
14656 #ifdef INVARIANTS
14657                 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14658                         panic("Insert in rb tree fails ret:%d rack:%p rsm:%p",
14659                               insret, rack, rsm);
14660                 }
14661 #else
14662                 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14663 #endif
14664                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14665                 rsm->r_in_tmap = 1;
14666         } else {
14667                 /* We have a query mechanism, lets use it */
14668                 struct tcp_query_resp qr;
14669                 int i;
14670                 tcp_seq at;
14671
14672                 at = tp->snd_una;
14673                 while (at != tp->snd_max) {
14674                         memset(&qr, 0, sizeof(qr));
14675                         qr.req = TCP_QUERY_SENDMAP;
14676                         qr.req_param = at;
14677                         if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0)
14678                                 break;
14679                         /* Move forward */
14680                         at = qr.sendmap_end;
14681                         /* Now lets build the entry for this one */
14682                         rsm = rack_alloc(rack);
14683                         if (rsm == NULL) {
14684                                 uma_zfree(rack_pcb_zone, ptr);
14685                                 return (ENOMEM);
14686                         }
14687                         memset(rsm, 0, sizeof(struct rack_sendmap));
14688                         /* Now configure the rsm and insert it */
14689                         rsm->r_dupack = qr.sendmap_dupacks;
14690                         rsm->r_start = qr.sendmap_start;
14691                         rsm->r_end = qr.sendmap_end;
14692                         if (qr.sendmap_fas)
14693                                 rsm->r_fas = qr.sendmap_end;
14694                         else
14695                                 rsm->r_fas = rsm->r_start - tp->snd_una;
14696                         /*
14697                          * We have carefully aligned the bits
14698                          * so that all we have to do is copy over
14699                          * the bits with the mask.
14700                          */
14701                         rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK;
14702                         rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes;
14703                         rsm->r_rtr_cnt = qr.sendmap_send_cnt;
14704                         rsm->r_ack_arrival = qr.sendmap_ack_arrival;
14705                         for (i=0 ; i<rsm->r_rtr_cnt; i++)
14706                                 rsm->r_tim_lastsent[i]  = qr.sendmap_time[i];
14707                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
14708                                            (rsm->r_start - tp->snd_una), &rsm->soff);
14709                         if (rsm->m) {
14710                                 rsm->orig_m_len = rsm->m->m_len;
14711                                 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14712                         } else {
14713                                 rsm->orig_m_len = 0;
14714                                 rsm->orig_t_space = 0;
14715                         }
14716 #ifdef INVARIANTS
14717                         if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14718                                 panic("Insert in rb tree fails ret:%d rack:%p rsm:%p",
14719                                       insret, rack, rsm);
14720                         }
14721 #else
14722                         (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14723 #endif
14724                         if ((rsm->r_flags & RACK_ACKED) == 0)  {
14725                                 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) {
14726                                         if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] >
14727                                             rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) {
14728                                                 /*
14729                                                  * If the existing ersm was sent at
14730                                                  * a later time than the new one, then
14731                                                  * the new one should appear ahead of this
14732                                                  * ersm.
14733                                                  */
14734                                                 rsm->r_in_tmap = 1;
14735                                                 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext);
14736                                                 break;
14737                                         }
14738                                 }
14739                                 if (rsm->r_in_tmap == 0) {
14740                                         /*
14741                                          * Not found so shove it on the tail.
14742                                          */
14743                                         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14744                                         rsm->r_in_tmap = 1;
14745                                 }
14746                         } else {
14747                                 if ((rack->r_ctl.rc_sacklast == NULL) ||
14748                                     (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) {
14749                                         rack->r_ctl.rc_sacklast = rsm;
14750                                 }
14751                         }
14752                         rack_log_chg_info(tp, rack, 3,
14753                                           rsm->r_start,
14754                                           rsm->r_end,
14755                                           rsm->r_flags);
14756                 }
14757         }
14758         return (0);
14759 }
14760
14761 static void
14762 rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval)
14763 {
14764         /*
14765          * P = percent bits
14766          * F = fill cw bit -- Toggle fillcw if this bit is set.
14767          * S = Segment bits
14768          * M = set max segment bit
14769          * U = Unclamined
14770          * C = If set to non-zero override the max number of clamps.
14771          * L = Bit to indicate if clamped gets lower.
14772          *
14773          * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP
14774          *
14775          * The lowest 3 nibbles is the perentage .1 - 6553.5%
14776          * where 10.1 = 101, max 6553.5
14777          * The upper 16 bits  holds some options.
14778          * The F bit will turn on fill-cw on if you are
14779          * not pacing, it will turn it off if dgp is on.
14780          * The L bit will change it so when clamped we get
14781          * the min(gp, lt-bw) for dgp.
14782          */
14783         uint16_t per;
14784
14785         rack->r_ctl.saved_rxt_clamp_val = optval;
14786         per = optval & 0x0000ffff;
14787         rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff);
14788         if (optval > 0) {
14789                 uint16_t clamp_opt;
14790
14791                 rack->excess_rxt_on = 1;
14792                 clamp_opt = ((optval & 0xffff0000) >> 16);
14793                 rack->r_ctl.clamp_options = clamp_opt & 0x00ff;
14794                 if (clamp_opt & 0xff00) {
14795                         /* A max clamps is also present */
14796                         rack->r_ctl.max_clamps = (clamp_opt >> 8);
14797                 } else {
14798                         /* No specified clamps means no limit */
14799                         rack->r_ctl.max_clamps = 0;
14800                 }
14801                 if (rack->r_ctl.clamp_options & 0x0002) {
14802                         rack->r_clamped_gets_lower  = 1;
14803                 } else {
14804                         rack->r_clamped_gets_lower  = 0;
14805                 }
14806         } else {
14807                 /* Turn it off back to default */
14808                 rack->excess_rxt_on = 0;
14809                 rack->r_clamped_gets_lower  = 0;
14810         }
14811
14812 }
14813
14814
14815 static int32_t
14816 rack_init(struct tcpcb *tp, void **ptr)
14817 {
14818         struct inpcb *inp = tptoinpcb(tp);
14819         struct tcp_rack *rack = NULL;
14820         uint32_t iwin, snt, us_cts;
14821         int err, no_query;
14822
14823         /*
14824          * First are we the initial or are we a switched stack?
14825          * If we are initing via tcp_newtcppcb the ptr passed
14826          * will be tp->t_fb_ptr. If its a stack switch that
14827          * has a previous stack we can query it will be a local
14828          * var that will in the end be set into t_fb_ptr.
14829          */
14830         if (ptr == &tp->t_fb_ptr)
14831                 no_query = 1;
14832         else
14833                 no_query = 0;
14834         *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
14835         if (*ptr == NULL) {
14836                 /*
14837                  * We need to allocate memory but cant. The INP and INP_INFO
14838                  * locks and they are recursive (happens during setup. So a
14839                  * scheme to drop the locks fails :(
14840                  *
14841                  */
14842                 return(ENOMEM);
14843         }
14844         memset(*ptr, 0, sizeof(struct tcp_rack));
14845         rack = (struct tcp_rack *)*ptr;
14846         rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT);
14847         if (rack->r_ctl.tqh == NULL) {
14848                 uma_zfree(rack_pcb_zone, rack);
14849                 return(ENOMEM);
14850         }
14851         tqhash_init(rack->r_ctl.tqh);
14852         TAILQ_INIT(&rack->r_ctl.rc_free);
14853         TAILQ_INIT(&rack->r_ctl.rc_tmap);
14854         rack->rc_tp = tp;
14855         rack->rc_inp = inp;
14856         /* Set the flag */
14857         rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0;
14858         /* Probably not needed but lets be sure */
14859         rack_clear_rate_sample(rack);
14860         /*
14861          * Save off the default values, socket options will poke
14862          * at these if pacing is not on or we have not yet
14863          * reached where pacing is on (gp_ready/fixed enabled).
14864          * When they get set into the CC module (when gp_ready
14865          * is enabled or we enable fixed) then we will set these
14866          * values into the CC and place in here the old values
14867          * so we have a restoral. Then we will set the flag
14868          * rc_pacing_cc_set. That way whenever we turn off pacing
14869          * or switch off this stack, we will know to go restore
14870          * the saved values.
14871          *
14872          * We specifically put into the beta the ecn value for pacing.
14873          */
14874         rack->rc_new_rnd_needed = 1;
14875         rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
14876         rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
14877         rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
14878         /* We want abe like behavior as well */
14879         rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
14880         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
14881         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
14882         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
14883         if (rack_rxt_clamp_thresh) {
14884                 rack_translate_clamp_value(rack, rack_rxt_clamp_thresh);
14885                 rack->excess_rxt_on = 1;
14886         }
14887         if (rack_uses_full_dgp_in_rec)
14888                 rack->r_ctl.full_dgp_in_rec = 1;
14889         if (rack_fill_cw_state)
14890                 rack->rc_pace_to_cwnd = 1;
14891         if (rack_pacing_min_seg)
14892                 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg;
14893         if (use_rack_rr)
14894                 rack->use_rack_rr = 1;
14895         if (rack_dnd_default) {
14896                 rack->rc_pace_dnd = 1;
14897         }
14898         if (V_tcp_delack_enabled)
14899                 tp->t_delayed_ack = 1;
14900         else
14901                 tp->t_delayed_ack = 0;
14902 #ifdef TCP_ACCOUNTING
14903         if (rack_tcp_accounting) {
14904                 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
14905         }
14906 #endif
14907         rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
14908         rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
14909         if (rack_enable_shared_cwnd)
14910                 rack->rack_enable_scwnd = 1;
14911         rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
14912         rack->rc_user_set_max_segs = rack_hptsi_segments;
14913         rack->rc_force_max_seg = 0;
14914         TAILQ_INIT(&rack->r_ctl.opt_list);
14915         if (rack_hibeta_setting)
14916                 rack->rack_hibeta = 1;
14917         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
14918         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
14919         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
14920         rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
14921         rack->r_ctl.rc_highest_us_rtt = 0;
14922         rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
14923         rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
14924         if (rack_use_cmp_acks)
14925                 rack->r_use_cmp_ack = 1;
14926         if (rack_disable_prr)
14927                 rack->rack_no_prr = 1;
14928         if (rack_gp_no_rec_chg)
14929                 rack->rc_gp_no_rec_chg = 1;
14930         if (rack_pace_every_seg && tcp_can_enable_pacing()) {
14931                 rack->rc_always_pace = 1;
14932                 if ((rack->gp_ready) && (rack->rc_always_pace && (rack->use_fixed_rate == 0)))
14933                         rack_set_cc_pacing(rack);
14934         } else
14935                 rack->rc_always_pace = 0;
14936         if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
14937                 rack->r_mbuf_queue = 1;
14938         else
14939                 rack->r_mbuf_queue = 0;
14940         rack_set_pace_segments(tp, rack, __LINE__, NULL);
14941         if (rack_limits_scwnd)
14942                 rack->r_limit_scw = 1;
14943         else
14944                 rack->r_limit_scw = 0;
14945         rack_init_retransmit_value(rack, rack_rxt_controls);
14946         rack->rc_labc = V_tcp_abc_l_var;
14947         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
14948         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
14949         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
14950         rack->r_ctl.rc_min_to = rack_min_to;
14951         microuptime(&rack->r_ctl.act_rcv_time);
14952         rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
14953         rack->rc_init_win = rack_default_init_window;
14954         rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
14955         if (rack_hw_up_only)
14956                 rack->r_up_only = 1;
14957         if (rack_do_dyn_mul) {
14958                 /* When dynamic adjustment is on CA needs to start at 100% */
14959                 rack->rc_gp_dyn_mul = 1;
14960                 if (rack_do_dyn_mul >= 100)
14961                         rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
14962         } else
14963                 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
14964         rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
14965         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
14966         rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
14967         setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
14968                                 rack_probertt_filter_life);
14969         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
14970         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
14971         rack->r_ctl.rc_time_of_last_probertt = us_cts;
14972         rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
14973         rack->r_ctl.rc_time_probertt_starts = 0;
14974         if (rack_dsack_std_based & 0x1) {
14975                 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
14976                 rack->rc_rack_tmr_std_based = 1;
14977         }
14978         if (rack_dsack_std_based & 0x2) {
14979                 /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
14980                 rack->rc_rack_use_dsack = 1;
14981         }
14982         /* We require at least one measurement, even if the sysctl is 0 */
14983         if (rack_req_measurements)
14984                 rack->r_ctl.req_measurements = rack_req_measurements;
14985         else
14986                 rack->r_ctl.req_measurements = 1;
14987         if (rack_enable_hw_pacing)
14988                 rack->rack_hdw_pace_ena = 1;
14989         if (rack_hw_rate_caps)
14990                 rack->r_rack_hw_rate_caps = 1;
14991 #ifdef TCP_SAD_DETECTION
14992         rack->do_detection = 1;
14993 #else
14994         rack->do_detection = 0;
14995 #endif
14996         if (rack_non_rxt_use_cr)
14997                 rack->rack_rec_nonrxt_use_cr = 1;
14998         /* Lets setup the fsb block */
14999         err = rack_init_fsb(tp, rack);
15000         if (err) {
15001                 uma_zfree(rack_pcb_zone, *ptr);
15002                 *ptr = NULL;
15003                 return (err);
15004         }
15005         if (rack_do_hystart) {
15006                 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
15007                 if (rack_do_hystart > 1)
15008                         tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
15009                 if (rack_do_hystart > 2)
15010                         tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
15011         }
15012         /* Log what we will do with queries */
15013         rack_log_chg_info(tp, rack, 7,
15014                           no_query, 0, 0);
15015         if (rack_def_profile)
15016                 rack_set_profile(rack, rack_def_profile);
15017         /* Cancel the GP measurement in progress */
15018         tp->t_flags &= ~TF_GPUTINPROG;
15019         if ((tp->t_state != TCPS_CLOSED) &&
15020             (tp->t_state != TCPS_TIME_WAIT)) {
15021                 /*
15022                  * We are already open, we may
15023                  * need to adjust a few things.
15024                  */
15025                 if (SEQ_GT(tp->snd_max, tp->iss))
15026                         snt = tp->snd_max - tp->iss;
15027                 else
15028                         snt = 0;
15029                 iwin = rc_init_window(rack);
15030                 if ((snt < iwin) &&
15031                     (no_query == 1)) {
15032                         /* We are not past the initial window
15033                          * on the first init (i.e. a stack switch
15034                          * has not yet occured) so we need to make
15035                          * sure cwnd and ssthresh is correct.
15036                          */
15037                         if (tp->snd_cwnd < iwin)
15038                                 tp->snd_cwnd = iwin;
15039                         /*
15040                          * If we are within the initial window
15041                          * we want ssthresh to be unlimited. Setting
15042                          * it to the rwnd (which the default stack does
15043                          * and older racks) is not really a good idea
15044                          * since we want to be in SS and grow both the
15045                          * cwnd and the rwnd (via dynamic rwnd growth). If
15046                          * we set it to the rwnd then as the peer grows its
15047                          * rwnd we will be stuck in CA and never hit SS.
15048                          *
15049                          * Its far better to raise it up high (this takes the
15050                          * risk that there as been a loss already, probably
15051                          * we should have an indicator in all stacks of loss
15052                          * but we don't), but considering the normal use this
15053                          * is a risk worth taking. The consequences of not
15054                          * hitting SS are far worse than going one more time
15055                          * into it early on (before we have sent even a IW).
15056                          * It is highly unlikely that we will have had a loss
15057                          * before getting the IW out.
15058                          */
15059                         tp->snd_ssthresh = 0xffffffff;
15060                 }
15061                 /*
15062                  * Any init based on sequence numbers
15063                  * should be done in the deferred init path
15064                  * since we can be CLOSED and not have them
15065                  * inited when rack_init() is called. We
15066                  * are not closed so lets call it.
15067                  */
15068                 rack_deferred_init(tp, rack);
15069         }
15070         if ((tp->t_state != TCPS_CLOSED) &&
15071             (tp->t_state != TCPS_TIME_WAIT) &&
15072             (no_query == 0) &&
15073             (tp->snd_una != tp->snd_max))  {
15074                 err = rack_init_outstanding(tp, rack, us_cts, *ptr);
15075                 if (err) {
15076                         *ptr = NULL;
15077                         return(err);
15078                 }
15079         }
15080         rack_stop_all_timers(tp, rack);
15081         /* Setup all the inp_flags2 */
15082         if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
15083                 tptoinpcb(tp)->inp_flags2 |= INP_SUPPORTS_MBUFQ;
15084         else
15085                 tptoinpcb(tp)->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
15086         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
15087                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
15088         /*
15089          * Timers in Rack are kept in microseconds so lets
15090          * convert any initial incoming variables
15091          * from ticks into usecs. Note that we
15092          * also change the values of t_srtt and t_rttvar, if
15093          * they are non-zero. They are kept with a 5
15094          * bit decimal so we have to carefully convert
15095          * these to get the full precision.
15096          */
15097         rack_convert_rtts(tp);
15098         rack_log_hystart_event(rack, rack->r_ctl.roundends, 20);
15099         if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) {
15100                 /* We do not start any timers on DROPPED connections */
15101                 if (tp->t_fb->tfb_chg_query == NULL) {
15102                         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15103                 } else {
15104                         struct tcp_query_resp qr;
15105                         int ret;
15106
15107                         memset(&qr, 0, sizeof(qr));
15108
15109                         /* Get the misc time stamps and such for rack */
15110                         qr.req = TCP_QUERY_RACK_TIMES;
15111                         ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
15112                         if (ret == 1) {
15113                                 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts;
15114                                 rack->r_ctl.num_dsack  = qr.rack_num_dsacks;
15115                                 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time;
15116                                 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt;
15117                                 rack->rc_rack_rtt = qr.rack_rtt;
15118                                 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time;
15119                                 rack->r_ctl.rc_sacked = qr.rack_sacked;
15120                                 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt;
15121                                 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered;
15122                                 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs;
15123                                 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt;
15124                                 rack->r_ctl.rc_prr_out = qr.rack_prr_out;
15125                                 if (qr.rack_tlp_out) {
15126                                         rack->rc_tlp_in_progress = 1;
15127                                         rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out;
15128                                 } else {
15129                                         rack->rc_tlp_in_progress = 0;
15130                                         rack->r_ctl.rc_tlp_cnt_out = 0;
15131                                 }
15132                                 if (qr.rack_srtt_measured)
15133                                         rack->rc_srtt_measure_made = 1;
15134                                 if (qr.rack_in_persist == 1) {
15135                                         rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle;
15136 #ifdef NETFLIX_SHARED_CWND
15137                                         if (rack->r_ctl.rc_scw) {
15138                                                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
15139                                                 rack->rack_scwnd_is_idle = 1;
15140                                         }
15141 #endif
15142                                         rack->r_ctl.persist_lost_ends = 0;
15143                                         rack->probe_not_answered = 0;
15144                                         rack->forced_ack = 0;
15145                                         tp->t_rxtshift = 0;
15146                                         rack->rc_in_persist = 1;
15147                                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
15148                                                            rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
15149                                 }
15150                                 if (qr.rack_wanted_output)
15151                                         rack->r_wanted_output = 1;
15152                                 rack_log_chg_info(tp, rack, 6,
15153                                                   qr.rack_min_rtt,
15154                                                   qr.rack_rtt,
15155                                                   qr.rack_reorder_ts);
15156                         }
15157                         /* Get the old stack timers */
15158                         qr.req_param = 0;
15159                         qr.req = TCP_QUERY_TIMERS_UP;
15160                         ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
15161                         if (ret) {
15162                                 /*
15163                                  * non-zero return means we have a timer('s)
15164                                  * to start. Zero means no timer (no keepalive
15165                                  * I suppose).
15166                                  */
15167                                 uint32_t tov = 0;
15168
15169                                 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags;
15170                                 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) {
15171                                         rack->r_ctl.rc_last_output_to = qr.timer_pacing_to;
15172                                         if (TSTMP_GT(qr.timer_pacing_to, us_cts))
15173                                                 tov = qr.timer_pacing_to - us_cts;
15174                                         else
15175                                                 tov = HPTS_TICKS_PER_SLOT;
15176                                 }
15177                                 if (qr.timer_hpts_flags & PACE_TMR_MASK) {
15178                                         rack->r_ctl.rc_timer_exp = qr.timer_timer_exp;
15179                                         if (tov == 0) {
15180                                                 if (TSTMP_GT(qr.timer_timer_exp, us_cts))
15181                                                         tov = qr.timer_timer_exp - us_cts;
15182                                                 else
15183                                                         tov = HPTS_TICKS_PER_SLOT;
15184                                         }
15185                                 }
15186                                 rack_log_chg_info(tp, rack, 4,
15187                                                   rack->r_ctl.rc_hpts_flags,
15188                                                   rack->r_ctl.rc_last_output_to,
15189                                                   rack->r_ctl.rc_timer_exp);
15190                                 if (tov) {
15191                                         struct hpts_diag diag;
15192
15193                                         (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov),
15194                                                                    __LINE__, &diag);
15195                                         rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
15196                                 }
15197                         }
15198                 }
15199                 rack_log_rtt_shrinks(rack,  us_cts,  tp->t_rxtcur,
15200                                      __LINE__, RACK_RTTS_INIT);
15201         }
15202         return (0);
15203 }
15204
15205 static int
15206 rack_handoff_ok(struct tcpcb *tp)
15207 {
15208         if ((tp->t_state == TCPS_CLOSED) ||
15209             (tp->t_state == TCPS_LISTEN)) {
15210                 /* Sure no problem though it may not stick */
15211                 return (0);
15212         }
15213         if ((tp->t_state == TCPS_SYN_SENT) ||
15214             (tp->t_state == TCPS_SYN_RECEIVED)) {
15215                 /*
15216                  * We really don't know if you support sack,
15217                  * you have to get to ESTAB or beyond to tell.
15218                  */
15219                 return (EAGAIN);
15220         }
15221         if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
15222                 /*
15223                  * Rack will only send a FIN after all data is acknowledged.
15224                  * So in this case we have more data outstanding. We can't
15225                  * switch stacks until either all data and only the FIN
15226                  * is left (in which case rack_init() now knows how
15227                  * to deal with that) <or> all is acknowledged and we
15228                  * are only left with incoming data, though why you
15229                  * would want to switch to rack after all data is acknowledged
15230                  * I have no idea (rrs)!
15231                  */
15232                 return (EAGAIN);
15233         }
15234         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
15235                 return (0);
15236         }
15237         /*
15238          * If we reach here we don't do SACK on this connection so we can
15239          * never do rack.
15240          */
15241         return (EINVAL);
15242 }
15243
15244 static void
15245 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
15246 {
15247
15248         if (tp->t_fb_ptr) {
15249                 uint32_t cnt_free = 0;
15250                 struct tcp_rack *rack;
15251                 struct rack_sendmap *rsm;
15252
15253                 tcp_handle_orphaned_packets(tp);
15254                 tp->t_flags &= ~TF_FORCEDATA;
15255                 rack = (struct tcp_rack *)tp->t_fb_ptr;
15256                 rack_log_pacing_delay_calc(rack,
15257                                            0,
15258                                            0,
15259                                            0,
15260                                            rack_get_gp_est(rack), /* delRate */
15261                                            rack_get_lt_bw(rack), /* rttProp */
15262                                            20, __LINE__, NULL, 0);
15263 #ifdef NETFLIX_SHARED_CWND
15264                 if (rack->r_ctl.rc_scw) {
15265                         uint32_t limit;
15266
15267                         if (rack->r_limit_scw)
15268                                 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
15269                         else
15270                                 limit = 0;
15271                         tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
15272                                                   rack->r_ctl.rc_scw_index,
15273                                                   limit);
15274                         rack->r_ctl.rc_scw = NULL;
15275                 }
15276 #endif
15277                 if (rack->r_ctl.fsb.tcp_ip_hdr) {
15278                         free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
15279                         rack->r_ctl.fsb.tcp_ip_hdr = NULL;
15280                         rack->r_ctl.fsb.th = NULL;
15281                 }
15282                 if (rack->rc_always_pace) {
15283                         tcp_decrement_paced_conn();
15284                         rack_undo_cc_pacing(rack);
15285                         rack->rc_always_pace = 0;
15286                 }
15287                 /* Clean up any options if they were not applied */
15288                 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
15289                         struct deferred_opt_list *dol;
15290
15291                         dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
15292                         TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
15293                         free(dol, M_TCPDO);
15294                 }
15295                 /* rack does not use force data but other stacks may clear it */
15296                 if (rack->r_ctl.crte != NULL) {
15297                         tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
15298                         rack->rack_hdrw_pacing = 0;
15299                         rack->r_ctl.crte = NULL;
15300                 }
15301 #ifdef TCP_BLACKBOX
15302                 tcp_log_flowend(tp);
15303 #endif
15304                 /*
15305                  * Lets take a different approach to purging just
15306                  * get each one and free it like a cum-ack would and
15307                  * not use a foreach loop.
15308                  */
15309                 rsm = tqhash_min(rack->r_ctl.tqh);
15310                 while (rsm) {
15311                         tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
15312                         rack->r_ctl.rc_num_maps_alloced--;
15313                         uma_zfree(rack_zone, rsm);
15314                         rsm = tqhash_min(rack->r_ctl.tqh);
15315                 }
15316                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
15317                 while (rsm) {
15318                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
15319                         rack->r_ctl.rc_num_maps_alloced--;
15320                         rack->rc_free_cnt--;
15321                         cnt_free++;
15322                         uma_zfree(rack_zone, rsm);
15323                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
15324                 }
15325                 if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
15326                     (tcp_bblogging_on(tp))) {
15327                         union tcp_log_stackspecific log;
15328                         struct timeval tv;
15329
15330                         memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15331                         log.u_bbr.flex8 = 10;
15332                         log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced;
15333                         log.u_bbr.flex2 = rack->rc_free_cnt;
15334                         log.u_bbr.flex3 = cnt_free;
15335                         log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15336                         rsm = tqhash_min(rack->r_ctl.tqh);
15337                         log.u_bbr.delRate = (uint64_t)rsm;
15338                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
15339                         log.u_bbr.cur_del_rate = (uint64_t)rsm;
15340                         log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15341                         log.u_bbr.pkt_epoch = __LINE__;
15342                         (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
15343                                              0, &log, false, NULL, NULL, 0, &tv);
15344                 }
15345                 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0),
15346                         ("rack:%p num_aloc:%u after freeing all?",
15347                          rack,
15348                          rack->r_ctl.rc_num_maps_alloced));
15349                 rack->rc_free_cnt = 0;
15350                 free(rack->r_ctl.tqh, M_TCPFSB);
15351                 rack->r_ctl.tqh = NULL;
15352                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
15353                 tp->t_fb_ptr = NULL;
15354         }
15355         /* Make sure snd_nxt is correctly set */
15356         tp->snd_nxt = tp->snd_max;
15357 }
15358
15359 static void
15360 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
15361 {
15362         if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
15363                 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0;
15364         }
15365         switch (tp->t_state) {
15366         case TCPS_SYN_SENT:
15367                 rack->r_state = TCPS_SYN_SENT;
15368                 rack->r_substate = rack_do_syn_sent;
15369                 break;
15370         case TCPS_SYN_RECEIVED:
15371                 rack->r_state = TCPS_SYN_RECEIVED;
15372                 rack->r_substate = rack_do_syn_recv;
15373                 break;
15374         case TCPS_ESTABLISHED:
15375                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15376                 rack->r_state = TCPS_ESTABLISHED;
15377                 rack->r_substate = rack_do_established;
15378                 break;
15379         case TCPS_CLOSE_WAIT:
15380                 rack->r_state = TCPS_CLOSE_WAIT;
15381                 rack->r_substate = rack_do_close_wait;
15382                 break;
15383         case TCPS_FIN_WAIT_1:
15384                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15385                 rack->r_state = TCPS_FIN_WAIT_1;
15386                 rack->r_substate = rack_do_fin_wait_1;
15387                 break;
15388         case TCPS_CLOSING:
15389                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15390                 rack->r_state = TCPS_CLOSING;
15391                 rack->r_substate = rack_do_closing;
15392                 break;
15393         case TCPS_LAST_ACK:
15394                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15395                 rack->r_state = TCPS_LAST_ACK;
15396                 rack->r_substate = rack_do_lastack;
15397                 break;
15398         case TCPS_FIN_WAIT_2:
15399                 rack->r_state = TCPS_FIN_WAIT_2;
15400                 rack->r_substate = rack_do_fin_wait_2;
15401                 break;
15402         case TCPS_LISTEN:
15403         case TCPS_CLOSED:
15404         case TCPS_TIME_WAIT:
15405         default:
15406                 break;
15407         };
15408         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
15409                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
15410
15411 }
15412
15413 static void
15414 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
15415 {
15416         /*
15417          * We received an ack, and then did not
15418          * call send or were bounced out due to the
15419          * hpts was running. Now a timer is up as well, is
15420          * it the right timer?
15421          */
15422         struct rack_sendmap *rsm;
15423         int tmr_up;
15424
15425         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
15426         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
15427                 return;
15428         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
15429         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
15430             (tmr_up == PACE_TMR_RXT)) {
15431                 /* Should be an RXT */
15432                 return;
15433         }
15434         if (rsm == NULL) {
15435                 /* Nothing outstanding? */
15436                 if (tp->t_flags & TF_DELACK) {
15437                         if (tmr_up == PACE_TMR_DELACK)
15438                                 /* We are supposed to have delayed ack up and we do */
15439                                 return;
15440                 } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) {
15441                         /*
15442                          * if we hit enobufs then we would expect the possibility
15443                          * of nothing outstanding and the RXT up (and the hptsi timer).
15444                          */
15445                         return;
15446                 } else if (((V_tcp_always_keepalive ||
15447                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
15448                             (tp->t_state <= TCPS_CLOSING)) &&
15449                            (tmr_up == PACE_TMR_KEEP) &&
15450                            (tp->snd_max == tp->snd_una)) {
15451                         /* We should have keep alive up and we do */
15452                         return;
15453                 }
15454         }
15455         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
15456                    ((tmr_up == PACE_TMR_TLP) ||
15457                     (tmr_up == PACE_TMR_RACK) ||
15458                     (tmr_up == PACE_TMR_RXT))) {
15459                 /*
15460                  * Either a Rack, TLP or RXT is fine if  we
15461                  * have outstanding data.
15462                  */
15463                 return;
15464         } else if (tmr_up == PACE_TMR_DELACK) {
15465                 /*
15466                  * If the delayed ack was going to go off
15467                  * before the rtx/tlp/rack timer were going to
15468                  * expire, then that would be the timer in control.
15469                  * Note we don't check the time here trusting the
15470                  * code is correct.
15471                  */
15472                 return;
15473         }
15474         /*
15475          * Ok the timer originally started is not what we want now.
15476          * We will force the hpts to be stopped if any, and restart
15477          * with the slot set to what was in the saved slot.
15478          */
15479         if (tcp_in_hpts(rack->rc_inp)) {
15480                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
15481                         uint32_t us_cts;
15482
15483                         us_cts = tcp_get_usecs(NULL);
15484                         if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
15485                                 rack->r_early = 1;
15486                                 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
15487                         }
15488                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
15489                 }
15490                 tcp_hpts_remove(rack->rc_inp);
15491         }
15492         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15493         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15494 }
15495
15496
15497 static void
15498 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts)
15499 {
15500         if ((SEQ_LT(tp->snd_wl1, seq) ||
15501             (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
15502             (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
15503                 /* keep track of pure window updates */
15504                 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
15505                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
15506                 tp->snd_wnd = tiwin;
15507                 rack_validate_fo_sendwin_up(tp, rack);
15508                 tp->snd_wl1 = seq;
15509                 tp->snd_wl2 = ack;
15510                 if (tp->snd_wnd > tp->max_sndwnd)
15511                         tp->max_sndwnd = tp->snd_wnd;
15512             rack->r_wanted_output = 1;
15513         } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
15514                 tp->snd_wnd = tiwin;
15515                 rack_validate_fo_sendwin_up(tp, rack);
15516                 tp->snd_wl1 = seq;
15517                 tp->snd_wl2 = ack;
15518         } else {
15519                 /* Not a valid win update */
15520                 return;
15521         }
15522         if (tp->snd_wnd > tp->max_sndwnd)
15523                 tp->max_sndwnd = tp->snd_wnd;
15524         /* Do we exit persists? */
15525         if ((rack->rc_in_persist != 0) &&
15526             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
15527                                 rack->r_ctl.rc_pace_min_segs))) {
15528                 rack_exit_persist(tp, rack, cts);
15529         }
15530         /* Do we enter persists? */
15531         if ((rack->rc_in_persist == 0) &&
15532             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
15533             TCPS_HAVEESTABLISHED(tp->t_state) &&
15534             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
15535             sbavail(&tptosocket(tp)->so_snd) &&
15536             (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
15537                 /*
15538                  * Here the rwnd is less than
15539                  * the pacing size, we are established,
15540                  * nothing is outstanding, and there is
15541                  * data to send. Enter persists.
15542                  */
15543                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack);
15544         }
15545 }
15546
15547 static void
15548 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
15549 {
15550
15551         if (tcp_bblogging_on(rack->rc_tp)) {
15552                 struct inpcb *inp = tptoinpcb(tp);
15553                 union tcp_log_stackspecific log;
15554                 struct timeval ltv;
15555                 char tcp_hdr_buf[60];
15556                 struct tcphdr *th;
15557                 struct timespec ts;
15558                 uint32_t orig_snd_una;
15559                 uint8_t xx = 0;
15560
15561 #ifdef TCP_REQUEST_TRK
15562                 struct http_sendfile_track *http_req;
15563
15564                 if (SEQ_GT(ae->ack, tp->snd_una)) {
15565                         http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
15566                 } else {
15567                         http_req = tcp_http_find_req_for_seq(tp, ae->ack);
15568                 }
15569 #endif
15570                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15571                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
15572                 if (rack->rack_no_prr == 0)
15573                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
15574                 else
15575                         log.u_bbr.flex1 = 0;
15576                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
15577                 log.u_bbr.use_lt_bw <<= 1;
15578                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
15579                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
15580                 log.u_bbr.bbr_state = rack->rc_free_cnt;
15581                 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15582                 log.u_bbr.pkts_out = tp->t_maxseg;
15583                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
15584                 log.u_bbr.flex7 = 1;
15585                 log.u_bbr.lost = ae->flags;
15586                 log.u_bbr.cwnd_gain = ackval;
15587                 log.u_bbr.pacing_gain = 0x2;
15588                 if (ae->flags & TSTMP_HDWR) {
15589                         /* Record the hardware timestamp if present */
15590                         log.u_bbr.flex3 = M_TSTMP;
15591                         ts.tv_sec = ae->timestamp / 1000000000;
15592                         ts.tv_nsec = ae->timestamp % 1000000000;
15593                         ltv.tv_sec = ts.tv_sec;
15594                         ltv.tv_usec = ts.tv_nsec / 1000;
15595                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
15596                 } else if (ae->flags & TSTMP_LRO) {
15597                         /* Record the LRO the arrival timestamp */
15598                         log.u_bbr.flex3 = M_TSTMP_LRO;
15599                         ts.tv_sec = ae->timestamp / 1000000000;
15600                         ts.tv_nsec = ae->timestamp % 1000000000;
15601                         ltv.tv_sec = ts.tv_sec;
15602                         ltv.tv_usec = ts.tv_nsec / 1000;
15603                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
15604                 }
15605                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
15606                 /* Log the rcv time */
15607                 log.u_bbr.delRate = ae->timestamp;
15608 #ifdef TCP_REQUEST_TRK
15609                 log.u_bbr.applimited = tp->t_http_closed;
15610                 log.u_bbr.applimited <<= 8;
15611                 log.u_bbr.applimited |= tp->t_http_open;
15612                 log.u_bbr.applimited <<= 8;
15613                 log.u_bbr.applimited |= tp->t_http_req;
15614                 if (http_req) {
15615                         /* Copy out any client req info */
15616                         /* seconds */
15617                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
15618                         /* useconds */
15619                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
15620                         log.u_bbr.rttProp = http_req->timestamp;
15621                         log.u_bbr.cur_del_rate = http_req->start;
15622                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
15623                                 log.u_bbr.flex8 |= 1;
15624                         } else {
15625                                 log.u_bbr.flex8 |= 2;
15626                                 log.u_bbr.bw_inuse = http_req->end;
15627                         }
15628                         log.u_bbr.flex6 = http_req->start_seq;
15629                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
15630                                 log.u_bbr.flex8 |= 4;
15631                                 log.u_bbr.epoch = http_req->end_seq;
15632                         }
15633                 }
15634 #endif
15635                 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
15636                 th = (struct tcphdr *)tcp_hdr_buf;
15637                 th->th_seq = ae->seq;
15638                 th->th_ack = ae->ack;
15639                 th->th_win = ae->win;
15640                 /* Now fill in the ports */
15641                 th->th_sport = inp->inp_fport;
15642                 th->th_dport = inp->inp_lport;
15643                 tcp_set_flags(th, ae->flags);
15644                 /* Now do we have a timestamp option? */
15645                 if (ae->flags & HAS_TSTMP) {
15646                         u_char *cp;
15647                         uint32_t val;
15648
15649                         th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
15650                         cp = (u_char *)(th + 1);
15651                         *cp = TCPOPT_NOP;
15652                         cp++;
15653                         *cp = TCPOPT_NOP;
15654                         cp++;
15655                         *cp = TCPOPT_TIMESTAMP;
15656                         cp++;
15657                         *cp = TCPOLEN_TIMESTAMP;
15658                         cp++;
15659                         val = htonl(ae->ts_value);
15660                         bcopy((char *)&val,
15661                               (char *)cp, sizeof(uint32_t));
15662                         val = htonl(ae->ts_echo);
15663                         bcopy((char *)&val,
15664                               (char *)(cp + 4), sizeof(uint32_t));
15665                 } else
15666                         th->th_off = (sizeof(struct tcphdr) >> 2);
15667
15668                 /*
15669                  * For sane logging we need to play a little trick.
15670                  * If the ack were fully processed we would have moved
15671                  * snd_una to high_seq, but since compressed acks are
15672                  * processed in two phases, at this point (logging) snd_una
15673                  * won't be advanced. So we would see multiple acks showing
15674                  * the advancement. We can prevent that by "pretending" that
15675                  * snd_una was advanced and then un-advancing it so that the
15676                  * logging code has the right value for tlb_snd_una.
15677                  */
15678                 if (tp->snd_una != high_seq) {
15679                         orig_snd_una = tp->snd_una;
15680                         tp->snd_una = high_seq;
15681                         xx = 1;
15682                 } else
15683                         xx = 0;
15684                 TCP_LOG_EVENTP(tp, th,
15685                                &tptosocket(tp)->so_rcv,
15686                                &tptosocket(tp)->so_snd, TCP_LOG_IN, 0,
15687                                0, &log, true, &ltv);
15688                 if (xx) {
15689                         tp->snd_una = orig_snd_una;
15690                 }
15691         }
15692
15693 }
15694
15695 static void
15696 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
15697 {
15698         uint32_t us_rtt;
15699         /*
15700          * A persist or keep-alive was forced out, update our
15701          * min rtt time. Note now worry about lost responses.
15702          * When a subsequent keep-alive or persist times out
15703          * and forced_ack is still on, then the last probe
15704          * was not responded to. In such cases we have a
15705          * sysctl that controls the behavior. Either we apply
15706          * the rtt but with reduced confidence (0). Or we just
15707          * plain don't apply the rtt estimate. Having data flow
15708          * will clear the probe_not_answered flag i.e. cum-ack
15709          * move forward <or> exiting and reentering persists.
15710          */
15711
15712         rack->forced_ack = 0;
15713         rack->rc_tp->t_rxtshift = 0;
15714         if ((rack->rc_in_persist &&
15715              (tiwin == rack->rc_tp->snd_wnd)) ||
15716             (rack->rc_in_persist == 0)) {
15717                 /*
15718                  * In persists only apply the RTT update if this is
15719                  * a response to our window probe. And that
15720                  * means the rwnd sent must match the current
15721                  * snd_wnd. If it does not, then we got a
15722                  * window update ack instead. For keepalive
15723                  * we allow the answer no matter what the window.
15724                  *
15725                  * Note that if the probe_not_answered is set then
15726                  * the forced_ack_ts is the oldest one i.e. the first
15727                  * probe sent that might have been lost. This assures
15728                  * us that if we do calculate an RTT it is longer not
15729                  * some short thing.
15730                  */
15731                 if (rack->rc_in_persist)
15732                         counter_u64_add(rack_persists_acks, 1);
15733                 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
15734                 if (us_rtt == 0)
15735                         us_rtt = 1;
15736                 if (rack->probe_not_answered == 0) {
15737                         rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15738                         tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
15739                 } else {
15740                         /* We have a retransmitted probe here too */
15741                         if (rack_apply_rtt_with_reduced_conf) {
15742                                 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15743                                 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
15744                         }
15745                 }
15746         }
15747 }
15748
15749 static int
15750 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
15751 {
15752         /*
15753          * Handle a "special" compressed ack mbuf. Each incoming
15754          * ack has only four possible dispositions:
15755          *
15756          * A) It moves the cum-ack forward
15757          * B) It is behind the cum-ack.
15758          * C) It is a window-update ack.
15759          * D) It is a dup-ack.
15760          *
15761          * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
15762          * in the incoming mbuf. We also need to still pay attention
15763          * to nxt_pkt since there may be another packet after this
15764          * one.
15765          */
15766 #ifdef TCP_ACCOUNTING
15767         uint64_t ts_val;
15768         uint64_t rdstc;
15769 #endif
15770         int segsiz;
15771         struct timespec ts;
15772         struct tcp_rack *rack;
15773         struct tcp_ackent *ae;
15774         uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
15775         int cnt, i, did_out, ourfinisacked = 0;
15776         struct tcpopt to_holder, *to = NULL;
15777 #ifdef TCP_ACCOUNTING
15778         int win_up_req = 0;
15779 #endif
15780         int nsegs = 0;
15781         int under_pacing = 0;
15782         int recovery = 0;
15783 #ifdef TCP_ACCOUNTING
15784         sched_pin();
15785 #endif
15786         rack = (struct tcp_rack *)tp->t_fb_ptr;
15787         if (rack->gp_ready &&
15788             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
15789                 under_pacing = 1;
15790
15791         if (rack->r_state != tp->t_state)
15792                 rack_set_state(tp, rack);
15793         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
15794             (tp->t_flags & TF_GPUTINPROG)) {
15795                 /*
15796                  * We have a goodput in progress
15797                  * and we have entered a late state.
15798                  * Do we have enough data in the sb
15799                  * to handle the GPUT request?
15800                  */
15801                 uint32_t bytes;
15802
15803                 bytes = tp->gput_ack - tp->gput_seq;
15804                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
15805                         bytes += tp->gput_seq - tp->snd_una;
15806                 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
15807                         /*
15808                          * There are not enough bytes in the socket
15809                          * buffer that have been sent to cover this
15810                          * measurement. Cancel it.
15811                          */
15812                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
15813                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
15814                                                    tp->gput_seq,
15815                                                    0, 0, 18, __LINE__, NULL, 0);
15816                         tp->t_flags &= ~TF_GPUTINPROG;
15817                 }
15818         }
15819         to = &to_holder;
15820         to->to_flags = 0;
15821         KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
15822                 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
15823         cnt = m->m_len / sizeof(struct tcp_ackent);
15824         counter_u64_add(rack_multi_single_eq, cnt);
15825         high_seq = tp->snd_una;
15826         the_win = tp->snd_wnd;
15827         win_seq = tp->snd_wl1;
15828         win_upd_ack = tp->snd_wl2;
15829         cts = tcp_tv_to_usectick(tv);
15830         ms_cts = tcp_tv_to_mssectick(tv);
15831         rack->r_ctl.rc_rcvtime = cts;
15832         segsiz = ctf_fixed_maxseg(tp);
15833         if ((rack->rc_gp_dyn_mul) &&
15834             (rack->use_fixed_rate == 0) &&
15835             (rack->rc_always_pace)) {
15836                 /* Check in on probertt */
15837                 rack_check_probe_rtt(rack, cts);
15838         }
15839         for (i = 0; i < cnt; i++) {
15840 #ifdef TCP_ACCOUNTING
15841                 ts_val = get_cyclecount();
15842 #endif
15843                 rack_clear_rate_sample(rack);
15844                 ae = ((mtod(m, struct tcp_ackent *)) + i);
15845                 if (ae->flags & TH_FIN)
15846                         rack_log_pacing_delay_calc(rack,
15847                                                    0,
15848                                                    0,
15849                                                    0,
15850                                                    rack_get_gp_est(rack), /* delRate */
15851                                                    rack_get_lt_bw(rack), /* rttProp */
15852                                                    20, __LINE__, NULL, 0);
15853                 /* Setup the window */
15854                 tiwin = ae->win << tp->snd_scale;
15855                 if (tiwin > rack->r_ctl.rc_high_rwnd)
15856                         rack->r_ctl.rc_high_rwnd = tiwin;
15857                 /* figure out the type of ack */
15858                 if (SEQ_LT(ae->ack, high_seq)) {
15859                         /* Case B*/
15860                         ae->ack_val_set = ACK_BEHIND;
15861                 } else if (SEQ_GT(ae->ack, high_seq)) {
15862                         /* Case A */
15863                         ae->ack_val_set = ACK_CUMACK;
15864                 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
15865                         /* Case D */
15866                         ae->ack_val_set = ACK_DUPACK;
15867                 } else {
15868                         /* Case C */
15869                         ae->ack_val_set = ACK_RWND;
15870                 }
15871                 if (rack->sack_attack_disable > 0) {
15872                         rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
15873                         rack->r_ctl.ack_during_sd++;
15874                 }
15875                 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
15876                 /* Validate timestamp */
15877                 if (ae->flags & HAS_TSTMP) {
15878                         /* Setup for a timestamp */
15879                         to->to_flags = TOF_TS;
15880                         ae->ts_echo -= tp->ts_offset;
15881                         to->to_tsecr = ae->ts_echo;
15882                         to->to_tsval = ae->ts_value;
15883                         /*
15884                          * If echoed timestamp is later than the current time, fall back to
15885                          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
15886                          * were used when this connection was established.
15887                          */
15888                         if (TSTMP_GT(ae->ts_echo, ms_cts))
15889                                 to->to_tsecr = 0;
15890                         if (tp->ts_recent &&
15891                             TSTMP_LT(ae->ts_value, tp->ts_recent)) {
15892                                 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
15893 #ifdef TCP_ACCOUNTING
15894                                         rdstc = get_cyclecount();
15895                                         if (rdstc > ts_val) {
15896                                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15897                                                         tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
15898                                                 }
15899                                         }
15900 #endif
15901                                         continue;
15902                                 }
15903                         }
15904                         if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
15905                             SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
15906                                 tp->ts_recent_age = tcp_ts_getticks();
15907                                 tp->ts_recent = ae->ts_value;
15908                         }
15909                 } else {
15910                         /* Setup for a no options */
15911                         to->to_flags = 0;
15912                 }
15913                 /* Update the rcv time and perform idle reduction possibly */
15914                 if  (tp->t_idle_reduce &&
15915                      (tp->snd_max == tp->snd_una) &&
15916                      (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
15917                         counter_u64_add(rack_input_idle_reduces, 1);
15918                         rack_cc_after_idle(rack, tp);
15919                 }
15920                 tp->t_rcvtime = ticks;
15921                 /* Now what about ECN of a chain of pure ACKs? */
15922                 if (tcp_ecn_input_segment(tp, ae->flags, 0,
15923                         tcp_packets_this_ack(tp, ae->ack),
15924                         ae->codepoint))
15925                         rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
15926 #ifdef TCP_ACCOUNTING
15927                 /* Count for the specific type of ack in */
15928                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15929                         tp->tcp_cnt_counters[ae->ack_val_set]++;
15930                 }
15931 #endif
15932                 /*
15933                  * Note how we could move up these in the determination
15934                  * above, but we don't so that way the timestamp checks (and ECN)
15935                  * is done first before we do any processing on the ACK.
15936                  * The non-compressed path through the code has this
15937                  * weakness (noted by @jtl) that it actually does some
15938                  * processing before verifying the timestamp information.
15939                  * We don't take that path here which is why we set
15940                  * the ack_val_set first, do the timestamp and ecn
15941                  * processing, and then look at what we have setup.
15942                  */
15943                 if (ae->ack_val_set == ACK_BEHIND) {
15944                         /*
15945                          * Case B flag reordering, if window is not closed
15946                          * or it could be a keep-alive or persists
15947                          */
15948                         if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
15949                                 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
15950                                 if (rack->r_ctl.rc_reorder_ts == 0)
15951                                         rack->r_ctl.rc_reorder_ts = 1;
15952                         }
15953                 } else if (ae->ack_val_set == ACK_DUPACK) {
15954                         /* Case D */
15955                         rack_strike_dupack(rack);
15956                 } else if (ae->ack_val_set == ACK_RWND) {
15957                         /* Case C */
15958                         if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
15959                                 ts.tv_sec = ae->timestamp / 1000000000;
15960                                 ts.tv_nsec = ae->timestamp % 1000000000;
15961                                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
15962                                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
15963                         } else {
15964                                 rack->r_ctl.act_rcv_time = *tv;
15965                         }
15966                         if (rack->forced_ack) {
15967                                 rack_handle_probe_response(rack, tiwin,
15968                                                            tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
15969                         }
15970 #ifdef TCP_ACCOUNTING
15971                         win_up_req = 1;
15972 #endif
15973                         win_upd_ack = ae->ack;
15974                         win_seq = ae->seq;
15975                         the_win = tiwin;
15976                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15977                 } else {
15978                         /* Case A */
15979                         if (SEQ_GT(ae->ack, tp->snd_max)) {
15980                                 /*
15981                                  * We just send an ack since the incoming
15982                                  * ack is beyond the largest seq we sent.
15983                                  */
15984                                 if ((tp->t_flags & TF_ACKNOW) == 0) {
15985                                         ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
15986                                         if (tp->t_flags && TF_ACKNOW)
15987                                                 rack->r_wanted_output = 1;
15988                                 }
15989                         } else {
15990                                 nsegs++;
15991                                 /* If the window changed setup to update */
15992                                 if (tiwin != tp->snd_wnd) {
15993                                         win_upd_ack = ae->ack;
15994                                         win_seq = ae->seq;
15995                                         the_win = tiwin;
15996                                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15997                                 }
15998 #ifdef TCP_ACCOUNTING
15999                                 /* Account for the acks */
16000                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16001                                         tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
16002                                 }
16003 #endif
16004                                 high_seq = ae->ack;
16005                                 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp))
16006                                         rack_log_hystart_event(rack, high_seq, 8);
16007                                 /* Setup our act_rcv_time */
16008                                 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
16009                                         ts.tv_sec = ae->timestamp / 1000000000;
16010                                         ts.tv_nsec = ae->timestamp % 1000000000;
16011                                         rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
16012                                         rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
16013                                 } else {
16014                                         rack->r_ctl.act_rcv_time = *tv;
16015                                 }
16016                                 rack_process_to_cumack(tp, rack, ae->ack, cts, to,
16017                                                        tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time));
16018 #ifdef TCP_REQUEST_TRK
16019                                 rack_http_check_for_comp(rack, high_seq);
16020 #endif
16021                                 if (rack->rc_dsack_round_seen) {
16022                                         /* Is the dsack round over? */
16023                                         if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
16024                                                 /* Yes it is */
16025                                                 rack->rc_dsack_round_seen = 0;
16026                                                 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
16027                                         }
16028                                 }
16029                         }
16030                 }
16031                 /* And lets be sure to commit the rtt measurements for this ack */
16032                 tcp_rack_xmit_timer_commit(rack, tp);
16033 #ifdef TCP_ACCOUNTING
16034                 rdstc = get_cyclecount();
16035                 if (rdstc > ts_val) {
16036                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16037                                 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
16038                                 if (ae->ack_val_set == ACK_CUMACK)
16039                                         tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
16040                         }
16041                 }
16042 #endif
16043         }
16044 #ifdef TCP_ACCOUNTING
16045         ts_val = get_cyclecount();
16046 #endif
16047         /* Tend to any collapsed window */
16048         if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
16049                 /* The peer collapsed the window */
16050                 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__);
16051         } else if (rack->rc_has_collapsed)
16052                 rack_un_collapse_window(rack, __LINE__);
16053         if ((rack->r_collapse_point_valid) &&
16054             (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
16055                 rack->r_collapse_point_valid = 0;
16056         acked_amount = acked = (high_seq - tp->snd_una);
16057         if (acked) {
16058                 /*
16059                  * The draft (v3) calls for us to use SEQ_GEQ, but that
16060                  * causes issues when we are just going app limited. Lets
16061                  * instead use SEQ_GT <or> where its equal but more data
16062                  * is outstanding.
16063                  *
16064                  * Also make sure we are on the last ack of a series. We
16065                  * have to have all the ack's processed in queue to know
16066                  * if there is something left outstanding.
16067                  *
16068                  */
16069                 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
16070                     (rack->rc_new_rnd_needed == 0) &&
16071                     (nxt_pkt == 0)) {
16072                         rack_log_hystart_event(rack, high_seq, 21);
16073                         rack->r_ctl.current_round++;
16074                         /* Force the next send to setup the next round */
16075                         rack->rc_new_rnd_needed = 1;
16076                         if (CC_ALGO(tp)->newround != NULL) {
16077                                 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
16078                         }
16079                 }
16080                 /*
16081                  * Clear the probe not answered flag
16082                  * since cum-ack moved forward.
16083                  */
16084                 rack->probe_not_answered = 0;
16085                 if (rack->sack_attack_disable == 0)
16086                         rack_do_decay(rack);
16087                 if (acked >= segsiz) {
16088                         /*
16089                          * You only get credit for
16090                          * MSS and greater (and you get extra
16091                          * credit for larger cum-ack moves).
16092                          */
16093                         int ac;
16094
16095                         ac = acked / segsiz;
16096                         rack->r_ctl.ack_count += ac;
16097                         counter_u64_add(rack_ack_total, ac);
16098                 }
16099                 if (rack->r_ctl.ack_count > 0xfff00000) {
16100                         /*
16101                          * reduce the number to keep us under
16102                          * a uint32_t.
16103                          */
16104                         rack->r_ctl.ack_count /= 2;
16105                         rack->r_ctl.sack_count /= 2;
16106                 }
16107                 if (tp->t_flags & TF_NEEDSYN) {
16108                         /*
16109                          * T/TCP: Connection was half-synchronized, and our SYN has
16110                          * been ACK'd (so connection is now fully synchronized).  Go
16111                          * to non-starred state, increment snd_una for ACK of SYN,
16112                          * and check if we can do window scaling.
16113                          */
16114                         tp->t_flags &= ~TF_NEEDSYN;
16115                         tp->snd_una++;
16116                         acked_amount = acked = (high_seq - tp->snd_una);
16117                 }
16118                 if (acked > sbavail(&so->so_snd))
16119                         acked_amount = sbavail(&so->so_snd);
16120 #ifdef TCP_SAD_DETECTION
16121                 /*
16122                  * We only care on a cum-ack move if we are in a sack-disabled
16123                  * state. We have already added in to the ack_count, and we never
16124                  * would disable on a cum-ack move, so we only care to do the
16125                  * detection if it may "undo" it, i.e. we were in disabled already.
16126                  */
16127                 if (rack->sack_attack_disable)
16128                         rack_do_detection(tp, rack, acked_amount, segsiz);
16129 #endif
16130                 if (IN_FASTRECOVERY(tp->t_flags) &&
16131                     (rack->rack_no_prr == 0))
16132                         rack_update_prr(tp, rack, acked_amount, high_seq);
16133                 if (IN_RECOVERY(tp->t_flags)) {
16134                         if (SEQ_LT(high_seq, tp->snd_recover) &&
16135                             (SEQ_LT(high_seq, tp->snd_max))) {
16136                                 tcp_rack_partialack(tp);
16137                         } else {
16138                                 rack_post_recovery(tp, high_seq);
16139                                 recovery = 1;
16140                         }
16141                 }
16142                 /* Handle the rack-log-ack part (sendmap) */
16143                 if ((sbused(&so->so_snd) == 0) &&
16144                     (acked > acked_amount) &&
16145                     (tp->t_state >= TCPS_FIN_WAIT_1) &&
16146                     (tp->t_flags & TF_SENTFIN)) {
16147                         /*
16148                          * We must be sure our fin
16149                          * was sent and acked (we can be
16150                          * in FIN_WAIT_1 without having
16151                          * sent the fin).
16152                          */
16153                         ourfinisacked = 1;
16154                         /*
16155                          * Lets make sure snd_una is updated
16156                          * since most likely acked_amount = 0 (it
16157                          * should be).
16158                          */
16159                         tp->snd_una = high_seq;
16160                 }
16161                 /* Did we make a RTO error? */
16162                 if ((tp->t_flags & TF_PREVVALID) &&
16163                     ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
16164                         tp->t_flags &= ~TF_PREVVALID;
16165                         if (tp->t_rxtshift == 1 &&
16166                             (int)(ticks - tp->t_badrxtwin) < 0)
16167                                 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__);
16168                 }
16169                 /* Handle the data in the socket buffer */
16170                 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
16171                 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
16172                 if (acked_amount > 0) {
16173                         struct mbuf *mfree;
16174
16175                         rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
16176                         SOCKBUF_LOCK(&so->so_snd);
16177                         mfree = sbcut_locked(&so->so_snd, acked_amount);
16178                         tp->snd_una = high_seq;
16179                         /* Note we want to hold the sb lock through the sendmap adjust */
16180                         rack_adjust_sendmap_head(rack, &so->so_snd);
16181                         /* Wake up the socket if we have room to write more */
16182                         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
16183                         sowwakeup_locked(so);
16184                         if ((recovery == 1) &&
16185                             (rack->excess_rxt_on) &&
16186                             (rack->r_cwnd_was_clamped == 0)) {
16187                                 do_rack_excess_rxt(tp, rack);
16188                         } else if (rack->r_cwnd_was_clamped)
16189                                 do_rack_check_for_unclamp(tp, rack);
16190                         m_freem(mfree);
16191                 }
16192                 /* update progress */
16193                 tp->t_acktime = ticks;
16194                 rack_log_progress_event(rack, tp, tp->t_acktime,
16195                                         PROGRESS_UPDATE, __LINE__);
16196                 /* Clear out shifts and such */
16197                 tp->t_rxtshift = 0;
16198                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
16199                                    rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
16200                 rack->rc_tlp_in_progress = 0;
16201                 rack->r_ctl.rc_tlp_cnt_out = 0;
16202                 /* Send recover and snd_nxt must be dragged along */
16203                 if (SEQ_GT(tp->snd_una, tp->snd_recover))
16204                         tp->snd_recover = tp->snd_una;
16205                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
16206                         tp->snd_nxt = tp->snd_una;
16207                 /*
16208                  * If the RXT timer is running we want to
16209                  * stop it, so we can restart a TLP (or new RXT).
16210                  */
16211                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
16212                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
16213                 tp->snd_wl2 = high_seq;
16214                 tp->t_dupacks = 0;
16215                 if (under_pacing &&
16216                     (rack->use_fixed_rate == 0) &&
16217                     (rack->in_probe_rtt == 0) &&
16218                     rack->rc_gp_dyn_mul &&
16219                     rack->rc_always_pace) {
16220                         /* Check if we are dragging bottom */
16221                         rack_check_bottom_drag(tp, rack, so);
16222                 }
16223                 if (tp->snd_una == tp->snd_max) {
16224                         tp->t_flags &= ~TF_PREVVALID;
16225                         rack->r_ctl.retran_during_recovery = 0;
16226                         rack->rc_suspicious = 0;
16227                         rack->r_ctl.dsack_byte_cnt = 0;
16228                         rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
16229                         if (rack->r_ctl.rc_went_idle_time == 0)
16230                                 rack->r_ctl.rc_went_idle_time = 1;
16231                         rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
16232                         if (sbavail(&tptosocket(tp)->so_snd) == 0)
16233                                 tp->t_acktime = 0;
16234                         /* Set so we might enter persists... */
16235                         rack->r_wanted_output = 1;
16236                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
16237                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
16238                         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
16239                             (sbavail(&so->so_snd) == 0) &&
16240                             (tp->t_flags2 & TF2_DROP_AF_DATA)) {
16241                                 /*
16242                                  * The socket was gone and the
16243                                  * peer sent data (not now in the past), time to
16244                                  * reset him.
16245                                  */
16246                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
16247                                 /* tcp_close will kill the inp pre-log the Reset */
16248                                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
16249 #ifdef TCP_ACCOUNTING
16250                                 rdstc = get_cyclecount();
16251                                 if (rdstc > ts_val) {
16252                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16253                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16254                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16255                                         }
16256                                 }
16257 #endif
16258                                 m_freem(m);
16259                                 tp = tcp_close(tp);
16260                                 if (tp == NULL) {
16261 #ifdef TCP_ACCOUNTING
16262                                         sched_unpin();
16263 #endif
16264                                         return (1);
16265                                 }
16266                                 /*
16267                                  * We would normally do drop-with-reset which would
16268                                  * send back a reset. We can't since we don't have
16269                                  * all the needed bits. Instead lets arrange for
16270                                  * a call to tcp_output(). That way since we
16271                                  * are in the closed state we will generate a reset.
16272                                  *
16273                                  * Note if tcp_accounting is on we don't unpin since
16274                                  * we do that after the goto label.
16275                                  */
16276                                 goto send_out_a_rst;
16277                         }
16278                         if ((sbused(&so->so_snd) == 0) &&
16279                             (tp->t_state >= TCPS_FIN_WAIT_1) &&
16280                             (tp->t_flags & TF_SENTFIN)) {
16281                                 /*
16282                                  * If we can't receive any more data, then closing user can
16283                                  * proceed. Starting the timer is contrary to the
16284                                  * specification, but if we don't get a FIN we'll hang
16285                                  * forever.
16286                                  *
16287                                  */
16288                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
16289                                         soisdisconnected(so);
16290                                         tcp_timer_activate(tp, TT_2MSL,
16291                                                            (tcp_fast_finwait2_recycle ?
16292                                                             tcp_finwait2_timeout :
16293                                                             TP_MAXIDLE(tp)));
16294                                 }
16295                                 if (ourfinisacked == 0) {
16296                                         /*
16297                                          * We don't change to fin-wait-2 if we have our fin acked
16298                                          * which means we are probably in TCPS_CLOSING.
16299                                          */
16300                                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
16301                                 }
16302                         }
16303                 }
16304                 /* Wake up the socket if we have room to write more */
16305                 if (sbavail(&so->so_snd)) {
16306                         rack->r_wanted_output = 1;
16307                         if (ctf_progress_timeout_check(tp, true)) {
16308                                 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
16309                                                         tp, tick, PROGRESS_DROP, __LINE__);
16310                                 /*
16311                                  * We cheat here and don't send a RST, we should send one
16312                                  * when the pacer drops the connection.
16313                                  */
16314 #ifdef TCP_ACCOUNTING
16315                                 rdstc = get_cyclecount();
16316                                 if (rdstc > ts_val) {
16317                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16318                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16319                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16320                                         }
16321                                 }
16322                                 sched_unpin();
16323 #endif
16324                                 (void)tcp_drop(tp, ETIMEDOUT);
16325                                 m_freem(m);
16326                                 return (1);
16327                         }
16328                 }
16329                 if (ourfinisacked) {
16330                         switch(tp->t_state) {
16331                         case TCPS_CLOSING:
16332 #ifdef TCP_ACCOUNTING
16333                                 rdstc = get_cyclecount();
16334                                 if (rdstc > ts_val) {
16335                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16336                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16337                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16338                                         }
16339                                 }
16340                                 sched_unpin();
16341 #endif
16342                                 tcp_twstart(tp);
16343                                 m_freem(m);
16344                                 return (1);
16345                                 break;
16346                         case TCPS_LAST_ACK:
16347 #ifdef TCP_ACCOUNTING
16348                                 rdstc = get_cyclecount();
16349                                 if (rdstc > ts_val) {
16350                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16351                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16352                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16353                                         }
16354                                 }
16355                                 sched_unpin();
16356 #endif
16357                                 tp = tcp_close(tp);
16358                                 ctf_do_drop(m, tp);
16359                                 return (1);
16360                                 break;
16361                         case TCPS_FIN_WAIT_1:
16362 #ifdef TCP_ACCOUNTING
16363                                 rdstc = get_cyclecount();
16364                                 if (rdstc > ts_val) {
16365                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16366                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16367                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16368                                         }
16369                                 }
16370 #endif
16371                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
16372                                         soisdisconnected(so);
16373                                         tcp_timer_activate(tp, TT_2MSL,
16374                                                            (tcp_fast_finwait2_recycle ?
16375                                                             tcp_finwait2_timeout :
16376                                                             TP_MAXIDLE(tp)));
16377                                 }
16378                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
16379                                 break;
16380                         default:
16381                                 break;
16382                         }
16383                 }
16384                 if (rack->r_fast_output) {
16385                         /*
16386                          * We re doing fast output.. can we expand that?
16387                          */
16388                         rack_gain_for_fastoutput(rack, tp, so, acked_amount);
16389                 }
16390 #ifdef TCP_ACCOUNTING
16391                 rdstc = get_cyclecount();
16392                 if (rdstc > ts_val) {
16393                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16394                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16395                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16396                         }
16397                 }
16398
16399         } else if (win_up_req) {
16400                 rdstc = get_cyclecount();
16401                 if (rdstc > ts_val) {
16402                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16403                                 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
16404                         }
16405                 }
16406 #endif
16407         }
16408         /* Now is there a next packet, if so we are done */
16409         m_freem(m);
16410         did_out = 0;
16411         if (nxt_pkt) {
16412 #ifdef TCP_ACCOUNTING
16413                 sched_unpin();
16414 #endif
16415                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
16416                 return (0);
16417         }
16418         rack_handle_might_revert(tp, rack);
16419         ctf_calc_rwin(so, tp);
16420         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
16421         send_out_a_rst:
16422                 if (tcp_output(tp) < 0) {
16423 #ifdef TCP_ACCOUNTING
16424                         sched_unpin();
16425 #endif
16426                         return (1);
16427                 }
16428                 did_out = 1;
16429         }
16430         rack_free_trim(rack);
16431 #ifdef TCP_ACCOUNTING
16432         sched_unpin();
16433 #endif
16434         rack_timer_audit(tp, rack, &so->so_snd);
16435         rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
16436         return (0);
16437 }
16438
16439 #define TCP_LRO_TS_OPTION \
16440     ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
16441           (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
16442
16443 static int
16444 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
16445     int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt,
16446     struct timeval *tv)
16447 {
16448         struct inpcb *inp = tptoinpcb(tp);
16449         struct socket *so = tptosocket(tp);
16450 #ifdef TCP_ACCOUNTING
16451         uint64_t ts_val;
16452 #endif
16453         int32_t thflags, retval, did_out = 0;
16454         int32_t way_out = 0;
16455         /*
16456          * cts - is the current time from tv (caller gets ts) in microseconds.
16457          * ms_cts - is the current time from tv in milliseconds.
16458          * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
16459          */
16460         uint32_t cts, us_cts, ms_cts;
16461         uint32_t tiwin, high_seq;
16462         struct timespec ts;
16463         struct tcpopt to;
16464         struct tcp_rack *rack;
16465         struct rack_sendmap *rsm;
16466         int32_t prev_state = 0;
16467         int no_output = 0;
16468         int slot_remaining = 0;
16469 #ifdef TCP_ACCOUNTING
16470         int ack_val_set = 0xf;
16471 #endif
16472         int nsegs;
16473
16474         NET_EPOCH_ASSERT();
16475         INP_WLOCK_ASSERT(inp);
16476
16477         /*
16478          * tv passed from common code is from either M_TSTMP_LRO or
16479          * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
16480          */
16481         rack = (struct tcp_rack *)tp->t_fb_ptr;
16482         if (rack->rack_deferred_inited == 0) {
16483                 /*
16484                  * If we are the connecting socket we will
16485                  * hit rack_init() when no sequence numbers
16486                  * are setup. This makes it so we must defer
16487                  * some initialization. Call that now.
16488                  */
16489                 rack_deferred_init(tp, rack);
16490         }
16491         /*
16492          * Check to see if we need to skip any output plans. This
16493          * can happen in the non-LRO path where we are pacing and
16494          * must process the ack coming in but need to defer sending
16495          * anything becase a pacing timer is running.
16496          */
16497         us_cts = tcp_tv_to_usectick(tv);
16498         if ((rack->rc_always_pace == 1) &&
16499             (rack->rc_ack_can_sendout_data == 0) &&
16500             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16501             (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) {
16502                 /*
16503                  * Ok conditions are right for queuing the packets
16504                  * but we do have to check the flags in the inp, it
16505                  * could be, if a sack is present, we want to be awoken and
16506                  * so should process the packets.
16507                  */
16508                 slot_remaining = rack->r_ctl.rc_last_output_to - us_cts;
16509                 if (rack->rc_inp->inp_flags2 & INP_DONT_SACK_QUEUE) {
16510                         no_output = 1;
16511                 } else {
16512                         /*
16513                          * If there is no options, or just a
16514                          * timestamp option, we will want to queue
16515                          * the packets. This is the same that LRO does
16516                          * and will need to change with accurate ECN.
16517                          */
16518                         uint32_t *ts_ptr;
16519                         int optlen;
16520
16521                         optlen = (th->th_off << 2) - sizeof(struct tcphdr);
16522                         ts_ptr = (uint32_t *)(th + 1);
16523                         if ((optlen == 0) ||
16524                             ((optlen == TCPOLEN_TSTAMP_APPA) &&
16525                              (*ts_ptr == TCP_LRO_TS_OPTION)))
16526                                 no_output = 1;
16527                 }
16528         }
16529         if (m->m_flags & M_ACKCMP) {
16530                 /*
16531                  * All compressed ack's are ack's by definition so
16532                  * remove any ack required flag and then do the processing.
16533                  */
16534                 rack->rc_ack_required = 0;
16535                 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
16536         }
16537         thflags = tcp_get_flags(th);
16538         /*
16539          * If there is a RST or FIN lets dump out the bw
16540          * with a FIN the connection may go on but we
16541          * may not.
16542          */
16543         if ((thflags & TH_FIN) || (thflags & TH_RST))
16544                 rack_log_pacing_delay_calc(rack,
16545                                            rack->r_ctl.gp_bw,
16546                                            0,
16547                                            0,
16548                                            rack_get_gp_est(rack), /* delRate */
16549                                            rack_get_lt_bw(rack), /* rttProp */
16550                                            20, __LINE__, NULL, 0);
16551         if (m->m_flags & M_ACKCMP) {
16552                 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
16553         }
16554         cts = tcp_tv_to_usectick(tv);
16555         ms_cts =  tcp_tv_to_mssectick(tv);
16556         nsegs = m->m_pkthdr.lro_nsegs;
16557         counter_u64_add(rack_proc_non_comp_ack, 1);
16558 #ifdef TCP_ACCOUNTING
16559         sched_pin();
16560         if (thflags & TH_ACK)
16561                 ts_val = get_cyclecount();
16562 #endif
16563         if ((m->m_flags & M_TSTMP) ||
16564             (m->m_flags & M_TSTMP_LRO)) {
16565                 mbuf_tstmp2timespec(m, &ts);
16566                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
16567                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
16568         } else
16569                 rack->r_ctl.act_rcv_time = *tv;
16570         kern_prefetch(rack, &prev_state);
16571         prev_state = 0;
16572         /*
16573          * Unscale the window into a 32-bit value. For the SYN_SENT state
16574          * the scale is zero.
16575          */
16576         tiwin = th->th_win << tp->snd_scale;
16577 #ifdef TCP_ACCOUNTING
16578         if (thflags & TH_ACK) {
16579                 /*
16580                  * We have a tradeoff here. We can either do what we are
16581                  * doing i.e. pinning to this CPU and then doing the accounting
16582                  * <or> we could do a critical enter, setup the rdtsc and cpu
16583                  * as in below, and then validate we are on the same CPU on
16584                  * exit. I have choosen to not do the critical enter since
16585                  * that often will gain you a context switch, and instead lock
16586                  * us (line above this if) to the same CPU with sched_pin(). This
16587                  * means we may be context switched out for a higher priority
16588                  * interupt but we won't be moved to another CPU.
16589                  *
16590                  * If this occurs (which it won't very often since we most likely
16591                  * are running this code in interupt context and only a higher
16592                  * priority will bump us ... clock?) we will falsely add in
16593                  * to the time the interupt processing time plus the ack processing
16594                  * time. This is ok since its a rare event.
16595                  */
16596                 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
16597                                                     ctf_fixed_maxseg(tp));
16598         }
16599 #endif
16600         /*
16601          * Parse options on any incoming segment.
16602          */
16603         memset(&to, 0, sizeof(to));
16604         tcp_dooptions(&to, (u_char *)(th + 1),
16605             (th->th_off << 2) - sizeof(struct tcphdr),
16606             (thflags & TH_SYN) ? TO_SYN : 0);
16607         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
16608             __func__));
16609         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
16610             __func__));
16611
16612         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
16613             (tp->t_flags & TF_GPUTINPROG)) {
16614                 /*
16615                  * We have a goodput in progress
16616                  * and we have entered a late state.
16617                  * Do we have enough data in the sb
16618                  * to handle the GPUT request?
16619                  */
16620                 uint32_t bytes;
16621
16622                 bytes = tp->gput_ack - tp->gput_seq;
16623                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
16624                         bytes += tp->gput_seq - tp->snd_una;
16625                 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
16626                         /*
16627                          * There are not enough bytes in the socket
16628                          * buffer that have been sent to cover this
16629                          * measurement. Cancel it.
16630                          */
16631                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
16632                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
16633                                                    tp->gput_seq,
16634                                                    0, 0, 18, __LINE__, NULL, 0);
16635                         tp->t_flags &= ~TF_GPUTINPROG;
16636                 }
16637         }
16638         high_seq = th->th_ack;
16639         if (tcp_bblogging_on(rack->rc_tp)) {
16640                 union tcp_log_stackspecific log;
16641                 struct timeval ltv;
16642 #ifdef TCP_REQUEST_TRK
16643                 struct http_sendfile_track *http_req;
16644
16645                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
16646                         http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
16647                 } else {
16648                         http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
16649                 }
16650 #endif
16651                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16652                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
16653                 if (rack->rack_no_prr == 0)
16654                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16655                 else
16656                         log.u_bbr.flex1 = 0;
16657                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
16658                 log.u_bbr.use_lt_bw <<= 1;
16659                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
16660                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
16661                 log.u_bbr.bbr_state = rack->rc_free_cnt;
16662                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16663                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
16664                 log.u_bbr.flex3 = m->m_flags;
16665                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
16666                 log.u_bbr.lost = thflags;
16667                 log.u_bbr.pacing_gain = 0x1;
16668 #ifdef TCP_ACCOUNTING
16669                 log.u_bbr.cwnd_gain = ack_val_set;
16670 #endif
16671                 log.u_bbr.flex7 = 2;
16672                 if (m->m_flags & M_TSTMP) {
16673                         /* Record the hardware timestamp if present */
16674                         mbuf_tstmp2timespec(m, &ts);
16675                         ltv.tv_sec = ts.tv_sec;
16676                         ltv.tv_usec = ts.tv_nsec / 1000;
16677                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
16678                 } else if (m->m_flags & M_TSTMP_LRO) {
16679                         /* Record the LRO the arrival timestamp */
16680                         mbuf_tstmp2timespec(m, &ts);
16681                         ltv.tv_sec = ts.tv_sec;
16682                         ltv.tv_usec = ts.tv_nsec / 1000;
16683                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
16684                 }
16685                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
16686                 /* Log the rcv time */
16687                 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
16688 #ifdef TCP_REQUEST_TRK
16689                 log.u_bbr.applimited = tp->t_http_closed;
16690                 log.u_bbr.applimited <<= 8;
16691                 log.u_bbr.applimited |= tp->t_http_open;
16692                 log.u_bbr.applimited <<= 8;
16693                 log.u_bbr.applimited |= tp->t_http_req;
16694                 if (http_req) {
16695                         /* Copy out any client req info */
16696                         /* seconds */
16697                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
16698                         /* useconds */
16699                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
16700                         log.u_bbr.rttProp = http_req->timestamp;
16701                         log.u_bbr.cur_del_rate = http_req->start;
16702                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
16703                                 log.u_bbr.flex8 |= 1;
16704                         } else {
16705                                 log.u_bbr.flex8 |= 2;
16706                                 log.u_bbr.bw_inuse = http_req->end;
16707                         }
16708                         log.u_bbr.flex6 = http_req->start_seq;
16709                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
16710                                 log.u_bbr.flex8 |= 4;
16711                                 log.u_bbr.epoch = http_req->end_seq;
16712                         }
16713                 }
16714 #endif
16715                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
16716                     tlen, &log, true, &ltv);
16717         }
16718         /* Remove ack required flag if set, we have one  */
16719         if (thflags & TH_ACK)
16720                 rack->rc_ack_required = 0;
16721         if (rack->sack_attack_disable > 0) {
16722                 rack->r_ctl.ack_during_sd++;
16723                 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
16724         }
16725         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
16726                 way_out = 4;
16727                 retval = 0;
16728                 m_freem(m);
16729                 goto done_with_input;
16730         }
16731         /*
16732          * If a segment with the ACK-bit set arrives in the SYN-SENT state
16733          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
16734          */
16735         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
16736             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
16737                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
16738                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
16739 #ifdef TCP_ACCOUNTING
16740                 sched_unpin();
16741 #endif
16742                 return (1);
16743         }
16744         /*
16745          * If timestamps were negotiated during SYN/ACK and a
16746          * segment without a timestamp is received, silently drop
16747          * the segment, unless it is a RST segment or missing timestamps are
16748          * tolerated.
16749          * See section 3.2 of RFC 7323.
16750          */
16751         if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
16752             ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
16753                 way_out = 5;
16754                 retval = 0;
16755                 m_freem(m);
16756                 goto done_with_input;
16757         }
16758
16759         /*
16760          * Segment received on connection. Reset idle time and keep-alive
16761          * timer. XXX: This should be done after segment validation to
16762          * ignore broken/spoofed segs.
16763          */
16764         if  (tp->t_idle_reduce &&
16765              (tp->snd_max == tp->snd_una) &&
16766              (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
16767                 counter_u64_add(rack_input_idle_reduces, 1);
16768                 rack_cc_after_idle(rack, tp);
16769         }
16770         tp->t_rcvtime = ticks;
16771 #ifdef STATS
16772         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
16773 #endif
16774         if (tiwin > rack->r_ctl.rc_high_rwnd)
16775                 rack->r_ctl.rc_high_rwnd = tiwin;
16776         /*
16777          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
16778          * this to occur after we've validated the segment.
16779          */
16780         if (tcp_ecn_input_segment(tp, thflags, tlen,
16781             tcp_packets_this_ack(tp, th->th_ack),
16782             iptos))
16783                 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
16784
16785         /*
16786          * If echoed timestamp is later than the current time, fall back to
16787          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
16788          * were used when this connection was established.
16789          */
16790         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
16791                 to.to_tsecr -= tp->ts_offset;
16792                 if (TSTMP_GT(to.to_tsecr, ms_cts))
16793                         to.to_tsecr = 0;
16794         }
16795
16796         /*
16797          * If its the first time in we need to take care of options and
16798          * verify we can do SACK for rack!
16799          */
16800         if (rack->r_state == 0) {
16801                 /* Should be init'd by rack_init() */
16802                 KASSERT(rack->rc_inp != NULL,
16803                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
16804                 if (rack->rc_inp == NULL) {
16805                         rack->rc_inp = inp;
16806                 }
16807
16808                 /*
16809                  * Process options only when we get SYN/ACK back. The SYN
16810                  * case for incoming connections is handled in tcp_syncache.
16811                  * According to RFC1323 the window field in a SYN (i.e., a
16812                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
16813                  * this is traditional behavior, may need to be cleaned up.
16814                  */
16815                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
16816                         /* Handle parallel SYN for ECN */
16817                         tcp_ecn_input_parallel_syn(tp, thflags, iptos);
16818                         if ((to.to_flags & TOF_SCALE) &&
16819                             (tp->t_flags & TF_REQ_SCALE)) {
16820                                 tp->t_flags |= TF_RCVD_SCALE;
16821                                 tp->snd_scale = to.to_wscale;
16822                         } else
16823                                 tp->t_flags &= ~TF_REQ_SCALE;
16824                         /*
16825                          * Initial send window.  It will be updated with the
16826                          * next incoming segment to the scaled value.
16827                          */
16828                         tp->snd_wnd = th->th_win;
16829                         rack_validate_fo_sendwin_up(tp, rack);
16830                         if ((to.to_flags & TOF_TS) &&
16831                             (tp->t_flags & TF_REQ_TSTMP)) {
16832                                 tp->t_flags |= TF_RCVD_TSTMP;
16833                                 tp->ts_recent = to.to_tsval;
16834                                 tp->ts_recent_age = cts;
16835                         } else
16836                                 tp->t_flags &= ~TF_REQ_TSTMP;
16837                         if (to.to_flags & TOF_MSS) {
16838                                 tcp_mss(tp, to.to_mss);
16839                         }
16840                         if ((tp->t_flags & TF_SACK_PERMIT) &&
16841                             (to.to_flags & TOF_SACKPERM) == 0)
16842                                 tp->t_flags &= ~TF_SACK_PERMIT;
16843                         if (IS_FASTOPEN(tp->t_flags)) {
16844                                 if (to.to_flags & TOF_FASTOPEN) {
16845                                         uint16_t mss;
16846
16847                                         if (to.to_flags & TOF_MSS)
16848                                                 mss = to.to_mss;
16849                                         else
16850                                                 if ((inp->inp_vflag & INP_IPV6) != 0)
16851                                                         mss = TCP6_MSS;
16852                                                 else
16853                                                         mss = TCP_MSS;
16854                                         tcp_fastopen_update_cache(tp, mss,
16855                                             to.to_tfo_len, to.to_tfo_cookie);
16856                                 } else
16857                                         tcp_fastopen_disable_path(tp);
16858                         }
16859                 }
16860                 /*
16861                  * At this point we are at the initial call. Here we decide
16862                  * if we are doing RACK or not. We do this by seeing if
16863                  * TF_SACK_PERMIT is set and the sack-not-required is clear.
16864                  * The code now does do dup-ack counting so if you don't
16865                  * switch back you won't get rack & TLP, but you will still
16866                  * get this stack.
16867                  */
16868
16869                 if ((rack_sack_not_required == 0) &&
16870                     ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
16871                         tcp_switch_back_to_default(tp);
16872                         (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen,
16873                             tlen, iptos);
16874 #ifdef TCP_ACCOUNTING
16875                         sched_unpin();
16876 #endif
16877                         return (1);
16878                 }
16879                 tcp_set_hpts(inp);
16880                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
16881         }
16882         if (thflags & TH_FIN)
16883                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
16884         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
16885         if ((rack->rc_gp_dyn_mul) &&
16886             (rack->use_fixed_rate == 0) &&
16887             (rack->rc_always_pace)) {
16888                 /* Check in on probertt */
16889                 rack_check_probe_rtt(rack, us_cts);
16890         }
16891         rack_clear_rate_sample(rack);
16892         if ((rack->forced_ack) &&
16893             ((tcp_get_flags(th) & TH_RST) == 0)) {
16894                 rack_handle_probe_response(rack, tiwin, us_cts);
16895         }
16896         /*
16897          * This is the one exception case where we set the rack state
16898          * always. All other times (timers etc) we must have a rack-state
16899          * set (so we assure we have done the checks above for SACK).
16900          */
16901         rack->r_ctl.rc_rcvtime = cts;
16902         if (rack->r_state != tp->t_state)
16903                 rack_set_state(tp, rack);
16904         if (SEQ_GT(th->th_ack, tp->snd_una) &&
16905             (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL)
16906                 kern_prefetch(rsm, &prev_state);
16907         prev_state = rack->r_state;
16908         if ((thflags & TH_RST) &&
16909             ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
16910               SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
16911              (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) {
16912                 /* The connection will be killed by a reset check the tracepoint */
16913                 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV);
16914         }
16915         retval = (*rack->r_substate) (m, th, so,
16916             tp, &to, drop_hdrlen,
16917             tlen, tiwin, thflags, nxt_pkt, iptos);
16918         if (retval == 0) {
16919                 /*
16920                  * If retval is 1 the tcb is unlocked and most likely the tp
16921                  * is gone.
16922                  */
16923                 INP_WLOCK_ASSERT(inp);
16924                 if ((rack->rc_gp_dyn_mul) &&
16925                     (rack->rc_always_pace) &&
16926                     (rack->use_fixed_rate == 0) &&
16927                     rack->in_probe_rtt &&
16928                     (rack->r_ctl.rc_time_probertt_starts == 0)) {
16929                         /*
16930                          * If we are going for target, lets recheck before
16931                          * we output.
16932                          */
16933                         rack_check_probe_rtt(rack, us_cts);
16934                 }
16935                 if (rack->set_pacing_done_a_iw == 0) {
16936                         /* How much has been acked? */
16937                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
16938                                 /* We have enough to set in the pacing segment size */
16939                                 rack->set_pacing_done_a_iw = 1;
16940                                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
16941                         }
16942                 }
16943                 tcp_rack_xmit_timer_commit(rack, tp);
16944 #ifdef TCP_ACCOUNTING
16945                 /*
16946                  * If we set the ack_val_se to what ack processing we are doing
16947                  * we also want to track how many cycles we burned. Note
16948                  * the bits after tcp_output we let be "free". This is because
16949                  * we are also tracking the tcp_output times as well. Note the
16950                  * use of 0xf here since we only have 11 counter (0 - 0xa) and
16951                  * 0xf cannot be returned and is what we initialize it too to
16952                  * indicate we are not doing the tabulations.
16953                  */
16954                 if (ack_val_set != 0xf) {
16955                         uint64_t crtsc;
16956
16957                         crtsc = get_cyclecount();
16958                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16959                                 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
16960                         }
16961                 }
16962 #endif
16963                 if ((nxt_pkt == 0) && (no_output == 0)) {
16964                         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
16965 do_output_now:
16966                                 if (tcp_output(tp) < 0) {
16967 #ifdef TCP_ACCOUNTING
16968                                         sched_unpin();
16969 #endif
16970                                         return (1);
16971                                 }
16972                                 did_out = 1;
16973                         }
16974                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16975                         rack_free_trim(rack);
16976                 } else if ((no_output == 1) &&
16977                            (nxt_pkt == 0)  &&
16978                            (tcp_in_hpts(rack->rc_inp) == 0)) {
16979                         /*
16980                          * We are not in hpts and we had a pacing timer up. Use
16981                          * the remaining time (slot_remaining) to restart the timer.
16982                          */
16983                         KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
16984                         rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0);
16985                         rack_free_trim(rack);
16986                 }
16987                 /* Update any rounds needed */
16988                 if (rack_verbose_logging &&  tcp_bblogging_on(rack->rc_tp))
16989                         rack_log_hystart_event(rack, high_seq, 8);
16990                 /*
16991                  * The draft (v3) calls for us to use SEQ_GEQ, but that
16992                  * causes issues when we are just going app limited. Lets
16993                  * instead use SEQ_GT <or> where its equal but more data
16994                  * is outstanding.
16995                  *
16996                  * Also make sure we are on the last ack of a series. We
16997                  * have to have all the ack's processed in queue to know
16998                  * if there is something left outstanding.
16999                  */
17000                 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
17001                     (rack->rc_new_rnd_needed == 0) &&
17002                     (nxt_pkt == 0)) {
17003                         rack_log_hystart_event(rack, tp->snd_una, 21);
17004                         rack->r_ctl.current_round++;
17005                         /* Force the next send to setup the next round */
17006                         rack->rc_new_rnd_needed = 1;
17007                         if (CC_ALGO(tp)->newround != NULL) {
17008                                 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
17009                         }
17010                 }
17011                 if ((nxt_pkt == 0) &&
17012                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
17013                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
17014                      (tp->t_flags & TF_DELACK) ||
17015                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
17016                       (tp->t_state <= TCPS_CLOSING)))) {
17017                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
17018                         if ((tp->snd_max == tp->snd_una) &&
17019                             ((tp->t_flags & TF_DELACK) == 0) &&
17020                             (tcp_in_hpts(rack->rc_inp)) &&
17021                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
17022                                 /* keep alive not needed if we are hptsi output yet */
17023                                 ;
17024                         } else {
17025                                 int late = 0;
17026                                 if (tcp_in_hpts(inp)) {
17027                                         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
17028                                                 us_cts = tcp_get_usecs(NULL);
17029                                                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
17030                                                         rack->r_early = 1;
17031                                                         rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
17032                                                 } else
17033                                                         late = 1;
17034                                                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
17035                                         }
17036                                         tcp_hpts_remove(inp);
17037                                 }
17038                                 if (late && (did_out == 0)) {
17039                                         /*
17040                                          * We are late in the sending
17041                                          * and we did not call the output
17042                                          * (this probably should not happen).
17043                                          */
17044                                         goto do_output_now;
17045                                 }
17046                                 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
17047                         }
17048                         way_out = 1;
17049                 } else if (nxt_pkt == 0) {
17050                         /* Do we have the correct timer running? */
17051                         rack_timer_audit(tp, rack, &so->so_snd);
17052                         way_out = 2;
17053                 }
17054         done_with_input:
17055                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
17056                 if (did_out)
17057                         rack->r_wanted_output = 0;
17058         }
17059 #ifdef TCP_ACCOUNTING
17060         sched_unpin();
17061 #endif
17062         return (retval);
17063 }
17064
17065 static void
17066 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
17067     int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
17068 {
17069         struct timeval tv;
17070
17071         /* First lets see if we have old packets */
17072         if (!STAILQ_EMPTY(&tp->t_inqueue)) {
17073                 if (ctf_do_queued_segments(tp, 1)) {
17074                         m_freem(m);
17075                         return;
17076                 }
17077         }
17078         if (m->m_flags & M_TSTMP_LRO) {
17079                 mbuf_tstmp2timeval(m, &tv);
17080         } else {
17081                 /* Should not be should we kassert instead? */
17082                 tcp_get_usecs(&tv);
17083         }
17084         if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0,
17085             &tv) == 0) {
17086                 INP_WUNLOCK(tptoinpcb(tp));
17087         }
17088 }
17089
17090 struct rack_sendmap *
17091 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
17092 {
17093         struct rack_sendmap *rsm = NULL;
17094         int32_t idx;
17095         uint32_t srtt = 0, thresh = 0, ts_low = 0;
17096         int no_sack = 0;
17097
17098         /* Return the next guy to be re-transmitted */
17099         if (tqhash_empty(rack->r_ctl.tqh)) {
17100                 return (NULL);
17101         }
17102         if (tp->t_flags & TF_SENTFIN) {
17103                 /* retran the end FIN? */
17104                 return (NULL);
17105         }
17106         /* ok lets look at this one */
17107         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
17108         if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) {
17109                 return (rsm);
17110         }
17111         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
17112                 goto check_it;
17113         }
17114         rsm = rack_find_lowest_rsm(rack);
17115         if (rsm == NULL) {
17116                 return (NULL);
17117         }
17118 check_it:
17119         if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) ||
17120             (rack->sack_attack_disable > 0)) {
17121                 no_sack = 1;
17122         }
17123         if ((no_sack > 0) &&
17124             (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
17125                 /*
17126                  * No sack so we automatically do the 3 strikes and
17127                  * retransmit (no rack timer would be started).
17128                  */
17129                 return (rsm);
17130         }
17131         if (rsm->r_flags & RACK_ACKED) {
17132                 return (NULL);
17133         }
17134         if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
17135             (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
17136                 /* Its not yet ready */
17137                 return (NULL);
17138         }
17139         srtt = rack_grab_rtt(tp, rack);
17140         idx = rsm->r_rtr_cnt - 1;
17141         ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
17142         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
17143         if ((tsused == ts_low) ||
17144             (TSTMP_LT(tsused, ts_low))) {
17145                 /* No time since sending */
17146                 return (NULL);
17147         }
17148         if ((tsused - ts_low) < thresh) {
17149                 /* It has not been long enough yet */
17150                 return (NULL);
17151         }
17152         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
17153             ((rsm->r_flags & RACK_SACK_PASSED) &&
17154              (rack->sack_attack_disable == 0))) {
17155                 /*
17156                  * We have passed the dup-ack threshold <or>
17157                  * a SACK has indicated this is missing.
17158                  * Note that if you are a declared attacker
17159                  * it is only the dup-ack threshold that
17160                  * will cause retransmits.
17161                  */
17162                 /* log retransmit reason */
17163                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
17164                 rack->r_fast_output = 0;
17165                 return (rsm);
17166         }
17167         return (NULL);
17168 }
17169
17170 static void
17171 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
17172                            uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
17173                            int line, struct rack_sendmap *rsm, uint8_t quality)
17174 {
17175         if (tcp_bblogging_on(rack->rc_tp)) {
17176                 union tcp_log_stackspecific log;
17177                 struct timeval tv;
17178
17179                 memset(&log, 0, sizeof(log));
17180                 log.u_bbr.flex1 = slot;
17181                 log.u_bbr.flex2 = len;
17182                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
17183                 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
17184                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
17185                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
17186                 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
17187                 log.u_bbr.use_lt_bw <<= 1;
17188                 log.u_bbr.use_lt_bw |= rack->r_late;
17189                 log.u_bbr.use_lt_bw <<= 1;
17190                 log.u_bbr.use_lt_bw |= rack->r_early;
17191                 log.u_bbr.use_lt_bw <<= 1;
17192                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
17193                 log.u_bbr.use_lt_bw <<= 1;
17194                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
17195                 log.u_bbr.use_lt_bw <<= 1;
17196                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
17197                 log.u_bbr.use_lt_bw <<= 1;
17198                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
17199                 log.u_bbr.use_lt_bw <<= 1;
17200                 log.u_bbr.use_lt_bw |= rack->gp_ready;
17201                 log.u_bbr.pkt_epoch = line;
17202                 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
17203                 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
17204                 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
17205                 log.u_bbr.bw_inuse = bw_est;
17206                 log.u_bbr.delRate = bw;
17207                 if (rack->r_ctl.gp_bw == 0)
17208                         log.u_bbr.cur_del_rate = 0;
17209                 else
17210                         log.u_bbr.cur_del_rate = rack_get_bw(rack);
17211                 log.u_bbr.rttProp = len_time;
17212                 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
17213                 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
17214                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
17215                 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
17216                         /* We are in slow start */
17217                         log.u_bbr.flex7 = 1;
17218                 } else {
17219                         /* we are on congestion avoidance */
17220                         log.u_bbr.flex7 = 0;
17221                 }
17222                 log.u_bbr.flex8 = method;
17223                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17224                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
17225                 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
17226                 log.u_bbr.cwnd_gain <<= 1;
17227                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
17228                 log.u_bbr.cwnd_gain <<= 1;
17229                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
17230                 log.u_bbr.bbr_substate = quality;
17231                 log.u_bbr.bbr_state = rack->dgp_on;
17232                 log.u_bbr.bbr_state <<= 1;
17233                 log.u_bbr.bbr_state |= rack->r_fill_less_agg;
17234                 log.u_bbr.bbr_state <<= 1;
17235                 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd;
17236                 log.u_bbr.bbr_state <<= 2;
17237                 log.u_bbr.bbr_state |= rack->r_pacing_discount;
17238                 log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7);
17239                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
17240                     &rack->rc_inp->inp_socket->so_rcv,
17241                     &rack->rc_inp->inp_socket->so_snd,
17242                     BBR_LOG_HPTSI_CALC, 0,
17243                     0, &log, false, &tv);
17244         }
17245 }
17246
17247 static uint32_t
17248 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
17249 {
17250         uint32_t new_tso, user_max, pace_one;
17251
17252         user_max = rack->rc_user_set_max_segs * mss;
17253         if (rack->rc_force_max_seg) {
17254                 return (user_max);
17255         }
17256         if (rack->use_fixed_rate &&
17257             ((rack->r_ctl.crte == NULL) ||
17258              (bw != rack->r_ctl.crte->rate))) {
17259                 /* Use the user mss since we are not exactly matched */
17260                 return (user_max);
17261         }
17262         if (rack_pace_one_seg ||
17263             (rack->r_ctl.rc_user_set_min_segs == 1))
17264                 pace_one = 1;
17265         else
17266                 pace_one = 0;
17267
17268         new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss,
17269                      pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
17270         if (new_tso > user_max)
17271                 new_tso = user_max;
17272         if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) {
17273                 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso)
17274                         new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss;
17275         }
17276         if (rack->r_ctl.rc_user_set_min_segs &&
17277             ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso))
17278             new_tso = rack->r_ctl.rc_user_set_min_segs * mss;
17279         return (new_tso);
17280 }
17281
17282 static int32_t
17283 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
17284 {
17285         uint64_t lentim, fill_bw;
17286
17287         /* Lets first see if we are full, if so continue with normal rate */
17288         rack->r_via_fill_cw = 0;
17289         if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
17290                 return (slot);
17291         if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
17292                 return (slot);
17293         if (rack->r_ctl.rc_last_us_rtt == 0)
17294                 return (slot);
17295         if (rack->rc_pace_fill_if_rttin_range &&
17296             (rack->r_ctl.rc_last_us_rtt >=
17297              (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
17298                 /* The rtt is huge, N * smallest, lets not fill */
17299                 return (slot);
17300         }
17301         /*
17302          * first lets calculate the b/w based on the last us-rtt
17303          * and the the smallest send window.
17304          */
17305         fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
17306         /* Take the rwnd if its smaller */
17307         if (fill_bw > rack->rc_tp->snd_wnd)
17308                 fill_bw = rack->rc_tp->snd_wnd;
17309         /* Now lets make it into a b/w */
17310         fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
17311         fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
17312         if (rack->r_fill_less_agg) {
17313                 /*
17314                  * We want the average of the rate_wanted
17315                  * and our fill-cw calculated bw. We also want
17316                  * to cap any increase to be no more than
17317                  * X times the lt_bw (where X is the rack_bw_multipler).
17318                  */
17319                 uint64_t lt_bw, rate;
17320
17321                 lt_bw = rack_get_lt_bw(rack);
17322                 if (lt_bw > *rate_wanted)
17323                         rate = lt_bw;
17324                 else
17325                         rate = *rate_wanted;
17326                 fill_bw += rate;
17327                 fill_bw /= 2;
17328                 if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) {
17329                         fill_bw = rate * rack_bw_multipler;
17330                 }
17331         }
17332         /* We are below the min b/w */
17333         if (non_paced)
17334                 *rate_wanted = fill_bw;
17335         if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
17336                 return (slot);
17337         rack->r_via_fill_cw = 1;
17338         if (rack->r_rack_hw_rate_caps &&
17339             (rack->r_ctl.crte != NULL)) {
17340                 uint64_t high_rate;
17341
17342                 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
17343                 if (fill_bw > high_rate) {
17344                         /* We are capping bw at the highest rate table entry */
17345                         if (*rate_wanted > high_rate) {
17346                                 /* The original rate was also capped */
17347                                 rack->r_via_fill_cw = 0;
17348                         }
17349                         rack_log_hdwr_pacing(rack,
17350                                              fill_bw, high_rate, __LINE__,
17351                                              0, 3);
17352                         fill_bw = high_rate;
17353                         if (capped)
17354                                 *capped = 1;
17355                 }
17356         } else if ((rack->r_ctl.crte == NULL) &&
17357                    (rack->rack_hdrw_pacing == 0) &&
17358                    (rack->rack_hdw_pace_ena) &&
17359                    rack->r_rack_hw_rate_caps &&
17360                    (rack->rack_attempt_hdwr_pace == 0) &&
17361                    (rack->rc_inp->inp_route.ro_nh != NULL) &&
17362                    (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17363                 /*
17364                  * Ok we may have a first attempt that is greater than our top rate
17365                  * lets check.
17366                  */
17367                 uint64_t high_rate;
17368
17369                 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
17370                 if (high_rate) {
17371                         if (fill_bw > high_rate) {
17372                                 fill_bw = high_rate;
17373                                 if (capped)
17374                                         *capped = 1;
17375                         }
17376                 }
17377         }
17378         if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
17379                 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
17380                                    fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL);
17381                 fill_bw = rack->r_ctl.bw_rate_cap;
17382         }
17383         /*
17384          * Ok fill_bw holds our mythical b/w to fill the cwnd
17385          * in an rtt (unless it was capped), what does that
17386          * time wise equate too?
17387          */
17388         lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
17389         lentim /= fill_bw;
17390         *rate_wanted = fill_bw;
17391         if (non_paced || (lentim < slot)) {
17392                 rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
17393                                            0, lentim, 12, __LINE__, NULL, 0);
17394                 return ((int32_t)lentim);
17395         } else
17396                 return (slot);
17397 }
17398
17399 static int32_t
17400 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
17401 {
17402         uint64_t srtt;
17403         int32_t slot = 0;
17404         int32_t minslot = 0;
17405         int can_start_hw_pacing = 1;
17406         int err;
17407         int pace_one;
17408
17409         if (rack_pace_one_seg ||
17410             (rack->r_ctl.rc_user_set_min_segs == 1))
17411                 pace_one = 1;
17412         else
17413                 pace_one = 0;
17414         if (rack->rc_always_pace == 0) {
17415                 /*
17416                  * We use the most optimistic possible cwnd/srtt for
17417                  * sending calculations. This will make our
17418                  * calculation anticipate getting more through
17419                  * quicker then possible. But thats ok we don't want
17420                  * the peer to have a gap in data sending.
17421                  */
17422                 uint64_t cwnd, tr_perms = 0;
17423                 int32_t reduce = 0;
17424
17425         old_method:
17426                 /*
17427                  * We keep no precise pacing with the old method
17428                  * instead we use the pacer to mitigate bursts.
17429                  */
17430                 if (rack->r_ctl.rc_rack_min_rtt)
17431                         srtt = rack->r_ctl.rc_rack_min_rtt;
17432                 else
17433                         srtt = max(tp->t_srtt, 1);
17434                 if (rack->r_ctl.rc_rack_largest_cwnd)
17435                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
17436                 else
17437                         cwnd = rack->r_ctl.cwnd_to_use;
17438                 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
17439                 tr_perms = (cwnd * 1000) / srtt;
17440                 if (tr_perms == 0) {
17441                         tr_perms = ctf_fixed_maxseg(tp);
17442                 }
17443                 /*
17444                  * Calculate how long this will take to drain, if
17445                  * the calculation comes out to zero, thats ok we
17446                  * will use send_a_lot to possibly spin around for
17447                  * more increasing tot_len_this_send to the point
17448                  * that its going to require a pace, or we hit the
17449                  * cwnd. Which in that case we are just waiting for
17450                  * a ACK.
17451                  */
17452                 slot = len / tr_perms;
17453                 /* Now do we reduce the time so we don't run dry? */
17454                 if (slot && rack_slot_reduction) {
17455                         reduce = (slot / rack_slot_reduction);
17456                         if (reduce < slot) {
17457                                 slot -= reduce;
17458                         } else
17459                                 slot = 0;
17460                 }
17461                 slot *= HPTS_USEC_IN_MSEC;
17462                 if (rack->rc_pace_to_cwnd) {
17463                         uint64_t rate_wanted = 0;
17464
17465                         slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
17466                         rack->rc_ack_can_sendout_data = 1;
17467                         rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
17468                 } else
17469                         rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
17470                 /*******************************************************/
17471                 /* RRS: We insert non-paced call to stats here for len */
17472                 /*******************************************************/
17473         } else {
17474                 uint64_t bw_est, res, lentim, rate_wanted;
17475                 uint32_t segs, oh;
17476                 int capped = 0;
17477                 int prev_fill;
17478
17479                 if ((rack->r_rr_config == 1) && rsm) {
17480                         return (rack->r_ctl.rc_min_to);
17481                 }
17482                 if (rack->use_fixed_rate) {
17483                         rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
17484                 } else if ((rack->r_ctl.init_rate == 0) &&
17485                            (rack->r_ctl.gp_bw == 0)) {
17486                         /* no way to yet do an estimate */
17487                         bw_est = rate_wanted = 0;
17488                 } else {
17489                         bw_est = rack_get_bw(rack);
17490                         rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
17491                 }
17492                 if ((bw_est == 0) || (rate_wanted == 0) ||
17493                     ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
17494                         /*
17495                          * No way yet to make a b/w estimate or
17496                          * our raise is set incorrectly.
17497                          */
17498                         goto old_method;
17499                 }
17500                 rack_rate_cap_bw(rack, &rate_wanted, &capped);
17501                 /* We need to account for all the overheads */
17502                 segs = (len + segsiz - 1) / segsiz;
17503                 /*
17504                  * We need the diff between 1514 bytes (e-mtu with e-hdr)
17505                  * and how much data we put in each packet. Yes this
17506                  * means we may be off if we are larger than 1500 bytes
17507                  * or smaller. But this just makes us more conservative.
17508                  */
17509
17510                 oh =  (tp->t_maxseg - segsiz) + sizeof(struct tcphdr);
17511                 if (rack->r_is_v6) {
17512 #ifdef INET6
17513                         oh += sizeof(struct ip6_hdr);
17514 #endif
17515                 } else {
17516 #ifdef INET
17517                         oh += sizeof(struct ip);
17518 #endif
17519                 }
17520                 /* We add a fixed 14 for the ethernet header */
17521                 oh += 14;
17522                 segs *= oh;
17523                 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
17524                 res = lentim / rate_wanted;
17525                 slot = (uint32_t)res;
17526                 if (rack_hw_rate_min &&
17527                     (rate_wanted < rack_hw_rate_min)) {
17528                         can_start_hw_pacing = 0;
17529                         if (rack->r_ctl.crte) {
17530                                 /*
17531                                  * Ok we need to release it, we
17532                                  * have fallen too low.
17533                                  */
17534                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17535                                 rack->r_ctl.crte = NULL;
17536                                 rack->rack_attempt_hdwr_pace = 0;
17537                                 rack->rack_hdrw_pacing = 0;
17538                         }
17539                 }
17540                 if (rack->r_ctl.crte &&
17541                     (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17542                         /*
17543                          * We want more than the hardware can give us,
17544                          * don't start any hw pacing.
17545                          */
17546                         can_start_hw_pacing = 0;
17547                         if (rack->r_rack_hw_rate_caps == 0) {
17548                                 /*
17549                                  * Ok we need to release it, we
17550                                  * want more than the card can give us and
17551                                  * no rate cap is in place. Set it up so
17552                                  * when we want less we can retry.
17553                                  */
17554                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17555                                 rack->r_ctl.crte = NULL;
17556                                 rack->rack_attempt_hdwr_pace = 0;
17557                                 rack->rack_hdrw_pacing = 0;
17558                         }
17559                 }
17560                 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) {
17561                         /*
17562                          * We lost our rate somehow, this can happen
17563                          * if the interface changed underneath us.
17564                          */
17565                         tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17566                         rack->r_ctl.crte = NULL;
17567                         /* Lets re-allow attempting to setup pacing */
17568                         rack->rack_hdrw_pacing = 0;
17569                         rack->rack_attempt_hdwr_pace = 0;
17570                         rack_log_hdwr_pacing(rack,
17571                                              rate_wanted, bw_est, __LINE__,
17572                                              0, 6);
17573                 }
17574                 prev_fill = rack->r_via_fill_cw;
17575                 if ((rack->rc_pace_to_cwnd) &&
17576                     (capped == 0) &&
17577                     (rack->use_fixed_rate == 0) &&
17578                     (rack->in_probe_rtt == 0) &&
17579                     (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
17580                         /*
17581                          * We want to pace at our rate *or* faster to
17582                          * fill the cwnd to the max if its not full.
17583                          */
17584                         slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
17585                         /* Re-check to make sure we are not exceeding our max b/w */
17586                         if ((rack->r_ctl.crte != NULL) &&
17587                             (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17588                                 /*
17589                                  * We want more than the hardware can give us,
17590                                  * don't start any hw pacing.
17591                                  */
17592                                 can_start_hw_pacing = 0;
17593                                 if (rack->r_rack_hw_rate_caps == 0) {
17594                                         /*
17595                                          * Ok we need to release it, we
17596                                          * want more than the card can give us and
17597                                          * no rate cap is in place. Set it up so
17598                                          * when we want less we can retry.
17599                                          */
17600                                         tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17601                                         rack->r_ctl.crte = NULL;
17602                                         rack->rack_attempt_hdwr_pace = 0;
17603                                         rack->rack_hdrw_pacing = 0;
17604                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
17605                                 }
17606                         }
17607                 }
17608                 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
17609                     (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17610                         if ((rack->rack_hdw_pace_ena) &&
17611                             (can_start_hw_pacing > 0) &&
17612                             (rack->rack_hdrw_pacing == 0) &&
17613                             (rack->rack_attempt_hdwr_pace == 0)) {
17614                                 /*
17615                                  * Lets attempt to turn on hardware pacing
17616                                  * if we can.
17617                                  */
17618                                 rack->rack_attempt_hdwr_pace = 1;
17619                                 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
17620                                                                        rack->rc_inp->inp_route.ro_nh->nh_ifp,
17621                                                                        rate_wanted,
17622                                                                        RS_PACING_GEQ,
17623                                                                        &err, &rack->r_ctl.crte_prev_rate);
17624                                 if (rack->r_ctl.crte) {
17625                                         rack->rack_hdrw_pacing = 1;
17626                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz,
17627                                                                           pace_one, rack->r_ctl.crte,
17628                                                                           NULL, rack->r_ctl.pace_len_divisor);
17629                                         rack_log_hdwr_pacing(rack,
17630                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17631                                                              err, 0);
17632                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
17633                                 } else {
17634                                         counter_u64_add(rack_hw_pace_init_fail, 1);
17635                                 }
17636                         } else if (rack->rack_hdrw_pacing &&
17637                                    (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
17638                                 /* Do we need to adjust our rate? */
17639                                 const struct tcp_hwrate_limit_table *nrte;
17640
17641                                 if (rack->r_up_only &&
17642                                     (rate_wanted < rack->r_ctl.crte->rate)) {
17643                                         /**
17644                                          * We have four possible states here
17645                                          * having to do with the previous time
17646                                          * and this time.
17647                                          *   previous  |  this-time
17648                                          * A)     0      |     0   -- fill_cw not in the picture
17649                                          * B)     1      |     0   -- we were doing a fill-cw but now are not
17650                                          * C)     1      |     1   -- all rates from fill_cw
17651                                          * D)     0      |     1   -- we were doing non-fill and now we are filling
17652                                          *
17653                                          * For case A, C and D we don't allow a drop. But for
17654                                          * case B where we now our on our steady rate we do
17655                                          * allow a drop.
17656                                          *
17657                                          */
17658                                         if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
17659                                                 goto done_w_hdwr;
17660                                 }
17661                                 if ((rate_wanted > rack->r_ctl.crte->rate) ||
17662                                     (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
17663                                         if (rack_hw_rate_to_low &&
17664                                             (bw_est < rack_hw_rate_to_low)) {
17665                                                 /*
17666                                                  * The pacing rate is too low for hardware, but
17667                                                  * do allow hardware pacing to be restarted.
17668                                                  */
17669                                                 rack_log_hdwr_pacing(rack,
17670                                                              bw_est, rack->r_ctl.crte->rate, __LINE__,
17671                                                              0, 5);
17672                                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17673                                                 rack->r_ctl.crte = NULL;
17674                                                 rack->rack_attempt_hdwr_pace = 0;
17675                                                 rack->rack_hdrw_pacing = 0;
17676                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17677                                                 goto done_w_hdwr;
17678                                         }
17679                                         nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
17680                                                                             rack->rc_tp,
17681                                                                             rack->rc_inp->inp_route.ro_nh->nh_ifp,
17682                                                                             rate_wanted,
17683                                                                             RS_PACING_GEQ,
17684                                                                             &err, &rack->r_ctl.crte_prev_rate);
17685                                         if (nrte == NULL) {
17686                                                 /*
17687                                                  * Lost the rate, lets drop hardware pacing
17688                                                  * period.
17689                                                  */
17690                                                 rack->rack_hdrw_pacing = 0;
17691                                                 rack->r_ctl.crte = NULL;
17692                                                 rack_log_hdwr_pacing(rack,
17693                                                                      rate_wanted, 0, __LINE__,
17694                                                                      err, 1);
17695                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17696                                                 counter_u64_add(rack_hw_pace_lost, 1);
17697                                         } else if (nrte != rack->r_ctl.crte) {
17698                                                 rack->r_ctl.crte = nrte;
17699                                                 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted,
17700                                                                                  segsiz, pace_one, rack->r_ctl.crte,
17701                                                                                  NULL, rack->r_ctl.pace_len_divisor);
17702                                                 rack_log_hdwr_pacing(rack,
17703                                                                      rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17704                                                                      err, 2);
17705                                                 rack->r_ctl.last_hw_bw_req = rate_wanted;
17706                                         }
17707                                 } else {
17708                                         /* We just need to adjust the segment size */
17709                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17710                                         rack_log_hdwr_pacing(rack,
17711                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17712                                                              0, 4);
17713                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
17714                                 }
17715                         }
17716                 }
17717                 if (minslot && (minslot > slot)) {
17718                         rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim,
17719                                                    98, __LINE__, NULL, 0);
17720                         slot = minslot;
17721                 }
17722 done_w_hdwr:
17723                 if (rack_limit_time_with_srtt &&
17724                     (rack->use_fixed_rate == 0) &&
17725                     (rack->rack_hdrw_pacing == 0)) {
17726                         /*
17727                          * Sanity check, we do not allow the pacing delay
17728                          * to be longer than the SRTT of the path. If it is
17729                          * a slow path, then adding a packet should increase
17730                          * the RTT and compensate for this i.e. the srtt will
17731                          * be greater so the allowed pacing time will be greater.
17732                          *
17733                          * Note this restriction is not for where a peak rate
17734                          * is set, we are doing fixed pacing or hardware pacing.
17735                          */
17736                         if (rack->rc_tp->t_srtt)
17737                                 srtt = rack->rc_tp->t_srtt;
17738                         else
17739                                 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;    /* its in ms convert */
17740                         if (srtt < (uint64_t)slot) {
17741                                 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
17742                                 slot = srtt;
17743                         }
17744                 }
17745                 /*******************************************************************/
17746                 /* RRS: We insert paced call to stats here for len and rate_wanted */
17747                 /*******************************************************************/
17748                 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
17749         }
17750         if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
17751                 /*
17752                  * If this rate is seeing enobufs when it
17753                  * goes to send then either the nic is out
17754                  * of gas or we are mis-estimating the time
17755                  * somehow and not letting the queue empty
17756                  * completely. Lets add to the pacing time.
17757                  */
17758                 int hw_boost_delay;
17759
17760                 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
17761                 if (hw_boost_delay > rack_enobuf_hw_max)
17762                         hw_boost_delay = rack_enobuf_hw_max;
17763                 else if (hw_boost_delay < rack_enobuf_hw_min)
17764                         hw_boost_delay = rack_enobuf_hw_min;
17765                 slot += hw_boost_delay;
17766         }
17767         return (slot);
17768 }
17769
17770 static void
17771 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
17772     tcp_seq startseq, uint32_t sb_offset)
17773 {
17774         struct rack_sendmap *my_rsm = NULL;
17775
17776         if (tp->t_state < TCPS_ESTABLISHED) {
17777                 /*
17778                  * We don't start any measurements if we are
17779                  * not at least established.
17780                  */
17781                 return;
17782         }
17783         if (tp->t_state >= TCPS_FIN_WAIT_1) {
17784                 /*
17785                  * We will get no more data into the SB
17786                  * this means we need to have the data available
17787                  * before we start a measurement.
17788                  */
17789
17790                 if (sbavail(&tptosocket(tp)->so_snd) <
17791                     max(rc_init_window(rack),
17792                         (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
17793                         /* Nope not enough data */
17794                         return;
17795                 }
17796         }
17797         tp->t_flags |= TF_GPUTINPROG;
17798         rack->r_ctl.rc_gp_cumack_ts = 0;
17799         rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
17800         rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
17801         tp->gput_seq = startseq;
17802         rack->app_limited_needs_set = 0;
17803         if (rack->in_probe_rtt)
17804                 rack->measure_saw_probe_rtt = 1;
17805         else if ((rack->measure_saw_probe_rtt) &&
17806                  (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
17807                 rack->measure_saw_probe_rtt = 0;
17808         if (rack->rc_gp_filled)
17809                 tp->gput_ts = rack->r_ctl.last_cumack_advance;
17810         else {
17811                 /* Special case initial measurement */
17812                 struct timeval tv;
17813
17814                 tp->gput_ts = tcp_get_usecs(&tv);
17815                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17816         }
17817         /*
17818          * We take a guess out into the future,
17819          * if we have no measurement and no
17820          * initial rate, we measure the first
17821          * initial-windows worth of data to
17822          * speed up getting some GP measurement and
17823          * thus start pacing.
17824          */
17825         if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
17826                 rack->app_limited_needs_set = 1;
17827                 tp->gput_ack = startseq + max(rc_init_window(rack),
17828                                               (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
17829                 rack_log_pacing_delay_calc(rack,
17830                                            tp->gput_seq,
17831                                            tp->gput_ack,
17832                                            0,
17833                                            tp->gput_ts,
17834                                            (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17835                                            9,
17836                                            __LINE__, NULL, 0);
17837                 rack_tend_gp_marks(tp, rack);
17838                 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17839                 return;
17840         }
17841         if (sb_offset) {
17842                 /*
17843                  * We are out somewhere in the sb
17844                  * can we use the already outstanding data?
17845                  */
17846
17847                 if (rack->r_ctl.rc_app_limited_cnt == 0) {
17848                         /*
17849                          * Yes first one is good and in this case
17850                          * the tp->gput_ts is correctly set based on
17851                          * the last ack that arrived (no need to
17852                          * set things up when an ack comes in).
17853                          */
17854                         my_rsm = tqhash_min(rack->r_ctl.tqh);
17855                         if ((my_rsm == NULL) ||
17856                             (my_rsm->r_rtr_cnt != 1)) {
17857                                 /* retransmission? */
17858                                 goto use_latest;
17859                         }
17860                 } else {
17861                         if (rack->r_ctl.rc_first_appl == NULL) {
17862                                 /*
17863                                  * If rc_first_appl is NULL
17864                                  * then the cnt should be 0.
17865                                  * This is probably an error, maybe
17866                                  * a KASSERT would be approprate.
17867                                  */
17868                                 goto use_latest;
17869                         }
17870                         /*
17871                          * If we have a marker pointer to the last one that is
17872                          * app limited we can use that, but we need to set
17873                          * things up so that when it gets ack'ed we record
17874                          * the ack time (if its not already acked).
17875                          */
17876                         rack->app_limited_needs_set = 1;
17877                         /*
17878                          * We want to get to the rsm that is either
17879                          * next with space i.e. over 1 MSS or the one
17880                          * after that (after the app-limited).
17881                          */
17882                         my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl);
17883                         if (my_rsm) {
17884                                 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
17885                                         /* Have to use the next one */
17886                                         my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17887                                 else {
17888                                         /* Use after the first MSS of it is acked */
17889                                         tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
17890                                         goto start_set;
17891                                 }
17892                         }
17893                         if ((my_rsm == NULL) ||
17894                             (my_rsm->r_rtr_cnt != 1)) {
17895                                 /*
17896                                  * Either its a retransmit or
17897                                  * the last is the app-limited one.
17898                                  */
17899                                 goto use_latest;
17900                         }
17901                 }
17902                 tp->gput_seq = my_rsm->r_start;
17903 start_set:
17904                 if (my_rsm->r_flags & RACK_ACKED) {
17905                         /*
17906                          * This one has been acked use the arrival ack time
17907                          */
17908                         struct rack_sendmap *nrsm;
17909
17910                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17911                         rack->app_limited_needs_set = 0;
17912                         /*
17913                          * Ok in this path we need to use the r_end now
17914                          * since this guy is the starting ack.
17915                          */
17916                         tp->gput_seq = my_rsm->r_end;
17917                         /*
17918                          * We also need to adjust up the sendtime
17919                          * to the send of the next data after my_rsm.
17920                          */
17921                         nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17922                         if (nrsm != NULL)
17923                                 my_rsm = nrsm;
17924                         else {
17925                                 /*
17926                                  * The next as not been sent, thats the
17927                                  * case for using the latest.
17928                                  */
17929                                 goto use_latest;
17930                         }
17931                 }
17932                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17933                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
17934                 rack->r_ctl.rc_gp_cumack_ts = 0;
17935                 rack_log_pacing_delay_calc(rack,
17936                                            tp->gput_seq,
17937                                            tp->gput_ack,
17938                                            (uint64_t)my_rsm,
17939                                            tp->gput_ts,
17940                                            (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17941                                            9,
17942                                            __LINE__, my_rsm, 0);
17943                 /* Now lets make sure all are marked as they should be */
17944                 rack_tend_gp_marks(tp, rack);
17945                 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17946                 return;
17947         }
17948
17949 use_latest:
17950         /*
17951          * We don't know how long we may have been
17952          * idle or if this is the first-send. Lets
17953          * setup the flag so we will trim off
17954          * the first ack'd data so we get a true
17955          * measurement.
17956          */
17957         rack->app_limited_needs_set = 1;
17958         tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
17959         rack->r_ctl.rc_gp_cumack_ts = 0;
17960         /* Find this guy so we can pull the send time */
17961         my_rsm = tqhash_find(rack->r_ctl.tqh, startseq);
17962         if (my_rsm) {
17963                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17964                 if (my_rsm->r_flags & RACK_ACKED) {
17965                         /*
17966                          * Unlikely since its probably what was
17967                          * just transmitted (but I am paranoid).
17968                          */
17969                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17970                         rack->app_limited_needs_set = 0;
17971                 }
17972                 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
17973                         /* This also is unlikely */
17974                         tp->gput_seq = my_rsm->r_start;
17975                 }
17976         } else {
17977                 /*
17978                  * TSNH unless we have some send-map limit,
17979                  * and even at that it should not be hitting
17980                  * that limit (we should have stopped sending).
17981                  */
17982                 struct timeval tv;
17983
17984                 microuptime(&tv);
17985                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17986         }
17987         rack_tend_gp_marks(tp, rack);
17988         rack_log_pacing_delay_calc(rack,
17989                                    tp->gput_seq,
17990                                    tp->gput_ack,
17991                                    (uint64_t)my_rsm,
17992                                    tp->gput_ts,
17993                                    (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17994                                    9, __LINE__, NULL, 0);
17995         rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17996 }
17997
17998 static inline uint32_t
17999 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
18000     uint32_t avail, int32_t sb_offset)
18001 {
18002         uint32_t len;
18003         uint32_t sendwin;
18004
18005         if (tp->snd_wnd > cwnd_to_use)
18006                 sendwin = cwnd_to_use;
18007         else
18008                 sendwin = tp->snd_wnd;
18009         if (ctf_outstanding(tp) >= tp->snd_wnd) {
18010                 /* We never want to go over our peers rcv-window */
18011                 len = 0;
18012         } else {
18013                 uint32_t flight;
18014
18015                 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
18016                 if (flight >= sendwin) {
18017                         /*
18018                          * We have in flight what we are allowed by cwnd (if
18019                          * it was rwnd blocking it would have hit above out
18020                          * >= tp->snd_wnd).
18021                          */
18022                         return (0);
18023                 }
18024                 len = sendwin - flight;
18025                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
18026                         /* We would send too much (beyond the rwnd) */
18027                         len = tp->snd_wnd - ctf_outstanding(tp);
18028                 }
18029                 if ((len + sb_offset) > avail) {
18030                         /*
18031                          * We don't have that much in the SB, how much is
18032                          * there?
18033                          */
18034                         len = avail - sb_offset;
18035                 }
18036         }
18037         return (len);
18038 }
18039
18040 static void
18041 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
18042              unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
18043              int rsm_is_null, int optlen, int line, uint16_t mode)
18044 {
18045         if (tcp_bblogging_on(rack->rc_tp)) {
18046                 union tcp_log_stackspecific log;
18047                 struct timeval tv;
18048
18049                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18050                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
18051                 log.u_bbr.flex1 = error;
18052                 log.u_bbr.flex2 = flags;
18053                 log.u_bbr.flex3 = rsm_is_null;
18054                 log.u_bbr.flex4 = ipoptlen;
18055                 log.u_bbr.flex5 = tp->rcv_numsacks;
18056                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18057                 log.u_bbr.flex7 = optlen;
18058                 log.u_bbr.flex8 = rack->r_fsb_inited;
18059                 log.u_bbr.applimited = rack->r_fast_output;
18060                 log.u_bbr.bw_inuse = rack_get_bw(rack);
18061                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
18062                 log.u_bbr.cwnd_gain = mode;
18063                 log.u_bbr.pkts_out = orig_len;
18064                 log.u_bbr.lt_epoch = len;
18065                 log.u_bbr.delivered = line;
18066                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
18067                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18068                 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
18069                                len, &log, false, NULL, __func__, __LINE__, &tv);
18070         }
18071 }
18072
18073
18074 static struct mbuf *
18075 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
18076                    struct rack_fast_send_blk *fsb,
18077                    int32_t seglimit, int32_t segsize, int hw_tls)
18078 {
18079 #ifdef KERN_TLS
18080         struct ktls_session *tls, *ntls;
18081 #ifdef INVARIANTS
18082         struct mbuf *start;
18083 #endif
18084 #endif
18085         struct mbuf *m, *n, **np, *smb;
18086         struct mbuf *top;
18087         int32_t off, soff;
18088         int32_t len = *plen;
18089         int32_t fragsize;
18090         int32_t len_cp = 0;
18091         uint32_t mlen, frags;
18092
18093         soff = off = the_off;
18094         smb = m = the_m;
18095         np = &top;
18096         top = NULL;
18097 #ifdef KERN_TLS
18098         if (hw_tls && (m->m_flags & M_EXTPG))
18099                 tls = m->m_epg_tls;
18100         else
18101                 tls = NULL;
18102 #ifdef INVARIANTS
18103         start = m;
18104 #endif
18105 #endif
18106         while (len > 0) {
18107                 if (m == NULL) {
18108                         *plen = len_cp;
18109                         break;
18110                 }
18111 #ifdef KERN_TLS
18112                 if (hw_tls) {
18113                         if (m->m_flags & M_EXTPG)
18114                                 ntls = m->m_epg_tls;
18115                         else
18116                                 ntls = NULL;
18117
18118                         /*
18119                          * Avoid mixing TLS records with handshake
18120                          * data or TLS records from different
18121                          * sessions.
18122                          */
18123                         if (tls != ntls) {
18124                                 MPASS(m != start);
18125                                 *plen = len_cp;
18126                                 break;
18127                         }
18128                 }
18129 #endif
18130                 mlen = min(len, m->m_len - off);
18131                 if (seglimit) {
18132                         /*
18133                          * For M_EXTPG mbufs, add 3 segments
18134                          * + 1 in case we are crossing page boundaries
18135                          * + 2 in case the TLS hdr/trailer are used
18136                          * It is cheaper to just add the segments
18137                          * than it is to take the cache miss to look
18138                          * at the mbuf ext_pgs state in detail.
18139                          */
18140                         if (m->m_flags & M_EXTPG) {
18141                                 fragsize = min(segsize, PAGE_SIZE);
18142                                 frags = 3;
18143                         } else {
18144                                 fragsize = segsize;
18145                                 frags = 0;
18146                         }
18147
18148                         /* Break if we really can't fit anymore. */
18149                         if ((frags + 1) >= seglimit) {
18150                                 *plen = len_cp;
18151                                 break;
18152                         }
18153
18154                         /*
18155                          * Reduce size if you can't copy the whole
18156                          * mbuf. If we can't copy the whole mbuf, also
18157                          * adjust len so the loop will end after this
18158                          * mbuf.
18159                          */
18160                         if ((frags + howmany(mlen, fragsize)) >= seglimit) {
18161                                 mlen = (seglimit - frags - 1) * fragsize;
18162                                 len = mlen;
18163                                 *plen = len_cp + len;
18164                         }
18165                         frags += howmany(mlen, fragsize);
18166                         if (frags == 0)
18167                                 frags++;
18168                         seglimit -= frags;
18169                         KASSERT(seglimit > 0,
18170                             ("%s: seglimit went too low", __func__));
18171                 }
18172                 n = m_get(M_NOWAIT, m->m_type);
18173                 *np = n;
18174                 if (n == NULL)
18175                         goto nospace;
18176                 n->m_len = mlen;
18177                 soff += mlen;
18178                 len_cp += n->m_len;
18179                 if (m->m_flags & (M_EXT|M_EXTPG)) {
18180                         n->m_data = m->m_data + off;
18181                         mb_dupcl(n, m);
18182                 } else {
18183                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
18184                             (u_int)n->m_len);
18185                 }
18186                 len -= n->m_len;
18187                 off = 0;
18188                 m = m->m_next;
18189                 np = &n->m_next;
18190                 if (len || (soff == smb->m_len)) {
18191                         /*
18192                          * We have more so we move forward  or
18193                          * we have consumed the entire mbuf and
18194                          * len has fell to 0.
18195                          */
18196                         soff = 0;
18197                         smb = m;
18198                 }
18199
18200         }
18201         if (fsb != NULL) {
18202                 fsb->m = smb;
18203                 fsb->off = soff;
18204                 if (smb) {
18205                         /*
18206                          * Save off the size of the mbuf. We do
18207                          * this so that we can recognize when it
18208                          * has been trimmed by sbcut() as acks
18209                          * come in.
18210                          */
18211                         fsb->o_m_len = smb->m_len;
18212                         fsb->o_t_len = M_TRAILINGROOM(smb);
18213                 } else {
18214                         /*
18215                          * This is the case where the next mbuf went to NULL. This
18216                          * means with this copy we have sent everything in the sb.
18217                          * In theory we could clear the fast_output flag, but lets
18218                          * not since its possible that we could get more added
18219                          * and acks that call the extend function which would let
18220                          * us send more.
18221                          */
18222                         fsb->o_m_len = 0;
18223                         fsb->o_t_len = 0;
18224                 }
18225         }
18226         return (top);
18227 nospace:
18228         if (top)
18229                 m_freem(top);
18230         return (NULL);
18231
18232 }
18233
18234 /*
18235  * This is a copy of m_copym(), taking the TSO segment size/limit
18236  * constraints into account, and advancing the sndptr as it goes.
18237  */
18238 static struct mbuf *
18239 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
18240                 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
18241 {
18242         struct mbuf *m, *n;
18243         int32_t soff;
18244
18245         m = rack->r_ctl.fsb.m;
18246         if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) {
18247                 /*
18248                  * The trailing space changed, mbufs can grow
18249                  * at the tail but they can't shrink from
18250                  * it, KASSERT that. Adjust the orig_m_len to
18251                  * compensate for this change.
18252                  */
18253                 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)),
18254                         ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
18255                          m,
18256                          rack,
18257                          (intmax_t)M_TRAILINGROOM(m),
18258                          rack->r_ctl.fsb.o_t_len,
18259                          rack->r_ctl.fsb.o_m_len,
18260                          m->m_len));
18261                 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m));
18262                 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m);
18263         }
18264         if (m->m_len < rack->r_ctl.fsb.o_m_len) {
18265                 /*
18266                  * Mbuf shrank, trimmed off the top by an ack, our
18267                  * offset changes.
18268                  */
18269                 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)),
18270                         ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n",
18271                          m, m->m_len,
18272                          rack, rack->r_ctl.fsb.o_m_len,
18273                          rack->r_ctl.fsb.off));
18274
18275                 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len))
18276                         rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len);
18277                 else
18278                         rack->r_ctl.fsb.off = 0;
18279                 rack->r_ctl.fsb.o_m_len = m->m_len;
18280 #ifdef INVARIANTS
18281         } else if (m->m_len > rack->r_ctl.fsb.o_m_len) {
18282                 panic("rack:%p m:%p m_len grew outside of t_space compensation",
18283                       rack, m);
18284 #endif
18285         }
18286         soff = rack->r_ctl.fsb.off;
18287         KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
18288         KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
18289         KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
18290                                  __FUNCTION__,
18291                                  rack, *plen, m, m->m_len));
18292         /* Save off the right location before we copy and advance */
18293         *s_soff = soff;
18294         *s_mb = rack->r_ctl.fsb.m;
18295         n = rack_fo_base_copym(m, soff, plen,
18296                                &rack->r_ctl.fsb,
18297                                seglimit, segsize, rack->r_ctl.fsb.hw_tls);
18298         return (n);
18299 }
18300
18301 /* Log the buffer level */
18302 static void
18303 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,
18304                      int len, struct timeval *tv,
18305                      uint32_t cts)
18306 {
18307         uint32_t p_rate = 0, p_queue = 0, err = 0;
18308         union tcp_log_stackspecific log;
18309
18310 #ifdef RATELIMIT
18311         err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18312         err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18313 #endif
18314         memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18315         log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
18316         log.u_bbr.flex1 = p_rate;
18317         log.u_bbr.flex2 = p_queue;
18318         log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18319         log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18320         log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18321         log.u_bbr.flex7 = 99;
18322         log.u_bbr.flex8 = 0;
18323         log.u_bbr.pkts_out = err;
18324         log.u_bbr.delRate = rack->r_ctl.crte->rate;
18325         log.u_bbr.timeStamp = cts;
18326         log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18327         tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18328                        len, &log, false, NULL, __func__, __LINE__, tv);
18329
18330 }
18331
18332 static uint32_t
18333 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp,
18334                        struct timeval *tv, uint32_t cts, int len, uint32_t segsiz)
18335 {
18336         uint64_t lentime = 0;
18337 #ifdef RATELIMIT
18338         uint32_t p_rate = 0, p_queue = 0, err;
18339         union tcp_log_stackspecific log;
18340         uint64_t bw;
18341
18342         err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18343         /* Failed or queue is zero */
18344         if (err || (p_queue == 0)) {
18345                 lentime = 0;
18346                 goto out;
18347         }
18348         err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18349         if (err) {
18350                 lentime = 0;
18351                 goto out;
18352         }
18353         /*
18354          * If we reach here we have some bytes in
18355          * the queue. The number returned is a value
18356          * between 0 and 0xffff where ffff is full
18357          * and 0 is empty. So how best to make this into
18358          * something usable?
18359          *
18360          * The "safer" way is lets take the b/w gotten
18361          * from the query (which should be our b/w rate)
18362          * and pretend that a full send (our rc_pace_max_segs)
18363          * is outstanding. We factor it so its as if a full
18364          * number of our MSS segment is terms of full
18365          * ethernet segments are outstanding.
18366          */
18367         bw = p_rate / 8;
18368         if (bw) {
18369                 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz);
18370                 lentime *= ETHERNET_SEGMENT_SIZE;
18371                 lentime *= (uint64_t)HPTS_USEC_IN_SEC;
18372                 lentime /= bw;
18373         } else {
18374                 /* TSNH -- KASSERT? */
18375                 lentime = 0;
18376         }
18377 out:
18378         if (tcp_bblogging_on(tp)) {
18379                 memset(&log, 0, sizeof(log));
18380                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
18381                 log.u_bbr.flex1 = p_rate;
18382                 log.u_bbr.flex2 = p_queue;
18383                 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18384                 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18385                 log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18386                 log.u_bbr.flex7 = 99;
18387                 log.u_bbr.flex8 = 0;
18388                 log.u_bbr.pkts_out = err;
18389                 log.u_bbr.delRate = rack->r_ctl.crte->rate;
18390                 log.u_bbr.cur_del_rate = lentime;
18391                 log.u_bbr.timeStamp = cts;
18392                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18393                 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18394                                len, &log, false, NULL, __func__, __LINE__,tv);
18395         }
18396 #endif
18397         return ((uint32_t)lentime);
18398 }
18399
18400 static int
18401 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
18402                      uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
18403 {
18404         /*
18405          * Enter the fast retransmit path. We are given that a sched_pin is
18406          * in place (if accounting is compliled in) and the cycle count taken
18407          * at the entry is in the ts_val. The concept her is that the rsm
18408          * now holds the mbuf offsets and such so we can directly transmit
18409          * without a lot of overhead, the len field is already set for
18410          * us to prohibit us from sending too much (usually its 1MSS).
18411          */
18412         struct ip *ip = NULL;
18413         struct udphdr *udp = NULL;
18414         struct tcphdr *th = NULL;
18415         struct mbuf *m = NULL;
18416         struct inpcb *inp;
18417         uint8_t *cpto;
18418         struct tcp_log_buffer *lgb;
18419 #ifdef TCP_ACCOUNTING
18420         uint64_t crtsc;
18421         int cnt_thru = 1;
18422 #endif
18423         struct tcpopt to;
18424         u_char opt[TCP_MAXOLEN];
18425         uint32_t hdrlen, optlen;
18426         int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0;
18427         uint16_t flags;
18428         uint32_t if_hw_tsomaxsegcount = 0, startseq;
18429         uint32_t if_hw_tsomaxsegsize;
18430         int32_t ip_sendflag = IP_NO_SND_TAG_RL;
18431
18432 #ifdef INET6
18433         struct ip6_hdr *ip6 = NULL;
18434
18435         if (rack->r_is_v6) {
18436                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18437                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18438         } else
18439 #endif                          /* INET6 */
18440         {
18441                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18442                 hdrlen = sizeof(struct tcpiphdr);
18443         }
18444         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
18445                 goto failed;
18446         }
18447         if (doing_tlp) {
18448                 /* Its a TLP add the flag, it may already be there but be sure */
18449                 rsm->r_flags |= RACK_TLP;
18450         } else {
18451                 /* If it was a TLP it is not not on this retransmit */
18452                 rsm->r_flags &= ~RACK_TLP;
18453         }
18454         startseq = rsm->r_start;
18455         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
18456         inp = rack->rc_inp;
18457         to.to_flags = 0;
18458         flags = tcp_outflags[tp->t_state];
18459         if (flags & (TH_SYN|TH_RST)) {
18460                 goto failed;
18461         }
18462         if (rsm->r_flags & RACK_HAS_FIN) {
18463                 /* We can't send a FIN here */
18464                 goto failed;
18465         }
18466         if (flags & TH_FIN) {
18467                 /* We never send a FIN */
18468                 flags &= ~TH_FIN;
18469         }
18470         if (tp->t_flags & TF_RCVD_TSTMP) {
18471                 to.to_tsval = ms_cts + tp->ts_offset;
18472                 to.to_tsecr = tp->ts_recent;
18473                 to.to_flags = TOF_TS;
18474         }
18475         optlen = tcp_addoptions(&to, opt);
18476         hdrlen += optlen;
18477         udp = rack->r_ctl.fsb.udp;
18478         if (udp)
18479                 hdrlen += sizeof(struct udphdr);
18480         if (rack->r_ctl.rc_pace_max_segs)
18481                 max_val = rack->r_ctl.rc_pace_max_segs;
18482         else if (rack->rc_user_set_max_segs)
18483                 max_val = rack->rc_user_set_max_segs * segsiz;
18484         else
18485                 max_val = len;
18486         if ((tp->t_flags & TF_TSO) &&
18487             V_tcp_do_tso &&
18488             (len > segsiz) &&
18489             (tp->t_port == 0))
18490                 tso = 1;
18491 #ifdef INET6
18492         if (MHLEN < hdrlen + max_linkhdr)
18493                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18494         else
18495 #endif
18496                 m = m_gethdr(M_NOWAIT, MT_DATA);
18497         if (m == NULL)
18498                 goto failed;
18499         m->m_data += max_linkhdr;
18500         m->m_len = hdrlen;
18501         th = rack->r_ctl.fsb.th;
18502         /* Establish the len to send */
18503         if (len > max_val)
18504                 len = max_val;
18505         if ((tso) && (len + optlen > segsiz)) {
18506                 uint32_t if_hw_tsomax;
18507                 int32_t max_len;
18508
18509                 /* extract TSO information */
18510                 if_hw_tsomax = tp->t_tsomax;
18511                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18512                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18513                 /*
18514                  * Check if we should limit by maximum payload
18515                  * length:
18516                  */
18517                 if (if_hw_tsomax != 0) {
18518                         /* compute maximum TSO length */
18519                         max_len = (if_hw_tsomax - hdrlen -
18520                                    max_linkhdr);
18521                         if (max_len <= 0) {
18522                                 goto failed;
18523                         } else if (len > max_len) {
18524                                 len = max_len;
18525                         }
18526                 }
18527                 if (len <= segsiz) {
18528                         /*
18529                          * In case there are too many small fragments don't
18530                          * use TSO:
18531                          */
18532                         tso = 0;
18533                 }
18534         } else {
18535                 tso = 0;
18536         }
18537         if ((tso == 0) && (len > segsiz))
18538                 len = segsiz;
18539         (void)tcp_get_usecs(tv);
18540         if ((len == 0) ||
18541             (len <= MHLEN - hdrlen - max_linkhdr)) {
18542                 goto failed;
18543         }
18544         th->th_seq = htonl(rsm->r_start);
18545         th->th_ack = htonl(tp->rcv_nxt);
18546         /*
18547          * The PUSH bit should only be applied
18548          * if the full retransmission is made. If
18549          * we are sending less than this is the
18550          * left hand edge and should not have
18551          * the PUSH bit.
18552          */
18553         if ((rsm->r_flags & RACK_HAD_PUSH) &&
18554             (len == (rsm->r_end - rsm->r_start)))
18555                 flags |= TH_PUSH;
18556         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
18557         if (th->th_win == 0) {
18558                 tp->t_sndzerowin++;
18559                 tp->t_flags |= TF_RXWIN0SENT;
18560         } else
18561                 tp->t_flags &= ~TF_RXWIN0SENT;
18562         if (rsm->r_flags & RACK_TLP) {
18563                 /*
18564                  * TLP should not count in retran count, but
18565                  * in its own bin
18566                  */
18567                 counter_u64_add(rack_tlp_retran, 1);
18568                 counter_u64_add(rack_tlp_retran_bytes, len);
18569         } else {
18570                 tp->t_sndrexmitpack++;
18571                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18572                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18573         }
18574 #ifdef STATS
18575         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18576                                  len);
18577 #endif
18578         if (rsm->m == NULL)
18579                 goto failed;
18580         if (rsm->m &&
18581             ((rsm->orig_m_len != rsm->m->m_len) ||
18582              (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
18583                 /* Fix up the orig_m_len and possibly the mbuf offset */
18584                 rack_adjust_orig_mlen(rsm);
18585         }
18586         m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
18587         if (len <= segsiz) {
18588                 /*
18589                  * Must have ran out of mbufs for the copy
18590                  * shorten it to no longer need tso. Lets
18591                  * not put on sendalot since we are low on
18592                  * mbufs.
18593                  */
18594                 tso = 0;
18595         }
18596         if ((m->m_next == NULL) || (len <= 0)){
18597                 goto failed;
18598         }
18599         if (udp) {
18600                 if (rack->r_is_v6)
18601                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
18602                 else
18603                         ulen = hdrlen + len - sizeof(struct ip);
18604                 udp->uh_ulen = htons(ulen);
18605         }
18606         m->m_pkthdr.rcvif = (struct ifnet *)0;
18607         if (TCPS_HAVERCVDSYN(tp->t_state) &&
18608             (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
18609                 int ect = tcp_ecn_output_established(tp, &flags, len, true);
18610                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18611                     (tp->t_flags2 & TF2_ECN_SND_ECE))
18612                     tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18613 #ifdef INET6
18614                 if (rack->r_is_v6) {
18615                     ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
18616                     ip6->ip6_flow |= htonl(ect << 20);
18617                 }
18618                 else
18619 #endif
18620                 {
18621                     ip->ip_tos &= ~IPTOS_ECN_MASK;
18622                     ip->ip_tos |= ect;
18623                 }
18624         }
18625         if (rack->r_ctl.crte != NULL) {
18626                 /* See if we can send via the hw queue */
18627                 slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
18628                 /* If there is nothing in queue (no pacing time) we can send via the hw queue */
18629                 if (slot == 0)
18630                         ip_sendflag = 0;
18631         }
18632         tcp_set_flags(th, flags);
18633         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18634 #ifdef INET6
18635         if (rack->r_is_v6) {
18636                 if (tp->t_port) {
18637                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18638                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18639                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18640                         th->th_sum = htons(0);
18641                         UDPSTAT_INC(udps_opackets);
18642                 } else {
18643                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18644                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18645                         th->th_sum = in6_cksum_pseudo(ip6,
18646                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18647                                                       0);
18648                 }
18649         }
18650 #endif
18651 #if defined(INET6) && defined(INET)
18652         else
18653 #endif
18654 #ifdef INET
18655         {
18656                 if (tp->t_port) {
18657                         m->m_pkthdr.csum_flags = CSUM_UDP;
18658                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18659                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18660                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18661                         th->th_sum = htons(0);
18662                         UDPSTAT_INC(udps_opackets);
18663                 } else {
18664                         m->m_pkthdr.csum_flags = CSUM_TCP;
18665                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18666                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
18667                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18668                                                                         IPPROTO_TCP + len + optlen));
18669                 }
18670                 /* IP version must be set here for ipv4/ipv6 checking later */
18671                 KASSERT(ip->ip_v == IPVERSION,
18672                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18673         }
18674 #endif
18675         if (tso) {
18676                 /*
18677                  * Here we use segsiz since we have no added options besides
18678                  * any standard timestamp options (no DSACKs or SACKS are sent
18679                  * via either fast-path).
18680                  */
18681                 KASSERT(len > segsiz,
18682                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
18683                 m->m_pkthdr.csum_flags |= CSUM_TSO;
18684                 m->m_pkthdr.tso_segsz = segsiz;
18685         }
18686 #ifdef INET6
18687         if (rack->r_is_v6) {
18688                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
18689                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18690                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18691                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18692                 else
18693                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18694         }
18695 #endif
18696 #if defined(INET) && defined(INET6)
18697         else
18698 #endif
18699 #ifdef INET
18700         {
18701                 ip->ip_len = htons(m->m_pkthdr.len);
18702                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
18703                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18704                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18705                         if (tp->t_port == 0 || len < V_tcp_minmss) {
18706                                 ip->ip_off |= htons(IP_DF);
18707                         }
18708                 } else {
18709                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18710                 }
18711         }
18712 #endif
18713         if (doing_tlp == 0) {
18714                 /* Set we retransmitted */
18715                 rack->rc_gp_saw_rec = 1;
18716         } else {
18717                 /* Its a TLP set ca or ss */
18718                 if (tp->snd_cwnd > tp->snd_ssthresh) {
18719                         /* Set we sent in CA */
18720                         rack->rc_gp_saw_ca = 1;
18721                 } else {
18722                         /* Set we sent in SS */
18723                         rack->rc_gp_saw_ss = 1;
18724                 }
18725         }
18726         /* Time to copy in our header */
18727         cpto = mtod(m, uint8_t *);
18728         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18729         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18730         if (optlen) {
18731                 bcopy(opt, th + 1, optlen);
18732                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18733         } else {
18734                 th->th_off = sizeof(struct tcphdr) >> 2;
18735         }
18736         if (tcp_bblogging_on(rack->rc_tp)) {
18737                 union tcp_log_stackspecific log;
18738
18739                 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
18740                         rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
18741                         counter_u64_add(rack_collapsed_win_rxt, 1);
18742                         counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
18743                 }
18744                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18745                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
18746                 if (rack->rack_no_prr)
18747                         log.u_bbr.flex1 = 0;
18748                 else
18749                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18750                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18751                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18752                 log.u_bbr.flex4 = max_val;
18753                 /* Save off the early/late values */
18754                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18755                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18756                 log.u_bbr.bw_inuse = rack_get_bw(rack);
18757                 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
18758                 if (doing_tlp == 0)
18759                         log.u_bbr.flex8 = 1;
18760                 else
18761                         log.u_bbr.flex8 = 2;
18762                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
18763                 log.u_bbr.flex7 = 55;
18764                 log.u_bbr.pkts_out = tp->t_maxseg;
18765                 log.u_bbr.timeStamp = cts;
18766                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18767                 if (rsm && (rsm->r_rtr_cnt > 0)) {
18768                         /*
18769                          * When we have a retransmit we want to log the
18770                          * burst at send and flight at send from before.
18771                          */
18772                         log.u_bbr.flex5 = rsm->r_fas;
18773                         log.u_bbr.bbr_substate = rsm->r_bas;
18774                 } else {
18775                         /*
18776                          * This is currently unlikely until we do the
18777                          * packet pair probes but I will add it for completeness.
18778                          */
18779                         log.u_bbr.flex5 = log.u_bbr.inflight;
18780                         log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
18781                 }
18782                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
18783                 log.u_bbr.delivered = 0;
18784                 log.u_bbr.rttProp = (uint64_t)rsm;
18785                 log.u_bbr.delRate = rsm->r_flags;
18786                 log.u_bbr.delRate <<= 31;
18787                 log.u_bbr.delRate |= rack->r_must_retran;
18788                 log.u_bbr.delRate <<= 1;
18789                 log.u_bbr.delRate |= 1;
18790                 log.u_bbr.pkt_epoch = __LINE__;
18791                 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
18792                                      len, &log, false, NULL, __func__, __LINE__, tv);
18793         } else
18794                 lgb = NULL;
18795         if ((rack->r_ctl.crte != NULL) &&
18796             tcp_bblogging_on(tp)) {
18797                 rack_log_queue_level(tp, rack, len, tv, cts);
18798         }
18799 #ifdef INET6
18800         if (rack->r_is_v6) {
18801                 error = ip6_output(m, NULL,
18802                                    &inp->inp_route6,
18803                                    ip_sendflag, NULL, NULL, inp);
18804         }
18805         else
18806 #endif
18807 #ifdef INET
18808         {
18809                 error = ip_output(m, NULL,
18810                                   &inp->inp_route,
18811                                   ip_sendflag, 0, inp);
18812         }
18813 #endif
18814         m = NULL;
18815         if (lgb) {
18816                 lgb->tlb_errno = error;
18817                 lgb = NULL;
18818         }
18819         if (error) {
18820                 goto failed;
18821         } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) {
18822                 rack->rc_hw_nobuf = 0;
18823                 rack->r_ctl.rc_agg_delayed = 0;
18824                 rack->r_early = 0;
18825                 rack->r_late = 0;
18826                 rack->r_ctl.rc_agg_early = 0;
18827         }
18828
18829         rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
18830                         rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
18831         if (doing_tlp) {
18832                 rack->rc_tlp_in_progress = 1;
18833                 rack->r_ctl.rc_tlp_cnt_out++;
18834         }
18835         if (error == 0) {
18836                 counter_u64_add(rack_total_bytes, len);
18837                 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
18838                 if (doing_tlp) {
18839                         rack->rc_last_sent_tlp_past_cumack = 0;
18840                         rack->rc_last_sent_tlp_seq_valid = 1;
18841                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18842                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18843                 }
18844         }
18845         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
18846         rack->forced_ack = 0;   /* If we send something zap the FA flag */
18847         if (IN_FASTRECOVERY(tp->t_flags) && rsm)
18848                 rack->r_ctl.retran_during_recovery += len;
18849         {
18850                 int idx;
18851
18852                 idx = (len / segsiz) + 3;
18853                 if (idx >= TCP_MSS_ACCT_ATIMER)
18854                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18855                 else
18856                         counter_u64_add(rack_out_size[idx], 1);
18857         }
18858         if (tp->t_rtttime == 0) {
18859                 tp->t_rtttime = ticks;
18860                 tp->t_rtseq = startseq;
18861                 KMOD_TCPSTAT_INC(tcps_segstimed);
18862         }
18863         counter_u64_add(rack_fto_rsm_send, 1);
18864         if (error && (error == ENOBUFS)) {
18865                 if (rack->r_ctl.crte != NULL) {
18866                         tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
18867                         if (tcp_bblogging_on(rack->rc_tp))
18868                                 rack_log_queue_level(tp, rack, len, tv, cts);
18869                 } else
18870                         tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
18871                 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
18872                 if (rack->rc_enobuf < 0x7f)
18873                         rack->rc_enobuf++;
18874                 if (slot < (10 * HPTS_USEC_IN_MSEC))
18875                         slot = 10 * HPTS_USEC_IN_MSEC;
18876                 if (rack->r_ctl.crte != NULL) {
18877                         counter_u64_add(rack_saw_enobuf_hw, 1);
18878                         tcp_rl_log_enobuf(rack->r_ctl.crte);
18879                 }
18880                 counter_u64_add(rack_saw_enobuf, 1);
18881         } else
18882                 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
18883         if ((slot == 0) ||
18884             (rack->rc_always_pace == 0) ||
18885             (rack->r_rr_config == 1)) {
18886                 /*
18887                  * We have no pacing set or we
18888                  * are using old-style rack or
18889                  * we are overridden to use the old 1ms pacing.
18890                  */
18891                 slot = rack->r_ctl.rc_min_to;
18892         }
18893         rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
18894 #ifdef TCP_ACCOUNTING
18895         crtsc = get_cyclecount();
18896         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18897                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
18898         }
18899         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18900                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
18901         }
18902         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18903                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
18904         }
18905         sched_unpin();
18906 #endif
18907         return (0);
18908 failed:
18909         if (m)
18910                 m_free(m);
18911         return (-1);
18912 }
18913
18914 static void
18915 rack_sndbuf_autoscale(struct tcp_rack *rack)
18916 {
18917         /*
18918          * Automatic sizing of send socket buffer.  Often the send buffer
18919          * size is not optimally adjusted to the actual network conditions
18920          * at hand (delay bandwidth product).  Setting the buffer size too
18921          * small limits throughput on links with high bandwidth and high
18922          * delay (eg. trans-continental/oceanic links).  Setting the
18923          * buffer size too big consumes too much real kernel memory,
18924          * especially with many connections on busy servers.
18925          *
18926          * The criteria to step up the send buffer one notch are:
18927          *  1. receive window of remote host is larger than send buffer
18928          *     (with a fudge factor of 5/4th);
18929          *  2. send buffer is filled to 7/8th with data (so we actually
18930          *     have data to make use of it);
18931          *  3. send buffer fill has not hit maximal automatic size;
18932          *  4. our send window (slow start and cogestion controlled) is
18933          *     larger than sent but unacknowledged data in send buffer.
18934          *
18935          * Note that the rack version moves things much faster since
18936          * we want to avoid hitting cache lines in the rack_fast_output()
18937          * path so this is called much less often and thus moves
18938          * the SB forward by a percentage.
18939          */
18940         struct socket *so;
18941         struct tcpcb *tp;
18942         uint32_t sendwin, scaleup;
18943
18944         tp = rack->rc_tp;
18945         so = rack->rc_inp->inp_socket;
18946         sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
18947         if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
18948                 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
18949                     sbused(&so->so_snd) >=
18950                     (so->so_snd.sb_hiwat / 8 * 7) &&
18951                     sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
18952                     sendwin >= (sbused(&so->so_snd) -
18953                     (tp->snd_nxt - tp->snd_una))) {
18954                         if (rack_autosndbuf_inc)
18955                                 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
18956                         else
18957                                 scaleup = V_tcp_autosndbuf_inc;
18958                         if (scaleup < V_tcp_autosndbuf_inc)
18959                                 scaleup = V_tcp_autosndbuf_inc;
18960                         scaleup += so->so_snd.sb_hiwat;
18961                         if (scaleup > V_tcp_autosndbuf_max)
18962                                 scaleup = V_tcp_autosndbuf_max;
18963                         if (!sbreserve_locked(so, SO_SND, scaleup, curthread))
18964                                 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
18965                 }
18966         }
18967 }
18968
18969 static int
18970 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
18971                  uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
18972 {
18973         /*
18974          * Enter to do fast output. We are given that the sched_pin is
18975          * in place (if accounting is compiled in) and the cycle count taken
18976          * at entry is in place in ts_val. The idea here is that
18977          * we know how many more bytes needs to be sent (presumably either
18978          * during pacing or to fill the cwnd and that was greater than
18979          * the max-burst). We have how much to send and all the info we
18980          * need to just send.
18981          */
18982 #ifdef INET
18983         struct ip *ip = NULL;
18984 #endif
18985         struct udphdr *udp = NULL;
18986         struct tcphdr *th = NULL;
18987         struct mbuf *m, *s_mb;
18988         struct inpcb *inp;
18989         uint8_t *cpto;
18990         struct tcp_log_buffer *lgb;
18991 #ifdef TCP_ACCOUNTING
18992         uint64_t crtsc;
18993 #endif
18994         struct tcpopt to;
18995         u_char opt[TCP_MAXOLEN];
18996         uint32_t hdrlen, optlen;
18997 #ifdef TCP_ACCOUNTING
18998         int cnt_thru = 1;
18999 #endif
19000         int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
19001         uint16_t flags;
19002         uint32_t s_soff;
19003         uint32_t if_hw_tsomaxsegcount = 0, startseq;
19004         uint32_t if_hw_tsomaxsegsize;
19005         uint16_t add_flag = RACK_SENT_FP;
19006 #ifdef INET6
19007         struct ip6_hdr *ip6 = NULL;
19008
19009         if (rack->r_is_v6) {
19010                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
19011                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
19012         } else
19013 #endif                          /* INET6 */
19014         {
19015 #ifdef INET
19016                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
19017                 hdrlen = sizeof(struct tcpiphdr);
19018 #endif
19019         }
19020         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
19021                 m = NULL;
19022                 goto failed;
19023         }
19024         startseq = tp->snd_max;
19025         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
19026         inp = rack->rc_inp;
19027         len = rack->r_ctl.fsb.left_to_send;
19028         to.to_flags = 0;
19029         flags = rack->r_ctl.fsb.tcp_flags;
19030         if (tp->t_flags & TF_RCVD_TSTMP) {
19031                 to.to_tsval = ms_cts + tp->ts_offset;
19032                 to.to_tsecr = tp->ts_recent;
19033                 to.to_flags = TOF_TS;
19034         }
19035         optlen = tcp_addoptions(&to, opt);
19036         hdrlen += optlen;
19037         udp = rack->r_ctl.fsb.udp;
19038         if (udp)
19039                 hdrlen += sizeof(struct udphdr);
19040         if (rack->r_ctl.rc_pace_max_segs)
19041                 max_val = rack->r_ctl.rc_pace_max_segs;
19042         else if (rack->rc_user_set_max_segs)
19043                 max_val = rack->rc_user_set_max_segs * segsiz;
19044         else
19045                 max_val = len;
19046         if ((tp->t_flags & TF_TSO) &&
19047             V_tcp_do_tso &&
19048             (len > segsiz) &&
19049             (tp->t_port == 0))
19050                 tso = 1;
19051 again:
19052 #ifdef INET6
19053         if (MHLEN < hdrlen + max_linkhdr)
19054                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
19055         else
19056 #endif
19057                 m = m_gethdr(M_NOWAIT, MT_DATA);
19058         if (m == NULL)
19059                 goto failed;
19060         m->m_data += max_linkhdr;
19061         m->m_len = hdrlen;
19062         th = rack->r_ctl.fsb.th;
19063         /* Establish the len to send */
19064         if (len > max_val)
19065                 len = max_val;
19066         if ((tso) && (len + optlen > segsiz)) {
19067                 uint32_t if_hw_tsomax;
19068                 int32_t max_len;
19069
19070                 /* extract TSO information */
19071                 if_hw_tsomax = tp->t_tsomax;
19072                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
19073                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
19074                 /*
19075                  * Check if we should limit by maximum payload
19076                  * length:
19077                  */
19078                 if (if_hw_tsomax != 0) {
19079                         /* compute maximum TSO length */
19080                         max_len = (if_hw_tsomax - hdrlen -
19081                                    max_linkhdr);
19082                         if (max_len <= 0) {
19083                                 goto failed;
19084                         } else if (len > max_len) {
19085                                 len = max_len;
19086                         }
19087                 }
19088                 if (len <= segsiz) {
19089                         /*
19090                          * In case there are too many small fragments don't
19091                          * use TSO:
19092                          */
19093                         tso = 0;
19094                 }
19095         } else {
19096                 tso = 0;
19097         }
19098         if ((tso == 0) && (len > segsiz))
19099                 len = segsiz;
19100         (void)tcp_get_usecs(tv);
19101         if ((len == 0) ||
19102             (len <= MHLEN - hdrlen - max_linkhdr)) {
19103                 goto failed;
19104         }
19105         sb_offset = tp->snd_max - tp->snd_una;
19106         th->th_seq = htonl(tp->snd_max);
19107         th->th_ack = htonl(tp->rcv_nxt);
19108         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
19109         if (th->th_win == 0) {
19110                 tp->t_sndzerowin++;
19111                 tp->t_flags |= TF_RXWIN0SENT;
19112         } else
19113                 tp->t_flags &= ~TF_RXWIN0SENT;
19114         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
19115         KMOD_TCPSTAT_INC(tcps_sndpack);
19116         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
19117 #ifdef STATS
19118         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
19119                                  len);
19120 #endif
19121         if (rack->r_ctl.fsb.m == NULL)
19122                 goto failed;
19123
19124         /* s_mb and s_soff are saved for rack_log_output */
19125         m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
19126                                     &s_mb, &s_soff);
19127         if (len <= segsiz) {
19128                 /*
19129                  * Must have ran out of mbufs for the copy
19130                  * shorten it to no longer need tso. Lets
19131                  * not put on sendalot since we are low on
19132                  * mbufs.
19133                  */
19134                 tso = 0;
19135         }
19136         if (rack->r_ctl.fsb.rfo_apply_push &&
19137             (len == rack->r_ctl.fsb.left_to_send)) {
19138                 tcp_set_flags(th, flags | TH_PUSH);
19139                 add_flag |= RACK_HAD_PUSH;
19140         }
19141         if ((m->m_next == NULL) || (len <= 0)){
19142                 goto failed;
19143         }
19144         if (udp) {
19145                 if (rack->r_is_v6)
19146                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
19147                 else
19148                         ulen = hdrlen + len - sizeof(struct ip);
19149                 udp->uh_ulen = htons(ulen);
19150         }
19151         m->m_pkthdr.rcvif = (struct ifnet *)0;
19152         if (TCPS_HAVERCVDSYN(tp->t_state) &&
19153             (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
19154                 int ect = tcp_ecn_output_established(tp, &flags, len, false);
19155                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
19156                     (tp->t_flags2 & TF2_ECN_SND_ECE))
19157                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
19158 #ifdef INET6
19159                 if (rack->r_is_v6) {
19160                         ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
19161                         ip6->ip6_flow |= htonl(ect << 20);
19162                 }
19163                 else
19164 #endif
19165                 {
19166 #ifdef INET
19167                         ip->ip_tos &= ~IPTOS_ECN_MASK;
19168                         ip->ip_tos |= ect;
19169 #endif
19170                 }
19171         }
19172         tcp_set_flags(th, flags);
19173         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
19174 #ifdef INET6
19175         if (rack->r_is_v6) {
19176                 if (tp->t_port) {
19177                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
19178                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19179                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
19180                         th->th_sum = htons(0);
19181                         UDPSTAT_INC(udps_opackets);
19182                 } else {
19183                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
19184                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19185                         th->th_sum = in6_cksum_pseudo(ip6,
19186                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
19187                                                       0);
19188                 }
19189         }
19190 #endif
19191 #if defined(INET6) && defined(INET)
19192         else
19193 #endif
19194 #ifdef INET
19195         {
19196                 if (tp->t_port) {
19197                         m->m_pkthdr.csum_flags = CSUM_UDP;
19198                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19199                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
19200                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
19201                         th->th_sum = htons(0);
19202                         UDPSTAT_INC(udps_opackets);
19203                 } else {
19204                         m->m_pkthdr.csum_flags = CSUM_TCP;
19205                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19206                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
19207                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
19208                                                                         IPPROTO_TCP + len + optlen));
19209                 }
19210                 /* IP version must be set here for ipv4/ipv6 checking later */
19211                 KASSERT(ip->ip_v == IPVERSION,
19212                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
19213         }
19214 #endif
19215         if (tso) {
19216                 /*
19217                  * Here we use segsiz since we have no added options besides
19218                  * any standard timestamp options (no DSACKs or SACKS are sent
19219                  * via either fast-path).
19220                  */
19221                 KASSERT(len > segsiz,
19222                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
19223                 m->m_pkthdr.csum_flags |= CSUM_TSO;
19224                 m->m_pkthdr.tso_segsz = segsiz;
19225         }
19226 #ifdef INET6
19227         if (rack->r_is_v6) {
19228                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
19229                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
19230                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
19231                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19232                 else
19233                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19234         }
19235 #endif
19236 #if defined(INET) && defined(INET6)
19237         else
19238 #endif
19239 #ifdef INET
19240         {
19241                 ip->ip_len = htons(m->m_pkthdr.len);
19242                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
19243                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
19244                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19245                         if (tp->t_port == 0 || len < V_tcp_minmss) {
19246                                 ip->ip_off |= htons(IP_DF);
19247                         }
19248                 } else {
19249                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19250                 }
19251         }
19252 #endif
19253         if (tp->snd_cwnd > tp->snd_ssthresh) {
19254                 /* Set we sent in CA */
19255                 rack->rc_gp_saw_ca = 1;
19256         } else {
19257                 /* Set we sent in SS */
19258                 rack->rc_gp_saw_ss = 1;
19259         }
19260         /* Time to copy in our header */
19261         cpto = mtod(m, uint8_t *);
19262         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
19263         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
19264         if (optlen) {
19265                 bcopy(opt, th + 1, optlen);
19266                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
19267         } else {
19268                 th->th_off = sizeof(struct tcphdr) >> 2;
19269         }
19270         if ((rack->r_ctl.crte != NULL) &&
19271             tcp_bblogging_on(tp)) {
19272                 rack_log_queue_level(tp, rack, len, tv, cts);
19273         }
19274         if (tcp_bblogging_on(rack->rc_tp)) {
19275                 union tcp_log_stackspecific log;
19276
19277                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
19278                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
19279                 if (rack->rack_no_prr)
19280                         log.u_bbr.flex1 = 0;
19281                 else
19282                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
19283                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
19284                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
19285                 log.u_bbr.flex4 = max_val;
19286                 /* Save off the early/late values */
19287                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
19288                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
19289                 log.u_bbr.bw_inuse = rack_get_bw(rack);
19290                 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
19291                 log.u_bbr.flex8 = 0;
19292                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
19293                 log.u_bbr.flex7 = 44;
19294                 log.u_bbr.pkts_out = tp->t_maxseg;
19295                 log.u_bbr.timeStamp = cts;
19296                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
19297                 log.u_bbr.flex5 = log.u_bbr.inflight;
19298                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
19299                 log.u_bbr.delivered = 0;
19300                 log.u_bbr.rttProp = 0;
19301                 log.u_bbr.delRate = rack->r_must_retran;
19302                 log.u_bbr.delRate <<= 1;
19303                 log.u_bbr.pkt_epoch = __LINE__;
19304                 /* For fast output no retrans so just inflight and how many mss we send */
19305                 log.u_bbr.flex5 = log.u_bbr.inflight;
19306                 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
19307                 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
19308                                      len, &log, false, NULL, __func__, __LINE__, tv);
19309         } else
19310                 lgb = NULL;
19311 #ifdef INET6
19312         if (rack->r_is_v6) {
19313                 error = ip6_output(m, NULL,
19314                                    &inp->inp_route6,
19315                                    0, NULL, NULL, inp);
19316         }
19317 #endif
19318 #if defined(INET) && defined(INET6)
19319         else
19320 #endif
19321 #ifdef INET
19322         {
19323                 error = ip_output(m, NULL,
19324                                   &inp->inp_route,
19325                                   0, 0, inp);
19326         }
19327 #endif
19328         if (lgb) {
19329                 lgb->tlb_errno = error;
19330                 lgb = NULL;
19331         }
19332         if (error) {
19333                 *send_err = error;
19334                 m = NULL;
19335                 goto failed;
19336         } else if (rack->rc_hw_nobuf) {
19337                 rack->rc_hw_nobuf = 0;
19338                 rack->r_ctl.rc_agg_delayed = 0;
19339                 rack->r_early = 0;
19340                 rack->r_late = 0;
19341                 rack->r_ctl.rc_agg_early = 0;
19342         }
19343         if ((error == 0) && (rack->lt_bw_up == 0)) {
19344                 /* Unlikely */
19345                 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv);
19346                 rack->r_ctl.lt_seq = tp->snd_una;
19347                 rack->lt_bw_up = 1;
19348         }
19349         rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
19350                         NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
19351         m = NULL;
19352         if (tp->snd_una == tp->snd_max) {
19353                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
19354                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
19355                 tp->t_acktime = ticks;
19356         }
19357         counter_u64_add(rack_total_bytes, len);
19358         tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
19359
19360         rack->forced_ack = 0;   /* If we send something zap the FA flag */
19361         tot_len += len;
19362         if ((tp->t_flags & TF_GPUTINPROG) == 0)
19363                 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
19364         tp->snd_max += len;
19365         tp->snd_nxt = tp->snd_max;
19366         if (rack->rc_new_rnd_needed) {
19367                 /*
19368                  * Update the rnd to start ticking not
19369                  * that from a time perspective all of
19370                  * the preceding idle time is "in the round"
19371                  */
19372                 rack->rc_new_rnd_needed = 0;
19373                 rack->r_ctl.roundends = tp->snd_max;
19374         }
19375         {
19376                 int idx;
19377
19378                 idx = (len / segsiz) + 3;
19379                 if (idx >= TCP_MSS_ACCT_ATIMER)
19380                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
19381                 else
19382                         counter_u64_add(rack_out_size[idx], 1);
19383         }
19384         if (len <= rack->r_ctl.fsb.left_to_send)
19385                 rack->r_ctl.fsb.left_to_send -= len;
19386         else
19387                 rack->r_ctl.fsb.left_to_send = 0;
19388         if (rack->r_ctl.fsb.left_to_send < segsiz) {
19389                 rack->r_fast_output = 0;
19390                 rack->r_ctl.fsb.left_to_send = 0;
19391                 /* At the end of fast_output scale up the sb */
19392                 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
19393                 rack_sndbuf_autoscale(rack);
19394                 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
19395         }
19396         if (tp->t_rtttime == 0) {
19397                 tp->t_rtttime = ticks;
19398                 tp->t_rtseq = startseq;
19399                 KMOD_TCPSTAT_INC(tcps_segstimed);
19400         }
19401         if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
19402             (max_val > len) &&
19403             (tso == 0)) {
19404                 max_val -= len;
19405                 len = segsiz;
19406                 th = rack->r_ctl.fsb.th;
19407 #ifdef TCP_ACCOUNTING
19408                 cnt_thru++;
19409 #endif
19410                 goto again;
19411         }
19412         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
19413         counter_u64_add(rack_fto_send, 1);
19414         slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
19415         rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
19416 #ifdef TCP_ACCOUNTING
19417         crtsc = get_cyclecount();
19418         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19419                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
19420         }
19421         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19422                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
19423         }
19424         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19425                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
19426         }
19427         sched_unpin();
19428 #endif
19429         return (0);
19430 failed:
19431         if (m)
19432                 m_free(m);
19433         rack->r_fast_output = 0;
19434         return (-1);
19435 }
19436
19437 static inline void
19438 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack,
19439                        struct sockbuf *sb,
19440                        int len, int orig_len, int segsiz, uint32_t pace_max_seg,
19441                        bool hw_tls,
19442                        uint16_t flags)
19443 {
19444         rack->r_fast_output = 1;
19445         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19446         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19447         rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
19448         rack->r_ctl.fsb.tcp_flags = flags;
19449         rack->r_ctl.fsb.left_to_send = orig_len - len;
19450         if (rack->r_ctl.fsb.left_to_send < pace_max_seg) {
19451                 /* Less than a full sized pace, lets not  */
19452                 rack->r_fast_output = 0;
19453                 return;
19454         } else {
19455                 /* Round down to the nearest pace_max_seg */
19456                 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg);
19457         }
19458         if (hw_tls)
19459                 rack->r_ctl.fsb.hw_tls = 1;
19460         else
19461                 rack->r_ctl.fsb.hw_tls = 0;
19462         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19463                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19464                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19465                  (tp->snd_max - tp->snd_una)));
19466         if (rack->r_ctl.fsb.left_to_send < segsiz)
19467                 rack->r_fast_output = 0;
19468         else {
19469                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19470                         rack->r_ctl.fsb.rfo_apply_push = 1;
19471                 else
19472                         rack->r_ctl.fsb.rfo_apply_push = 0;
19473         }
19474 }
19475
19476 static uint32_t
19477 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz)
19478 {
19479         uint64_t min_time;
19480         uint32_t maxlen;
19481
19482         min_time = (uint64_t)get_hpts_min_sleep_time();
19483         maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC);
19484         maxlen = roundup(maxlen, segsiz);
19485         return (maxlen);
19486 }
19487
19488 static struct rack_sendmap *
19489 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
19490 {
19491         struct rack_sendmap *rsm = NULL;
19492         int thresh;
19493
19494 restart:
19495         rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
19496         if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
19497                 /* Nothing, strange turn off validity  */
19498                 rack->r_collapse_point_valid = 0;
19499                 return (NULL);
19500         }
19501         /* Can we send it yet? */
19502         if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
19503                 /*
19504                  * Receiver window has not grown enough for
19505                  * the segment to be put on the wire.
19506                  */
19507                 return (NULL);
19508         }
19509         if (rsm->r_flags & RACK_ACKED) {
19510                 /*
19511                  * It has been sacked, lets move to the
19512                  * next one if possible.
19513                  */
19514                 rack->r_ctl.last_collapse_point = rsm->r_end;
19515                 /* Are we done? */
19516                 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19517                             rack->r_ctl.high_collapse_point)) {
19518                         rack->r_collapse_point_valid = 0;
19519                         return (NULL);
19520                 }
19521                 goto restart;
19522         }
19523         /* Now has it been long enough ? */
19524         thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
19525         if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
19526                 rack_log_collapse(rack, rsm->r_start,
19527                                   (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19528                                   thresh, __LINE__, 6, rsm->r_flags, rsm);
19529                 return (rsm);
19530         }
19531         /* Not enough time */
19532         rack_log_collapse(rack, rsm->r_start,
19533                           (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19534                           thresh, __LINE__, 7, rsm->r_flags, rsm);
19535         return (NULL);
19536 }
19537
19538 static inline void
19539 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
19540 {
19541         if ((rack->full_size_rxt == 0) &&
19542             (rack->shape_rxt_to_pacing_min == 0) &&
19543             (*len >= segsiz)) {
19544                 *len = segsiz;
19545         } else if (rack->shape_rxt_to_pacing_min &&
19546                  rack->gp_ready) {
19547                 /* We use pacing min as shaping len req */
19548                 uint32_t maxlen;
19549
19550                 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
19551                 if (*len > maxlen)
19552                         *len = maxlen;
19553         } else {
19554                 /*
19555                  * The else is full_size_rxt is on so send it all
19556                  * note we do need to check this for exceeding
19557                  * our max segment size due to the fact that
19558                  * we do sometimes merge chunks together i.e.
19559                  * we cannot just assume that we will never have
19560                  * a chunk greater than pace_max_seg
19561                  */
19562                 if (*len > pace_max_seg)
19563                         *len = pace_max_seg;
19564         }
19565 }
19566
19567 static int
19568 rack_output(struct tcpcb *tp)
19569 {
19570         struct socket *so;
19571         uint32_t recwin;
19572         uint32_t sb_offset, s_moff = 0;
19573         int32_t len, error = 0;
19574         uint16_t flags;
19575         struct mbuf *m, *s_mb = NULL;
19576         struct mbuf *mb;
19577         uint32_t if_hw_tsomaxsegcount = 0;
19578         uint32_t if_hw_tsomaxsegsize;
19579         int32_t segsiz, minseg;
19580         long tot_len_this_send = 0;
19581 #ifdef INET
19582         struct ip *ip = NULL;
19583 #endif
19584         struct udphdr *udp = NULL;
19585         struct tcp_rack *rack;
19586         struct tcphdr *th;
19587         uint8_t pass = 0;
19588         uint8_t mark = 0;
19589         uint8_t check_done = 0;
19590         uint8_t wanted_cookie = 0;
19591         u_char opt[TCP_MAXOLEN];
19592         unsigned ipoptlen, optlen, hdrlen, ulen=0;
19593         uint32_t rack_seq;
19594
19595 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
19596         unsigned ipsec_optlen = 0;
19597
19598 #endif
19599         int32_t idle, sendalot;
19600         int32_t sub_from_prr = 0;
19601         volatile int32_t sack_rxmit;
19602         struct rack_sendmap *rsm = NULL;
19603         int32_t tso, mtu;
19604         struct tcpopt to;
19605         int32_t slot = 0;
19606         int32_t sup_rack = 0;
19607         uint32_t cts, ms_cts, delayed, early;
19608         uint16_t add_flag = RACK_SENT_SP;
19609         /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
19610         uint8_t hpts_calling,  doing_tlp = 0;
19611         uint32_t cwnd_to_use, pace_max_seg;
19612         int32_t do_a_prefetch = 0;
19613         int32_t prefetch_rsm = 0;
19614         int32_t orig_len = 0;
19615         struct timeval tv;
19616         int32_t prefetch_so_done = 0;
19617         struct tcp_log_buffer *lgb;
19618         struct inpcb *inp = tptoinpcb(tp);
19619         struct sockbuf *sb;
19620         uint64_t ts_val = 0;
19621 #ifdef TCP_ACCOUNTING
19622         uint64_t crtsc;
19623 #endif
19624 #ifdef INET6
19625         struct ip6_hdr *ip6 = NULL;
19626         int32_t isipv6;
19627 #endif
19628         bool hw_tls = false;
19629
19630         NET_EPOCH_ASSERT();
19631         INP_WLOCK_ASSERT(inp);
19632
19633         /* setup and take the cache hits here */
19634         rack = (struct tcp_rack *)tp->t_fb_ptr;
19635 #ifdef TCP_ACCOUNTING
19636         sched_pin();
19637         ts_val = get_cyclecount();
19638 #endif
19639         hpts_calling = inp->inp_hpts_calls;
19640 #ifdef TCP_OFFLOAD
19641         if (tp->t_flags & TF_TOE) {
19642 #ifdef TCP_ACCOUNTING
19643                 sched_unpin();
19644 #endif
19645                 return (tcp_offload_output(tp));
19646         }
19647 #endif
19648         if (rack->rack_deferred_inited == 0) {
19649                 /*
19650                  * If we are the connecting socket we will
19651                  * hit rack_init() when no sequence numbers
19652                  * are setup. This makes it so we must defer
19653                  * some initialization. Call that now.
19654                  */
19655                 rack_deferred_init(tp, rack);
19656         }
19657         /*
19658          * For TFO connections in SYN_RECEIVED, only allow the initial
19659          * SYN|ACK and those sent by the retransmit timer.
19660          */
19661         if (IS_FASTOPEN(tp->t_flags) &&
19662             (tp->t_state == TCPS_SYN_RECEIVED) &&
19663             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
19664             (rack->r_ctl.rc_resend == NULL)) {         /* not a retransmit */
19665 #ifdef TCP_ACCOUNTING
19666                 sched_unpin();
19667 #endif
19668                 return (0);
19669         }
19670 #ifdef INET6
19671         if (rack->r_state) {
19672                 /* Use the cache line loaded if possible */
19673                 isipv6 = rack->r_is_v6;
19674         } else {
19675                 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
19676         }
19677 #endif
19678         early = 0;
19679         cts = tcp_get_usecs(&tv);
19680         ms_cts = tcp_tv_to_mssectick(&tv);
19681         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
19682             tcp_in_hpts(rack->rc_inp)) {
19683                 /*
19684                  * We are on the hpts for some timer but not hptsi output.
19685                  * Remove from the hpts unconditionally.
19686                  */
19687                 rack_timer_cancel(tp, rack, cts, __LINE__);
19688         }
19689         /* Are we pacing and late? */
19690         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19691             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
19692                 /* We are delayed */
19693                 delayed = cts - rack->r_ctl.rc_last_output_to;
19694         } else {
19695                 delayed = 0;
19696         }
19697         /* Do the timers, which may override the pacer */
19698         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
19699                 int retval;
19700
19701                 retval = rack_process_timers(tp, rack, cts, hpts_calling,
19702                                              &doing_tlp);
19703                 if (retval != 0) {
19704                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
19705 #ifdef TCP_ACCOUNTING
19706                         sched_unpin();
19707 #endif
19708                         /*
19709                          * If timers want tcp_drop(), then pass error out,
19710                          * otherwise suppress it.
19711                          */
19712                         return (retval < 0 ? retval : 0);
19713                 }
19714         }
19715         if (rack->rc_in_persist) {
19716                 if (tcp_in_hpts(rack->rc_inp) == 0) {
19717                         /* Timer is not running */
19718                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19719                 }
19720 #ifdef TCP_ACCOUNTING
19721                 sched_unpin();
19722 #endif
19723                 return (0);
19724         }
19725         if ((rack->rc_ack_required == 1) &&
19726             (rack->r_timer_override == 0)){
19727                 /* A timeout occurred and no ack has arrived */
19728                 if (tcp_in_hpts(rack->rc_inp) == 0) {
19729                         /* Timer is not running */
19730                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19731                 }
19732 #ifdef TCP_ACCOUNTING
19733                 sched_unpin();
19734 #endif
19735                 return (0);
19736         }
19737         if ((rack->r_timer_override) ||
19738             (rack->rc_ack_can_sendout_data) ||
19739             (delayed) ||
19740             (tp->t_state < TCPS_ESTABLISHED)) {
19741                 rack->rc_ack_can_sendout_data = 0;
19742                 if (tcp_in_hpts(rack->rc_inp))
19743                         tcp_hpts_remove(rack->rc_inp);
19744         } else if (tcp_in_hpts(rack->rc_inp)) {
19745                 /*
19746                  * On the hpts you can't pass even if ACKNOW is on, we will
19747                  * when the hpts fires.
19748                  */
19749 #ifdef TCP_ACCOUNTING
19750                 crtsc = get_cyclecount();
19751                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19752                         tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
19753                 }
19754                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19755                         tp->tcp_cnt_counters[SND_BLOCKED]++;
19756                 }
19757                 sched_unpin();
19758 #endif
19759                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
19760                 return (0);
19761         }
19762         rack->rc_inp->inp_hpts_calls = 0;
19763         /* Finish out both pacing early and late accounting */
19764         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19765             TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
19766                 early = rack->r_ctl.rc_last_output_to - cts;
19767         } else
19768                 early = 0;
19769         if (delayed) {
19770                 rack->r_ctl.rc_agg_delayed += delayed;
19771                 rack->r_late = 1;
19772         } else if (early) {
19773                 rack->r_ctl.rc_agg_early += early;
19774                 rack->r_early = 1;
19775         }
19776         /* Now that early/late accounting is done turn off the flag */
19777         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
19778         rack->r_wanted_output = 0;
19779         rack->r_timer_override = 0;
19780         if ((tp->t_state != rack->r_state) &&
19781             TCPS_HAVEESTABLISHED(tp->t_state)) {
19782                 rack_set_state(tp, rack);
19783         }
19784         if ((rack->r_fast_output) &&
19785             (doing_tlp == 0) &&
19786             (tp->rcv_numsacks == 0)) {
19787                 int ret;
19788
19789                 error = 0;
19790                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
19791                 if (ret >= 0)
19792                         return(ret);
19793                 else if (error) {
19794                         inp = rack->rc_inp;
19795                         so = inp->inp_socket;
19796                         sb = &so->so_snd;
19797                         goto nomore;
19798                 }
19799         }
19800         inp = rack->rc_inp;
19801         /*
19802          * For TFO connections in SYN_SENT or SYN_RECEIVED,
19803          * only allow the initial SYN or SYN|ACK and those sent
19804          * by the retransmit timer.
19805          */
19806         if (IS_FASTOPEN(tp->t_flags) &&
19807             ((tp->t_state == TCPS_SYN_RECEIVED) ||
19808              (tp->t_state == TCPS_SYN_SENT)) &&
19809             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
19810             (tp->t_rxtshift == 0)) {              /* not a retransmit */
19811                 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
19812                 so = inp->inp_socket;
19813                 sb = &so->so_snd;
19814                 goto just_return_nolock;
19815         }
19816         /*
19817          * Determine length of data that should be transmitted, and flags
19818          * that will be used. If there is some data or critical controls
19819          * (SYN, RST) to send, then transmit; otherwise, investigate
19820          * further.
19821          */
19822         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
19823         if (tp->t_idle_reduce) {
19824                 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur))
19825                         rack_cc_after_idle(rack, tp);
19826         }
19827         tp->t_flags &= ~TF_LASTIDLE;
19828         if (idle) {
19829                 if (tp->t_flags & TF_MORETOCOME) {
19830                         tp->t_flags |= TF_LASTIDLE;
19831                         idle = 0;
19832                 }
19833         }
19834         if ((tp->snd_una == tp->snd_max) &&
19835             rack->r_ctl.rc_went_idle_time &&
19836             TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
19837                 idle = cts - rack->r_ctl.rc_went_idle_time;
19838                 if (idle > rack_min_probertt_hold) {
19839                         /* Count as a probe rtt */
19840                         if (rack->in_probe_rtt == 0) {
19841                                 rack->r_ctl.rc_lower_rtt_us_cts = cts;
19842                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
19843                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
19844                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
19845                         } else {
19846                                 rack_exit_probertt(rack, cts);
19847                         }
19848                 }
19849                 idle = 0;
19850         }
19851         if (rack_use_fsb &&
19852             (rack->r_ctl.fsb.tcp_ip_hdr) &&
19853             (rack->r_fsb_inited == 0) &&
19854             (rack->r_state != TCPS_CLOSED))
19855                 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
19856 again:
19857         /*
19858          * If we've recently taken a timeout, snd_max will be greater than
19859          * snd_nxt.  There may be SACK information that allows us to avoid
19860          * resending already delivered data.  Adjust snd_nxt accordingly.
19861          */
19862         sendalot = 0;
19863         cts = tcp_get_usecs(&tv);
19864         ms_cts = tcp_tv_to_mssectick(&tv);
19865         tso = 0;
19866         mtu = 0;
19867         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
19868         minseg = segsiz;
19869         if (rack->r_ctl.rc_pace_max_segs == 0)
19870                 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
19871         else
19872                 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
19873         sb_offset = tp->snd_max - tp->snd_una;
19874         cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
19875         flags = tcp_outflags[tp->t_state];
19876         while (rack->rc_free_cnt < rack_free_cache) {
19877                 rsm = rack_alloc(rack);
19878                 if (rsm == NULL) {
19879                         if (inp->inp_hpts_calls)
19880                                 /* Retry in a ms */
19881                                 slot = (1 * HPTS_USEC_IN_MSEC);
19882                         so = inp->inp_socket;
19883                         sb = &so->so_snd;
19884                         goto just_return_nolock;
19885                 }
19886                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
19887                 rack->rc_free_cnt++;
19888                 rsm = NULL;
19889         }
19890         if (inp->inp_hpts_calls)
19891                 inp->inp_hpts_calls = 0;
19892         sack_rxmit = 0;
19893         len = 0;
19894         rsm = NULL;
19895         if (flags & TH_RST) {
19896                 SOCKBUF_LOCK(&inp->inp_socket->so_snd);
19897                 so = inp->inp_socket;
19898                 sb = &so->so_snd;
19899                 goto send;
19900         }
19901         if (rack->r_ctl.rc_resend) {
19902                 /* Retransmit timer */
19903                 rsm = rack->r_ctl.rc_resend;
19904                 rack->r_ctl.rc_resend = NULL;
19905                 len = rsm->r_end - rsm->r_start;
19906                 sack_rxmit = 1;
19907                 sendalot = 0;
19908                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19909                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19910                          __func__, __LINE__,
19911                          rsm->r_start, tp->snd_una, tp, rack, rsm));
19912                 sb_offset = rsm->r_start - tp->snd_una;
19913                 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19914         } else if (rack->r_collapse_point_valid &&
19915                    ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
19916                 /*
19917                  * If an RSM is returned then enough time has passed
19918                  * for us to retransmit it. Move up the collapse point,
19919                  * since this rsm has its chance to retransmit now.
19920                  */
19921                 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT);
19922                 rack->r_ctl.last_collapse_point = rsm->r_end;
19923                 /* Are we done? */
19924                 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19925                             rack->r_ctl.high_collapse_point))
19926                         rack->r_collapse_point_valid = 0;
19927                 sack_rxmit = 1;
19928                 /* We are not doing a TLP */
19929                 doing_tlp = 0;
19930                 len = rsm->r_end - rsm->r_start;
19931                 sb_offset = rsm->r_start - tp->snd_una;
19932                 sendalot = 0;
19933                 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19934         } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
19935                 /* We have a retransmit that takes precedence */
19936                 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
19937                     ((rsm->r_flags & RACK_MUST_RXT) == 0) &&
19938                     ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
19939                         /* Enter recovery if not induced by a time-out */
19940                         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
19941                 }
19942 #ifdef INVARIANTS
19943                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
19944                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
19945                               tp, rack, rsm, rsm->r_start, tp->snd_una);
19946                 }
19947 #endif
19948                 len = rsm->r_end - rsm->r_start;
19949                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19950                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19951                          __func__, __LINE__,
19952                          rsm->r_start, tp->snd_una, tp, rack, rsm));
19953                 sb_offset = rsm->r_start - tp->snd_una;
19954                 sendalot = 0;
19955                 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19956                 if (len > 0) {
19957                         sack_rxmit = 1;
19958                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
19959                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
19960                                          min(len, segsiz));
19961                 }
19962         } else if (rack->r_ctl.rc_tlpsend) {
19963                 /* Tail loss probe */
19964                 long cwin;
19965                 long tlen;
19966
19967                 /*
19968                  * Check if we can do a TLP with a RACK'd packet
19969                  * this can happen if we are not doing the rack
19970                  * cheat and we skipped to a TLP and it
19971                  * went off.
19972                  */
19973                 rsm = rack->r_ctl.rc_tlpsend;
19974                 /* We are doing a TLP make sure the flag is preent */
19975                 rsm->r_flags |= RACK_TLP;
19976                 rack->r_ctl.rc_tlpsend = NULL;
19977                 sack_rxmit = 1;
19978                 tlen = rsm->r_end - rsm->r_start;
19979                 if (tlen > segsiz)
19980                         tlen = segsiz;
19981                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19982                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19983                          __func__, __LINE__,
19984                          rsm->r_start, tp->snd_una, tp, rack, rsm));
19985                 sb_offset = rsm->r_start - tp->snd_una;
19986                 cwin = min(tp->snd_wnd, tlen);
19987                 len = cwin;
19988         }
19989         if (rack->r_must_retran &&
19990             (doing_tlp == 0) &&
19991             (SEQ_GT(tp->snd_max, tp->snd_una)) &&
19992             (rsm == NULL)) {
19993                 /*
19994                  * There are two different ways that we
19995                  * can get into this block:
19996                  * a) This is a non-sack connection, we had a time-out
19997                  *    and thus r_must_retran was set and everything
19998                  *    left outstanding as been marked for retransmit.
19999                  * b) The MTU of the path shrank, so that everything
20000                  *    was marked to be retransmitted with the smaller
20001                  *    mtu and r_must_retran was set.
20002                  *
20003                  * This means that we expect the sendmap (outstanding)
20004                  * to all be marked must. We can use the tmap to
20005                  * look at them.
20006                  *
20007                  */
20008                 int sendwin, flight;
20009
20010                 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
20011                 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
20012                 if (flight >= sendwin) {
20013                         /*
20014                          * We can't send yet.
20015                          */
20016                         so = inp->inp_socket;
20017                         sb = &so->so_snd;
20018                         goto just_return_nolock;
20019                 }
20020                 /*
20021                  * This is the case a/b mentioned above. All
20022                  * outstanding/not-acked should be marked.
20023                  * We can use the tmap to find them.
20024                  */
20025                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
20026                 if (rsm == NULL) {
20027                         /* TSNH */
20028                         rack->r_must_retran = 0;
20029                         rack->r_ctl.rc_out_at_rto = 0;
20030                         so = inp->inp_socket;
20031                         sb = &so->so_snd;
20032                         goto just_return_nolock;
20033                 }
20034                 if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
20035                         /*
20036                          * The first one does not have the flag, did we collapse
20037                          * further up in our list?
20038                          */
20039                         rack->r_must_retran = 0;
20040                         rack->r_ctl.rc_out_at_rto = 0;
20041                         rsm = NULL;
20042                         sack_rxmit = 0;
20043                 } else {
20044                         sack_rxmit = 1;
20045                         len = rsm->r_end - rsm->r_start;
20046                         sb_offset = rsm->r_start - tp->snd_una;
20047                         sendalot = 0;
20048                         if ((rack->full_size_rxt == 0) &&
20049                             (rack->shape_rxt_to_pacing_min == 0) &&
20050                             (len >= segsiz))
20051                                 len = segsiz;
20052                         else if (rack->shape_rxt_to_pacing_min &&
20053                                  rack->gp_ready) {
20054                                 /* We use pacing min as shaping len req */
20055                                 uint32_t maxlen;
20056
20057                                 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20058                                 if (len > maxlen)
20059                                         len = maxlen;
20060                         }
20061                         /*
20062                          * Delay removing the flag RACK_MUST_RXT so
20063                          * that the fastpath for retransmit will
20064                          * work with this rsm.
20065                          */
20066                 }
20067         }
20068         /*
20069          * Enforce a connection sendmap count limit if set
20070          * as long as we are not retransmiting.
20071          */
20072         if ((rsm == NULL) &&
20073             (rack->do_detection == 0) &&
20074             (V_tcp_map_entries_limit > 0) &&
20075             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
20076                 counter_u64_add(rack_to_alloc_limited, 1);
20077                 if (!rack->alloc_limit_reported) {
20078                         rack->alloc_limit_reported = 1;
20079                         counter_u64_add(rack_alloc_limited_conns, 1);
20080                 }
20081                 so = inp->inp_socket;
20082                 sb = &so->so_snd;
20083                 goto just_return_nolock;
20084         }
20085         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
20086                 /* we are retransmitting the fin */
20087                 len--;
20088                 if (len) {
20089                         /*
20090                          * When retransmitting data do *not* include the
20091                          * FIN. This could happen from a TLP probe.
20092                          */
20093                         flags &= ~TH_FIN;
20094                 }
20095         }
20096         if (rsm && rack->r_fsb_inited &&
20097             rack_use_rsm_rfo &&
20098             ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
20099                 int ret;
20100
20101                 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
20102                 if (ret == 0)
20103                         return (0);
20104         }
20105         so = inp->inp_socket;
20106         sb = &so->so_snd;
20107         if (do_a_prefetch == 0) {
20108                 kern_prefetch(sb, &do_a_prefetch);
20109                 do_a_prefetch = 1;
20110         }
20111 #ifdef NETFLIX_SHARED_CWND
20112         if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
20113             rack->rack_enable_scwnd) {
20114                 /* We are doing cwnd sharing */
20115                 if (rack->gp_ready &&
20116                     (rack->rack_attempted_scwnd == 0) &&
20117                     (rack->r_ctl.rc_scw == NULL) &&
20118                     tp->t_lib) {
20119                         /* The pcbid is in, lets make an attempt */
20120                         counter_u64_add(rack_try_scwnd, 1);
20121                         rack->rack_attempted_scwnd = 1;
20122                         rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
20123                                                                    &rack->r_ctl.rc_scw_index,
20124                                                                    segsiz);
20125                 }
20126                 if (rack->r_ctl.rc_scw &&
20127                     (rack->rack_scwnd_is_idle == 1) &&
20128                     sbavail(&so->so_snd)) {
20129                         /* we are no longer out of data */
20130                         tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20131                         rack->rack_scwnd_is_idle = 0;
20132                 }
20133                 if (rack->r_ctl.rc_scw) {
20134                         /* First lets update and get the cwnd */
20135                         rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
20136                                                                                        rack->r_ctl.rc_scw_index,
20137                                                                                        tp->snd_cwnd, tp->snd_wnd, segsiz);
20138                 }
20139         }
20140 #endif
20141         /*
20142          * Get standard flags, and add SYN or FIN if requested by 'hidden'
20143          * state flags.
20144          */
20145         if (tp->t_flags & TF_NEEDFIN)
20146                 flags |= TH_FIN;
20147         if (tp->t_flags & TF_NEEDSYN)
20148                 flags |= TH_SYN;
20149         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
20150                 void *end_rsm;
20151                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
20152                 if (end_rsm)
20153                         kern_prefetch(end_rsm, &prefetch_rsm);
20154                 prefetch_rsm = 1;
20155         }
20156         SOCKBUF_LOCK(sb);
20157         /*
20158          * If snd_nxt == snd_max and we have transmitted a FIN, the
20159          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
20160          * negative length.  This can also occur when TCP opens up its
20161          * congestion window while receiving additional duplicate acks after
20162          * fast-retransmit because TCP will reset snd_nxt to snd_max after
20163          * the fast-retransmit.
20164          *
20165          * In the normal retransmit-FIN-only case, however, snd_nxt will be
20166          * set to snd_una, the sb_offset will be 0, and the length may wind
20167          * up 0.
20168          *
20169          * If sack_rxmit is true we are retransmitting from the scoreboard
20170          * in which case len is already set.
20171          */
20172         if ((sack_rxmit == 0) &&
20173             (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
20174                 uint32_t avail;
20175
20176                 avail = sbavail(sb);
20177                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
20178                         sb_offset = tp->snd_nxt - tp->snd_una;
20179                 else
20180                         sb_offset = 0;
20181                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
20182                         if (rack->r_ctl.rc_tlp_new_data) {
20183                                 /* TLP is forcing out new data */
20184                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
20185                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
20186                                 }
20187                                 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
20188                                         if (tp->snd_wnd > sb_offset)
20189                                                 len = tp->snd_wnd - sb_offset;
20190                                         else
20191                                                 len = 0;
20192                                 } else {
20193                                         len = rack->r_ctl.rc_tlp_new_data;
20194                                 }
20195                                 rack->r_ctl.rc_tlp_new_data = 0;
20196                         }  else {
20197                                 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
20198                         }
20199                         if ((rack->r_ctl.crte == NULL) &&
20200                             IN_FASTRECOVERY(tp->t_flags) &&
20201                             (rack->full_size_rxt == 0) &&
20202                             (rack->shape_rxt_to_pacing_min == 0) &&
20203                             (len > segsiz)) {
20204                                 /*
20205                                  * For prr=off, we need to send only 1 MSS
20206                                  * at a time. We do this because another sack could
20207                                  * be arriving that causes us to send retransmits and
20208                                  * we don't want to be on a long pace due to a larger send
20209                                  * that keeps us from sending out the retransmit.
20210                                  */
20211                                 len = segsiz;
20212                         } else if (rack->shape_rxt_to_pacing_min &&
20213                                    rack->gp_ready) {
20214                                 /* We use pacing min as shaping len req */
20215                                 uint32_t maxlen;
20216
20217                                 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20218                                 if (len > maxlen)
20219                                         len = maxlen;
20220                         }/* The else is full_size_rxt is on so send it all */
20221                 } else {
20222                         uint32_t outstanding;
20223                         /*
20224                          * We are inside of a Fast recovery episode, this
20225                          * is caused by a SACK or 3 dup acks. At this point
20226                          * we have sent all the retransmissions and we rely
20227                          * on PRR to dictate what we will send in the form of
20228                          * new data.
20229                          */
20230
20231                         outstanding = tp->snd_max - tp->snd_una;
20232                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
20233                                 if (tp->snd_wnd > outstanding) {
20234                                         len = tp->snd_wnd - outstanding;
20235                                         /* Check to see if we have the data */
20236                                         if ((sb_offset + len) > avail) {
20237                                                 /* It does not all fit */
20238                                                 if (avail > sb_offset)
20239                                                         len = avail - sb_offset;
20240                                                 else
20241                                                         len = 0;
20242                                         }
20243                                 } else {
20244                                         len = 0;
20245                                 }
20246                         } else if (avail > sb_offset) {
20247                                 len = avail - sb_offset;
20248                         } else {
20249                                 len = 0;
20250                         }
20251                         if (len > 0) {
20252                                 if (len > rack->r_ctl.rc_prr_sndcnt) {
20253                                         len = rack->r_ctl.rc_prr_sndcnt;
20254                                 }
20255                                 if (len > 0) {
20256                                         sub_from_prr = 1;
20257                                 }
20258                         }
20259                         if (len > segsiz) {
20260                                 /*
20261                                  * We should never send more than a MSS when
20262                                  * retransmitting or sending new data in prr
20263                                  * mode unless the override flag is on. Most
20264                                  * likely the PRR algorithm is not going to
20265                                  * let us send a lot as well :-)
20266                                  */
20267                                 if (rack->r_ctl.rc_prr_sendalot == 0) {
20268                                         len = segsiz;
20269                                 }
20270                         } else if (len < segsiz) {
20271                                 /*
20272                                  * Do we send any? The idea here is if the
20273                                  * send empty's the socket buffer we want to
20274                                  * do it. However if not then lets just wait
20275                                  * for our prr_sndcnt to get bigger.
20276                                  */
20277                                 long leftinsb;
20278
20279                                 leftinsb = sbavail(sb) - sb_offset;
20280                                 if (leftinsb > len) {
20281                                         /* This send does not empty the sb */
20282                                         len = 0;
20283                                 }
20284                         }
20285                 }
20286         } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
20287                 /*
20288                  * If you have not established
20289                  * and are not doing FAST OPEN
20290                  * no data please.
20291                  */
20292                 if ((sack_rxmit == 0) &&
20293                     (!IS_FASTOPEN(tp->t_flags))){
20294                         len = 0;
20295                         sb_offset = 0;
20296                 }
20297         }
20298         if (prefetch_so_done == 0) {
20299                 kern_prefetch(so, &prefetch_so_done);
20300                 prefetch_so_done = 1;
20301         }
20302         /*
20303          * Lop off SYN bit if it has already been sent.  However, if this is
20304          * SYN-SENT state and if segment contains data and if we don't know
20305          * that foreign host supports TAO, suppress sending segment.
20306          */
20307         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
20308             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
20309                 /*
20310                  * When sending additional segments following a TFO SYN|ACK,
20311                  * do not include the SYN bit.
20312                  */
20313                 if (IS_FASTOPEN(tp->t_flags) &&
20314                     (tp->t_state == TCPS_SYN_RECEIVED))
20315                         flags &= ~TH_SYN;
20316         }
20317         /*
20318          * Be careful not to send data and/or FIN on SYN segments. This
20319          * measure is needed to prevent interoperability problems with not
20320          * fully conformant TCP implementations.
20321          */
20322         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
20323                 len = 0;
20324                 flags &= ~TH_FIN;
20325         }
20326         /*
20327          * On TFO sockets, ensure no data is sent in the following cases:
20328          *
20329          *  - When retransmitting SYN|ACK on a passively-created socket
20330          *
20331          *  - When retransmitting SYN on an actively created socket
20332          *
20333          *  - When sending a zero-length cookie (cookie request) on an
20334          *    actively created socket
20335          *
20336          *  - When the socket is in the CLOSED state (RST is being sent)
20337          */
20338         if (IS_FASTOPEN(tp->t_flags) &&
20339             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
20340              ((tp->t_state == TCPS_SYN_SENT) &&
20341               (tp->t_tfo_client_cookie_len == 0)) ||
20342              (flags & TH_RST))) {
20343                 sack_rxmit = 0;
20344                 len = 0;
20345         }
20346         /* Without fast-open there should never be data sent on a SYN */
20347         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
20348                 tp->snd_nxt = tp->iss;
20349                 len = 0;
20350         }
20351         if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
20352                 /* We only send 1 MSS if we have a DSACK block */
20353                 add_flag |= RACK_SENT_W_DSACK;
20354                 len = segsiz;
20355         }
20356         orig_len = len;
20357         if (len <= 0) {
20358                 /*
20359                  * If FIN has been sent but not acked, but we haven't been
20360                  * called to retransmit, len will be < 0.  Otherwise, window
20361                  * shrank after we sent into it.  If window shrank to 0,
20362                  * cancel pending retransmit, pull snd_nxt back to (closed)
20363                  * window, and set the persist timer if it isn't already
20364                  * going.  If the window didn't close completely, just wait
20365                  * for an ACK.
20366                  *
20367                  * We also do a general check here to ensure that we will
20368                  * set the persist timer when we have data to send, but a
20369                  * 0-byte window. This makes sure the persist timer is set
20370                  * even if the packet hits one of the "goto send" lines
20371                  * below.
20372                  */
20373                 len = 0;
20374                 if ((tp->snd_wnd == 0) &&
20375                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20376                     (tp->snd_una == tp->snd_max) &&
20377                     (sb_offset < (int)sbavail(sb))) {
20378                         rack_enter_persist(tp, rack, cts, tp->snd_una);
20379                 }
20380         } else if ((rsm == NULL) &&
20381                    (doing_tlp == 0) &&
20382                    (len < pace_max_seg)) {
20383                 /*
20384                  * We are not sending a maximum sized segment for
20385                  * some reason. Should we not send anything (think
20386                  * sws or persists)?
20387                  */
20388                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20389                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20390                     (len < minseg) &&
20391                     (len < (int)(sbavail(sb) - sb_offset))) {
20392                         /*
20393                          * Here the rwnd is less than
20394                          * the minimum pacing size, this is not a retransmit,
20395                          * we are established and
20396                          * the send is not the last in the socket buffer
20397                          * we send nothing, and we may enter persists
20398                          * if nothing is outstanding.
20399                          */
20400                         len = 0;
20401                         if (tp->snd_max == tp->snd_una) {
20402                                 /*
20403                                  * Nothing out we can
20404                                  * go into persists.
20405                                  */
20406                                 rack_enter_persist(tp, rack, cts, tp->snd_una);
20407                         }
20408                 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
20409                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20410                            (len < (int)(sbavail(sb) - sb_offset)) &&
20411                            (len < minseg)) {
20412                         /*
20413                          * Here we are not retransmitting, and
20414                          * the cwnd is not so small that we could
20415                          * not send at least a min size (rxt timer
20416                          * not having gone off), We have 2 segments or
20417                          * more already in flight, its not the tail end
20418                          * of the socket buffer  and the cwnd is blocking
20419                          * us from sending out a minimum pacing segment size.
20420                          * Lets not send anything.
20421                          */
20422                         len = 0;
20423                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
20424                             min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20425                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20426                            (len < (int)(sbavail(sb) - sb_offset)) &&
20427                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
20428                         /*
20429                          * Here we have a send window but we have
20430                          * filled it up and we can't send another pacing segment.
20431                          * We also have in flight more than 2 segments
20432                          * and we are not completing the sb i.e. we allow
20433                          * the last bytes of the sb to go out even if
20434                          * its not a full pacing segment.
20435                          */
20436                         len = 0;
20437                 } else if ((rack->r_ctl.crte != NULL) &&
20438                            (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
20439                            (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
20440                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
20441                            (len < (int)(sbavail(sb) - sb_offset))) {
20442                         /*
20443                          * Here we are doing hardware pacing, this is not a TLP,
20444                          * we are not sending a pace max segment size, there is rwnd
20445                          * room to send at least N pace_max_seg, the cwnd is greater
20446                          * than or equal to a full pacing segments plus 4 mss and we have 2 or
20447                          * more segments in flight and its not the tail of the socket buffer.
20448                          *
20449                          * We don't want to send instead we need to get more ack's in to
20450                          * allow us to send a full pacing segment. Normally, if we are pacing
20451                          * about the right speed, we should have finished our pacing
20452                          * send as most of the acks have come back if we are at the
20453                          * right rate. This is a bit fuzzy since return path delay
20454                          * can delay the acks, which is why we want to make sure we
20455                          * have cwnd space to have a bit more than a max pace segments in flight.
20456                          *
20457                          * If we have not gotten our acks back we are pacing at too high a
20458                          * rate delaying will not hurt and will bring our GP estimate down by
20459                          * injecting the delay. If we don't do this we will send
20460                          * 2 MSS out in response to the acks being clocked in which
20461                          * defeats the point of hw-pacing (i.e. to help us get
20462                          * larger TSO's out).
20463                          */
20464                         len = 0;
20465                 }
20466
20467         }
20468         /* len will be >= 0 after this point. */
20469         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
20470         rack_sndbuf_autoscale(rack);
20471         /*
20472          * Decide if we can use TCP Segmentation Offloading (if supported by
20473          * hardware).
20474          *
20475          * TSO may only be used if we are in a pure bulk sending state.  The
20476          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
20477          * options prevent using TSO.  With TSO the TCP header is the same
20478          * (except for the sequence number) for all generated packets.  This
20479          * makes it impossible to transmit any options which vary per
20480          * generated segment or packet.
20481          *
20482          * IPv4 handling has a clear separation of ip options and ip header
20483          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
20484          * the right thing below to provide length of just ip options and thus
20485          * checking for ipoptlen is enough to decide if ip options are present.
20486          */
20487         ipoptlen = 0;
20488 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20489         /*
20490          * Pre-calculate here as we save another lookup into the darknesses
20491          * of IPsec that way and can actually decide if TSO is ok.
20492          */
20493 #ifdef INET6
20494         if (isipv6 && IPSEC_ENABLED(ipv6))
20495                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
20496 #ifdef INET
20497         else
20498 #endif
20499 #endif                          /* INET6 */
20500 #ifdef INET
20501                 if (IPSEC_ENABLED(ipv4))
20502                         ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
20503 #endif                          /* INET */
20504 #endif
20505
20506 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20507         ipoptlen += ipsec_optlen;
20508 #endif
20509         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
20510             (tp->t_port == 0) &&
20511             ((tp->t_flags & TF_SIGNATURE) == 0) &&
20512             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
20513             ipoptlen == 0)
20514                 tso = 1;
20515         {
20516                 uint32_t outstanding __unused;
20517
20518                 outstanding = tp->snd_max - tp->snd_una;
20519                 if (tp->t_flags & TF_SENTFIN) {
20520                         /*
20521                          * If we sent a fin, snd_max is 1 higher than
20522                          * snd_una
20523                          */
20524                         outstanding--;
20525                 }
20526                 if (sack_rxmit) {
20527                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
20528                                 flags &= ~TH_FIN;
20529                 } else {
20530                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
20531                                    sbused(sb)))
20532                                 flags &= ~TH_FIN;
20533                 }
20534         }
20535         recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
20536                       (long)TCP_MAXWIN << tp->rcv_scale);
20537
20538         /*
20539          * Sender silly window avoidance.   We transmit under the following
20540          * conditions when len is non-zero:
20541          *
20542          * - We have a full segment (or more with TSO) - This is the last
20543          * buffer in a write()/send() and we are either idle or running
20544          * NODELAY - we've timed out (e.g. persist timer) - we have more
20545          * then 1/2 the maximum send window's worth of data (receiver may be
20546          * limited the window size) - we need to retransmit
20547          */
20548         if (len) {
20549                 if (len >= segsiz) {
20550                         goto send;
20551                 }
20552                 /*
20553                  * NOTE! on localhost connections an 'ack' from the remote
20554                  * end may occur synchronously with the output and cause us
20555                  * to flush a buffer queued with moretocome.  XXX
20556                  *
20557                  */
20558                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
20559                     (idle || (tp->t_flags & TF_NODELAY)) &&
20560                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20561                     (tp->t_flags & TF_NOPUSH) == 0) {
20562                         pass = 2;
20563                         goto send;
20564                 }
20565                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
20566                         pass = 22;
20567                         goto send;
20568                 }
20569                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
20570                         pass = 4;
20571                         goto send;
20572                 }
20573                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
20574                         pass = 5;
20575                         goto send;
20576                 }
20577                 if (sack_rxmit) {
20578                         pass = 6;
20579                         goto send;
20580                 }
20581                 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
20582                     (ctf_outstanding(tp) < (segsiz * 2))) {
20583                         /*
20584                          * We have less than two MSS outstanding (delayed ack)
20585                          * and our rwnd will not let us send a full sized
20586                          * MSS. Lets go ahead and let this small segment
20587                          * out because we want to try to have at least two
20588                          * packets inflight to not be caught by delayed ack.
20589                          */
20590                         pass = 12;
20591                         goto send;
20592                 }
20593         }
20594         /*
20595          * Sending of standalone window updates.
20596          *
20597          * Window updates are important when we close our window due to a
20598          * full socket buffer and are opening it again after the application
20599          * reads data from it.  Once the window has opened again and the
20600          * remote end starts to send again the ACK clock takes over and
20601          * provides the most current window information.
20602          *
20603          * We must avoid the silly window syndrome whereas every read from
20604          * the receive buffer, no matter how small, causes a window update
20605          * to be sent.  We also should avoid sending a flurry of window
20606          * updates when the socket buffer had queued a lot of data and the
20607          * application is doing small reads.
20608          *
20609          * Prevent a flurry of pointless window updates by only sending an
20610          * update when we can increase the advertized window by more than
20611          * 1/4th of the socket buffer capacity.  When the buffer is getting
20612          * full or is very small be more aggressive and send an update
20613          * whenever we can increase by two mss sized segments. In all other
20614          * situations the ACK's to new incoming data will carry further
20615          * window increases.
20616          *
20617          * Don't send an independent window update if a delayed ACK is
20618          * pending (it will get piggy-backed on it) or the remote side
20619          * already has done a half-close and won't send more data.  Skip
20620          * this if the connection is in T/TCP half-open state.
20621          */
20622         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
20623             !(tp->t_flags & TF_DELACK) &&
20624             !TCPS_HAVERCVDFIN(tp->t_state)) {
20625                 /*
20626                  * "adv" is the amount we could increase the window, taking
20627                  * into account that we are limited by TCP_MAXWIN <<
20628                  * tp->rcv_scale.
20629                  */
20630                 int32_t adv;
20631                 int oldwin;
20632
20633                 adv = recwin;
20634                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
20635                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
20636                         if (adv > oldwin)
20637                                 adv -= oldwin;
20638                         else {
20639                                 /* We can't increase the window */
20640                                 adv = 0;
20641                         }
20642                 } else
20643                         oldwin = 0;
20644
20645                 /*
20646                  * If the new window size ends up being the same as or less
20647                  * than the old size when it is scaled, then don't force
20648                  * a window update.
20649                  */
20650                 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
20651                         goto dontupdate;
20652
20653                 if (adv >= (int32_t)(2 * segsiz) &&
20654                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
20655                      recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
20656                      so->so_rcv.sb_hiwat <= 8 * segsiz)) {
20657                         pass = 7;
20658                         goto send;
20659                 }
20660                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
20661                         pass = 23;
20662                         goto send;
20663                 }
20664         }
20665 dontupdate:
20666
20667         /*
20668          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
20669          * is also a catch-all for the retransmit timer timeout case.
20670          */
20671         if (tp->t_flags & TF_ACKNOW) {
20672                 pass = 8;
20673                 goto send;
20674         }
20675         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
20676                 pass = 9;
20677                 goto send;
20678         }
20679         /*
20680          * If our state indicates that FIN should be sent and we have not
20681          * yet done so, then we need to send.
20682          */
20683         if ((flags & TH_FIN) &&
20684             (tp->snd_nxt == tp->snd_una)) {
20685                 pass = 11;
20686                 goto send;
20687         }
20688         /*
20689          * No reason to send a segment, just return.
20690          */
20691 just_return:
20692         SOCKBUF_UNLOCK(sb);
20693 just_return_nolock:
20694         {
20695                 int app_limited = CTF_JR_SENT_DATA;
20696
20697                 if (tot_len_this_send > 0) {
20698                         /* Make sure snd_nxt is up to max */
20699                         rack->r_ctl.fsb.recwin = recwin;
20700                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
20701                         if ((error == 0) &&
20702                             rack_use_rfo &&
20703                             ((flags & (TH_SYN|TH_FIN)) == 0) &&
20704                             (ipoptlen == 0) &&
20705                             (tp->snd_nxt == tp->snd_max) &&
20706                             (tp->rcv_numsacks == 0) &&
20707                             rack->r_fsb_inited &&
20708                             TCPS_HAVEESTABLISHED(tp->t_state) &&
20709                             ((IN_RECOVERY(tp->t_flags)) == 0) &&
20710                             (rack->r_must_retran == 0) &&
20711                             ((tp->t_flags & TF_NEEDFIN) == 0) &&
20712                             (len > 0) && (orig_len > 0) &&
20713                             (orig_len > len) &&
20714                             ((orig_len - len) >= segsiz) &&
20715                             ((optlen == 0) ||
20716                              ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
20717                                 /* We can send at least one more MSS using our fsb */
20718                                 rack_setup_fast_output(tp, rack, sb, len, orig_len,
20719                                                        segsiz, pace_max_seg, hw_tls, flags);
20720                         } else
20721                                 rack->r_fast_output = 0;
20722
20723
20724                         rack_log_fsb(rack, tp, so, flags,
20725                                      ipoptlen, orig_len, len, 0,
20726                                      1, optlen, __LINE__, 1);
20727                         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
20728                                 tp->snd_nxt = tp->snd_max;
20729                 } else {
20730                         int end_window = 0;
20731                         uint32_t seq = tp->gput_ack;
20732
20733                         rsm = tqhash_max(rack->r_ctl.tqh);
20734                         if (rsm) {
20735                                 /*
20736                                  * Mark the last sent that we just-returned (hinting
20737                                  * that delayed ack may play a role in any rtt measurement).
20738                                  */
20739                                 rsm->r_just_ret = 1;
20740                         }
20741                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
20742                         rack->r_ctl.rc_agg_delayed = 0;
20743                         rack->r_early = 0;
20744                         rack->r_late = 0;
20745                         rack->r_ctl.rc_agg_early = 0;
20746                         if ((ctf_outstanding(tp) +
20747                              min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
20748                                  minseg)) >= tp->snd_wnd) {
20749                                 /* We are limited by the rwnd */
20750                                 app_limited = CTF_JR_RWND_LIMITED;
20751                                 if (IN_FASTRECOVERY(tp->t_flags))
20752                                         rack->r_ctl.rc_prr_sndcnt = 0;
20753                         } else if (ctf_outstanding(tp) >= sbavail(sb)) {
20754                                 /* We are limited by whats available -- app limited */
20755                                 app_limited = CTF_JR_APP_LIMITED;
20756                                 if (IN_FASTRECOVERY(tp->t_flags))
20757                                         rack->r_ctl.rc_prr_sndcnt = 0;
20758                         } else if ((idle == 0) &&
20759                                    ((tp->t_flags & TF_NODELAY) == 0) &&
20760                                    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20761                                    (len < segsiz)) {
20762                                 /*
20763                                  * No delay is not on and the
20764                                  * user is sending less than 1MSS. This
20765                                  * brings out SWS avoidance so we
20766                                  * don't send. Another app-limited case.
20767                                  */
20768                                 app_limited = CTF_JR_APP_LIMITED;
20769                         } else if (tp->t_flags & TF_NOPUSH) {
20770                                 /*
20771                                  * The user has requested no push of
20772                                  * the last segment and we are
20773                                  * at the last segment. Another app
20774                                  * limited case.
20775                                  */
20776                                 app_limited = CTF_JR_APP_LIMITED;
20777                         } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
20778                                 /* Its the cwnd */
20779                                 app_limited = CTF_JR_CWND_LIMITED;
20780                         } else if (IN_FASTRECOVERY(tp->t_flags) &&
20781                                    (rack->rack_no_prr == 0) &&
20782                                    (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
20783                                 app_limited = CTF_JR_PRR;
20784                         } else {
20785                                 /* Now why here are we not sending? */
20786 #ifdef NOW
20787 #ifdef INVARIANTS
20788                                 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
20789 #endif
20790 #endif
20791                                 app_limited = CTF_JR_ASSESSING;
20792                         }
20793                         /*
20794                          * App limited in some fashion, for our pacing GP
20795                          * measurements we don't want any gap (even cwnd).
20796                          * Close  down the measurement window.
20797                          */
20798                         if (rack_cwnd_block_ends_measure &&
20799                             ((app_limited == CTF_JR_CWND_LIMITED) ||
20800                              (app_limited == CTF_JR_PRR))) {
20801                                 /*
20802                                  * The reason we are not sending is
20803                                  * the cwnd (or prr). We have been configured
20804                                  * to end the measurement window in
20805                                  * this case.
20806                                  */
20807                                 end_window = 1;
20808                         } else if (rack_rwnd_block_ends_measure &&
20809                                    (app_limited == CTF_JR_RWND_LIMITED)) {
20810                                 /*
20811                                  * We are rwnd limited and have been
20812                                  * configured to end the measurement
20813                                  * window in this case.
20814                                  */
20815                                 end_window = 1;
20816                         } else if (app_limited == CTF_JR_APP_LIMITED) {
20817                                 /*
20818                                  * A true application limited period, we have
20819                                  * ran out of data.
20820                                  */
20821                                 end_window = 1;
20822                         } else if (app_limited == CTF_JR_ASSESSING) {
20823                                 /*
20824                                  * In the assessing case we hit the end of
20825                                  * the if/else and had no known reason
20826                                  * This will panic us under invariants..
20827                                  *
20828                                  * If we get this out in logs we need to
20829                                  * investagate which reason we missed.
20830                                  */
20831                                 end_window = 1;
20832                         }
20833                         if (end_window) {
20834                                 uint8_t log = 0;
20835
20836                                 /* Adjust the Gput measurement */
20837                                 if ((tp->t_flags & TF_GPUTINPROG) &&
20838                                     SEQ_GT(tp->gput_ack, tp->snd_max)) {
20839                                         tp->gput_ack = tp->snd_max;
20840                                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
20841                                                 /*
20842                                                  * There is not enough to measure.
20843                                                  */
20844                                                 tp->t_flags &= ~TF_GPUTINPROG;
20845                                                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
20846                                                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
20847                                                                            tp->gput_seq,
20848                                                                            0, 0, 18, __LINE__, NULL, 0);
20849                                         } else
20850                                                 log = 1;
20851                                 }
20852                                 /* Mark the last packet has app limited */
20853                                 rsm = tqhash_max(rack->r_ctl.tqh);
20854                                 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
20855                                         if (rack->r_ctl.rc_app_limited_cnt == 0)
20856                                                 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
20857                                         else {
20858                                                 /*
20859                                                  * Go out to the end app limited and mark
20860                                                  * this new one as next and move the end_appl up
20861                                                  * to this guy.
20862                                                  */
20863                                                 if (rack->r_ctl.rc_end_appl)
20864                                                         rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
20865                                                 rack->r_ctl.rc_end_appl = rsm;
20866                                         }
20867                                         rsm->r_flags |= RACK_APP_LIMITED;
20868                                         rack->r_ctl.rc_app_limited_cnt++;
20869                                 }
20870                                 if (log)
20871                                         rack_log_pacing_delay_calc(rack,
20872                                                                    rack->r_ctl.rc_app_limited_cnt, seq,
20873                                                                    tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
20874                         }
20875                 }
20876                 /* Check if we need to go into persists or not */
20877                 if ((tp->snd_max == tp->snd_una) &&
20878                     TCPS_HAVEESTABLISHED(tp->t_state) &&
20879                     sbavail(sb) &&
20880                     (sbavail(sb) > tp->snd_wnd) &&
20881                     (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
20882                         /* Yes lets make sure to move to persist before timer-start */
20883                         rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
20884                 }
20885                 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
20886                 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
20887         }
20888 #ifdef NETFLIX_SHARED_CWND
20889         if ((sbavail(sb) == 0) &&
20890             rack->r_ctl.rc_scw) {
20891                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20892                 rack->rack_scwnd_is_idle = 1;
20893         }
20894 #endif
20895 #ifdef TCP_ACCOUNTING
20896         if (tot_len_this_send > 0) {
20897                 crtsc = get_cyclecount();
20898                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20899                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
20900                 }
20901                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20902                         tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
20903                 }
20904                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20905                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
20906                 }
20907         } else {
20908                 crtsc = get_cyclecount();
20909                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20910                         tp->tcp_cnt_counters[SND_LIMITED]++;
20911                 }
20912                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20913                         tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
20914                 }
20915         }
20916         sched_unpin();
20917 #endif
20918         return (0);
20919
20920 send:
20921         if ((rack->r_ctl.crte != NULL) &&
20922             (rsm == NULL) &&
20923             ((rack->rc_hw_nobuf == 1) ||
20924              (rack_hw_check_queue && (check_done == 0)))) {
20925                 /*
20926                  * We only want to do this once with the hw_check_queue,
20927                  * for the enobuf case we would only do it once if
20928                  * we come around to again, the flag will be clear.
20929                  */
20930                 check_done = 1;
20931                 slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
20932                 if (slot) {
20933                         rack->r_ctl.rc_agg_delayed = 0;
20934                         rack->r_ctl.rc_agg_early = 0;
20935                         rack->r_early = 0;
20936                         rack->r_late = 0;
20937                         SOCKBUF_UNLOCK(&so->so_snd);
20938                         goto skip_all_send;
20939                 }
20940         }
20941         if (rsm || sack_rxmit)
20942                 counter_u64_add(rack_nfto_resend, 1);
20943         else
20944                 counter_u64_add(rack_non_fto_send, 1);
20945         if ((flags & TH_FIN) &&
20946             sbavail(sb)) {
20947                 /*
20948                  * We do not transmit a FIN
20949                  * with data outstanding. We
20950                  * need to make it so all data
20951                  * is acked first.
20952                  */
20953                 flags &= ~TH_FIN;
20954         }
20955         /* Enforce stack imposed max seg size if we have one */
20956         if (rack->r_ctl.rc_pace_max_segs &&
20957             (len > rack->r_ctl.rc_pace_max_segs)) {
20958                 mark = 1;
20959                 len = rack->r_ctl.rc_pace_max_segs;
20960         }
20961         SOCKBUF_LOCK_ASSERT(sb);
20962         if (len > 0) {
20963                 if (len >= segsiz)
20964                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
20965                 else
20966                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
20967         }
20968         /*
20969          * Before ESTABLISHED, force sending of initial options unless TCP
20970          * set not to do any options. NOTE: we assume that the IP/TCP header
20971          * plus TCP options always fit in a single mbuf, leaving room for a
20972          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
20973          * + optlen <= MCLBYTES
20974          */
20975         optlen = 0;
20976 #ifdef INET6
20977         if (isipv6)
20978                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
20979         else
20980 #endif
20981                 hdrlen = sizeof(struct tcpiphdr);
20982
20983         /*
20984          * Compute options for segment. We only have to care about SYN and
20985          * established connection segments.  Options for SYN-ACK segments
20986          * are handled in TCP syncache.
20987          */
20988         to.to_flags = 0;
20989         if ((tp->t_flags & TF_NOOPT) == 0) {
20990                 /* Maximum segment size. */
20991                 if (flags & TH_SYN) {
20992                         tp->snd_nxt = tp->iss;
20993                         to.to_mss = tcp_mssopt(&inp->inp_inc);
20994                         if (tp->t_port)
20995                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
20996                         to.to_flags |= TOF_MSS;
20997
20998                         /*
20999                          * On SYN or SYN|ACK transmits on TFO connections,
21000                          * only include the TFO option if it is not a
21001                          * retransmit, as the presence of the TFO option may
21002                          * have caused the original SYN or SYN|ACK to have
21003                          * been dropped by a middlebox.
21004                          */
21005                         if (IS_FASTOPEN(tp->t_flags) &&
21006                             (tp->t_rxtshift == 0)) {
21007                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
21008                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
21009                                         to.to_tfo_cookie =
21010                                                 (u_int8_t *)&tp->t_tfo_cookie.server;
21011                                         to.to_flags |= TOF_FASTOPEN;
21012                                         wanted_cookie = 1;
21013                                 } else if (tp->t_state == TCPS_SYN_SENT) {
21014                                         to.to_tfo_len =
21015                                                 tp->t_tfo_client_cookie_len;
21016                                         to.to_tfo_cookie =
21017                                                 tp->t_tfo_cookie.client;
21018                                         to.to_flags |= TOF_FASTOPEN;
21019                                         wanted_cookie = 1;
21020                                         /*
21021                                          * If we wind up having more data to
21022                                          * send with the SYN than can fit in
21023                                          * one segment, don't send any more
21024                                          * until the SYN|ACK comes back from
21025                                          * the other end.
21026                                          */
21027                                         sendalot = 0;
21028                                 }
21029                         }
21030                 }
21031                 /* Window scaling. */
21032                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
21033                         to.to_wscale = tp->request_r_scale;
21034                         to.to_flags |= TOF_SCALE;
21035                 }
21036                 /* Timestamps. */
21037                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
21038                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
21039                         to.to_tsval = ms_cts + tp->ts_offset;
21040                         to.to_tsecr = tp->ts_recent;
21041                         to.to_flags |= TOF_TS;
21042                 }
21043                 /* Set receive buffer autosizing timestamp. */
21044                 if (tp->rfbuf_ts == 0 &&
21045                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
21046                         tp->rfbuf_ts = tcp_ts_getticks();
21047                 /* Selective ACK's. */
21048                 if (tp->t_flags & TF_SACK_PERMIT) {
21049                         if (flags & TH_SYN)
21050                                 to.to_flags |= TOF_SACKPERM;
21051                         else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21052                                  tp->rcv_numsacks > 0) {
21053                                 to.to_flags |= TOF_SACK;
21054                                 to.to_nsacks = tp->rcv_numsacks;
21055                                 to.to_sacks = (u_char *)tp->sackblks;
21056                         }
21057                 }
21058 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21059                 /* TCP-MD5 (RFC2385). */
21060                 if (tp->t_flags & TF_SIGNATURE)
21061                         to.to_flags |= TOF_SIGNATURE;
21062 #endif                          /* TCP_SIGNATURE */
21063
21064                 /* Processing the options. */
21065                 hdrlen += optlen = tcp_addoptions(&to, opt);
21066                 /*
21067                  * If we wanted a TFO option to be added, but it was unable
21068                  * to fit, ensure no data is sent.
21069                  */
21070                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
21071                     !(to.to_flags & TOF_FASTOPEN))
21072                         len = 0;
21073         }
21074         if (tp->t_port) {
21075                 if (V_tcp_udp_tunneling_port == 0) {
21076                         /* The port was removed?? */
21077                         SOCKBUF_UNLOCK(&so->so_snd);
21078 #ifdef TCP_ACCOUNTING
21079                         crtsc = get_cyclecount();
21080                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
21081                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
21082                         }
21083                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
21084                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
21085                         }
21086                         sched_unpin();
21087 #endif
21088                         return (EHOSTUNREACH);
21089                 }
21090                 hdrlen += sizeof(struct udphdr);
21091         }
21092 #ifdef INET6
21093         if (isipv6)
21094                 ipoptlen = ip6_optlen(inp);
21095         else
21096 #endif
21097                 if (inp->inp_options)
21098                         ipoptlen = inp->inp_options->m_len -
21099                                 offsetof(struct ipoption, ipopt_list);
21100                 else
21101                         ipoptlen = 0;
21102 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21103         ipoptlen += ipsec_optlen;
21104 #endif
21105
21106         /*
21107          * Adjust data length if insertion of options will bump the packet
21108          * length beyond the t_maxseg length. Clear the FIN bit because we
21109          * cut off the tail of the segment.
21110          */
21111         if (len + optlen + ipoptlen > tp->t_maxseg) {
21112                 if (tso) {
21113                         uint32_t if_hw_tsomax;
21114                         uint32_t moff;
21115                         int32_t max_len;
21116
21117                         /* extract TSO information */
21118                         if_hw_tsomax = tp->t_tsomax;
21119                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
21120                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
21121                         KASSERT(ipoptlen == 0,
21122                                 ("%s: TSO can't do IP options", __func__));
21123
21124                         /*
21125                          * Check if we should limit by maximum payload
21126                          * length:
21127                          */
21128                         if (if_hw_tsomax != 0) {
21129                                 /* compute maximum TSO length */
21130                                 max_len = (if_hw_tsomax - hdrlen -
21131                                            max_linkhdr);
21132                                 if (max_len <= 0) {
21133                                         len = 0;
21134                                 } else if (len > max_len) {
21135                                         sendalot = 1;
21136                                         len = max_len;
21137                                         mark = 2;
21138                                 }
21139                         }
21140                         /*
21141                          * Prevent the last segment from being fractional
21142                          * unless the send sockbuf can be emptied:
21143                          */
21144                         max_len = (tp->t_maxseg - optlen);
21145                         if ((sb_offset + len) < sbavail(sb)) {
21146                                 moff = len % (u_int)max_len;
21147                                 if (moff != 0) {
21148                                         mark = 3;
21149                                         len -= moff;
21150                                 }
21151                         }
21152                         /*
21153                          * In case there are too many small fragments don't
21154                          * use TSO:
21155                          */
21156                         if (len <= max_len) {
21157                                 mark = 4;
21158                                 tso = 0;
21159                         }
21160                         /*
21161                          * Send the FIN in a separate segment after the bulk
21162                          * sending is done. We don't trust the TSO
21163                          * implementations to clear the FIN flag on all but
21164                          * the last segment.
21165                          */
21166                         if (tp->t_flags & TF_NEEDFIN) {
21167                                 sendalot = 4;
21168                         }
21169                 } else {
21170                         mark = 5;
21171                         if (optlen + ipoptlen >= tp->t_maxseg) {
21172                                 /*
21173                                  * Since we don't have enough space to put
21174                                  * the IP header chain and the TCP header in
21175                                  * one packet as required by RFC 7112, don't
21176                                  * send it. Also ensure that at least one
21177                                  * byte of the payload can be put into the
21178                                  * TCP segment.
21179                                  */
21180                                 SOCKBUF_UNLOCK(&so->so_snd);
21181                                 error = EMSGSIZE;
21182                                 sack_rxmit = 0;
21183                                 goto out;
21184                         }
21185                         len = tp->t_maxseg - optlen - ipoptlen;
21186                         sendalot = 5;
21187                 }
21188         } else {
21189                 tso = 0;
21190                 mark = 6;
21191         }
21192         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
21193                 ("%s: len > IP_MAXPACKET", __func__));
21194 #ifdef DIAGNOSTIC
21195 #ifdef INET6
21196         if (max_linkhdr + hdrlen > MCLBYTES)
21197 #else
21198                 if (max_linkhdr + hdrlen > MHLEN)
21199 #endif
21200                         panic("tcphdr too big");
21201 #endif
21202
21203         /*
21204          * This KASSERT is here to catch edge cases at a well defined place.
21205          * Before, those had triggered (random) panic conditions further
21206          * down.
21207          */
21208         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
21209         if ((len == 0) &&
21210             (flags & TH_FIN) &&
21211             (sbused(sb))) {
21212                 /*
21213                  * We have outstanding data, don't send a fin by itself!.
21214                  */
21215                 goto just_return;
21216         }
21217         /*
21218          * Grab a header mbuf, attaching a copy of data to be transmitted,
21219          * and initialize the header from the template for sends on this
21220          * connection.
21221          */
21222         hw_tls = tp->t_nic_ktls_xmit != 0;
21223         if (len) {
21224                 uint32_t max_val;
21225                 uint32_t moff;
21226
21227                 if (rack->r_ctl.rc_pace_max_segs)
21228                         max_val = rack->r_ctl.rc_pace_max_segs;
21229                 else if (rack->rc_user_set_max_segs)
21230                         max_val = rack->rc_user_set_max_segs * segsiz;
21231                 else
21232                         max_val = len;
21233                 /*
21234                  * We allow a limit on sending with hptsi.
21235                  */
21236                 if (len > max_val) {
21237                         mark = 7;
21238                         len = max_val;
21239                 }
21240 #ifdef INET6
21241                 if (MHLEN < hdrlen + max_linkhdr)
21242                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
21243                 else
21244 #endif
21245                         m = m_gethdr(M_NOWAIT, MT_DATA);
21246
21247                 if (m == NULL) {
21248                         SOCKBUF_UNLOCK(sb);
21249                         error = ENOBUFS;
21250                         sack_rxmit = 0;
21251                         goto out;
21252                 }
21253                 m->m_data += max_linkhdr;
21254                 m->m_len = hdrlen;
21255
21256                 /*
21257                  * Start the m_copy functions from the closest mbuf to the
21258                  * sb_offset in the socket buffer chain.
21259                  */
21260                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
21261                 s_mb = mb;
21262                 s_moff = moff;
21263                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
21264                         m_copydata(mb, moff, (int)len,
21265                                    mtod(m, caddr_t)+hdrlen);
21266                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
21267                                 sbsndptr_adv(sb, mb, len);
21268                         m->m_len += len;
21269                 } else {
21270                         struct sockbuf *msb;
21271
21272                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
21273                                 msb = NULL;
21274                         else
21275                                 msb = sb;
21276                         m->m_next = tcp_m_copym(
21277                                 mb, moff, &len,
21278                                 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
21279                                 ((rsm == NULL) ? hw_tls : 0)
21280 #ifdef NETFLIX_COPY_ARGS
21281                                 , &s_mb, &s_moff
21282 #endif
21283                                 );
21284                         if (len <= (tp->t_maxseg - optlen)) {
21285                                 /*
21286                                  * Must have ran out of mbufs for the copy
21287                                  * shorten it to no longer need tso. Lets
21288                                  * not put on sendalot since we are low on
21289                                  * mbufs.
21290                                  */
21291                                 tso = 0;
21292                         }
21293                         if (m->m_next == NULL) {
21294                                 SOCKBUF_UNLOCK(sb);
21295                                 (void)m_free(m);
21296                                 error = ENOBUFS;
21297                                 sack_rxmit = 0;
21298                                 goto out;
21299                         }
21300                 }
21301                 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
21302                         if (rsm && (rsm->r_flags & RACK_TLP)) {
21303                                 /*
21304                                  * TLP should not count in retran count, but
21305                                  * in its own bin
21306                                  */
21307                                 counter_u64_add(rack_tlp_retran, 1);
21308                                 counter_u64_add(rack_tlp_retran_bytes, len);
21309                         } else {
21310                                 tp->t_sndrexmitpack++;
21311                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
21312                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
21313                         }
21314 #ifdef STATS
21315                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
21316                                                  len);
21317 #endif
21318                 } else {
21319                         KMOD_TCPSTAT_INC(tcps_sndpack);
21320                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
21321 #ifdef STATS
21322                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
21323                                                  len);
21324 #endif
21325                 }
21326                 /*
21327                  * If we're sending everything we've got, set PUSH. (This
21328                  * will keep happy those implementations which only give
21329                  * data to the user when a buffer fills or a PUSH comes in.)
21330                  */
21331                 if (sb_offset + len == sbused(sb) &&
21332                     sbused(sb) &&
21333                     !(flags & TH_SYN)) {
21334                         flags |= TH_PUSH;
21335                         add_flag |= RACK_HAD_PUSH;
21336                 }
21337
21338                 SOCKBUF_UNLOCK(sb);
21339         } else {
21340                 SOCKBUF_UNLOCK(sb);
21341                 if (tp->t_flags & TF_ACKNOW)
21342                         KMOD_TCPSTAT_INC(tcps_sndacks);
21343                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
21344                         KMOD_TCPSTAT_INC(tcps_sndctrl);
21345                 else
21346                         KMOD_TCPSTAT_INC(tcps_sndwinup);
21347
21348                 m = m_gethdr(M_NOWAIT, MT_DATA);
21349                 if (m == NULL) {
21350                         error = ENOBUFS;
21351                         sack_rxmit = 0;
21352                         goto out;
21353                 }
21354 #ifdef INET6
21355                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
21356                     MHLEN >= hdrlen) {
21357                         M_ALIGN(m, hdrlen);
21358                 } else
21359 #endif
21360                         m->m_data += max_linkhdr;
21361                 m->m_len = hdrlen;
21362         }
21363         SOCKBUF_UNLOCK_ASSERT(sb);
21364         m->m_pkthdr.rcvif = (struct ifnet *)0;
21365 #ifdef MAC
21366         mac_inpcb_create_mbuf(inp, m);
21367 #endif
21368         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) &&  rack->r_fsb_inited) {
21369 #ifdef INET6
21370                 if (isipv6)
21371                         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
21372                 else
21373 #endif                          /* INET6 */
21374 #ifdef INET
21375                         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
21376 #endif
21377                 th = rack->r_ctl.fsb.th;
21378                 udp = rack->r_ctl.fsb.udp;
21379                 if (udp) {
21380 #ifdef INET6
21381                         if (isipv6)
21382                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21383                         else
21384 #endif                          /* INET6 */
21385                                 ulen = hdrlen + len - sizeof(struct ip);
21386                         udp->uh_ulen = htons(ulen);
21387                 }
21388         } else {
21389 #ifdef INET6
21390                 if (isipv6) {
21391                         ip6 = mtod(m, struct ip6_hdr *);
21392                         if (tp->t_port) {
21393                                 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
21394                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21395                                 udp->uh_dport = tp->t_port;
21396                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21397                                 udp->uh_ulen = htons(ulen);
21398                                 th = (struct tcphdr *)(udp + 1);
21399                         } else
21400                                 th = (struct tcphdr *)(ip6 + 1);
21401                         tcpip_fillheaders(inp, tp->t_port, ip6, th);
21402                 } else
21403 #endif                          /* INET6 */
21404                 {
21405 #ifdef INET
21406                         ip = mtod(m, struct ip *);
21407                         if (tp->t_port) {
21408                                 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
21409                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21410                                 udp->uh_dport = tp->t_port;
21411                                 ulen = hdrlen + len - sizeof(struct ip);
21412                                 udp->uh_ulen = htons(ulen);
21413                                 th = (struct tcphdr *)(udp + 1);
21414                         } else
21415                                 th = (struct tcphdr *)(ip + 1);
21416                         tcpip_fillheaders(inp, tp->t_port, ip, th);
21417 #endif
21418                 }
21419         }
21420         /*
21421          * Fill in fields, remembering maximum advertised window for use in
21422          * delaying messages about window sizes. If resending a FIN, be sure
21423          * not to use a new sequence number.
21424          */
21425         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
21426             tp->snd_nxt == tp->snd_max)
21427                 tp->snd_nxt--;
21428         /*
21429          * If we are starting a connection, send ECN setup SYN packet. If we
21430          * are on a retransmit, we may resend those bits a number of times
21431          * as per RFC 3168.
21432          */
21433         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
21434                 flags |= tcp_ecn_output_syn_sent(tp);
21435         }
21436         /* Also handle parallel SYN for ECN */
21437         if (TCPS_HAVERCVDSYN(tp->t_state) &&
21438             (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
21439                 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
21440                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
21441                     (tp->t_flags2 & TF2_ECN_SND_ECE))
21442                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
21443 #ifdef INET6
21444                 if (isipv6) {
21445                         ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
21446                         ip6->ip6_flow |= htonl(ect << 20);
21447                 }
21448                 else
21449 #endif
21450                 {
21451 #ifdef INET
21452                         ip->ip_tos &= ~IPTOS_ECN_MASK;
21453                         ip->ip_tos |= ect;
21454 #endif
21455                 }
21456         }
21457         /*
21458          * If we are doing retransmissions, then snd_nxt will not reflect
21459          * the first unsent octet.  For ACK only packets, we do not want the
21460          * sequence number of the retransmitted packet, we want the sequence
21461          * number of the next unsent octet.  So, if there is no data (and no
21462          * SYN or FIN), use snd_max instead of snd_nxt when filling in
21463          * ti_seq.  But if we are in persist state, snd_max might reflect
21464          * one byte beyond the right edge of the window, so use snd_nxt in
21465          * that case, since we know we aren't doing a retransmission.
21466          * (retransmit and persist are mutually exclusive...)
21467          */
21468         if (sack_rxmit == 0) {
21469                 if (len || (flags & (TH_SYN | TH_FIN))) {
21470                         th->th_seq = htonl(tp->snd_nxt);
21471                         rack_seq = tp->snd_nxt;
21472                 } else {
21473                         th->th_seq = htonl(tp->snd_max);
21474                         rack_seq = tp->snd_max;
21475                 }
21476         } else {
21477                 th->th_seq = htonl(rsm->r_start);
21478                 rack_seq = rsm->r_start;
21479         }
21480         th->th_ack = htonl(tp->rcv_nxt);
21481         tcp_set_flags(th, flags);
21482         /*
21483          * Calculate receive window.  Don't shrink window, but avoid silly
21484          * window syndrome.
21485          * If a RST segment is sent, advertise a window of zero.
21486          */
21487         if (flags & TH_RST) {
21488                 recwin = 0;
21489         } else {
21490                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
21491                     recwin < (long)segsiz) {
21492                         recwin = 0;
21493                 }
21494                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
21495                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
21496                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
21497         }
21498
21499         /*
21500          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
21501          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
21502          * handled in syncache.
21503          */
21504         if (flags & TH_SYN)
21505                 th->th_win = htons((u_short)
21506                                    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
21507         else {
21508                 /* Avoid shrinking window with window scaling. */
21509                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
21510                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
21511         }
21512         /*
21513          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
21514          * window.  This may cause the remote transmitter to stall.  This
21515          * flag tells soreceive() to disable delayed acknowledgements when
21516          * draining the buffer.  This can occur if the receiver is
21517          * attempting to read more data than can be buffered prior to
21518          * transmitting on the connection.
21519          */
21520         if (th->th_win == 0) {
21521                 tp->t_sndzerowin++;
21522                 tp->t_flags |= TF_RXWIN0SENT;
21523         } else
21524                 tp->t_flags &= ~TF_RXWIN0SENT;
21525         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
21526         /* Now are we using fsb?, if so copy the template data to the mbuf */
21527         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
21528                 uint8_t *cpto;
21529
21530                 cpto = mtod(m, uint8_t *);
21531                 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
21532                 /*
21533                  * We have just copied in:
21534                  * IP/IP6
21535                  * <optional udphdr>
21536                  * tcphdr (no options)
21537                  *
21538                  * We need to grab the correct pointers into the mbuf
21539                  * for both the tcp header, and possibly the udp header (if tunneling).
21540                  * We do this by using the offset in the copy buffer and adding it
21541                  * to the mbuf base pointer (cpto).
21542                  */
21543 #ifdef INET6
21544                 if (isipv6)
21545                         ip6 = mtod(m, struct ip6_hdr *);
21546                 else
21547 #endif                          /* INET6 */
21548 #ifdef INET
21549                         ip = mtod(m, struct ip *);
21550 #endif
21551                 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
21552                 /* If we have a udp header lets set it into the mbuf as well */
21553                 if (udp)
21554                         udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
21555         }
21556 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21557         if (to.to_flags & TOF_SIGNATURE) {
21558                 /*
21559                  * Calculate MD5 signature and put it into the place
21560                  * determined before.
21561                  * NOTE: since TCP options buffer doesn't point into
21562                  * mbuf's data, calculate offset and use it.
21563                  */
21564                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
21565                                                        (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
21566                         /*
21567                          * Do not send segment if the calculation of MD5
21568                          * digest has failed.
21569                          */
21570                         goto out;
21571                 }
21572         }
21573 #endif
21574         if (optlen) {
21575                 bcopy(opt, th + 1, optlen);
21576                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
21577         }
21578         /*
21579          * Put TCP length in extended header, and then checksum extended
21580          * header and data.
21581          */
21582         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
21583 #ifdef INET6
21584         if (isipv6) {
21585                 /*
21586                  * ip6_plen is not need to be filled now, and will be filled
21587                  * in ip6_output.
21588                  */
21589                 if (tp->t_port) {
21590                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
21591                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21592                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
21593                         th->th_sum = htons(0);
21594                         UDPSTAT_INC(udps_opackets);
21595                 } else {
21596                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
21597                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21598                         th->th_sum = in6_cksum_pseudo(ip6,
21599                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
21600                                                       0);
21601                 }
21602         }
21603 #endif
21604 #if defined(INET6) && defined(INET)
21605         else
21606 #endif
21607 #ifdef INET
21608         {
21609                 if (tp->t_port) {
21610                         m->m_pkthdr.csum_flags = CSUM_UDP;
21611                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21612                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
21613                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
21614                         th->th_sum = htons(0);
21615                         UDPSTAT_INC(udps_opackets);
21616                 } else {
21617                         m->m_pkthdr.csum_flags = CSUM_TCP;
21618                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21619                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
21620                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
21621                                                                         IPPROTO_TCP + len + optlen));
21622                 }
21623                 /* IP version must be set here for ipv4/ipv6 checking later */
21624                 KASSERT(ip->ip_v == IPVERSION,
21625                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
21626         }
21627 #endif
21628         /*
21629          * Enable TSO and specify the size of the segments. The TCP pseudo
21630          * header checksum is always provided. XXX: Fixme: This is currently
21631          * not the case for IPv6.
21632          */
21633         if (tso) {
21634                 /*
21635                  * Here we must use t_maxseg and the optlen since
21636                  * the optlen may include SACK's (or DSACK).
21637                  */
21638                 KASSERT(len > tp->t_maxseg - optlen,
21639                         ("%s: len <= tso_segsz", __func__));
21640                 m->m_pkthdr.csum_flags |= CSUM_TSO;
21641                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
21642         }
21643         KASSERT(len + hdrlen == m_length(m, NULL),
21644                 ("%s: mbuf chain different than expected: %d + %u != %u",
21645                  __func__, len, hdrlen, m_length(m, NULL)));
21646
21647 #ifdef TCP_HHOOK
21648         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
21649         hhook_run_tcp_est_out(tp, th, &to, len, tso);
21650 #endif
21651         if ((rack->r_ctl.crte != NULL) &&
21652             (rack->rc_hw_nobuf == 0) &&
21653             tcp_bblogging_on(tp)) {
21654                 rack_log_queue_level(tp, rack, len, &tv, cts);
21655         }
21656         /* We're getting ready to send; log now. */
21657         if (tcp_bblogging_on(rack->rc_tp)) {
21658                 union tcp_log_stackspecific log;
21659
21660                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
21661                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
21662                 if (rack->rack_no_prr)
21663                         log.u_bbr.flex1 = 0;
21664                 else
21665                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
21666                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
21667                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
21668                 log.u_bbr.flex4 = orig_len;
21669                 /* Save off the early/late values */
21670                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
21671                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
21672                 log.u_bbr.bw_inuse = rack_get_bw(rack);
21673                 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
21674                 log.u_bbr.flex8 = 0;
21675                 if (rsm) {
21676                         if (rsm->r_flags & RACK_RWND_COLLAPSED) {
21677                                 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
21678                                 counter_u64_add(rack_collapsed_win_rxt, 1);
21679                                 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
21680                         }
21681                         if (doing_tlp)
21682                                 log.u_bbr.flex8 = 2;
21683                         else
21684                                 log.u_bbr.flex8 = 1;
21685                 } else {
21686                         if (doing_tlp)
21687                                 log.u_bbr.flex8 = 3;
21688                 }
21689                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
21690                 log.u_bbr.flex7 = mark;
21691                 log.u_bbr.flex7 <<= 8;
21692                 log.u_bbr.flex7 |= pass;
21693                 log.u_bbr.pkts_out = tp->t_maxseg;
21694                 log.u_bbr.timeStamp = cts;
21695                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
21696                 if (rsm && (rsm->r_rtr_cnt > 0)) {
21697                         /*
21698                          * When we have a retransmit we want to log the
21699                          * burst at send and flight at send from before.
21700                          */
21701                         log.u_bbr.flex5 = rsm->r_fas;
21702                         log.u_bbr.bbr_substate = rsm->r_bas;
21703                 } else {
21704                         /*
21705                          * New transmits we log in flex5 the inflight again as
21706                          * well as the number of segments in our send in the
21707                          * substate field.
21708                          */
21709                         log.u_bbr.flex5 = log.u_bbr.inflight;
21710                         log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
21711                 }
21712                 log.u_bbr.lt_epoch = cwnd_to_use;
21713                 log.u_bbr.delivered = sendalot;
21714                 log.u_bbr.rttProp = (uint64_t)rsm;
21715                 log.u_bbr.pkt_epoch = __LINE__;
21716                 if (rsm) {
21717                         log.u_bbr.delRate = rsm->r_flags;
21718                         log.u_bbr.delRate <<= 31;
21719                         log.u_bbr.delRate |= rack->r_must_retran;
21720                         log.u_bbr.delRate <<= 1;
21721                         log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21722                 } else {
21723                         log.u_bbr.delRate = rack->r_must_retran;
21724                         log.u_bbr.delRate <<= 1;
21725                         log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21726                 }
21727                 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
21728                                     len, &log, false, NULL, __func__, __LINE__, &tv);
21729         } else
21730                 lgb = NULL;
21731
21732         /*
21733          * Fill in IP length and desired time to live and send to IP level.
21734          * There should be a better way to handle ttl and tos; we could keep
21735          * them in the template, but need a way to checksum without them.
21736          */
21737         /*
21738          * m->m_pkthdr.len should have been set before cksum calcuration,
21739          * because in6_cksum() need it.
21740          */
21741 #ifdef INET6
21742         if (isipv6) {
21743                 /*
21744                  * we separately set hoplimit for every segment, since the
21745                  * user might want to change the value via setsockopt. Also,
21746                  * desired default hop limit might be changed via Neighbor
21747                  * Discovery.
21748                  */
21749                 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
21750
21751                 /*
21752                  * Set the packet size here for the benefit of DTrace
21753                  * probes. ip6_output() will set it properly; it's supposed
21754                  * to include the option header lengths as well.
21755                  */
21756                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
21757
21758                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
21759                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21760                 else
21761                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21762
21763                 if (tp->t_state == TCPS_SYN_SENT)
21764                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
21765
21766                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
21767                 /* TODO: IPv6 IP6TOS_ECT bit on */
21768                 error = ip6_output(m,
21769 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21770                                    inp->in6p_outputopts,
21771 #else
21772                                    NULL,
21773 #endif
21774                                    &inp->inp_route6,
21775                                    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
21776                                    NULL, NULL, inp);
21777
21778                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
21779                         mtu = inp->inp_route6.ro_nh->nh_mtu;
21780         }
21781 #endif                          /* INET6 */
21782 #if defined(INET) && defined(INET6)
21783         else
21784 #endif
21785 #ifdef INET
21786         {
21787                 ip->ip_len = htons(m->m_pkthdr.len);
21788 #ifdef INET6
21789                 if (inp->inp_vflag & INP_IPV6PROTO)
21790                         ip->ip_ttl = in6_selecthlim(inp, NULL);
21791 #endif                          /* INET6 */
21792                 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
21793                 /*
21794                  * If we do path MTU discovery, then we set DF on every
21795                  * packet. This might not be the best thing to do according
21796                  * to RFC3390 Section 2. However the tcp hostcache migitates
21797                  * the problem so it affects only the first tcp connection
21798                  * with a host.
21799                  *
21800                  * NB: Don't set DF on small MTU/MSS to have a safe
21801                  * fallback.
21802                  */
21803                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
21804                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21805                         if (tp->t_port == 0 || len < V_tcp_minmss) {
21806                                 ip->ip_off |= htons(IP_DF);
21807                         }
21808                 } else {
21809                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21810                 }
21811
21812                 if (tp->t_state == TCPS_SYN_SENT)
21813                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
21814
21815                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
21816
21817                 error = ip_output(m,
21818 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21819                                   inp->inp_options,
21820 #else
21821                                   NULL,
21822 #endif
21823                                   &inp->inp_route,
21824                                   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
21825                                   inp);
21826                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
21827                         mtu = inp->inp_route.ro_nh->nh_mtu;
21828         }
21829 #endif                          /* INET */
21830
21831 out:
21832         if (lgb) {
21833                 lgb->tlb_errno = error;
21834                 lgb = NULL;
21835         }
21836         /*
21837          * In transmit state, time the transmission and arrange for the
21838          * retransmit.  In persist state, just set snd_max.
21839          */
21840         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
21841                         rack_to_usec_ts(&tv),
21842                         rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
21843         if (error == 0) {
21844                 if (rsm == NULL) {
21845                         if (rack->lt_bw_up == 0) {
21846                                 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv);
21847                                 rack->r_ctl.lt_seq = tp->snd_una;
21848                                 rack->lt_bw_up = 1;
21849                         } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) {
21850                                 /*
21851                                  * Need to record what we have since we are
21852                                  * approaching seq wrap.
21853                                  */
21854                                 uint64_t tmark;
21855
21856                                 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
21857                                 rack->r_ctl.lt_seq = tp->snd_una;
21858                                 tmark = tcp_tv_to_lusectick(&tv);
21859                                 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
21860                                 rack->r_ctl.lt_timemark = tmark;
21861                         }
21862                 }
21863                 rack->forced_ack = 0;   /* If we send something zap the FA flag */
21864                 counter_u64_add(rack_total_bytes, len);
21865                 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
21866                 if (rsm && doing_tlp) {
21867                         rack->rc_last_sent_tlp_past_cumack = 0;
21868                         rack->rc_last_sent_tlp_seq_valid = 1;
21869                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
21870                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
21871                 }
21872                 if (rack->rc_hw_nobuf) {
21873                         rack->rc_hw_nobuf = 0;
21874                         rack->r_ctl.rc_agg_delayed = 0;
21875                         rack->r_early = 0;
21876                         rack->r_late = 0;
21877                         rack->r_ctl.rc_agg_early = 0;
21878                 }
21879                 if (rsm && (doing_tlp == 0)) {
21880                         /* Set we retransmitted */
21881                         rack->rc_gp_saw_rec = 1;
21882                 } else {
21883                         if (cwnd_to_use > tp->snd_ssthresh) {
21884                                 /* Set we sent in CA */
21885                                 rack->rc_gp_saw_ca = 1;
21886                         } else {
21887                                 /* Set we sent in SS */
21888                                 rack->rc_gp_saw_ss = 1;
21889                         }
21890                 }
21891                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21892                     (tp->t_flags & TF_SACK_PERMIT) &&
21893                     tp->rcv_numsacks > 0)
21894                         tcp_clean_dsack_blocks(tp);
21895                 tot_len_this_send += len;
21896                 if (len == 0) {
21897                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
21898                 } else {
21899                         int idx;
21900
21901                         idx = (len / segsiz) + 3;
21902                         if (idx >= TCP_MSS_ACCT_ATIMER)
21903                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
21904                         else
21905                                 counter_u64_add(rack_out_size[idx], 1);
21906                 }
21907         }
21908         if ((rack->rack_no_prr == 0) &&
21909             sub_from_prr &&
21910             (error == 0)) {
21911                 if (rack->r_ctl.rc_prr_sndcnt >= len)
21912                         rack->r_ctl.rc_prr_sndcnt -= len;
21913                 else
21914                         rack->r_ctl.rc_prr_sndcnt = 0;
21915         }
21916         sub_from_prr = 0;
21917         if (doing_tlp) {
21918                 /* Make sure the TLP is added */
21919                 add_flag |= RACK_TLP;
21920         } else if (rsm) {
21921                 /* If its a resend without TLP then it must not have the flag */
21922                 rsm->r_flags &= ~RACK_TLP;
21923         }
21924
21925
21926         if ((error == 0) &&
21927             (len > 0) &&
21928             (tp->snd_una == tp->snd_max))
21929                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
21930         {
21931                 tcp_seq startseq = tp->snd_nxt;
21932
21933                 /* Track our lost count */
21934                 if (rsm && (doing_tlp == 0))
21935                         rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
21936                 /*
21937                  * Advance snd_nxt over sequence space of this segment.
21938                  */
21939                 if (error)
21940                         /* We don't log or do anything with errors */
21941                         goto nomore;
21942                 if (doing_tlp == 0) {
21943                         if (rsm == NULL) {
21944                                 /*
21945                                  * Not a retransmission of some
21946                                  * sort, new data is going out so
21947                                  * clear our TLP count and flag.
21948                                  */
21949                                 rack->rc_tlp_in_progress = 0;
21950                                 rack->r_ctl.rc_tlp_cnt_out = 0;
21951                         }
21952                 } else {
21953                         /*
21954                          * We have just sent a TLP, mark that it is true
21955                          * and make sure our in progress is set so we
21956                          * continue to check the count.
21957                          */
21958                         rack->rc_tlp_in_progress = 1;
21959                         rack->r_ctl.rc_tlp_cnt_out++;
21960                 }
21961                 if (flags & (TH_SYN | TH_FIN)) {
21962                         if (flags & TH_SYN)
21963                                 tp->snd_nxt++;
21964                         if (flags & TH_FIN) {
21965                                 tp->snd_nxt++;
21966                                 tp->t_flags |= TF_SENTFIN;
21967                         }
21968                 }
21969                 /* In the ENOBUFS case we do *not* update snd_max */
21970                 if (sack_rxmit)
21971                         goto nomore;
21972
21973                 tp->snd_nxt += len;
21974                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
21975                         if (tp->snd_una == tp->snd_max) {
21976                                 /*
21977                                  * Update the time we just added data since
21978                                  * none was outstanding.
21979                                  */
21980                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
21981                                 tp->t_acktime = ticks;
21982                         }
21983                         tp->snd_max = tp->snd_nxt;
21984                         if (rack->rc_new_rnd_needed) {
21985                                 /*
21986                                  * Update the rnd to start ticking not
21987                                  * that from a time perspective all of
21988                                  * the preceding idle time is "in the round"
21989                                  */
21990                                 rack->rc_new_rnd_needed = 0;
21991                                 rack->r_ctl.roundends = tp->snd_max;
21992                         }
21993                         /*
21994                          * Time this transmission if not a retransmission and
21995                          * not currently timing anything.
21996                          * This is only relevant in case of switching back to
21997                          * the base stack.
21998                          */
21999                         if (tp->t_rtttime == 0) {
22000                                 tp->t_rtttime = ticks;
22001                                 tp->t_rtseq = startseq;
22002                                 KMOD_TCPSTAT_INC(tcps_segstimed);
22003                         }
22004                         if (len &&
22005                             ((tp->t_flags & TF_GPUTINPROG) == 0))
22006                                 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
22007                 }
22008                 /*
22009                  * If we are doing FO we need to update the mbuf position and subtract
22010                  * this happens when the peer sends us duplicate information and
22011                  * we thus want to send a DSACK.
22012                  *
22013                  * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
22014                  * turned off? If not then we are going to echo multiple DSACK blocks
22015                  * out (with the TSO), which we should not be doing.
22016                  */
22017                 if (rack->r_fast_output && len) {
22018                         if (rack->r_ctl.fsb.left_to_send > len)
22019                                 rack->r_ctl.fsb.left_to_send -= len;
22020                         else
22021                                 rack->r_ctl.fsb.left_to_send = 0;
22022                         if (rack->r_ctl.fsb.left_to_send < segsiz)
22023                                 rack->r_fast_output = 0;
22024                         if (rack->r_fast_output) {
22025                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
22026                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
22027                                 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
22028                         }
22029                 }
22030         }
22031 nomore:
22032         if (error) {
22033                 rack->r_ctl.rc_agg_delayed = 0;
22034                 rack->r_early = 0;
22035                 rack->r_late = 0;
22036                 rack->r_ctl.rc_agg_early = 0;
22037                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
22038                 /*
22039                  * Failures do not advance the seq counter above. For the
22040                  * case of ENOBUFS we will fall out and retry in 1ms with
22041                  * the hpts. Everything else will just have to retransmit
22042                  * with the timer.
22043                  *
22044                  * In any case, we do not want to loop around for another
22045                  * send without a good reason.
22046                  */
22047                 sendalot = 0;
22048                 switch (error) {
22049                 case EPERM:
22050                         tp->t_softerror = error;
22051 #ifdef TCP_ACCOUNTING
22052                         crtsc = get_cyclecount();
22053                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22054                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22055                         }
22056                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22057                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22058                         }
22059                         sched_unpin();
22060 #endif
22061                         return (error);
22062                 case ENOBUFS:
22063                         /*
22064                          * Pace us right away to retry in a some
22065                          * time
22066                          */
22067                         if (rack->r_ctl.crte != NULL) {
22068                                 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
22069                                 if (tcp_bblogging_on(rack->rc_tp))
22070                                         rack_log_queue_level(tp, rack, len, &tv, cts);
22071                         } else
22072                                 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
22073                         slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
22074                         if (rack->rc_enobuf < 0x7f)
22075                                 rack->rc_enobuf++;
22076                         if (slot < (10 * HPTS_USEC_IN_MSEC))
22077                                 slot = 10 * HPTS_USEC_IN_MSEC;
22078                         if (rack->r_ctl.crte != NULL) {
22079                                 counter_u64_add(rack_saw_enobuf_hw, 1);
22080                                 tcp_rl_log_enobuf(rack->r_ctl.crte);
22081                         }
22082                         counter_u64_add(rack_saw_enobuf, 1);
22083                         goto enobufs;
22084                 case EMSGSIZE:
22085                         /*
22086                          * For some reason the interface we used initially
22087                          * to send segments changed to another or lowered
22088                          * its MTU. If TSO was active we either got an
22089                          * interface without TSO capabilits or TSO was
22090                          * turned off. If we obtained mtu from ip_output()
22091                          * then update it and try again.
22092                          */
22093                         if (tso)
22094                                 tp->t_flags &= ~TF_TSO;
22095                         if (mtu != 0) {
22096                                 int saved_mtu;
22097
22098                                 saved_mtu = tp->t_maxseg;
22099                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
22100                                 if (saved_mtu > tp->t_maxseg) {
22101                                         goto again;
22102                                 }
22103                         }
22104                         slot = 10 * HPTS_USEC_IN_MSEC;
22105                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
22106 #ifdef TCP_ACCOUNTING
22107                         crtsc = get_cyclecount();
22108                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22109                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22110                         }
22111                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22112                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22113                         }
22114                         sched_unpin();
22115 #endif
22116                         return (error);
22117                 case ENETUNREACH:
22118                         counter_u64_add(rack_saw_enetunreach, 1);
22119                 case EHOSTDOWN:
22120                 case EHOSTUNREACH:
22121                 case ENETDOWN:
22122                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
22123                                 tp->t_softerror = error;
22124                         }
22125                         /* FALLTHROUGH */
22126                 default:
22127                         slot = 10 * HPTS_USEC_IN_MSEC;
22128                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
22129 #ifdef TCP_ACCOUNTING
22130                         crtsc = get_cyclecount();
22131                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22132                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22133                         }
22134                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22135                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22136                         }
22137                         sched_unpin();
22138 #endif
22139                         return (error);
22140                 }
22141         } else {
22142                 rack->rc_enobuf = 0;
22143                 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
22144                         rack->r_ctl.retran_during_recovery += len;
22145         }
22146         KMOD_TCPSTAT_INC(tcps_sndtotal);
22147
22148         /*
22149          * Data sent (as far as we can tell). If this advertises a larger
22150          * window than any other segment, then remember the size of the
22151          * advertised window. Any pending ACK has now been sent.
22152          */
22153         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
22154                 tp->rcv_adv = tp->rcv_nxt + recwin;
22155
22156         tp->last_ack_sent = tp->rcv_nxt;
22157         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
22158 enobufs:
22159         if (sendalot) {
22160                 /* Do we need to turn off sendalot? */
22161                 if (rack->r_ctl.rc_pace_max_segs &&
22162                     (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
22163                         /* We hit our max. */
22164                         sendalot = 0;
22165                 } else if ((rack->rc_user_set_max_segs) &&
22166                            (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
22167                         /* We hit the user defined max */
22168                         sendalot = 0;
22169                 }
22170         }
22171         if ((error == 0) && (flags & TH_FIN))
22172                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
22173         if (flags & TH_RST) {
22174                 /*
22175                  * We don't send again after sending a RST.
22176                  */
22177                 slot = 0;
22178                 sendalot = 0;
22179                 if (error == 0)
22180                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
22181         } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
22182                 /*
22183                  * Get our pacing rate, if an error
22184                  * occurred in sending (ENOBUF) we would
22185                  * hit the else if with slot preset. Other
22186                  * errors return.
22187                  */
22188                 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
22189         }
22190         if (rsm &&
22191             (rsm->r_flags & RACK_HAS_SYN) == 0 &&
22192             rack->use_rack_rr) {
22193                 /* Its a retransmit and we use the rack cheat? */
22194                 if ((slot == 0) ||
22195                     (rack->rc_always_pace == 0) ||
22196                     (rack->r_rr_config == 1)) {
22197                         /*
22198                          * We have no pacing set or we
22199                          * are using old-style rack or
22200                          * we are overridden to use the old 1ms pacing.
22201                          */
22202                         slot = rack->r_ctl.rc_min_to;
22203                 }
22204         }
22205         /* We have sent clear the flag */
22206         rack->r_ent_rec_ns = 0;
22207         if (rack->r_must_retran) {
22208                 if (rsm) {
22209                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
22210                         if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
22211                                 /*
22212                                  * We have retransmitted all.
22213                                  */
22214                                 rack->r_must_retran = 0;
22215                                 rack->r_ctl.rc_out_at_rto = 0;
22216                         }
22217                 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22218                         /*
22219                          * Sending new data will also kill
22220                          * the loop.
22221                          */
22222                         rack->r_must_retran = 0;
22223                         rack->r_ctl.rc_out_at_rto = 0;
22224                 }
22225         }
22226         rack->r_ctl.fsb.recwin = recwin;
22227         if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
22228             SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22229                 /*
22230                  * We hit an RTO and now have past snd_max at the RTO
22231                  * clear all the WAS flags.
22232                  */
22233                 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
22234         }
22235         if (slot) {
22236                 /* set the rack tcb into the slot N */
22237                 if ((error == 0) &&
22238                     rack_use_rfo &&
22239                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
22240                     (rsm == NULL) &&
22241                     (tp->snd_nxt == tp->snd_max) &&
22242                     (ipoptlen == 0) &&
22243                     (tp->rcv_numsacks == 0) &&
22244                     rack->r_fsb_inited &&
22245                     TCPS_HAVEESTABLISHED(tp->t_state) &&
22246                     ((IN_RECOVERY(tp->t_flags)) == 0) &&
22247                     (rack->r_must_retran == 0) &&
22248                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
22249                     (len > 0) && (orig_len > 0) &&
22250                     (orig_len > len) &&
22251                     ((orig_len - len) >= segsiz) &&
22252                     ((optlen == 0) ||
22253                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22254                         /* We can send at least one more MSS using our fsb */
22255                         rack_setup_fast_output(tp, rack, sb, len, orig_len,
22256                                                segsiz, pace_max_seg, hw_tls, flags);
22257                 } else
22258                         rack->r_fast_output = 0;
22259                 rack_log_fsb(rack, tp, so, flags,
22260                              ipoptlen, orig_len, len, error,
22261                              (rsm == NULL), optlen, __LINE__, 2);
22262         } else if (sendalot) {
22263                 int ret;
22264
22265                 sack_rxmit = 0;
22266                 if ((error == 0) &&
22267                     rack_use_rfo &&
22268                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
22269                     (rsm == NULL) &&
22270                     (ipoptlen == 0) &&
22271                     (tp->rcv_numsacks == 0) &&
22272                     (tp->snd_nxt == tp->snd_max) &&
22273                     (rack->r_must_retran == 0) &&
22274                     rack->r_fsb_inited &&
22275                     TCPS_HAVEESTABLISHED(tp->t_state) &&
22276                     ((IN_RECOVERY(tp->t_flags)) == 0) &&
22277                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
22278                     (len > 0) && (orig_len > 0) &&
22279                     (orig_len > len) &&
22280                     ((orig_len - len) >= segsiz) &&
22281                     ((optlen == 0) ||
22282                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22283                         /* we can use fast_output for more */
22284                         rack_setup_fast_output(tp, rack, sb, len, orig_len,
22285                                                segsiz, pace_max_seg, hw_tls, flags);
22286                         if (rack->r_fast_output) {
22287                                 error = 0;
22288                                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
22289                                 if (ret >= 0)
22290                                         return (ret);
22291                                 else if (error)
22292                                         goto nomore;
22293
22294                         }
22295                 }
22296                 goto again;
22297         }
22298         /* Assure when we leave that snd_nxt will point to top */
22299 skip_all_send:
22300         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
22301                 tp->snd_nxt = tp->snd_max;
22302         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
22303 #ifdef TCP_ACCOUNTING
22304         crtsc = get_cyclecount() - ts_val;
22305         if (tot_len_this_send) {
22306                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22307                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
22308                 }
22309                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22310                         tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
22311                 }
22312                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22313                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
22314                 }
22315         } else {
22316                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22317                         tp->tcp_cnt_counters[SND_OUT_ACK]++;
22318                 }
22319                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22320                         tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
22321                 }
22322         }
22323         sched_unpin();
22324 #endif
22325         if (error == ENOBUFS)
22326                 error = 0;
22327         return (error);
22328 }
22329
22330 static void
22331 rack_update_seg(struct tcp_rack *rack)
22332 {
22333         uint32_t orig_val;
22334
22335         orig_val = rack->r_ctl.rc_pace_max_segs;
22336         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
22337         if (orig_val != rack->r_ctl.rc_pace_max_segs)
22338                 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
22339 }
22340
22341 static void
22342 rack_mtu_change(struct tcpcb *tp)
22343 {
22344         /*
22345          * The MSS may have changed
22346          */
22347         struct tcp_rack *rack;
22348         struct rack_sendmap *rsm;
22349
22350         rack = (struct tcp_rack *)tp->t_fb_ptr;
22351         if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
22352                 /*
22353                  * The MTU has changed we need to resend everything
22354                  * since all we have sent is lost. We first fix
22355                  * up the mtu though.
22356                  */
22357                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
22358                 /* We treat this like a full retransmit timeout without the cwnd adjustment */
22359                 rack_remxt_tmr(tp);
22360                 rack->r_fast_output = 0;
22361                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
22362                                                 rack->r_ctl.rc_sacked);
22363                 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
22364                 rack->r_must_retran = 1;
22365                 /* Mark all inflight to needing to be rxt'd */
22366                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
22367                         rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG);
22368                 }
22369         }
22370         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
22371         /* We don't use snd_nxt to retransmit */
22372         tp->snd_nxt = tp->snd_max;
22373 }
22374
22375 static int
22376 rack_set_dgp(struct tcp_rack *rack)
22377 {
22378         /* pace_always=1 */
22379         if (rack->rc_always_pace == 0) {
22380                 if (tcp_can_enable_pacing() == 0)
22381                         return (EBUSY);
22382         }
22383         rack->dgp_on = 1;
22384         rack->rc_always_pace = 1;
22385         rack->use_fixed_rate = 0;
22386         if (rack->gp_ready)
22387                 rack_set_cc_pacing(rack);
22388         rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
22389         rack->rack_attempt_hdwr_pace = 0;
22390         /* rxt settings */
22391         rack->full_size_rxt = 1;
22392         rack->shape_rxt_to_pacing_min  = 0;
22393         /* cmpack=1 */
22394         rack->r_use_cmp_ack = 1;
22395         if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
22396             rack->r_use_cmp_ack)
22397                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
22398         /* scwnd=1 */
22399         rack->rack_enable_scwnd = 1;
22400         /* dynamic=100 */
22401         rack->rc_gp_dyn_mul = 1;
22402         /* gp_inc_ca */
22403         rack->r_ctl.rack_per_of_gp_ca = 100;
22404         /* rrr_conf=3 */
22405         rack->r_rr_config = 3;
22406         /* npush=2 */
22407         rack->r_ctl.rc_no_push_at_mrtt = 2;
22408         /* fillcw=1 */
22409         if (rack->r_cwnd_was_clamped == 0) {
22410                 rack->rc_pace_to_cwnd = 1;
22411         } else {
22412                 rack->rc_pace_to_cwnd = 0;
22413                 /* Reset all multipliers to 100.0 so just the measured bw */
22414                 rack->r_ctl.rack_per_of_gp_ss = 100;
22415                 rack->r_ctl.rack_per_of_gp_ca = 100;
22416         }
22417         rack->rc_pace_fill_if_rttin_range = 0;
22418         rack->rtt_limit_mul = 0;
22419         /* noprr=1 */
22420         rack->rack_no_prr = 1;
22421         /* lscwnd=1 */
22422         rack->r_limit_scw = 1;
22423         /* gp_inc_rec */
22424         rack->r_ctl.rack_per_of_gp_rec = 90;
22425         rack_client_buffer_level_set(rack);
22426         return (0);
22427 }
22428
22429
22430
22431 static int
22432 rack_set_profile(struct tcp_rack *rack, int prof)
22433 {
22434         int err = EINVAL;
22435         if (prof == 1) {
22436                 /*
22437                  * Profile 1 is "standard" DGP. It ignores
22438                  * client buffer level.
22439                  */
22440                 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0;
22441                 err = rack_set_dgp(rack);
22442                 if (err)
22443                         return (err);
22444         } else if (prof == 2) {
22445                 /*
22446                  * Profile 2 is DGP. Less aggressive with
22447                  * respect to client buffer level.
22448                  */
22449                 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1;
22450                 err = rack_set_dgp(rack);
22451                 if (err)
22452                         return (err);
22453         } else if (prof == 3) {
22454                 /*
22455                  * Profile 3 is DGP. Even Less aggressive with
22456                  * respect to client buffer level.
22457                  */
22458                 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2;
22459                 err = rack_set_dgp(rack);
22460                 if (err)
22461                         return (err);
22462         } else if (prof == 4) {
22463                 /*
22464                  * Profile 4 is DGP with the most responsiveness
22465                  * to client buffer level.
22466                  */
22467                 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3;
22468                 err = rack_set_dgp(rack);
22469                 if (err)
22470                         return (err);
22471         } else if (prof == 0) {
22472                 /* This changes things back to the default settings */
22473                 rack->dgp_on = 0;
22474                 rack->rc_hybrid_mode = 0;
22475                 err = 0;
22476                 if (rack_fill_cw_state)
22477                         rack->rc_pace_to_cwnd = 1;
22478                 else
22479                         rack->rc_pace_to_cwnd = 0;
22480                 if (rack->rc_always_pace) {
22481                         tcp_decrement_paced_conn();
22482                         rack_undo_cc_pacing(rack);
22483                         rack->rc_always_pace = 0;
22484                 }
22485                 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
22486                         rack->rc_always_pace = 1;
22487                         if ((rack->gp_ready) && (rack->use_fixed_rate == 0))
22488                                 rack_set_cc_pacing(rack);
22489                 } else
22490                         rack->rc_always_pace = 0;
22491                 if (rack_dsack_std_based & 0x1) {
22492                         /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
22493                         rack->rc_rack_tmr_std_based = 1;
22494                 }
22495                 if (rack_dsack_std_based & 0x2) {
22496                         /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
22497                         rack->rc_rack_use_dsack = 1;
22498                 }
22499                 if (rack_use_cmp_acks)
22500                         rack->r_use_cmp_ack = 1;
22501                 else
22502                         rack->r_use_cmp_ack = 0;
22503                 if (rack_disable_prr)
22504                         rack->rack_no_prr = 1;
22505                 else
22506                         rack->rack_no_prr = 0;
22507                 if (rack_gp_no_rec_chg)
22508                         rack->rc_gp_no_rec_chg = 1;
22509                 else
22510                         rack->rc_gp_no_rec_chg = 0;
22511                 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
22512                         rack->r_mbuf_queue = 1;
22513                         if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
22514                                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
22515                         rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
22516                 } else {
22517                         rack->r_mbuf_queue = 0;
22518                         rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
22519                 }
22520                 if (rack_enable_shared_cwnd)
22521                         rack->rack_enable_scwnd = 1;
22522                 else
22523                         rack->rack_enable_scwnd = 0;
22524                 if (rack_do_dyn_mul) {
22525                         /* When dynamic adjustment is on CA needs to start at 100% */
22526                         rack->rc_gp_dyn_mul = 1;
22527                         if (rack_do_dyn_mul >= 100)
22528                                 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
22529                 } else {
22530                         rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
22531                         rack->rc_gp_dyn_mul = 0;
22532                 }
22533                 rack->r_rr_config = 0;
22534                 rack->r_ctl.rc_no_push_at_mrtt = 0;
22535                 rack->rc_pace_to_cwnd = 0;
22536                 rack->rc_pace_fill_if_rttin_range = 0;
22537                 rack->rtt_limit_mul = 0;
22538
22539                 if (rack_enable_hw_pacing)
22540                         rack->rack_hdw_pace_ena = 1;
22541                 else
22542                         rack->rack_hdw_pace_ena = 0;
22543                 if (rack_disable_prr)
22544                         rack->rack_no_prr = 1;
22545                 else
22546                         rack->rack_no_prr = 0;
22547                 if (rack_limits_scwnd)
22548                         rack->r_limit_scw  = 1;
22549                 else
22550                         rack->r_limit_scw  = 0;
22551                 rack_init_retransmit_value(rack, rack_rxt_controls);
22552                 err = 0;
22553         }
22554         return (err);
22555 }
22556
22557 static int
22558 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
22559 {
22560         struct deferred_opt_list *dol;
22561
22562         dol = malloc(sizeof(struct deferred_opt_list),
22563                      M_TCPFSB, M_NOWAIT|M_ZERO);
22564         if (dol == NULL) {
22565                 /*
22566                  * No space yikes -- fail out..
22567                  */
22568                 return (0);
22569         }
22570         dol->optname = sopt_name;
22571         dol->optval = loptval;
22572         TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
22573         return (1);
22574 }
22575
22576 static int
22577 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid)
22578 {
22579 #ifdef TCP_REQUEST_TRK
22580         struct http_sendfile_track *sft;
22581         struct timeval tv;
22582         tcp_seq seq;
22583         int err;
22584
22585         microuptime(&tv);
22586
22587         /*
22588          * If BB logging is not on we need to look at the DTL flag.
22589          * If its on already then those reasons override the DTL input.
22590          * We do this with any request, you can turn DTL on, but it does
22591          * not turn off at least from hybrid pacing requests.
22592          */
22593         if (tcp_bblogging_on(rack->rc_tp) == 0) {
22594                 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) {
22595                         /* Turn on BB point logging  */
22596                         tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS,
22597                                             TCP_BBPOINT_REQ_LEVEL_LOGGING);
22598                 }
22599         }
22600         /* Make sure no fixed rate is on */
22601         rack->use_fixed_rate = 0;
22602         rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
22603         rack->r_ctl.rc_fixed_pacing_rate_ca = 0;
22604         rack->r_ctl.rc_fixed_pacing_rate_ss = 0;
22605         /* Now allocate or find our entry that will have these settings */
22606         sft = tcp_http_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0);
22607         if (sft == NULL) {
22608                 rack->rc_tp->tcp_hybrid_error++;
22609                 /* no space, where would it have gone? */
22610                 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc;
22611                 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
22612                 return (ENOSPC);
22613         }
22614         /* The seq will be snd_una + everything in the buffer */
22615         seq = sft->start_seq;
22616         if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
22617                 /* Disabling hybrid pacing */
22618                 if (rack->rc_hybrid_mode) {
22619                         rack_set_profile(rack, 0);
22620                         rack->rc_tp->tcp_hybrid_stop++;
22621                 }
22622                 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0);
22623                 return (0);
22624         }
22625         if (rack->dgp_on == 0) {
22626                 /*
22627                  * If we have not yet turned DGP on, do so
22628                  * now setting pure DGP mode, no buffer level
22629                  * response.
22630                  */
22631                 if ((err = rack_set_profile(rack, 1)) != 0){
22632                         /* Failed to turn pacing on */
22633                         rack->rc_tp->tcp_hybrid_error++;
22634                         rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0);
22635                         return (err);
22636                 }
22637         }
22638         /* Now set in our flags */
22639         sft->hybrid_flags = hybrid->hybrid_flags;
22640         if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
22641                 sft->cspr = hybrid->cspr;
22642         else
22643                 sft->cspr = 0;
22644         if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS)
22645                 sft->hint_maxseg = hybrid->hint_maxseg;
22646         else
22647                 sft->hint_maxseg = 0;
22648         rack->rc_hybrid_mode = 1;
22649         rack->rc_tp->tcp_hybrid_start++;
22650         rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
22651         return (0);
22652 #else
22653         return (ENOTSUP);
22654 #endif
22655 }
22656
22657 static int
22658 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
22659                     uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
22660
22661 {
22662         struct epoch_tracker et;
22663         struct sockopt sopt;
22664         struct cc_newreno_opts opt;
22665         struct inpcb *inp = tptoinpcb(tp);
22666         uint64_t val;
22667         int error = 0;
22668         uint16_t ca, ss;
22669
22670         switch (sopt_name) {
22671         case TCP_RACK_SET_RXT_OPTIONS:
22672                 if ((optval >= 0) && (optval <= 2)) {
22673                         rack_init_retransmit_value(rack, optval);
22674                 } else {
22675                         /*
22676                          * You must send in 0, 1 or 2 all else is
22677                          * invalid.
22678                          */
22679                         error = EINVAL;
22680                 }
22681                 break;
22682         case TCP_RACK_DSACK_OPT:
22683                 RACK_OPTS_INC(tcp_rack_dsack_opt);
22684                 if (optval & 0x1) {
22685                         rack->rc_rack_tmr_std_based = 1;
22686                 } else {
22687                         rack->rc_rack_tmr_std_based = 0;
22688                 }
22689                 if (optval & 0x2) {
22690                         rack->rc_rack_use_dsack = 1;
22691                 } else {
22692                         rack->rc_rack_use_dsack = 0;
22693                 }
22694                 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
22695                 break;
22696         case TCP_RACK_PACING_DIVISOR:
22697                 RACK_OPTS_INC(tcp_rack_pacing_divisor);
22698                 if (optval == 0) {
22699                         rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
22700                 } else {
22701                         if (optval < RL_MIN_DIVISOR)
22702                                 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR;
22703                         else
22704                                 rack->r_ctl.pace_len_divisor = optval;
22705                 }
22706                 break;
22707         case TCP_RACK_HI_BETA:
22708                 RACK_OPTS_INC(tcp_rack_hi_beta);
22709                 if (optval)
22710                         rack->rack_hibeta = 1;
22711                 else
22712                         rack->rack_hibeta = 0;
22713                 break;
22714         case TCP_RACK_PACING_BETA:
22715                 RACK_OPTS_INC(tcp_rack_beta);
22716                 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
22717                         /* This only works for newreno. */
22718                         error = EINVAL;
22719                         break;
22720                 }
22721                 if (rack->rc_pacing_cc_set) {
22722                         /*
22723                          * Set them into the real CC module
22724                          * whats in the rack pcb is the old values
22725                          * to be used on restoral/
22726                          */
22727                         sopt.sopt_dir = SOPT_SET;
22728                         opt.name = CC_NEWRENO_BETA;
22729                         opt.val = optval;
22730                         if (CC_ALGO(tp)->ctl_output != NULL)
22731                                 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
22732                         else {
22733                                 error = ENOENT;
22734                                 break;
22735                         }
22736                 } else {
22737                         /*
22738                          * Not pacing yet so set it into our local
22739                          * rack pcb storage.
22740                          */
22741                         rack->r_ctl.rc_saved_beta.beta = optval;
22742                 }
22743                 break;
22744         case TCP_RACK_TIMER_SLOP:
22745                 RACK_OPTS_INC(tcp_rack_timer_slop);
22746                 rack->r_ctl.timer_slop = optval;
22747                 if (rack->rc_tp->t_srtt) {
22748                         /*
22749                          * If we have an SRTT lets update t_rxtcur
22750                          * to have the new slop.
22751                          */
22752                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
22753                                            rack_rto_min, rack_rto_max,
22754                                            rack->r_ctl.timer_slop);
22755                 }
22756                 break;
22757         case TCP_RACK_PACING_BETA_ECN:
22758                 RACK_OPTS_INC(tcp_rack_beta_ecn);
22759                 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
22760                         /* This only works for newreno. */
22761                         error = EINVAL;
22762                         break;
22763                 }
22764                 if (rack->rc_pacing_cc_set) {
22765                         /*
22766                          * Set them into the real CC module
22767                          * whats in the rack pcb is the old values
22768                          * to be used on restoral/
22769                          */
22770                         sopt.sopt_dir = SOPT_SET;
22771                         opt.name = CC_NEWRENO_BETA_ECN;
22772                         opt.val = optval;
22773                         if (CC_ALGO(tp)->ctl_output != NULL)
22774                                 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
22775                         else
22776                                 error = ENOENT;
22777                 } else {
22778                         /*
22779                          * Not pacing yet so set it into our local
22780                          * rack pcb storage.
22781                          */
22782                         rack->r_ctl.rc_saved_beta.beta_ecn = optval;
22783                         rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED;
22784                 }
22785                 break;
22786         case TCP_DEFER_OPTIONS:
22787                 RACK_OPTS_INC(tcp_defer_opt);
22788                 if (optval) {
22789                         if (rack->gp_ready) {
22790                                 /* Too late */
22791                                 error = EINVAL;
22792                                 break;
22793                         }
22794                         rack->defer_options = 1;
22795                 } else
22796                         rack->defer_options = 0;
22797                 break;
22798         case TCP_RACK_MEASURE_CNT:
22799                 RACK_OPTS_INC(tcp_rack_measure_cnt);
22800                 if (optval && (optval <= 0xff)) {
22801                         rack->r_ctl.req_measurements = optval;
22802                 } else
22803                         error = EINVAL;
22804                 break;
22805         case TCP_REC_ABC_VAL:
22806                 RACK_OPTS_INC(tcp_rec_abc_val);
22807                 if (optval > 0)
22808                         rack->r_use_labc_for_rec = 1;
22809                 else
22810                         rack->r_use_labc_for_rec = 0;
22811                 break;
22812         case TCP_RACK_ABC_VAL:
22813                 RACK_OPTS_INC(tcp_rack_abc_val);
22814                 if ((optval > 0) && (optval < 255))
22815                         rack->rc_labc = optval;
22816                 else
22817                         error = EINVAL;
22818                 break;
22819         case TCP_HDWR_UP_ONLY:
22820                 RACK_OPTS_INC(tcp_pacing_up_only);
22821                 if (optval)
22822                         rack->r_up_only = 1;
22823                 else
22824                         rack->r_up_only = 0;
22825                 break;
22826         case TCP_PACING_RATE_CAP:
22827                 RACK_OPTS_INC(tcp_pacing_rate_cap);
22828                 rack->r_ctl.bw_rate_cap = loptval;
22829                 break;
22830         case TCP_HYBRID_PACING:
22831                 if (hybrid == NULL) {
22832                         error = EINVAL;
22833                         break;
22834                 }
22835                 error = process_hybrid_pacing(rack, hybrid);
22836                 break;
22837         case TCP_RACK_PROFILE:
22838                 RACK_OPTS_INC(tcp_profile);
22839                 error = rack_set_profile(rack, optval);
22840                 break;
22841         case TCP_USE_CMP_ACKS:
22842                 RACK_OPTS_INC(tcp_use_cmp_acks);
22843                 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
22844                         /* You can't turn it off once its on! */
22845                         error = EINVAL;
22846                 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
22847                         rack->r_use_cmp_ack = 1;
22848                         rack->r_mbuf_queue = 1;
22849                         inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
22850                 }
22851                 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
22852                         inp->inp_flags2 |= INP_MBUF_ACKCMP;
22853                 break;
22854         case TCP_SHARED_CWND_TIME_LIMIT:
22855                 RACK_OPTS_INC(tcp_lscwnd);
22856                 if (optval)
22857                         rack->r_limit_scw = 1;
22858                 else
22859                         rack->r_limit_scw = 0;
22860                 break;
22861         case TCP_RACK_DGP_IN_REC:
22862                 RACK_OPTS_INC(tcp_dgp_in_rec);
22863                 if (optval)
22864                         rack->r_ctl.full_dgp_in_rec = 1;
22865                 else
22866                         rack->r_ctl.full_dgp_in_rec = 0;
22867                 break;
22868         case TCP_RXT_CLAMP:
22869                 RACK_OPTS_INC(tcp_rxt_clamp);
22870                 rack_translate_clamp_value(rack, optval);
22871                 break;
22872         case TCP_RACK_PACE_TO_FILL:
22873                 RACK_OPTS_INC(tcp_fillcw);
22874                 if (optval == 0)
22875                         rack->rc_pace_to_cwnd = 0;
22876                 else {
22877                         rack->rc_pace_to_cwnd = 1;
22878                         if (optval > 1)
22879                                 rack->r_fill_less_agg = 1;
22880                 }
22881                 if ((optval >= rack_gp_rtt_maxmul) &&
22882                     rack_gp_rtt_maxmul &&
22883                     (optval < 0xf)) {
22884                         rack->rc_pace_fill_if_rttin_range = 1;
22885                         rack->rtt_limit_mul = optval;
22886                 } else {
22887                         rack->rc_pace_fill_if_rttin_range = 0;
22888                         rack->rtt_limit_mul = 0;
22889                 }
22890                 break;
22891         case TCP_RACK_NO_PUSH_AT_MAX:
22892                 RACK_OPTS_INC(tcp_npush);
22893                 if (optval == 0)
22894                         rack->r_ctl.rc_no_push_at_mrtt = 0;
22895                 else if (optval < 0xff)
22896                         rack->r_ctl.rc_no_push_at_mrtt = optval;
22897                 else
22898                         error = EINVAL;
22899                 break;
22900         case TCP_SHARED_CWND_ENABLE:
22901                 RACK_OPTS_INC(tcp_rack_scwnd);
22902                 if (optval == 0)
22903                         rack->rack_enable_scwnd = 0;
22904                 else
22905                         rack->rack_enable_scwnd = 1;
22906                 break;
22907         case TCP_RACK_MBUF_QUEUE:
22908                 /* Now do we use the LRO mbuf-queue feature */
22909                 RACK_OPTS_INC(tcp_rack_mbufq);
22910                 if (optval || rack->r_use_cmp_ack)
22911                         rack->r_mbuf_queue = 1;
22912                 else
22913                         rack->r_mbuf_queue = 0;
22914                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
22915                         inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
22916                 else
22917                         inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
22918                 break;
22919         case TCP_RACK_NONRXT_CFG_RATE:
22920                 RACK_OPTS_INC(tcp_rack_cfg_rate);
22921                 if (optval == 0)
22922                         rack->rack_rec_nonrxt_use_cr = 0;
22923                 else
22924                         rack->rack_rec_nonrxt_use_cr = 1;
22925                 break;
22926         case TCP_NO_PRR:
22927                 RACK_OPTS_INC(tcp_rack_noprr);
22928                 if (optval == 0)
22929                         rack->rack_no_prr = 0;
22930                 else if (optval == 1)
22931                         rack->rack_no_prr = 1;
22932                 else if (optval == 2)
22933                         rack->no_prr_addback = 1;
22934                 else
22935                         error = EINVAL;
22936                 break;
22937         case TCP_TIMELY_DYN_ADJ:
22938                 RACK_OPTS_INC(tcp_timely_dyn);
22939                 if (optval == 0)
22940                         rack->rc_gp_dyn_mul = 0;
22941                 else {
22942                         rack->rc_gp_dyn_mul = 1;
22943                         if (optval >= 100) {
22944                                 /*
22945                                  * If the user sets something 100 or more
22946                                  * its the gp_ca value.
22947                                  */
22948                                 rack->r_ctl.rack_per_of_gp_ca  = optval;
22949                         }
22950                 }
22951                 break;
22952         case TCP_RACK_DO_DETECTION:
22953                 RACK_OPTS_INC(tcp_rack_do_detection);
22954                 if (optval == 0)
22955                         rack->do_detection = 0;
22956                 else
22957                         rack->do_detection = 1;
22958                 break;
22959         case TCP_RACK_TLP_USE:
22960                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
22961                         error = EINVAL;
22962                         break;
22963                 }
22964                 RACK_OPTS_INC(tcp_tlp_use);
22965                 rack->rack_tlp_threshold_use = optval;
22966                 break;
22967         case TCP_RACK_TLP_REDUCE:
22968                 /* RACK TLP cwnd reduction (bool) */
22969                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
22970                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
22971                 break;
22972                 /*  Pacing related ones */
22973         case TCP_RACK_PACE_ALWAYS:
22974                 /*
22975                  * zero is old rack method, 1 is new
22976                  * method using a pacing rate.
22977                  */
22978                 RACK_OPTS_INC(tcp_rack_pace_always);
22979                 if (optval > 0) {
22980                         if (rack->rc_always_pace) {
22981                                 error = EALREADY;
22982                                 break;
22983                         } else if (tcp_can_enable_pacing()) {
22984                                 rack->rc_always_pace = 1;
22985                                 if ((rack->gp_ready) && (rack->use_fixed_rate == 0))
22986                                         rack_set_cc_pacing(rack);
22987                         }
22988                         else {
22989                                 error = ENOSPC;
22990                                 break;
22991                         }
22992                 } else {
22993                         if (rack->rc_always_pace) {
22994                                 tcp_decrement_paced_conn();
22995                                 rack->rc_always_pace = 0;
22996                                 rack_undo_cc_pacing(rack);
22997                         }
22998                 }
22999                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
23000                         inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
23001                 else
23002                         inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
23003                 /* A rate may be set irate or other, if so set seg size */
23004                 rack_update_seg(rack);
23005                 break;
23006         case TCP_BBR_RACK_INIT_RATE:
23007                 RACK_OPTS_INC(tcp_initial_rate);
23008                 val = optval;
23009                 /* Change from kbits per second to bytes per second */
23010                 val *= 1000;
23011                 val /= 8;
23012                 rack->r_ctl.init_rate = val;
23013                 if (rack->rc_init_win != rack_default_init_window) {
23014                         uint32_t win, snt;
23015
23016                         /*
23017                          * Options don't always get applied
23018                          * in the order you think. So in order
23019                          * to assure we update a cwnd we need
23020                          * to check and see if we are still
23021                          * where we should raise the cwnd.
23022                          */
23023                         win = rc_init_window(rack);
23024                         if (SEQ_GT(tp->snd_max, tp->iss))
23025                                 snt = tp->snd_max - tp->iss;
23026                         else
23027                                 snt = 0;
23028                         if ((snt < win) &&
23029                             (tp->snd_cwnd < win))
23030                                 tp->snd_cwnd = win;
23031                 }
23032                 if (rack->rc_always_pace)
23033                         rack_update_seg(rack);
23034                 break;
23035         case TCP_BBR_IWINTSO:
23036                 RACK_OPTS_INC(tcp_initial_win);
23037                 if (optval && (optval <= 0xff)) {
23038                         uint32_t win, snt;
23039
23040                         rack->rc_init_win = optval;
23041                         win = rc_init_window(rack);
23042                         if (SEQ_GT(tp->snd_max, tp->iss))
23043                                 snt = tp->snd_max - tp->iss;
23044                         else
23045                                 snt = 0;
23046                         if ((snt < win) &&
23047                             (tp->t_srtt |
23048                              rack->r_ctl.init_rate)) {
23049                                 /*
23050                                  * We are not past the initial window
23051                                  * and we have some bases for pacing,
23052                                  * so we need to possibly adjust up
23053                                  * the cwnd. Note even if we don't set
23054                                  * the cwnd, its still ok to raise the rc_init_win
23055                                  * which can be used coming out of idle when we
23056                                  * would have a rate.
23057                                  */
23058                                 if (tp->snd_cwnd < win)
23059                                         tp->snd_cwnd = win;
23060                         }
23061                         if (rack->rc_always_pace)
23062                                 rack_update_seg(rack);
23063                 } else
23064                         error = EINVAL;
23065                 break;
23066         case TCP_RACK_FORCE_MSEG:
23067                 RACK_OPTS_INC(tcp_rack_force_max_seg);
23068                 if (optval)
23069                         rack->rc_force_max_seg = 1;
23070                 else
23071                         rack->rc_force_max_seg = 0;
23072                 break;
23073         case TCP_RACK_PACE_MIN_SEG:
23074                 RACK_OPTS_INC(tcp_rack_min_seg);
23075                 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval);
23076                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23077                 break;
23078         case TCP_RACK_PACE_MAX_SEG:
23079                 /* Max segments size in a pace in bytes */
23080                 RACK_OPTS_INC(tcp_rack_max_seg);
23081                 rack->rc_user_set_max_segs = optval;
23082                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23083                 break;
23084         case TCP_RACK_PACE_RATE_REC:
23085                 /* Set the fixed pacing rate in Bytes per second ca */
23086                 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
23087                 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23088                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23089                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23090                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23091                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23092                 rack->use_fixed_rate = 1;
23093                 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
23094                         rack_set_cc_pacing(rack);
23095                 rack_log_pacing_delay_calc(rack,
23096                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
23097                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
23098                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23099                                            __LINE__, NULL,0);
23100                 break;
23101
23102         case TCP_RACK_PACE_RATE_SS:
23103                 /* Set the fixed pacing rate in Bytes per second ca */
23104                 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
23105                 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23106                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23107                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23108                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23109                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23110                 rack->use_fixed_rate = 1;
23111                 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
23112                         rack_set_cc_pacing(rack);
23113                 rack_log_pacing_delay_calc(rack,
23114                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
23115                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
23116                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23117                                            __LINE__, NULL, 0);
23118                 break;
23119
23120         case TCP_RACK_PACE_RATE_CA:
23121                 /* Set the fixed pacing rate in Bytes per second ca */
23122                 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
23123                 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23124                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23125                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23126                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23127                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23128                 rack->use_fixed_rate = 1;
23129                 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta)
23130                         rack_set_cc_pacing(rack);
23131                 rack_log_pacing_delay_calc(rack,
23132                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
23133                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
23134                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23135                                            __LINE__, NULL, 0);
23136                 break;
23137         case TCP_RACK_GP_INCREASE_REC:
23138                 RACK_OPTS_INC(tcp_gp_inc_rec);
23139                 rack->r_ctl.rack_per_of_gp_rec = optval;
23140                 rack_log_pacing_delay_calc(rack,
23141                                            rack->r_ctl.rack_per_of_gp_ss,
23142                                            rack->r_ctl.rack_per_of_gp_ca,
23143                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23144                                            __LINE__, NULL, 0);
23145                 break;
23146         case TCP_RACK_GP_INCREASE_CA:
23147                 RACK_OPTS_INC(tcp_gp_inc_ca);
23148                 ca = optval;
23149                 if (ca < 100) {
23150                         /*
23151                          * We don't allow any reduction
23152                          * over the GP b/w.
23153                          */
23154                         error = EINVAL;
23155                         break;
23156                 }
23157                 rack->r_ctl.rack_per_of_gp_ca = ca;
23158                 rack_log_pacing_delay_calc(rack,
23159                                            rack->r_ctl.rack_per_of_gp_ss,
23160                                            rack->r_ctl.rack_per_of_gp_ca,
23161                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23162                                            __LINE__, NULL, 0);
23163                 break;
23164         case TCP_RACK_GP_INCREASE_SS:
23165                 RACK_OPTS_INC(tcp_gp_inc_ss);
23166                 ss = optval;
23167                 if (ss < 100) {
23168                         /*
23169                          * We don't allow any reduction
23170                          * over the GP b/w.
23171                          */
23172                         error = EINVAL;
23173                         break;
23174                 }
23175                 rack->r_ctl.rack_per_of_gp_ss = ss;
23176                 rack_log_pacing_delay_calc(rack,
23177                                            rack->r_ctl.rack_per_of_gp_ss,
23178                                            rack->r_ctl.rack_per_of_gp_ca,
23179                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23180                                            __LINE__, NULL, 0);
23181                 break;
23182         case TCP_RACK_RR_CONF:
23183                 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
23184                 if (optval && optval <= 3)
23185                         rack->r_rr_config = optval;
23186                 else
23187                         rack->r_rr_config = 0;
23188                 break;
23189         case TCP_PACING_DND:                    /*  URL:dnd */
23190                 if (optval > 0)
23191                         rack->rc_pace_dnd = 1;
23192                 else
23193                         rack->rc_pace_dnd = 0;
23194                 break;
23195         case TCP_HDWR_RATE_CAP:
23196                 RACK_OPTS_INC(tcp_hdwr_rate_cap);
23197                 if (optval) {
23198                         if (rack->r_rack_hw_rate_caps == 0)
23199                                 rack->r_rack_hw_rate_caps = 1;
23200                         else
23201                                 error = EALREADY;
23202                 } else {
23203                         rack->r_rack_hw_rate_caps = 0;
23204                 }
23205                 break;
23206         case TCP_RACK_SPLIT_LIMIT:
23207                 RACK_OPTS_INC(tcp_split_limit);
23208                 rack->r_ctl.rc_split_limit = optval;
23209                 break;
23210         case TCP_BBR_HDWR_PACE:
23211                 RACK_OPTS_INC(tcp_hdwr_pacing);
23212                 if (optval){
23213                         if (rack->rack_hdrw_pacing == 0) {
23214                                 rack->rack_hdw_pace_ena = 1;
23215                                 rack->rack_attempt_hdwr_pace = 0;
23216                         } else
23217                                 error = EALREADY;
23218                 } else {
23219                         rack->rack_hdw_pace_ena = 0;
23220 #ifdef RATELIMIT
23221                         if (rack->r_ctl.crte != NULL) {
23222                                 rack->rack_hdrw_pacing = 0;
23223                                 rack->rack_attempt_hdwr_pace = 0;
23224                                 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
23225                                 rack->r_ctl.crte = NULL;
23226                         }
23227 #endif
23228                 }
23229                 break;
23230                 /*  End Pacing related ones */
23231         case TCP_RACK_PRR_SENDALOT:
23232                 /* Allow PRR to send more than one seg */
23233                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
23234                 rack->r_ctl.rc_prr_sendalot = optval;
23235                 break;
23236         case TCP_RACK_MIN_TO:
23237                 /* Minimum time between rack t-o's in ms */
23238                 RACK_OPTS_INC(tcp_rack_min_to);
23239                 rack->r_ctl.rc_min_to = optval;
23240                 break;
23241         case TCP_RACK_EARLY_SEG:
23242                 /* If early recovery max segments */
23243                 RACK_OPTS_INC(tcp_rack_early_seg);
23244                 rack->r_ctl.rc_early_recovery_segs = optval;
23245                 break;
23246         case TCP_RACK_ENABLE_HYSTART:
23247         {
23248                 if (optval) {
23249                         tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
23250                         if (rack_do_hystart > RACK_HYSTART_ON)
23251                                 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
23252                         if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
23253                                 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
23254                 } else {
23255                         tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
23256                 }
23257         }
23258         break;
23259         case TCP_RACK_REORD_THRESH:
23260                 /* RACK reorder threshold (shift amount) */
23261                 RACK_OPTS_INC(tcp_rack_reord_thresh);
23262                 if ((optval > 0) && (optval < 31))
23263                         rack->r_ctl.rc_reorder_shift = optval;
23264                 else
23265                         error = EINVAL;
23266                 break;
23267         case TCP_RACK_REORD_FADE:
23268                 /* Does reordering fade after ms time */
23269                 RACK_OPTS_INC(tcp_rack_reord_fade);
23270                 rack->r_ctl.rc_reorder_fade = optval;
23271                 break;
23272         case TCP_RACK_TLP_THRESH:
23273                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
23274                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
23275                 if (optval)
23276                         rack->r_ctl.rc_tlp_threshold = optval;
23277                 else
23278                         error = EINVAL;
23279                 break;
23280         case TCP_BBR_USE_RACK_RR:
23281                 RACK_OPTS_INC(tcp_rack_rr);
23282                 if (optval)
23283                         rack->use_rack_rr = 1;
23284                 else
23285                         rack->use_rack_rr = 0;
23286                 break;
23287         case TCP_RACK_PKT_DELAY:
23288                 /* RACK added ms i.e. rack-rtt + reord + N */
23289                 RACK_OPTS_INC(tcp_rack_pkt_delay);
23290                 rack->r_ctl.rc_pkt_delay = optval;
23291                 break;
23292         case TCP_DELACK:
23293                 RACK_OPTS_INC(tcp_rack_delayed_ack);
23294                 if (optval == 0)
23295                         tp->t_delayed_ack = 0;
23296                 else
23297                         tp->t_delayed_ack = 1;
23298                 if (tp->t_flags & TF_DELACK) {
23299                         tp->t_flags &= ~TF_DELACK;
23300                         tp->t_flags |= TF_ACKNOW;
23301                         NET_EPOCH_ENTER(et);
23302                         rack_output(tp);
23303                         NET_EPOCH_EXIT(et);
23304                 }
23305                 break;
23306
23307         case TCP_BBR_RACK_RTT_USE:
23308                 RACK_OPTS_INC(tcp_rack_rtt_use);
23309                 if ((optval != USE_RTT_HIGH) &&
23310                     (optval != USE_RTT_LOW) &&
23311                     (optval != USE_RTT_AVG))
23312                         error = EINVAL;
23313                 else
23314                         rack->r_ctl.rc_rate_sample_method = optval;
23315                 break;
23316         case TCP_DATA_AFTER_CLOSE:
23317                 RACK_OPTS_INC(tcp_data_after_close);
23318                 if (optval)
23319                         rack->rc_allow_data_af_clo = 1;
23320                 else
23321                         rack->rc_allow_data_af_clo = 0;
23322                 break;
23323         default:
23324                 break;
23325         }
23326         tcp_log_socket_option(tp, sopt_name, optval, error);
23327         return (error);
23328 }
23329
23330
23331 static void
23332 rack_apply_deferred_options(struct tcp_rack *rack)
23333 {
23334         struct deferred_opt_list *dol, *sdol;
23335         uint32_t s_optval;
23336
23337         TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
23338                 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
23339                 /* Disadvantage of deferal is you loose the error return */
23340                 s_optval = (uint32_t)dol->optval;
23341                 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL);
23342                 free(dol, M_TCPDO);
23343         }
23344 }
23345
23346 static void
23347 rack_hw_tls_change(struct tcpcb *tp, int chg)
23348 {
23349         /* Update HW tls state */
23350         struct tcp_rack *rack;
23351
23352         rack = (struct tcp_rack *)tp->t_fb_ptr;
23353         if (chg)
23354                 rack->r_ctl.fsb.hw_tls = 1;
23355         else
23356                 rack->r_ctl.fsb.hw_tls = 0;
23357 }
23358
23359 static int
23360 rack_pru_options(struct tcpcb *tp, int flags)
23361 {
23362         if (flags & PRUS_OOB)
23363                 return (EOPNOTSUPP);
23364         return (0);
23365 }
23366
23367 static bool
23368 rack_wake_check(struct tcpcb *tp)
23369 {
23370         struct tcp_rack *rack;
23371         struct timeval tv;
23372         uint32_t cts;
23373
23374         rack = (struct tcp_rack *)tp->t_fb_ptr;
23375         if (rack->r_ctl.rc_hpts_flags) {
23376                 cts = tcp_get_usecs(&tv);
23377                 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){
23378                         /*
23379                          * Pacing timer is up, check if we are ready.
23380                          */
23381                         if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to))
23382                                 return (true);
23383                 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) {
23384                         /*
23385                          * A timer is up, check if we are ready.
23386                          */
23387                         if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp))
23388                                 return (true);
23389                 }
23390         }
23391         return (false);
23392 }
23393
23394 static struct tcp_function_block __tcp_rack = {
23395         .tfb_tcp_block_name = __XSTRING(STACKNAME),
23396         .tfb_tcp_output = rack_output,
23397         .tfb_do_queued_segments = ctf_do_queued_segments,
23398         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
23399         .tfb_tcp_do_segment = rack_do_segment,
23400         .tfb_tcp_ctloutput = rack_ctloutput,
23401         .tfb_tcp_fb_init = rack_init,
23402         .tfb_tcp_fb_fini = rack_fini,
23403         .tfb_tcp_timer_stop_all = rack_stopall,
23404         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
23405         .tfb_tcp_handoff_ok = rack_handoff_ok,
23406         .tfb_tcp_mtu_chg = rack_mtu_change,
23407         .tfb_pru_options = rack_pru_options,
23408         .tfb_hwtls_change = rack_hw_tls_change,
23409         .tfb_chg_query = rack_chg_query,
23410         .tfb_switch_failed = rack_switch_failed,
23411         .tfb_early_wake_check = rack_wake_check,
23412         .tfb_compute_pipe = rack_compute_pipe,
23413         .tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
23414 };
23415
23416 /*
23417  * rack_ctloutput() must drop the inpcb lock before performing copyin on
23418  * socket option arguments.  When it re-acquires the lock after the copy, it
23419  * has to revalidate that the connection is still valid for the socket
23420  * option.
23421  */
23422 static int
23423 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt)
23424 {
23425         struct inpcb *inp = tptoinpcb(tp);
23426 #ifdef INET6
23427         struct ip6_hdr *ip6;
23428         int32_t mask, tclass;
23429 #endif
23430 #ifdef INET
23431         struct ip *ip;
23432 #endif
23433         struct tcp_rack *rack;
23434         struct tcp_hybrid_req hybrid;
23435         uint64_t loptval;
23436         int32_t error = 0, optval;
23437
23438         rack = (struct tcp_rack *)tp->t_fb_ptr;
23439         if (rack == NULL) {
23440                 INP_WUNLOCK(inp);
23441                 return (EINVAL);
23442         }
23443 #ifdef INET6
23444         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
23445 #endif
23446 #ifdef INET
23447         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
23448 #endif
23449
23450         switch (sopt->sopt_level) {
23451 #ifdef INET6
23452         case IPPROTO_IPV6:
23453                 MPASS(inp->inp_vflag & INP_IPV6PROTO);
23454                 switch (sopt->sopt_name) {
23455                 case IPV6_USE_MIN_MTU:
23456                         tcp6_use_min_mtu(tp);
23457                         break;
23458                 case IPV6_TCLASS:
23459                         /*
23460                          * The DSCP codepoint has changed, update the fsb
23461                          * by overwriting any previous traffic class.
23462                          */
23463                         if (inp->in6p_outputopts) {
23464                                 mask = 0xfc;
23465                                 tclass = inp->in6p_outputopts->ip6po_tclass;
23466                                 ip6->ip6_flow &= htonl((~mask) << 20);
23467                                 ip6->ip6_flow |= htonl((tclass & mask) << 20);
23468                         }
23469                         break;
23470                 }
23471                 INP_WUNLOCK(inp);
23472                 return (0);
23473 #endif
23474 #ifdef INET
23475         case IPPROTO_IP:
23476                 switch (sopt->sopt_name) {
23477                 case IP_TOS:
23478                         /*
23479                          * The DSCP codepoint has changed, update the fsb.
23480                          */
23481                         ip->ip_tos = rack->rc_inp->inp_ip_tos;
23482                         break;
23483                 case IP_TTL:
23484                         /*
23485                          * The TTL has changed, update the fsb.
23486                          */
23487                         ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
23488                         break;
23489                 }
23490                 INP_WUNLOCK(inp);
23491                 return (0);
23492 #endif
23493 #ifdef SO_PEERPRIO
23494         case SOL_SOCKET:
23495                 switch (sopt->sopt_name) {
23496                 case SO_PEERPRIO:                       /*  SC-URL:bs */
23497                         /* Already read in and sanity checked in sosetopt(). */
23498                         if (inp->inp_socket) {
23499                                 rack->client_bufferlvl = inp->inp_socket->so_peerprio;
23500                                 rack_client_buffer_level_set(rack);
23501                         }
23502                         break;
23503                 }
23504                 INP_WUNLOCK(inp);
23505                 return (0);
23506 #endif
23507         case IPPROTO_TCP:
23508                 switch (sopt->sopt_name) {
23509                 case TCP_RACK_TLP_REDUCE:               /*  URL:tlp_reduce */
23510                 /*  Pacing related ones */
23511                 case TCP_RACK_PACE_ALWAYS:              /*  URL:pace_always */
23512                 case TCP_BBR_RACK_INIT_RATE:            /*  URL:irate */
23513                 case TCP_BBR_IWINTSO:                   /*  URL:tso_iwin */
23514                 case TCP_RACK_PACE_MIN_SEG:             /*  URL:pace_min_seg */
23515                 case TCP_RACK_PACE_MAX_SEG:             /*  URL:pace_max_seg */
23516                 case TCP_RACK_FORCE_MSEG:               /*  URL:force_max_seg */
23517                 case TCP_RACK_PACE_RATE_CA:             /*  URL:pr_ca */
23518                 case TCP_RACK_PACE_RATE_SS:             /*  URL:pr_ss*/
23519                 case TCP_RACK_PACE_RATE_REC:            /*  URL:pr_rec */
23520                 case TCP_RACK_GP_INCREASE_CA:           /*  URL:gp_inc_ca */
23521                 case TCP_RACK_GP_INCREASE_SS:           /*  URL:gp_inc_ss */
23522                 case TCP_RACK_GP_INCREASE_REC:          /*  URL:gp_inc_rec */
23523                 case TCP_RACK_RR_CONF:                  /*  URL:rrr_conf */
23524                 case TCP_BBR_HDWR_PACE:                 /*  URL:hdwrpace */
23525                 case TCP_HDWR_RATE_CAP:                 /*  URL:hdwrcap boolean */
23526                 case TCP_PACING_RATE_CAP:               /*  URL:cap  -- used by side-channel */
23527                 case TCP_HDWR_UP_ONLY:                  /*  URL:uponly -- hardware pacing  boolean */
23528                 case TCP_RACK_PACING_BETA:              /*  URL:pacing_beta */
23529                 case TCP_RACK_PACING_BETA_ECN:          /*  URL:pacing_beta_ecn */
23530                 case TCP_RACK_PACE_TO_FILL:             /*  URL:fillcw */
23531                 case TCP_RACK_DGP_IN_REC:               /*  URL:dgpinrec */
23532                         /* End pacing related */
23533                 case TCP_RXT_CLAMP:                     /*  URL:rxtclamp */
23534                 case TCP_DELACK:                        /*  URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
23535                 case TCP_RACK_PRR_SENDALOT:             /*  URL:prr_sendalot */
23536                 case TCP_RACK_MIN_TO:                   /*  URL:min_to */
23537                 case TCP_RACK_EARLY_SEG:                /*  URL:early_seg */
23538                 case TCP_RACK_REORD_THRESH:             /*  URL:reord_thresh */
23539                 case TCP_RACK_REORD_FADE:               /*  URL:reord_fade */
23540                 case TCP_RACK_TLP_THRESH:               /*  URL:tlp_thresh */
23541                 case TCP_RACK_PKT_DELAY:                /*  URL:pkt_delay */
23542                 case TCP_RACK_TLP_USE:                  /*  URL:tlp_use */
23543                 case TCP_BBR_RACK_RTT_USE:              /*  URL:rttuse */
23544                 case TCP_BBR_USE_RACK_RR:               /*  URL:rackrr */
23545                 case TCP_RACK_DO_DETECTION:             /*  URL:detect */
23546                 case TCP_NO_PRR:                        /*  URL:noprr */
23547                 case TCP_TIMELY_DYN_ADJ:                /*  URL:dynamic */
23548                 case TCP_DATA_AFTER_CLOSE:              /*  no URL */
23549                 case TCP_RACK_NONRXT_CFG_RATE:          /*  URL:nonrxtcr */
23550                 case TCP_SHARED_CWND_ENABLE:            /*  URL:scwnd */
23551                 case TCP_RACK_MBUF_QUEUE:               /*  URL:mqueue */
23552                 case TCP_RACK_NO_PUSH_AT_MAX:           /*  URL:npush */
23553                 case TCP_SHARED_CWND_TIME_LIMIT:        /*  URL:lscwnd */
23554                 case TCP_RACK_PROFILE:                  /*  URL:profile */
23555                 case TCP_HYBRID_PACING:                 /*  URL:hybrid */
23556                 case TCP_USE_CMP_ACKS:                  /*  URL:cmpack */
23557                 case TCP_RACK_ABC_VAL:                  /*  URL:labc */
23558                 case TCP_REC_ABC_VAL:                   /*  URL:reclabc */
23559                 case TCP_RACK_MEASURE_CNT:              /*  URL:measurecnt */
23560                 case TCP_DEFER_OPTIONS:                 /*  URL:defer */
23561                 case TCP_RACK_DSACK_OPT:                /*  URL:dsack */
23562                 case TCP_RACK_TIMER_SLOP:               /*  URL:timer_slop */
23563                 case TCP_RACK_ENABLE_HYSTART:           /*  URL:hystart */
23564                 case TCP_RACK_SET_RXT_OPTIONS:          /*  URL:rxtsz */
23565                 case TCP_RACK_HI_BETA:                  /*  URL:hibeta */
23566                 case TCP_RACK_SPLIT_LIMIT:              /*  URL:split */
23567                 case TCP_RACK_PACING_DIVISOR:           /*  URL:divisor */
23568                 case TCP_PACING_DND:                    /*  URL:dnd */
23569                         goto process_opt;
23570                         break;
23571                 default:
23572                         /* Filter off all unknown options to the base stack */
23573                         return (tcp_default_ctloutput(tp, sopt));
23574                         break;
23575                 }
23576
23577         default:
23578                 INP_WUNLOCK(inp);
23579                 return (0);
23580         }
23581 process_opt:
23582         INP_WUNLOCK(inp);
23583         if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
23584                 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
23585                 /*
23586                  * We truncate it down to 32 bits for the socket-option trace this
23587                  * means rates > 34Gbps won't show right, but thats probably ok.
23588                  */
23589                 optval = (uint32_t)loptval;
23590         } else if (sopt->sopt_name == TCP_HYBRID_PACING) {
23591                 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid));
23592         } else {
23593                 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
23594                 /* Save it in 64 bit form too */
23595                 loptval = optval;
23596         }
23597         if (error)
23598                 return (error);
23599         INP_WLOCK(inp);
23600         if (tp->t_fb != &__tcp_rack) {
23601                 INP_WUNLOCK(inp);
23602                 return (ENOPROTOOPT);
23603         }
23604         if (rack->defer_options && (rack->gp_ready == 0) &&
23605             (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
23606             (sopt->sopt_name != TCP_HYBRID_PACING) &&
23607             (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
23608             (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
23609             (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
23610             (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
23611                 /* Options are beind deferred */
23612                 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
23613                         INP_WUNLOCK(inp);
23614                         return (0);
23615                 } else {
23616                         /* No memory to defer, fail */
23617                         INP_WUNLOCK(inp);
23618                         return (ENOMEM);
23619                 }
23620         }
23621         error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid);
23622         INP_WUNLOCK(inp);
23623         return (error);
23624 }
23625
23626 static void
23627 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
23628 {
23629
23630         INP_WLOCK_ASSERT(tptoinpcb(tp));
23631         bzero(ti, sizeof(*ti));
23632
23633         ti->tcpi_state = tp->t_state;
23634         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
23635                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
23636         if (tp->t_flags & TF_SACK_PERMIT)
23637                 ti->tcpi_options |= TCPI_OPT_SACK;
23638         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
23639                 ti->tcpi_options |= TCPI_OPT_WSCALE;
23640                 ti->tcpi_snd_wscale = tp->snd_scale;
23641                 ti->tcpi_rcv_wscale = tp->rcv_scale;
23642         }
23643         if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
23644                 ti->tcpi_options |= TCPI_OPT_ECN;
23645         if (tp->t_flags & TF_FASTOPEN)
23646                 ti->tcpi_options |= TCPI_OPT_TFO;
23647         /* still kept in ticks is t_rcvtime */
23648         ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
23649         /* Since we hold everything in precise useconds this is easy */
23650         ti->tcpi_rtt = tp->t_srtt;
23651         ti->tcpi_rttvar = tp->t_rttvar;
23652         ti->tcpi_rto = tp->t_rxtcur;
23653         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
23654         ti->tcpi_snd_cwnd = tp->snd_cwnd;
23655         /*
23656          * FreeBSD-specific extension fields for tcp_info.
23657          */
23658         ti->tcpi_rcv_space = tp->rcv_wnd;
23659         ti->tcpi_rcv_nxt = tp->rcv_nxt;
23660         ti->tcpi_snd_wnd = tp->snd_wnd;
23661         ti->tcpi_snd_bwnd = 0;          /* Unused, kept for compat. */
23662         ti->tcpi_snd_nxt = tp->snd_nxt;
23663         ti->tcpi_snd_mss = tp->t_maxseg;
23664         ti->tcpi_rcv_mss = tp->t_maxseg;
23665         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
23666         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
23667         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
23668         ti->tcpi_total_tlp = tp->t_sndtlppack;
23669         ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
23670 #ifdef NETFLIX_STATS
23671         memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
23672 #endif
23673 #ifdef TCP_OFFLOAD
23674         if (tp->t_flags & TF_TOE) {
23675                 ti->tcpi_options |= TCPI_OPT_TOE;
23676                 tcp_offload_tcp_info(tp, ti);
23677         }
23678 #endif
23679 }
23680
23681 static int
23682 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt)
23683 {
23684         struct inpcb *inp = tptoinpcb(tp);
23685         struct tcp_rack *rack;
23686         int32_t error, optval;
23687         uint64_t val, loptval;
23688         struct  tcp_info ti;
23689         /*
23690          * Because all our options are either boolean or an int, we can just
23691          * pull everything into optval and then unlock and copy. If we ever
23692          * add a option that is not a int, then this will have quite an
23693          * impact to this routine.
23694          */
23695         error = 0;
23696         rack = (struct tcp_rack *)tp->t_fb_ptr;
23697         if (rack == NULL) {
23698                 INP_WUNLOCK(inp);
23699                 return (EINVAL);
23700         }
23701         switch (sopt->sopt_name) {
23702         case TCP_INFO:
23703                 /* First get the info filled */
23704                 rack_fill_info(tp, &ti);
23705                 /* Fix up the rtt related fields if needed */
23706                 INP_WUNLOCK(inp);
23707                 error = sooptcopyout(sopt, &ti, sizeof ti);
23708                 return (error);
23709         /*
23710          * Beta is the congestion control value for NewReno that influences how
23711          * much of a backoff happens when loss is detected. It is normally set
23712          * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
23713          * when you exit recovery.
23714          */
23715         case TCP_RACK_PACING_BETA:
23716                 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
23717                         error = EINVAL;
23718                 else if (rack->rc_pacing_cc_set == 0)
23719                         optval = rack->r_ctl.rc_saved_beta.beta;
23720                 else {
23721                         /*
23722                          * Reach out into the CC data and report back what
23723                          * I have previously set. Yeah it looks hackish but
23724                          * we don't want to report the saved values.
23725                          */
23726                         if (tp->t_ccv.cc_data)
23727                                 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta;
23728                         else
23729                                 error = EINVAL;
23730                 }
23731                 break;
23732                 /*
23733                  * Beta_ecn is the congestion control value for NewReno that influences how
23734                  * much of a backoff happens when a ECN mark is detected. It is normally set
23735                  * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
23736                  * you exit recovery. Note that classic ECN has a beta of 50, it is only
23737                  * ABE Ecn that uses this "less" value, but we do too with pacing :)
23738                  */
23739
23740         case TCP_RACK_PACING_BETA_ECN:
23741                 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
23742                         error = EINVAL;
23743                 else if (rack->rc_pacing_cc_set == 0)
23744                         optval = rack->r_ctl.rc_saved_beta.beta_ecn;
23745                 else {
23746                         /*
23747                          * Reach out into the CC data and report back what
23748                          * I have previously set. Yeah it looks hackish but
23749                          * we don't want to report the saved values.
23750                          */
23751                         if (tp->t_ccv.cc_data)
23752                                 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn;
23753                         else
23754                                 error = EINVAL;
23755                 }
23756                 break;
23757         case TCP_RACK_DSACK_OPT:
23758                 optval = 0;
23759                 if (rack->rc_rack_tmr_std_based) {
23760                         optval |= 1;
23761                 }
23762                 if (rack->rc_rack_use_dsack) {
23763                         optval |= 2;
23764                 }
23765                 break;
23766         case TCP_RACK_ENABLE_HYSTART:
23767         {
23768                 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
23769                         optval = RACK_HYSTART_ON;
23770                         if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND)
23771                                 optval = RACK_HYSTART_ON_W_SC;
23772                         if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH)
23773                                 optval = RACK_HYSTART_ON_W_SC_C;
23774                 } else {
23775                         optval = RACK_HYSTART_OFF;
23776                 }
23777         }
23778         break;
23779         case TCP_RACK_DGP_IN_REC:
23780                 optval = rack->r_ctl.full_dgp_in_rec;
23781                 break;
23782         case TCP_RACK_HI_BETA:
23783                 optval = rack->rack_hibeta;
23784                 break;
23785         case TCP_RXT_CLAMP:
23786                 optval = rack->r_ctl.saved_rxt_clamp_val;
23787                 break;
23788         case TCP_DEFER_OPTIONS:
23789                 optval = rack->defer_options;
23790                 break;
23791         case TCP_RACK_MEASURE_CNT:
23792                 optval = rack->r_ctl.req_measurements;
23793                 break;
23794         case TCP_REC_ABC_VAL:
23795                 optval = rack->r_use_labc_for_rec;
23796                 break;
23797         case TCP_RACK_ABC_VAL:
23798                 optval = rack->rc_labc;
23799                 break;
23800         case TCP_HDWR_UP_ONLY:
23801                 optval= rack->r_up_only;
23802                 break;
23803         case TCP_PACING_RATE_CAP:
23804                 loptval = rack->r_ctl.bw_rate_cap;
23805                 break;
23806         case TCP_RACK_PROFILE:
23807                 /* You cannot retrieve a profile, its write only */
23808                 error = EINVAL;
23809                 break;
23810         case TCP_HYBRID_PACING:
23811                 /* You cannot retrieve hybrid pacing information, its write only */
23812                 error = EINVAL;
23813                 break;
23814         case TCP_USE_CMP_ACKS:
23815                 optval = rack->r_use_cmp_ack;
23816                 break;
23817         case TCP_RACK_PACE_TO_FILL:
23818                 optval = rack->rc_pace_to_cwnd;
23819                 if (optval && rack->r_fill_less_agg)
23820                         optval++;
23821                 break;
23822         case TCP_RACK_NO_PUSH_AT_MAX:
23823                 optval = rack->r_ctl.rc_no_push_at_mrtt;
23824                 break;
23825         case TCP_SHARED_CWND_ENABLE:
23826                 optval = rack->rack_enable_scwnd;
23827                 break;
23828         case TCP_RACK_NONRXT_CFG_RATE:
23829                 optval = rack->rack_rec_nonrxt_use_cr;
23830                 break;
23831         case TCP_NO_PRR:
23832                 if (rack->rack_no_prr  == 1)
23833                         optval = 1;
23834                 else if (rack->no_prr_addback == 1)
23835                         optval = 2;
23836                 else
23837                         optval = 0;
23838                 break;
23839         case TCP_RACK_DO_DETECTION:
23840                 optval = rack->do_detection;
23841                 break;
23842         case TCP_RACK_MBUF_QUEUE:
23843                 /* Now do we use the LRO mbuf-queue feature */
23844                 optval = rack->r_mbuf_queue;
23845                 break;
23846         case TCP_TIMELY_DYN_ADJ:
23847                 optval = rack->rc_gp_dyn_mul;
23848                 break;
23849         case TCP_BBR_IWINTSO:
23850                 optval = rack->rc_init_win;
23851                 break;
23852         case TCP_RACK_TLP_REDUCE:
23853                 /* RACK TLP cwnd reduction (bool) */
23854                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
23855                 break;
23856         case TCP_BBR_RACK_INIT_RATE:
23857                 val = rack->r_ctl.init_rate;
23858                 /* convert to kbits per sec */
23859                 val *= 8;
23860                 val /= 1000;
23861                 optval = (uint32_t)val;
23862                 break;
23863         case TCP_RACK_FORCE_MSEG:
23864                 optval = rack->rc_force_max_seg;
23865                 break;
23866         case TCP_RACK_PACE_MIN_SEG:
23867                 optval = rack->r_ctl.rc_user_set_min_segs;
23868                 break;
23869         case TCP_RACK_PACE_MAX_SEG:
23870                 /* Max segments in a pace */
23871                 optval = rack->rc_user_set_max_segs;
23872                 break;
23873         case TCP_RACK_PACE_ALWAYS:
23874                 /* Use the always pace method */
23875                 optval = rack->rc_always_pace;
23876                 break;
23877         case TCP_RACK_PRR_SENDALOT:
23878                 /* Allow PRR to send more than one seg */
23879                 optval = rack->r_ctl.rc_prr_sendalot;
23880                 break;
23881         case TCP_RACK_MIN_TO:
23882                 /* Minimum time between rack t-o's in ms */
23883                 optval = rack->r_ctl.rc_min_to;
23884                 break;
23885         case TCP_RACK_SPLIT_LIMIT:
23886                 optval = rack->r_ctl.rc_split_limit;
23887                 break;
23888         case TCP_RACK_EARLY_SEG:
23889                 /* If early recovery max segments */
23890                 optval = rack->r_ctl.rc_early_recovery_segs;
23891                 break;
23892         case TCP_RACK_REORD_THRESH:
23893                 /* RACK reorder threshold (shift amount) */
23894                 optval = rack->r_ctl.rc_reorder_shift;
23895                 break;
23896         case TCP_RACK_REORD_FADE:
23897                 /* Does reordering fade after ms time */
23898                 optval = rack->r_ctl.rc_reorder_fade;
23899                 break;
23900         case TCP_BBR_USE_RACK_RR:
23901                 /* Do we use the rack cheat for rxt */
23902                 optval = rack->use_rack_rr;
23903                 break;
23904         case TCP_RACK_RR_CONF:
23905                 optval = rack->r_rr_config;
23906                 break;
23907         case TCP_HDWR_RATE_CAP:
23908                 optval = rack->r_rack_hw_rate_caps;
23909                 break;
23910         case TCP_BBR_HDWR_PACE:
23911                 optval = rack->rack_hdw_pace_ena;
23912                 break;
23913         case TCP_RACK_TLP_THRESH:
23914                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
23915                 optval = rack->r_ctl.rc_tlp_threshold;
23916                 break;
23917         case TCP_RACK_PKT_DELAY:
23918                 /* RACK added ms i.e. rack-rtt + reord + N */
23919                 optval = rack->r_ctl.rc_pkt_delay;
23920                 break;
23921         case TCP_RACK_TLP_USE:
23922                 optval = rack->rack_tlp_threshold_use;
23923                 break;
23924         case TCP_PACING_DND:
23925                 optval = rack->rc_pace_dnd;
23926                 break;
23927         case TCP_RACK_PACE_RATE_CA:
23928                 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
23929                 break;
23930         case TCP_RACK_PACE_RATE_SS:
23931                 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
23932                 break;
23933         case TCP_RACK_PACE_RATE_REC:
23934                 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
23935                 break;
23936         case TCP_RACK_GP_INCREASE_SS:
23937                 optval = rack->r_ctl.rack_per_of_gp_ca;
23938                 break;
23939         case TCP_RACK_GP_INCREASE_CA:
23940                 optval = rack->r_ctl.rack_per_of_gp_ss;
23941                 break;
23942         case TCP_RACK_PACING_DIVISOR:
23943                 optval = rack->r_ctl.pace_len_divisor;
23944                 break;
23945         case TCP_BBR_RACK_RTT_USE:
23946                 optval = rack->r_ctl.rc_rate_sample_method;
23947                 break;
23948         case TCP_DELACK:
23949                 optval = tp->t_delayed_ack;
23950                 break;
23951         case TCP_DATA_AFTER_CLOSE:
23952                 optval = rack->rc_allow_data_af_clo;
23953                 break;
23954         case TCP_SHARED_CWND_TIME_LIMIT:
23955                 optval = rack->r_limit_scw;
23956                 break;
23957         case TCP_RACK_TIMER_SLOP:
23958                 optval = rack->r_ctl.timer_slop;
23959                 break;
23960         default:
23961                 return (tcp_default_ctloutput(tp, sopt));
23962                 break;
23963         }
23964         INP_WUNLOCK(inp);
23965         if (error == 0) {
23966                 if (TCP_PACING_RATE_CAP)
23967                         error = sooptcopyout(sopt, &loptval, sizeof loptval);
23968                 else
23969                         error = sooptcopyout(sopt, &optval, sizeof optval);
23970         }
23971         return (error);
23972 }
23973
23974 static int
23975 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt)
23976 {
23977         if (sopt->sopt_dir == SOPT_SET) {
23978                 return (rack_set_sockopt(tp, sopt));
23979         } else if (sopt->sopt_dir == SOPT_GET) {
23980                 return (rack_get_sockopt(tp, sopt));
23981         } else {
23982                 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
23983         }
23984 }
23985
23986 static const char *rack_stack_names[] = {
23987         __XSTRING(STACKNAME),
23988 #ifdef STACKALIAS
23989         __XSTRING(STACKALIAS),
23990 #endif
23991 };
23992
23993 static int
23994 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
23995 {
23996         memset(mem, 0, size);
23997         return (0);
23998 }
23999
24000 static void
24001 rack_dtor(void *mem, int32_t size, void *arg)
24002 {
24003
24004 }
24005
24006 static bool rack_mod_inited = false;
24007
24008 static int
24009 tcp_addrack(module_t mod, int32_t type, void *data)
24010 {
24011         int32_t err = 0;
24012         int num_stacks;
24013
24014         switch (type) {
24015         case MOD_LOAD:
24016                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
24017                     sizeof(struct rack_sendmap),
24018                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
24019
24020                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
24021                     sizeof(struct tcp_rack),
24022                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
24023
24024                 sysctl_ctx_init(&rack_sysctl_ctx);
24025                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
24026                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
24027                     OID_AUTO,
24028 #ifdef STACKALIAS
24029                     __XSTRING(STACKALIAS),
24030 #else
24031                     __XSTRING(STACKNAME),
24032 #endif
24033                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
24034                     "");
24035                 if (rack_sysctl_root == NULL) {
24036                         printf("Failed to add sysctl node\n");
24037                         err = EFAULT;
24038                         goto free_uma;
24039                 }
24040                 rack_init_sysctls();
24041                 num_stacks = nitems(rack_stack_names);
24042                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
24043                     rack_stack_names, &num_stacks);
24044                 if (err) {
24045                         printf("Failed to register %s stack name for "
24046                             "%s module\n", rack_stack_names[num_stacks],
24047                             __XSTRING(MODNAME));
24048                         sysctl_ctx_free(&rack_sysctl_ctx);
24049 free_uma:
24050                         uma_zdestroy(rack_zone);
24051                         uma_zdestroy(rack_pcb_zone);
24052                         rack_counter_destroy();
24053                         printf("Failed to register rack module -- err:%d\n", err);
24054                         return (err);
24055                 }
24056                 tcp_lro_reg_mbufq();
24057                 rack_mod_inited = true;
24058                 break;
24059         case MOD_QUIESCE:
24060                 err = deregister_tcp_functions(&__tcp_rack, true, false);
24061                 break;
24062         case MOD_UNLOAD:
24063                 err = deregister_tcp_functions(&__tcp_rack, false, true);
24064                 if (err == EBUSY)
24065                         break;
24066                 if (rack_mod_inited) {
24067                         uma_zdestroy(rack_zone);
24068                         uma_zdestroy(rack_pcb_zone);
24069                         sysctl_ctx_free(&rack_sysctl_ctx);
24070                         rack_counter_destroy();
24071                         rack_mod_inited = false;
24072                 }
24073                 tcp_lro_dereg_mbufq();
24074                 err = 0;
24075                 break;
24076         default:
24077                 return (EOPNOTSUPP);
24078         }
24079         return (err);
24080 }
24081
24082 static moduledata_t tcp_rack = {
24083         .name = __XSTRING(MODNAME),
24084         .evhand = tcp_addrack,
24085         .priv = 0
24086 };
24087
24088 MODULE_VERSION(MODNAME, 1);
24089 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
24090 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
24091
24092 #endif /* #if !defined(INET) && !defined(INET6) */