sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2020 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34 #include "opt_ratelimit.h"
  35 #include "opt_kern_tls.h"
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/systm.h>
  53 #ifdef STATS
  54 #include <sys/qmath.h>
  55 #include <sys/tree.h>
  56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  57 #else
  58 #include <sys/tree.h>
  59 #endif
  60 #include <sys/refcount.h>
  61 #include <sys/queue.h>
  62 #include <sys/tim_filter.h>
  63 #include <sys/smp.h>
  64 #include <sys/kthread.h>
  65 #include <sys/kern_prefetch.h>
  66 #include <sys/protosw.h>
  67 #ifdef TCP_ACCOUNTING
  68 #include <sys/sched.h>
  69 #include <machine/cpu.h>
  70 #endif
  71 #include <vm/uma.h>
  72
  73 #include <net/route.h>
  74 #include <net/route/nhop.h>
  75 #include <net/vnet.h>
  76
  77 #define TCPSTATES               /* for logging */
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_kdtrace.h>
  81 #include <netinet/in_pcb.h>
  82 #include <netinet/ip.h>
  83 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  84 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  85 #include <netinet/ip_var.h>
  86 #include <netinet/ip6.h>
  87 #include <netinet6/in6_pcb.h>
  88 #include <netinet6/ip6_var.h>
  89 #include <netinet/tcp.h>
  90 #define TCPOUTFLAGS
  91 #include <netinet/tcp_fsm.h>
  92 #include <netinet/tcp_log_buf.h>
  93 #include <netinet/tcp_seq.h>
  94 #include <netinet/tcp_timer.h>
  95 #include <netinet/tcp_var.h>
  96 #include <netinet/tcp_hpts.h>
  97 #include <netinet/tcp_ratelimit.h>
  98 #include <netinet/tcp_accounting.h>
  99 #include <netinet/tcpip.h>
 100 #include <netinet/cc/cc.h>
 101 #include <netinet/cc/cc_newreno.h>
 102 #include <netinet/tcp_fastopen.h>
 103 #include <netinet/tcp_lro.h>
 104 #ifdef NETFLIX_SHARED_CWND
 105 #include <netinet/tcp_shared_cwnd.h>
 106 #endif
 107 #ifdef TCPDEBUG
 108 #include <netinet/tcp_debug.h>
 109 #endif                          /* TCPDEBUG */
 110 #ifdef TCP_OFFLOAD
 111 #include <netinet/tcp_offload.h>
 112 #endif
 113 #ifdef INET6
 114 #include <netinet6/tcp6_var.h>
 115 #endif
 116
 117 #include <netipsec/ipsec_support.h>
 118
 119 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 120 #include <netipsec/ipsec.h>
 121 #include <netipsec/ipsec6.h>
 122 #endif                          /* IPSEC */
 123
 124 #include <netinet/udp.h>
 125 #include <netinet/udp_var.h>
 126 #include <machine/in_cksum.h>
 127
 128 #ifdef MAC
 129 #include <security/mac/mac_framework.h>
 130 #endif
 131 #include "sack_filter.h"
 132 #include "tcp_rack.h"
 133 #include "rack_bbr_common.h"
 134
 135 uma_zone_t rack_zone;
 136 uma_zone_t rack_pcb_zone;
 137
 138 #ifndef TICKS2SBT
 139 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 140 #endif
 141
 142 VNET_DECLARE(uint32_t, newreno_beta);
 143 VNET_DECLARE(uint32_t, newreno_beta_ecn);
 144 #define V_newreno_beta VNET(newreno_beta)
 145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
 146
 147
 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
 149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
 150
 151 struct sysctl_ctx_list rack_sysctl_ctx;
 152 struct sysctl_oid *rack_sysctl_root;
 153
 154 #define CUM_ACKED 1
 155 #define SACKED 2
 156
 157 /*
 158  * The RACK module incorporates a number of
 159  * TCP ideas that have been put out into the IETF
 160  * over the last few years:
 161  * - Matt Mathis's Rate Halving which slowly drops
 162  *    the congestion window so that the ack clock can
 163  *    be maintained during a recovery.
 164  * - Yuchung Cheng's RACK TCP (for which its named) that
 165  *    will stop us using the number of dup acks and instead
 166  *    use time as the gage of when we retransmit.
 167  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 168  *    of Dukkipati et.al.
 169  * RACK depends on SACK, so if an endpoint arrives that
 170  * cannot do SACK the state machine below will shuttle the
 171  * connection back to using the "default" TCP stack that is
 172  * in FreeBSD.
 173  *
 174  * To implement RACK the original TCP stack was first decomposed
 175  * into a functional state machine with individual states
 176  * for each of the possible TCP connection states. The do_segement
 177  * functions role in life is to mandate the connection supports SACK
 178  * initially and then assure that the RACK state matches the conenction
 179  * state before calling the states do_segment function. Each
 180  * state is simplified due to the fact that the original do_segment
 181  * has been decomposed and we *know* what state we are in (no
 182  * switches on the state) and all tests for SACK are gone. This
 183  * greatly simplifies what each state does.
 184  *
 185  * TCP output is also over-written with a new version since it
 186  * must maintain the new rack scoreboard.
 187  *
 188  */
 189 static int32_t rack_tlp_thresh = 1;
 190 static int32_t rack_tlp_limit = 2;      /* No more than 2 TLPs w-out new data */
 191 static int32_t rack_tlp_use_greater = 1;
 192 static int32_t rack_reorder_thresh = 2;
 193 static int32_t rack_reorder_fade = 60000000;    /* 0 - never fade, def 60,000,000
 194                                                  * - 60 seconds */
 195 static uint8_t rack_req_measurements = 1;
 196 /* Attack threshold detections */
 197 static uint32_t rack_highest_sack_thresh_seen = 0;
 198 static uint32_t rack_highest_move_thresh_seen = 0;
 199 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
 200 static int32_t rack_hw_pace_extra_slots = 2;    /* 2 extra MSS time betweens */
 201 static int32_t rack_hw_rate_caps = 1; /* 1; */
 202 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
 203 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
 204 static int32_t rack_hw_up_only = 1;
 205 static int32_t rack_stats_gets_ms_rtt = 1;
 206 static int32_t rack_prr_addbackmax = 2;
 207 static int32_t rack_do_hystart = 0;
 208 static int32_t rack_apply_rtt_with_reduced_conf = 0;
 209
 210 static int32_t rack_pkt_delay = 1000;
 211 static int32_t rack_send_a_lot_in_prr = 1;
 212 static int32_t rack_min_to = 1000;      /* Number of microsecond  min timeout */
 213 static int32_t rack_verbose_logging = 0;
 214 static int32_t rack_ignore_data_after_close = 1;
 215 static int32_t rack_enable_shared_cwnd = 1;
 216 static int32_t rack_use_cmp_acks = 1;
 217 static int32_t rack_use_fsb = 1;
 218 static int32_t rack_use_rfo = 1;
 219 static int32_t rack_use_rsm_rfo = 1;
 220 static int32_t rack_max_abc_post_recovery = 2;
 221 static int32_t rack_client_low_buf = 0;
 222 static int32_t rack_dsack_std_based = 0x3;      /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
 223 #ifdef TCP_ACCOUNTING
 224 static int32_t rack_tcp_accounting = 0;
 225 #endif
 226 static int32_t rack_limits_scwnd = 1;
 227 static int32_t rack_enable_mqueue_for_nonpaced = 0;
 228 static int32_t rack_disable_prr = 0;
 229 static int32_t use_rack_rr = 1;
 230 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
 231 static int32_t rack_persist_min = 250000;       /* 250usec */
 232 static int32_t rack_persist_max = 2000000;      /* 2 Second in usec's */
 233 static int32_t rack_sack_not_required = 1;      /* set to one to allow non-sack to use rack */
 234 static int32_t rack_default_init_window = 0;    /* Use system default */
 235 static int32_t rack_limit_time_with_srtt = 0;
 236 static int32_t rack_autosndbuf_inc = 20;        /* In percentage form */
 237 static int32_t rack_enobuf_hw_boost_mult = 2;   /* How many times the hw rate we boost slot using time_between */
 238 static int32_t rack_enobuf_hw_max = 12000;      /* 12 ms in usecs */
 239 static int32_t rack_enobuf_hw_min = 10000;      /* 10 ms in usecs */
 240 static int32_t rack_hw_rwnd_factor = 2;         /* How many max_segs the rwnd must be before we hold off sending */
 241 /*
 242  * Currently regular tcp has a rto_min of 30ms
 243  * the backoff goes 12 times so that ends up
 244  * being a total of 122.850 seconds before a
 245  * connection is killed.
 246  */
 247 static uint32_t rack_def_data_window = 20;
 248 static uint32_t rack_goal_bdp = 2;
 249 static uint32_t rack_min_srtts = 1;
 250 static uint32_t rack_min_measure_usec = 0;
 251 static int32_t rack_tlp_min = 10000;    /* 10ms */
 252 static int32_t rack_rto_min = 30000;    /* 30,000 usec same as main freebsd */
 253 static int32_t rack_rto_max = 4000000;  /* 4 seconds in usec's */
 254 static const int32_t rack_free_cache = 2;
 255 static int32_t rack_hptsi_segments = 40;
 256 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 257 static int32_t rack_pace_every_seg = 0;
 258 static int32_t rack_delayed_ack_time = 40000;   /* 40ms in usecs */
 259 static int32_t rack_slot_reduction = 4;
 260 static int32_t rack_wma_divisor = 8;            /* For WMA calculation */
 261 static int32_t rack_cwnd_block_ends_measure = 0;
 262 static int32_t rack_rwnd_block_ends_measure = 0;
 263 static int32_t rack_def_profile = 0;
 264
 265 static int32_t rack_lower_cwnd_at_tlp = 0;
 266 static int32_t rack_limited_retran = 0;
 267 static int32_t rack_always_send_oldest = 0;
 268 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 269
 270 static uint16_t rack_per_of_gp_ss = 250;        /* 250 % slow-start */
 271 static uint16_t rack_per_of_gp_ca = 200;        /* 200 % congestion-avoidance */
 272 static uint16_t rack_per_of_gp_rec = 200;       /* 200 % of bw */
 273
 274 /* Probertt */
 275 static uint16_t rack_per_of_gp_probertt = 60;   /* 60% of bw */
 276 static uint16_t rack_per_of_gp_lowthresh = 40;  /* 40% is bottom */
 277 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
 278 static uint16_t rack_atexit_prtt_hbp = 130;     /* Clamp to 130% on exit prtt if highly buffered path */
 279 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
 280
 281 static uint32_t rack_max_drain_wait = 2;        /* How man gp srtt's before we give up draining */
 282 static uint32_t rack_must_drain = 1;            /* How many GP srtt's we *must* wait */
 283 static uint32_t rack_probertt_use_min_rtt_entry = 1;    /* Use the min to calculate the goal else gp_srtt */
 284 static uint32_t rack_probertt_use_min_rtt_exit = 0;
 285 static uint32_t rack_probe_rtt_sets_cwnd = 0;
 286 static uint32_t rack_probe_rtt_safety_val = 2000000;    /* No more than 2 sec in probe-rtt */
 287 static uint32_t rack_time_between_probertt = 9600000;   /* 9.6 sec in usecs */
 288 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;       /* How many srtt periods does probe-rtt last top fraction */
 289 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;       /* How many srtt periods does probe-rtt last bottom fraction */
 290 static uint32_t rack_min_probertt_hold = 40000;         /* Equal to delayed ack time */
 291 static uint32_t rack_probertt_filter_life = 10000000;
 292 static uint32_t rack_probertt_lower_within = 10;
 293 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds)  to count as a lowering */
 294 static int32_t rack_pace_one_seg = 0;           /* Shall we pace for less than 1.4Meg 1MSS at a time */
 295 static int32_t rack_probertt_clear_is = 1;
 296 static int32_t rack_max_drain_hbp = 1;          /* Extra drain times gpsrtt for highly buffered paths */
 297 static int32_t rack_hbp_thresh = 3;             /* what is the divisor max_rtt/min_rtt to decided a hbp */
 298
 299 /* Part of pacing */
 300 static int32_t rack_max_per_above = 30;         /* When we go to increment stop if above 100+this% */
 301
 302 /* Timely information */
 303 /* Combine these two gives the range of 'no change' to bw */
 304 /* ie the up/down provide the upper and lower bound */
 305 static int32_t rack_gp_per_bw_mul_up = 2;       /* 2% */
 306 static int32_t rack_gp_per_bw_mul_down = 4;     /* 4% */
 307 static int32_t rack_gp_rtt_maxmul = 3;          /* 3 x maxmin */
 308 static int32_t rack_gp_rtt_minmul = 1;          /* minrtt + (minrtt/mindiv) is lower rtt */
 309 static int32_t rack_gp_rtt_mindiv = 4;          /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
 310 static int32_t rack_gp_decrease_per = 20;       /* 20% decrease in multipler */
 311 static int32_t rack_gp_increase_per = 2;        /* 2% increase in multipler */
 312 static int32_t rack_per_lower_bound = 50;       /* Don't allow to drop below this multiplier */
 313 static int32_t rack_per_upper_bound_ss = 0;     /* Don't allow SS to grow above this */
 314 static int32_t rack_per_upper_bound_ca = 0;     /* Don't allow CA to grow above this */
 315 static int32_t rack_do_dyn_mul = 0;             /* Are the rack gp multipliers dynamic */
 316 static int32_t rack_gp_no_rec_chg = 1;          /* Prohibit recovery from reducing it's multiplier */
 317 static int32_t rack_timely_dec_clear = 6;       /* Do we clear decrement count at a value (6)? */
 318 static int32_t rack_timely_max_push_rise = 3;   /* One round of pushing */
 319 static int32_t rack_timely_max_push_drop = 3;   /* Three round of pushing */
 320 static int32_t rack_timely_min_segs = 4;        /* 4 segment minimum */
 321 static int32_t rack_use_max_for_nobackoff = 0;
 322 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
 323 static int32_t rack_timely_no_stopping = 0;
 324 static int32_t rack_down_raise_thresh = 100;
 325 static int32_t rack_req_segs = 1;
 326 static uint64_t rack_bw_rate_cap = 0;
 327
 328 /* Weird delayed ack mode */
 329 static int32_t rack_use_imac_dack = 0;
 330 /* Rack specific counters */
 331 counter_u64_t rack_badfr;
 332 counter_u64_t rack_badfr_bytes;
 333 counter_u64_t rack_rtm_prr_retran;
 334 counter_u64_t rack_rtm_prr_newdata;
 335 counter_u64_t rack_timestamp_mismatch;
 336 counter_u64_t rack_reorder_seen;
 337 counter_u64_t rack_paced_segments;
 338 counter_u64_t rack_unpaced_segments;
 339 counter_u64_t rack_calc_zero;
 340 counter_u64_t rack_calc_nonzero;
 341 counter_u64_t rack_saw_enobuf;
 342 counter_u64_t rack_saw_enobuf_hw;
 343 counter_u64_t rack_saw_enetunreach;
 344 counter_u64_t rack_per_timer_hole;
 345 counter_u64_t rack_large_ackcmp;
 346 counter_u64_t rack_small_ackcmp;
 347 counter_u64_t rack_persists_sends;
 348 counter_u64_t rack_persists_acks;
 349 counter_u64_t rack_persists_loss;
 350 counter_u64_t rack_persists_lost_ends;
 351 #ifdef INVARIANTS
 352 counter_u64_t rack_adjust_map_bw;
 353 #endif
 354 /* Tail loss probe counters */
 355 counter_u64_t rack_tlp_tot;
 356 counter_u64_t rack_tlp_newdata;
 357 counter_u64_t rack_tlp_retran;
 358 counter_u64_t rack_tlp_retran_bytes;
 359 counter_u64_t rack_tlp_retran_fail;
 360 counter_u64_t rack_to_tot;
 361 counter_u64_t rack_to_arm_rack;
 362 counter_u64_t rack_to_arm_tlp;
 363 counter_u64_t rack_hot_alloc;
 364 counter_u64_t rack_to_alloc;
 365 counter_u64_t rack_to_alloc_hard;
 366 counter_u64_t rack_to_alloc_emerg;
 367 counter_u64_t rack_to_alloc_limited;
 368 counter_u64_t rack_alloc_limited_conns;
 369 counter_u64_t rack_split_limited;
 370
 371 #define MAX_NUM_OF_CNTS 13
 372 counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
 373 counter_u64_t rack_multi_single_eq;
 374 counter_u64_t rack_proc_non_comp_ack;
 375
 376 counter_u64_t rack_fto_send;
 377 counter_u64_t rack_fto_rsm_send;
 378 counter_u64_t rack_nfto_resend;
 379 counter_u64_t rack_non_fto_send;
 380 counter_u64_t rack_extended_rfo;
 381
 382 counter_u64_t rack_sack_proc_all;
 383 counter_u64_t rack_sack_proc_short;
 384 counter_u64_t rack_sack_proc_restart;
 385 counter_u64_t rack_sack_attacks_detected;
 386 counter_u64_t rack_sack_attacks_reversed;
 387 counter_u64_t rack_sack_used_next_merge;
 388 counter_u64_t rack_sack_splits;
 389 counter_u64_t rack_sack_used_prev_merge;
 390 counter_u64_t rack_sack_skipped_acked;
 391 counter_u64_t rack_ack_total;
 392 counter_u64_t rack_express_sack;
 393 counter_u64_t rack_sack_total;
 394 counter_u64_t rack_move_none;
 395 counter_u64_t rack_move_some;
 396
 397 counter_u64_t rack_used_tlpmethod;
 398 counter_u64_t rack_used_tlpmethod2;
 399 counter_u64_t rack_enter_tlp_calc;
 400 counter_u64_t rack_input_idle_reduces;
 401 counter_u64_t rack_collapsed_win;
 402 counter_u64_t rack_tlp_does_nada;
 403 counter_u64_t rack_try_scwnd;
 404 counter_u64_t rack_hw_pace_init_fail;
 405 counter_u64_t rack_hw_pace_lost;
 406 counter_u64_t rack_sbsndptr_right;
 407 counter_u64_t rack_sbsndptr_wrong;
 408
 409 /* Temp CPU counters */
 410 counter_u64_t rack_find_high;
 411
 412 counter_u64_t rack_progress_drops;
 413 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 414 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 415
 416
 417 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
 418
 419 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {  \
 420         (tv) = (value) + slop;   \
 421         if ((u_long)(tv) < (u_long)(tvmin)) \
 422                 (tv) = (tvmin); \
 423         if ((u_long)(tv) > (u_long)(tvmax)) \
 424                 (tv) = (tvmax); \
 425 } while (0)
 426
 427 static void
 428 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 429
 430 static int
 431 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 432     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 433     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 434 static int
 435 rack_process_data(struct mbuf *m, struct tcphdr *th,
 436     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 437     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 438 static void
 439 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 440    uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
 441 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 442 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 443     uint8_t limit_type);
 444 static struct rack_sendmap *
 445 rack_check_recovery_mode(struct tcpcb *tp,
 446     uint32_t tsused);
 447 static void
 448 rack_cong_signal(struct tcpcb *tp,
 449                  uint32_t type, uint32_t ack);
 450 static void rack_counter_destroy(void);
 451 static int
 452 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 453     struct inpcb *inp, struct tcpcb *tp);
 454 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 455 static void
 456 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
 457 static void
 458 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 459     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 460     uint8_t iptos);
 461 static void rack_dtor(void *mem, int32_t size, void *arg);
 462 static void
 463 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
 464     uint32_t flex1, uint32_t flex2,
 465     uint32_t flex3, uint32_t flex4,
 466     uint32_t flex5, uint32_t flex6,
 467     uint16_t flex7, uint8_t mod);
 468
 469 static void
 470 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
 471    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
 472    struct rack_sendmap *rsm, uint8_t quality);
 473 static struct rack_sendmap *
 474 rack_find_high_nonack(struct tcp_rack *rack,
 475     struct rack_sendmap *rsm);
 476 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 477 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 478 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 479 static int
 480 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 481     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 482 static void
 483 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 484                             tcp_seq th_ack, int line, uint8_t quality);
 485 static uint32_t
 486 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 487 static int32_t rack_handoff_ok(struct tcpcb *tp);
 488 static int32_t rack_init(struct tcpcb *tp);
 489 static void rack_init_sysctls(void);
 490 static void
 491 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 492     struct tcphdr *th, int entered_rec, int dup_ack_struck);
 493 static void
 494 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 495     uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
 496     struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
 497
 498 static void
 499 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 500     struct rack_sendmap *rsm);
 501 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
 502 static int32_t rack_output(struct tcpcb *tp);
 503
 504 static uint32_t
 505 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 506     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 507     uint32_t cts, int *moved_two);
 508 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
 509 static void rack_remxt_tmr(struct tcpcb *tp);
 510 static int
 511 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 512     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 513 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 514 static int32_t rack_stopall(struct tcpcb *tp);
 515 static void
 516 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 517     uint32_t delta);
 518 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 519 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 520 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 521 static uint32_t
 522 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 523     struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
 524 static void
 525 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 526     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
 527 static int
 528 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 529     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
 530 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 531 static int
 532 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 533     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 534     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 535 static int
 536 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 537     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 538     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 539 static int
 540 rack_do_established(struct mbuf *m, struct tcphdr *th,
 541     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 542     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 543 static int
 544 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 545     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 546     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 547 static int
 548 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 549     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 550     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 551 static int
 552 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 553     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 554     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 555 static int
 556 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 557     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 558     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 559 static int
 560 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 561     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 562     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 563 static int
 564 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 565     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 566     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 567 struct rack_sendmap *
 568 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 569     uint32_t tsused);
 570 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
 571     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
 572 static void
 573      tcp_rack_partialack(struct tcpcb *tp);
 574 static int
 575 rack_set_profile(struct tcp_rack *rack, int prof);
 576 static void
 577 rack_apply_deferred_options(struct tcp_rack *rack);
 578
 579 int32_t rack_clear_counter=0;
 580
 581 static void
 582 rack_set_cc_pacing(struct tcp_rack *rack)
 583 {
 584         struct sockopt sopt;
 585         struct cc_newreno_opts opt;
 586         struct newreno old, *ptr;
 587         struct tcpcb *tp;
 588         int error;
 589
 590         if (rack->rc_pacing_cc_set)
 591                 return;
 592
 593         tp = rack->rc_tp;
 594         if (tp->cc_algo == NULL) {
 595                 /* Tcb is leaving */
 596                 printf("No cc algorithm?\n");
 597                 return;
 598         }
 599         rack->rc_pacing_cc_set = 1;
 600         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 601                 /* Not new-reno we can't play games with beta! */
 602                 goto out;
 603         }
 604         ptr = ((struct newreno *)tp->ccv->cc_data);
 605         if (CC_ALGO(tp)->ctl_output == NULL)  {
 606                 /* Huh, why does new_reno no longer have a set function? */
 607                 goto out;
 608         }
 609         if (ptr == NULL) {
 610                 /* Just the default values */
 611                 old.beta = V_newreno_beta_ecn;
 612                 old.beta_ecn = V_newreno_beta_ecn;
 613                 old.newreno_flags = 0;
 614         } else {
 615                 old.beta = ptr->beta;
 616                 old.beta_ecn = ptr->beta_ecn;
 617                 old.newreno_flags = ptr->newreno_flags;
 618         }
 619         sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
 620         sopt.sopt_dir = SOPT_SET;
 621         opt.name = CC_NEWRENO_BETA;
 622         opt.val = rack->r_ctl.rc_saved_beta.beta;
 623         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 624         if (error)  {
 625                 goto out;
 626         }
 627         /*
 628          * Hack alert we need to set in our newreno_flags
 629          * so that Abe behavior is also applied.
 630          */
 631         ((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
 632         opt.name = CC_NEWRENO_BETA_ECN;
 633         opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
 634         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 635         if (error) {
 636                 goto out;
 637         }
 638         /* Save off the original values for restoral */
 639         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
 640 out:
 641         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 642                 union tcp_log_stackspecific log;
 643                 struct timeval tv;
 644
 645                 ptr = ((struct newreno *)tp->ccv->cc_data);
 646                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 647                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 648                 if (ptr) {
 649                         log.u_bbr.flex1 = ptr->beta;
 650                         log.u_bbr.flex2 = ptr->beta_ecn;
 651                         log.u_bbr.flex3 = ptr->newreno_flags;
 652                 }
 653                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
 654                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
 655                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
 656                 log.u_bbr.flex7 = rack->gp_ready;
 657                 log.u_bbr.flex7 <<= 1;
 658                 log.u_bbr.flex7 |= rack->use_fixed_rate;
 659                 log.u_bbr.flex7 <<= 1;
 660                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
 661                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 662                 log.u_bbr.flex8 = 3;
 663                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
 664                                0, &log, false, NULL, NULL, 0, &tv);
 665         }
 666 }
 667
 668 static void
 669 rack_undo_cc_pacing(struct tcp_rack *rack)
 670 {
 671         struct newreno old, *ptr;
 672         struct tcpcb *tp;
 673
 674         if (rack->rc_pacing_cc_set == 0)
 675                 return;
 676         tp = rack->rc_tp;
 677         rack->rc_pacing_cc_set = 0;
 678         if (tp->cc_algo == NULL)
 679                 /* Tcb is leaving */
 680                 return;
 681         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 682                 /* Not new-reno nothing to do! */
 683                 return;
 684         }
 685         ptr = ((struct newreno *)tp->ccv->cc_data);
 686         if (ptr == NULL) {
 687                 /*
 688                  * This happens at rack_fini() if the
 689                  * cc module gets freed on us. In that
 690                  * case we loose our "new" settings but
 691                  * thats ok, since the tcb is going away anyway.
 692                  */
 693                 return;
 694         }
 695         /* Grab out our set values */
 696         memcpy(&old, ptr, sizeof(struct newreno));
 697         /* Copy back in the original values */
 698         memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
 699         /* Now save back the values we had set in (for when pacing is restored) */
 700         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
 701         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 702                 union tcp_log_stackspecific log;
 703                 struct timeval tv;
 704
 705                 ptr = ((struct newreno *)tp->ccv->cc_data);
 706                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 707                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 708                 log.u_bbr.flex1 = ptr->beta;
 709                 log.u_bbr.flex2 = ptr->beta_ecn;
 710                 log.u_bbr.flex3 = ptr->newreno_flags;
 711                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
 712                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
 713                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
 714                 log.u_bbr.flex7 = rack->gp_ready;
 715                 log.u_bbr.flex7 <<= 1;
 716                 log.u_bbr.flex7 |= rack->use_fixed_rate;
 717                 log.u_bbr.flex7 <<= 1;
 718                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
 719                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 720                 log.u_bbr.flex8 = 4;
 721                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 722                                0, &log, false, NULL, NULL, 0, &tv);
 723         }
 724 }
 725
 726 #ifdef NETFLIX_PEAKRATE
 727 static inline void
 728 rack_update_peakrate_thr(struct tcpcb *tp)
 729 {
 730         /* Keep in mind that t_maxpeakrate is in B/s. */
 731         uint64_t peak;
 732         peak = uqmax((tp->t_maxseg * 2),
 733                      (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
 734         tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
 735 }
 736 #endif
 737
 738 static int
 739 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 740 {
 741         uint32_t stat;
 742         int32_t error;
 743         int i;
 744
 745         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 746         if (error || req->newptr == NULL)
 747                 return error;
 748
 749         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 750         if (error)
 751                 return (error);
 752         if (stat == 1) {
 753 #ifdef INVARIANTS
 754                 printf("Clearing RACK counters\n");
 755 #endif
 756                 counter_u64_zero(rack_badfr);
 757                 counter_u64_zero(rack_badfr_bytes);
 758                 counter_u64_zero(rack_rtm_prr_retran);
 759                 counter_u64_zero(rack_rtm_prr_newdata);
 760                 counter_u64_zero(rack_timestamp_mismatch);
 761                 counter_u64_zero(rack_reorder_seen);
 762                 counter_u64_zero(rack_tlp_tot);
 763                 counter_u64_zero(rack_tlp_newdata);
 764                 counter_u64_zero(rack_tlp_retran);
 765                 counter_u64_zero(rack_tlp_retran_bytes);
 766                 counter_u64_zero(rack_tlp_retran_fail);
 767                 counter_u64_zero(rack_to_tot);
 768                 counter_u64_zero(rack_to_arm_rack);
 769                 counter_u64_zero(rack_to_arm_tlp);
 770                 counter_u64_zero(rack_paced_segments);
 771                 counter_u64_zero(rack_calc_zero);
 772                 counter_u64_zero(rack_calc_nonzero);
 773                 counter_u64_zero(rack_unpaced_segments);
 774                 counter_u64_zero(rack_saw_enobuf);
 775                 counter_u64_zero(rack_saw_enobuf_hw);
 776                 counter_u64_zero(rack_saw_enetunreach);
 777                 counter_u64_zero(rack_per_timer_hole);
 778                 counter_u64_zero(rack_large_ackcmp);
 779                 counter_u64_zero(rack_small_ackcmp);
 780                 counter_u64_zero(rack_persists_sends);
 781                 counter_u64_zero(rack_persists_acks);
 782                 counter_u64_zero(rack_persists_loss);
 783                 counter_u64_zero(rack_persists_lost_ends);
 784 #ifdef INVARIANTS
 785                 counter_u64_zero(rack_adjust_map_bw);
 786 #endif
 787                 counter_u64_zero(rack_to_alloc_hard);
 788                 counter_u64_zero(rack_to_alloc_emerg);
 789                 counter_u64_zero(rack_sack_proc_all);
 790                 counter_u64_zero(rack_fto_send);
 791                 counter_u64_zero(rack_fto_rsm_send);
 792                 counter_u64_zero(rack_extended_rfo);
 793                 counter_u64_zero(rack_hw_pace_init_fail);
 794                 counter_u64_zero(rack_hw_pace_lost);
 795                 counter_u64_zero(rack_sbsndptr_wrong);
 796                 counter_u64_zero(rack_sbsndptr_right);
 797                 counter_u64_zero(rack_non_fto_send);
 798                 counter_u64_zero(rack_nfto_resend);
 799                 counter_u64_zero(rack_sack_proc_short);
 800                 counter_u64_zero(rack_sack_proc_restart);
 801                 counter_u64_zero(rack_to_alloc);
 802                 counter_u64_zero(rack_to_alloc_limited);
 803                 counter_u64_zero(rack_alloc_limited_conns);
 804                 counter_u64_zero(rack_split_limited);
 805                 for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
 806                         counter_u64_zero(rack_proc_comp_ack[i]);
 807                 }
 808                 counter_u64_zero(rack_multi_single_eq);
 809                 counter_u64_zero(rack_proc_non_comp_ack);
 810                 counter_u64_zero(rack_find_high);
 811                 counter_u64_zero(rack_sack_attacks_detected);
 812                 counter_u64_zero(rack_sack_attacks_reversed);
 813                 counter_u64_zero(rack_sack_used_next_merge);
 814                 counter_u64_zero(rack_sack_used_prev_merge);
 815                 counter_u64_zero(rack_sack_splits);
 816                 counter_u64_zero(rack_sack_skipped_acked);
 817                 counter_u64_zero(rack_ack_total);
 818                 counter_u64_zero(rack_express_sack);
 819                 counter_u64_zero(rack_sack_total);
 820                 counter_u64_zero(rack_move_none);
 821                 counter_u64_zero(rack_move_some);
 822                 counter_u64_zero(rack_used_tlpmethod);
 823                 counter_u64_zero(rack_used_tlpmethod2);
 824                 counter_u64_zero(rack_enter_tlp_calc);
 825                 counter_u64_zero(rack_progress_drops);
 826                 counter_u64_zero(rack_tlp_does_nada);
 827                 counter_u64_zero(rack_try_scwnd);
 828                 counter_u64_zero(rack_collapsed_win);
 829         }
 830         rack_clear_counter = 0;
 831         return (0);
 832 }
 833
 834 static void
 835 rack_init_sysctls(void)
 836 {
 837         int i;
 838         struct sysctl_oid *rack_counters;
 839         struct sysctl_oid *rack_attack;
 840         struct sysctl_oid *rack_pacing;
 841         struct sysctl_oid *rack_timely;
 842         struct sysctl_oid *rack_timers;
 843         struct sysctl_oid *rack_tlp;
 844         struct sysctl_oid *rack_misc;
 845         struct sysctl_oid *rack_features;
 846         struct sysctl_oid *rack_measure;
 847         struct sysctl_oid *rack_probertt;
 848         struct sysctl_oid *rack_hw_pacing;
 849
 850         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 851             SYSCTL_CHILDREN(rack_sysctl_root),
 852             OID_AUTO,
 853             "sack_attack",
 854             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 855             "Rack Sack Attack Counters and Controls");
 856         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 857             SYSCTL_CHILDREN(rack_sysctl_root),
 858             OID_AUTO,
 859             "stats",
 860             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 861             "Rack Counters");
 862         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 863             SYSCTL_CHILDREN(rack_sysctl_root),
 864             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 865             &rack_rate_sample_method , USE_RTT_LOW,
 866             "What method should we use for rate sampling 0=high, 1=low ");
 867         /* Probe rtt related controls */
 868         rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 869             SYSCTL_CHILDREN(rack_sysctl_root),
 870             OID_AUTO,
 871             "probertt",
 872             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 873             "ProbeRTT related Controls");
 874         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 875             SYSCTL_CHILDREN(rack_probertt),
 876             OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
 877             &rack_atexit_prtt_hbp, 130,
 878             "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
 879         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 880             SYSCTL_CHILDREN(rack_probertt),
 881             OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
 882             &rack_atexit_prtt, 130,
 883             "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
 884         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 885             SYSCTL_CHILDREN(rack_probertt),
 886             OID_AUTO, "gp_per_mul", CTLFLAG_RW,
 887             &rack_per_of_gp_probertt, 60,
 888             "What percentage of goodput do we pace at in probertt");
 889         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 890             SYSCTL_CHILDREN(rack_probertt),
 891             OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
 892             &rack_per_of_gp_probertt_reduce, 10,
 893             "What percentage of goodput do we reduce every gp_srtt");
 894         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 895             SYSCTL_CHILDREN(rack_probertt),
 896             OID_AUTO, "gp_per_low", CTLFLAG_RW,
 897             &rack_per_of_gp_lowthresh, 40,
 898             "What percentage of goodput do we allow the multiplier to fall to");
 899         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 900             SYSCTL_CHILDREN(rack_probertt),
 901             OID_AUTO, "time_between", CTLFLAG_RW,
 902             & rack_time_between_probertt, 96000000,
 903             "How many useconds between the lowest rtt falling must past before we enter probertt");
 904         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 905             SYSCTL_CHILDREN(rack_probertt),
 906             OID_AUTO, "safety", CTLFLAG_RW,
 907             &rack_probe_rtt_safety_val, 2000000,
 908             "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
 909         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 910             SYSCTL_CHILDREN(rack_probertt),
 911             OID_AUTO, "sets_cwnd", CTLFLAG_RW,
 912             &rack_probe_rtt_sets_cwnd, 0,
 913             "Do we set the cwnd too (if always_lower is on)");
 914         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 915             SYSCTL_CHILDREN(rack_probertt),
 916             OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
 917             &rack_max_drain_wait, 2,
 918             "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
 919         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 920             SYSCTL_CHILDREN(rack_probertt),
 921             OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
 922             &rack_must_drain, 1,
 923             "We must drain this many gp_srtt's waiting for flight to reach goal");
 924         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 925             SYSCTL_CHILDREN(rack_probertt),
 926             OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
 927             &rack_probertt_use_min_rtt_entry, 1,
 928             "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
 929         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 930             SYSCTL_CHILDREN(rack_probertt),
 931             OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
 932             &rack_probertt_use_min_rtt_exit, 0,
 933             "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
 934         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 935             SYSCTL_CHILDREN(rack_probertt),
 936             OID_AUTO, "length_div", CTLFLAG_RW,
 937             &rack_probertt_gpsrtt_cnt_div, 0,
 938             "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
 939         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 940             SYSCTL_CHILDREN(rack_probertt),
 941             OID_AUTO, "length_mul", CTLFLAG_RW,
 942             &rack_probertt_gpsrtt_cnt_mul, 0,
 943             "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
 944         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 945             SYSCTL_CHILDREN(rack_probertt),
 946             OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
 947             &rack_min_probertt_hold, 200000,
 948             "What is the minimum time we hold probertt at target");
 949         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 950             SYSCTL_CHILDREN(rack_probertt),
 951             OID_AUTO, "filter_life", CTLFLAG_RW,
 952             &rack_probertt_filter_life, 10000000,
 953             "What is the time for the filters life in useconds");
 954         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 955             SYSCTL_CHILDREN(rack_probertt),
 956             OID_AUTO, "lower_within", CTLFLAG_RW,
 957             &rack_probertt_lower_within, 10,
 958             "If the rtt goes lower within this percentage of the time, go into probe-rtt");
 959         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 960             SYSCTL_CHILDREN(rack_probertt),
 961             OID_AUTO, "must_move", CTLFLAG_RW,
 962             &rack_min_rtt_movement, 250,
 963             "How much is the minimum movement in rtt to count as a drop for probertt purposes");
 964         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 965             SYSCTL_CHILDREN(rack_probertt),
 966             OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
 967             &rack_probertt_clear_is, 1,
 968             "Do we clear I/S counts on exiting probe-rtt");
 969         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 970             SYSCTL_CHILDREN(rack_probertt),
 971             OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
 972             &rack_max_drain_hbp, 1,
 973             "How many extra drain gpsrtt's do we get in highly buffered paths");
 974         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 975             SYSCTL_CHILDREN(rack_probertt),
 976             OID_AUTO, "hbp_threshold", CTLFLAG_RW,
 977             &rack_hbp_thresh, 3,
 978             "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
 979         /* Pacing related sysctls */
 980         rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 981             SYSCTL_CHILDREN(rack_sysctl_root),
 982             OID_AUTO,
 983             "pacing",
 984             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 985             "Pacing related Controls");
 986         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 987             SYSCTL_CHILDREN(rack_pacing),
 988             OID_AUTO, "max_pace_over", CTLFLAG_RW,
 989             &rack_max_per_above, 30,
 990             "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
 991         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 992             SYSCTL_CHILDREN(rack_pacing),
 993             OID_AUTO, "pace_to_one", CTLFLAG_RW,
 994             &rack_pace_one_seg, 0,
 995             "Do we allow low b/w pacing of 1MSS instead of two");
 996         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 997             SYSCTL_CHILDREN(rack_pacing),
 998             OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
 999             &rack_limit_time_with_srtt, 0,
1000             "Do we limit pacing time based on srtt");
1001         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1002             SYSCTL_CHILDREN(rack_pacing),
1003             OID_AUTO, "init_win", CTLFLAG_RW,
1004             &rack_default_init_window, 0,
1005             "Do we have a rack initial window 0 = system default");
1006         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1007             SYSCTL_CHILDREN(rack_pacing),
1008             OID_AUTO, "gp_per_ss", CTLFLAG_RW,
1009             &rack_per_of_gp_ss, 250,
1010             "If non zero, what percentage of goodput to pace at in slow start");
1011         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1012             SYSCTL_CHILDREN(rack_pacing),
1013             OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1014             &rack_per_of_gp_ca, 150,
1015             "If non zero, what percentage of goodput to pace at in congestion avoidance");
1016         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1017             SYSCTL_CHILDREN(rack_pacing),
1018             OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1019             &rack_per_of_gp_rec, 200,
1020             "If non zero, what percentage of goodput to pace at in recovery");
1021         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1022             SYSCTL_CHILDREN(rack_pacing),
1023             OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1024             &rack_hptsi_segments, 40,
1025             "What size is the max for TSO segments in pacing and burst mitigation");
1026         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1027             SYSCTL_CHILDREN(rack_pacing),
1028             OID_AUTO, "burst_reduces", CTLFLAG_RW,
1029             &rack_slot_reduction, 4,
1030             "When doing only burst mitigation what is the reduce divisor");
1031         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1032             SYSCTL_CHILDREN(rack_sysctl_root),
1033             OID_AUTO, "use_pacing", CTLFLAG_RW,
1034             &rack_pace_every_seg, 0,
1035             "If set we use pacing, if clear we use only the original burst mitigation");
1036         SYSCTL_ADD_U64(&rack_sysctl_ctx,
1037             SYSCTL_CHILDREN(rack_pacing),
1038             OID_AUTO, "rate_cap", CTLFLAG_RW,
1039             &rack_bw_rate_cap, 0,
1040             "If set we apply this value to the absolute rate cap used by pacing");
1041         SYSCTL_ADD_U8(&rack_sysctl_ctx,
1042             SYSCTL_CHILDREN(rack_sysctl_root),
1043             OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1044             &rack_req_measurements, 1,
1045             "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1046         /* Hardware pacing */
1047         rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1048             SYSCTL_CHILDREN(rack_sysctl_root),
1049             OID_AUTO,
1050             "hdwr_pacing",
1051             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1052             "Pacing related Controls");
1053         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1054             SYSCTL_CHILDREN(rack_hw_pacing),
1055             OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1056             &rack_hw_rwnd_factor, 2,
1057             "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1058         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1059             SYSCTL_CHILDREN(rack_hw_pacing),
1060             OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1061             &rack_enobuf_hw_boost_mult, 2,
1062             "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1063         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1064             SYSCTL_CHILDREN(rack_hw_pacing),
1065             OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1066             &rack_enobuf_hw_max, 2,
1067             "What is the max boost the pacing time if we see a ENOBUFS?");
1068         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1069             SYSCTL_CHILDREN(rack_hw_pacing),
1070             OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1071             &rack_enobuf_hw_min, 2,
1072             "What is the min boost the pacing time if we see a ENOBUFS?");
1073         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1074             SYSCTL_CHILDREN(rack_hw_pacing),
1075             OID_AUTO, "enable", CTLFLAG_RW,
1076             &rack_enable_hw_pacing, 0,
1077             "Should RACK attempt to use hw pacing?");
1078         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1079             SYSCTL_CHILDREN(rack_hw_pacing),
1080             OID_AUTO, "rate_cap", CTLFLAG_RW,
1081             &rack_hw_rate_caps, 1,
1082             "Does the highest hardware pacing rate cap the rate we will send at??");
1083         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1084             SYSCTL_CHILDREN(rack_hw_pacing),
1085             OID_AUTO, "rate_min", CTLFLAG_RW,
1086             &rack_hw_rate_min, 0,
1087             "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1088         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1089             SYSCTL_CHILDREN(rack_hw_pacing),
1090             OID_AUTO, "rate_to_low", CTLFLAG_RW,
1091             &rack_hw_rate_to_low, 0,
1092             "If we fall below this rate, dis-engage hw pacing?");
1093         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1094             SYSCTL_CHILDREN(rack_hw_pacing),
1095             OID_AUTO, "up_only", CTLFLAG_RW,
1096             &rack_hw_up_only, 1,
1097             "Do we allow hw pacing to lower the rate selected?");
1098         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1099             SYSCTL_CHILDREN(rack_hw_pacing),
1100             OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
1101             &rack_hw_pace_extra_slots, 2,
1102             "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
1103         rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1104             SYSCTL_CHILDREN(rack_sysctl_root),
1105             OID_AUTO,
1106             "timely",
1107             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1108             "Rack Timely RTT Controls");
1109         /* Timely based GP dynmics */
1110         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1111             SYSCTL_CHILDREN(rack_timely),
1112             OID_AUTO, "upper", CTLFLAG_RW,
1113             &rack_gp_per_bw_mul_up, 2,
1114             "Rack timely upper range for equal b/w (in percentage)");
1115         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1116             SYSCTL_CHILDREN(rack_timely),
1117             OID_AUTO, "lower", CTLFLAG_RW,
1118             &rack_gp_per_bw_mul_down, 4,
1119             "Rack timely lower range for equal b/w (in percentage)");
1120         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1121             SYSCTL_CHILDREN(rack_timely),
1122             OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1123             &rack_gp_rtt_maxmul, 3,
1124             "Rack timely multipler of lowest rtt for rtt_max");
1125         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1126             SYSCTL_CHILDREN(rack_timely),
1127             OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1128             &rack_gp_rtt_mindiv, 4,
1129             "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1130         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1131             SYSCTL_CHILDREN(rack_timely),
1132             OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1133             &rack_gp_rtt_minmul, 1,
1134             "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1135         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1136             SYSCTL_CHILDREN(rack_timely),
1137             OID_AUTO, "decrease", CTLFLAG_RW,
1138             &rack_gp_decrease_per, 20,
1139             "Rack timely decrease percentage of our GP multiplication factor");
1140         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1141             SYSCTL_CHILDREN(rack_timely),
1142             OID_AUTO, "increase", CTLFLAG_RW,
1143             &rack_gp_increase_per, 2,
1144             "Rack timely increase perentage of our GP multiplication factor");
1145         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1146             SYSCTL_CHILDREN(rack_timely),
1147             OID_AUTO, "lowerbound", CTLFLAG_RW,
1148             &rack_per_lower_bound, 50,
1149             "Rack timely lowest percentage we allow GP multiplier to fall to");
1150         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1151             SYSCTL_CHILDREN(rack_timely),
1152             OID_AUTO, "upperboundss", CTLFLAG_RW,
1153             &rack_per_upper_bound_ss, 0,
1154             "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1155         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1156             SYSCTL_CHILDREN(rack_timely),
1157             OID_AUTO, "upperboundca", CTLFLAG_RW,
1158             &rack_per_upper_bound_ca, 0,
1159             "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1160         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1161             SYSCTL_CHILDREN(rack_timely),
1162             OID_AUTO, "dynamicgp", CTLFLAG_RW,
1163             &rack_do_dyn_mul, 0,
1164             "Rack timely do we enable dynmaic timely goodput by default");
1165         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1166             SYSCTL_CHILDREN(rack_timely),
1167             OID_AUTO, "no_rec_red", CTLFLAG_RW,
1168             &rack_gp_no_rec_chg, 1,
1169             "Rack timely do we prohibit the recovery multiplier from being lowered");
1170         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1171             SYSCTL_CHILDREN(rack_timely),
1172             OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1173             &rack_timely_dec_clear, 6,
1174             "Rack timely what threshold do we count to before another boost during b/w decent");
1175         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1176             SYSCTL_CHILDREN(rack_timely),
1177             OID_AUTO, "max_push_rise", CTLFLAG_RW,
1178             &rack_timely_max_push_rise, 3,
1179             "Rack timely how many times do we push up with b/w increase");
1180         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1181             SYSCTL_CHILDREN(rack_timely),
1182             OID_AUTO, "max_push_drop", CTLFLAG_RW,
1183             &rack_timely_max_push_drop, 3,
1184             "Rack timely how many times do we push back on b/w decent");
1185         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1186             SYSCTL_CHILDREN(rack_timely),
1187             OID_AUTO, "min_segs", CTLFLAG_RW,
1188             &rack_timely_min_segs, 4,
1189             "Rack timely when setting the cwnd what is the min num segments");
1190         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1191             SYSCTL_CHILDREN(rack_timely),
1192             OID_AUTO, "noback_max", CTLFLAG_RW,
1193             &rack_use_max_for_nobackoff, 0,
1194             "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
1195         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1196             SYSCTL_CHILDREN(rack_timely),
1197             OID_AUTO, "interim_timely_only", CTLFLAG_RW,
1198             &rack_timely_int_timely_only, 0,
1199             "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
1200         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1201             SYSCTL_CHILDREN(rack_timely),
1202             OID_AUTO, "nonstop", CTLFLAG_RW,
1203             &rack_timely_no_stopping, 0,
1204             "Rack timely don't stop increase");
1205         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1206             SYSCTL_CHILDREN(rack_timely),
1207             OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1208             &rack_down_raise_thresh, 100,
1209             "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1210         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1211             SYSCTL_CHILDREN(rack_timely),
1212             OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1213             &rack_req_segs, 1,
1214             "Bottom dragging if not these many segments outstanding and room");
1215
1216         /* TLP and Rack related parameters */
1217         rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1218             SYSCTL_CHILDREN(rack_sysctl_root),
1219             OID_AUTO,
1220             "tlp",
1221             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1222             "TLP and Rack related Controls");
1223         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1224             SYSCTL_CHILDREN(rack_tlp),
1225             OID_AUTO, "use_rrr", CTLFLAG_RW,
1226             &use_rack_rr, 1,
1227             "Do we use Rack Rapid Recovery");
1228         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1229             SYSCTL_CHILDREN(rack_tlp),
1230             OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1231             &rack_max_abc_post_recovery, 2,
1232             "Since we do early recovery, do we override the l_abc to a value, if so what?");
1233         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1234             SYSCTL_CHILDREN(rack_tlp),
1235             OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1236             &rack_non_rxt_use_cr, 0,
1237             "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1238         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1239             SYSCTL_CHILDREN(rack_tlp),
1240             OID_AUTO, "tlpmethod", CTLFLAG_RW,
1241             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1242             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1243         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1244             SYSCTL_CHILDREN(rack_tlp),
1245             OID_AUTO, "limit", CTLFLAG_RW,
1246             &rack_tlp_limit, 2,
1247             "How many TLP's can be sent without sending new data");
1248         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1249             SYSCTL_CHILDREN(rack_tlp),
1250             OID_AUTO, "use_greater", CTLFLAG_RW,
1251             &rack_tlp_use_greater, 1,
1252             "Should we use the rack_rtt time if its greater than srtt");
1253         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1254             SYSCTL_CHILDREN(rack_tlp),
1255             OID_AUTO, "tlpminto", CTLFLAG_RW,
1256             &rack_tlp_min, 10000,
1257             "TLP minimum timeout per the specification (in microseconds)");
1258         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1259             SYSCTL_CHILDREN(rack_tlp),
1260             OID_AUTO, "send_oldest", CTLFLAG_RW,
1261             &rack_always_send_oldest, 0,
1262             "Should we always send the oldest TLP and RACK-TLP");
1263         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1264             SYSCTL_CHILDREN(rack_tlp),
1265             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
1266             &rack_limited_retran, 0,
1267             "How many times can a rack timeout drive out sends");
1268         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1269             SYSCTL_CHILDREN(rack_tlp),
1270             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1271             &rack_lower_cwnd_at_tlp, 0,
1272             "When a TLP completes a retran should we enter recovery");
1273         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1274             SYSCTL_CHILDREN(rack_tlp),
1275             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1276             &rack_reorder_thresh, 2,
1277             "What factor for rack will be added when seeing reordering (shift right)");
1278         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1279             SYSCTL_CHILDREN(rack_tlp),
1280             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1281             &rack_tlp_thresh, 1,
1282             "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1283         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1284             SYSCTL_CHILDREN(rack_tlp),
1285             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1286             &rack_reorder_fade, 60000000,
1287             "Does reorder detection fade, if so how many microseconds (0 means never)");
1288         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1289             SYSCTL_CHILDREN(rack_tlp),
1290             OID_AUTO, "pktdelay", CTLFLAG_RW,
1291             &rack_pkt_delay, 1000,
1292             "Extra RACK time (in microseconds) besides reordering thresh");
1293
1294         /* Timer related controls */
1295         rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1296             SYSCTL_CHILDREN(rack_sysctl_root),
1297             OID_AUTO,
1298             "timers",
1299             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1300             "Timer related controls");
1301         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1302             SYSCTL_CHILDREN(rack_timers),
1303             OID_AUTO, "persmin", CTLFLAG_RW,
1304             &rack_persist_min, 250000,
1305             "What is the minimum time in microseconds between persists");
1306         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1307             SYSCTL_CHILDREN(rack_timers),
1308             OID_AUTO, "persmax", CTLFLAG_RW,
1309             &rack_persist_max, 2000000,
1310             "What is the largest delay in microseconds between persists");
1311         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1312             SYSCTL_CHILDREN(rack_timers),
1313             OID_AUTO, "delayed_ack", CTLFLAG_RW,
1314             &rack_delayed_ack_time, 40000,
1315             "Delayed ack time (40ms in microseconds)");
1316         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1317             SYSCTL_CHILDREN(rack_timers),
1318             OID_AUTO, "minrto", CTLFLAG_RW,
1319             &rack_rto_min, 30000,
1320             "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1321         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1322             SYSCTL_CHILDREN(rack_timers),
1323             OID_AUTO, "maxrto", CTLFLAG_RW,
1324             &rack_rto_max, 4000000,
1325             "Maximum RTO in microseconds -- should be at least as large as min_rto");
1326         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1327             SYSCTL_CHILDREN(rack_timers),
1328             OID_AUTO, "minto", CTLFLAG_RW,
1329             &rack_min_to, 1000,
1330             "Minimum rack timeout in microseconds");
1331         /* Measure controls */
1332         rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1333             SYSCTL_CHILDREN(rack_sysctl_root),
1334             OID_AUTO,
1335             "measure",
1336             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1337             "Measure related controls");
1338         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1339             SYSCTL_CHILDREN(rack_measure),
1340             OID_AUTO, "wma_divisor", CTLFLAG_RW,
1341             &rack_wma_divisor, 8,
1342             "When doing b/w calculation what is the  divisor for the WMA");
1343         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1344             SYSCTL_CHILDREN(rack_measure),
1345             OID_AUTO, "end_cwnd", CTLFLAG_RW,
1346             &rack_cwnd_block_ends_measure, 0,
1347             "Does a cwnd just-return end the measurement window (app limited)");
1348         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1349             SYSCTL_CHILDREN(rack_measure),
1350             OID_AUTO, "end_rwnd", CTLFLAG_RW,
1351             &rack_rwnd_block_ends_measure, 0,
1352             "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1353         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1354             SYSCTL_CHILDREN(rack_measure),
1355             OID_AUTO, "min_target", CTLFLAG_RW,
1356             &rack_def_data_window, 20,
1357             "What is the minimum target window (in mss) for a GP measurements");
1358         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1359             SYSCTL_CHILDREN(rack_measure),
1360             OID_AUTO, "goal_bdp", CTLFLAG_RW,
1361             &rack_goal_bdp, 2,
1362             "What is the goal BDP to measure");
1363         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1364             SYSCTL_CHILDREN(rack_measure),
1365             OID_AUTO, "min_srtts", CTLFLAG_RW,
1366             &rack_min_srtts, 1,
1367             "What is the goal BDP to measure");
1368         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1369             SYSCTL_CHILDREN(rack_measure),
1370             OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1371             &rack_min_measure_usec, 0,
1372             "What is the Minimum time time for a measurement if 0, this is off");
1373         /* Features */
1374         rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1375             SYSCTL_CHILDREN(rack_sysctl_root),
1376             OID_AUTO,
1377             "features",
1378             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1379             "Feature controls");
1380         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1381             SYSCTL_CHILDREN(rack_features),
1382             OID_AUTO, "cmpack", CTLFLAG_RW,
1383             &rack_use_cmp_acks, 1,
1384             "Should RACK have LRO send compressed acks");
1385         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1386             SYSCTL_CHILDREN(rack_features),
1387             OID_AUTO, "fsb", CTLFLAG_RW,
1388             &rack_use_fsb, 1,
1389             "Should RACK use the fast send block?");
1390         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1391             SYSCTL_CHILDREN(rack_features),
1392             OID_AUTO, "rfo", CTLFLAG_RW,
1393             &rack_use_rfo, 1,
1394             "Should RACK use rack_fast_output()?");
1395         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1396             SYSCTL_CHILDREN(rack_features),
1397             OID_AUTO, "rsmrfo", CTLFLAG_RW,
1398             &rack_use_rsm_rfo, 1,
1399             "Should RACK use rack_fast_rsm_output()?");
1400         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1401             SYSCTL_CHILDREN(rack_features),
1402             OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1403             &rack_enable_mqueue_for_nonpaced, 0,
1404             "Should RACK use mbuf queuing for non-paced connections");
1405         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1406             SYSCTL_CHILDREN(rack_features),
1407             OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1408             &rack_do_hystart, 0,
1409             "Should RACK enable HyStart++ on connections?");
1410         /* Misc rack controls */
1411         rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1412             SYSCTL_CHILDREN(rack_sysctl_root),
1413             OID_AUTO,
1414             "misc",
1415             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1416             "Misc related controls");
1417 #ifdef TCP_ACCOUNTING
1418         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1419             SYSCTL_CHILDREN(rack_misc),
1420             OID_AUTO, "tcp_acct", CTLFLAG_RW,
1421             &rack_tcp_accounting, 0,
1422             "Should we turn on TCP accounting for all rack sessions?");
1423 #endif
1424         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1425             SYSCTL_CHILDREN(rack_misc),
1426             OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
1427             &rack_apply_rtt_with_reduced_conf, 0,
1428             "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
1429         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1430             SYSCTL_CHILDREN(rack_misc),
1431             OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1432             &rack_dsack_std_based, 3,
1433             "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1434         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1435             SYSCTL_CHILDREN(rack_misc),
1436             OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1437             &rack_prr_addbackmax, 2,
1438             "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1439         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1440             SYSCTL_CHILDREN(rack_misc),
1441             OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1442             &rack_stats_gets_ms_rtt, 1,
1443             "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1444         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1445             SYSCTL_CHILDREN(rack_misc),
1446             OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1447             &rack_client_low_buf, 0,
1448             "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1449         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1450             SYSCTL_CHILDREN(rack_misc),
1451             OID_AUTO, "defprofile", CTLFLAG_RW,
1452             &rack_def_profile, 0,
1453             "Should RACK use a default profile (0=no, num == profile num)?");
1454         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1455             SYSCTL_CHILDREN(rack_misc),
1456             OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1457             &rack_enable_shared_cwnd, 1,
1458             "Should RACK try to use the shared cwnd on connections where allowed");
1459         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1460             SYSCTL_CHILDREN(rack_misc),
1461             OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1462             &rack_limits_scwnd, 1,
1463             "Should RACK place low end time limits on the shared cwnd feature");
1464         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1465             SYSCTL_CHILDREN(rack_misc),
1466             OID_AUTO, "iMac_dack", CTLFLAG_RW,
1467             &rack_use_imac_dack, 0,
1468             "Should RACK try to emulate iMac delayed ack");
1469         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1470             SYSCTL_CHILDREN(rack_misc),
1471             OID_AUTO, "no_prr", CTLFLAG_RW,
1472             &rack_disable_prr, 0,
1473             "Should RACK not use prr and only pace (must have pacing on)");
1474         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1475             SYSCTL_CHILDREN(rack_misc),
1476             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1477             &rack_verbose_logging, 0,
1478             "Should RACK black box logging be verbose");
1479         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1480             SYSCTL_CHILDREN(rack_misc),
1481             OID_AUTO, "data_after_close", CTLFLAG_RW,
1482             &rack_ignore_data_after_close, 1,
1483             "Do we hold off sending a RST until all pending data is ack'd");
1484         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1485             SYSCTL_CHILDREN(rack_misc),
1486             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1487             &rack_sack_not_required, 1,
1488             "Do we allow rack to run on connections not supporting SACK");
1489         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1490             SYSCTL_CHILDREN(rack_misc),
1491             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1492             &rack_send_a_lot_in_prr, 1,
1493             "Send a lot in prr");
1494         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1495             SYSCTL_CHILDREN(rack_misc),
1496             OID_AUTO, "autoscale", CTLFLAG_RW,
1497             &rack_autosndbuf_inc, 20,
1498             "What percentage should rack scale up its snd buffer by?");
1499         /* Sack Attacker detection stuff */
1500         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1501             SYSCTL_CHILDREN(rack_attack),
1502             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
1503             &rack_highest_sack_thresh_seen, 0,
1504             "Highest sack to ack ratio seen");
1505         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1506             SYSCTL_CHILDREN(rack_attack),
1507             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
1508             &rack_highest_move_thresh_seen, 0,
1509             "Highest move to non-move ratio seen");
1510         rack_ack_total = counter_u64_alloc(M_WAITOK);
1511         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1512             SYSCTL_CHILDREN(rack_attack),
1513             OID_AUTO, "acktotal", CTLFLAG_RD,
1514             &rack_ack_total,
1515             "Total number of Ack's");
1516         rack_express_sack = counter_u64_alloc(M_WAITOK);
1517         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1518             SYSCTL_CHILDREN(rack_attack),
1519             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
1520             &rack_express_sack,
1521             "Total expresss number of Sack's");
1522         rack_sack_total = counter_u64_alloc(M_WAITOK);
1523         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1524             SYSCTL_CHILDREN(rack_attack),
1525             OID_AUTO, "sacktotal", CTLFLAG_RD,
1526             &rack_sack_total,
1527             "Total number of SACKs");
1528         rack_move_none = counter_u64_alloc(M_WAITOK);
1529         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1530             SYSCTL_CHILDREN(rack_attack),
1531             OID_AUTO, "move_none", CTLFLAG_RD,
1532             &rack_move_none,
1533             "Total number of SACK index reuse of postions under threshold");
1534         rack_move_some = counter_u64_alloc(M_WAITOK);
1535         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1536             SYSCTL_CHILDREN(rack_attack),
1537             OID_AUTO, "move_some", CTLFLAG_RD,
1538             &rack_move_some,
1539             "Total number of SACK index reuse of postions over threshold");
1540         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
1541         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1542             SYSCTL_CHILDREN(rack_attack),
1543             OID_AUTO, "attacks", CTLFLAG_RD,
1544             &rack_sack_attacks_detected,
1545             "Total number of SACK attackers that had sack disabled");
1546         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
1547         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1548             SYSCTL_CHILDREN(rack_attack),
1549             OID_AUTO, "reversed", CTLFLAG_RD,
1550             &rack_sack_attacks_reversed,
1551             "Total number of SACK attackers that were later determined false positive");
1552         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
1553         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1554             SYSCTL_CHILDREN(rack_attack),
1555             OID_AUTO, "nextmerge", CTLFLAG_RD,
1556             &rack_sack_used_next_merge,
1557             "Total number of times we used the next merge");
1558         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
1559         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1560             SYSCTL_CHILDREN(rack_attack),
1561             OID_AUTO, "prevmerge", CTLFLAG_RD,
1562             &rack_sack_used_prev_merge,
1563             "Total number of times we used the prev merge");
1564         /* Counters */
1565         rack_fto_send = counter_u64_alloc(M_WAITOK);
1566         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1567             SYSCTL_CHILDREN(rack_counters),
1568             OID_AUTO, "fto_send", CTLFLAG_RD,
1569             &rack_fto_send, "Total number of rack_fast_output sends");
1570         rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1571         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1572             SYSCTL_CHILDREN(rack_counters),
1573             OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1574             &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1575         rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1576         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1577             SYSCTL_CHILDREN(rack_counters),
1578             OID_AUTO, "nfto_resend", CTLFLAG_RD,
1579             &rack_nfto_resend, "Total number of rack_output retransmissions");
1580         rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1581         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1582             SYSCTL_CHILDREN(rack_counters),
1583             OID_AUTO, "nfto_send", CTLFLAG_RD,
1584             &rack_non_fto_send, "Total number of rack_output first sends");
1585         rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1586         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1587             SYSCTL_CHILDREN(rack_counters),
1588             OID_AUTO, "rfo_extended", CTLFLAG_RD,
1589             &rack_extended_rfo, "Total number of times we extended rfo");
1590
1591         rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1592         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1593             SYSCTL_CHILDREN(rack_counters),
1594             OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1595             &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1596         rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1597
1598         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1599             SYSCTL_CHILDREN(rack_counters),
1600             OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1601             &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1602         rack_badfr = counter_u64_alloc(M_WAITOK);
1603         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1604             SYSCTL_CHILDREN(rack_counters),
1605             OID_AUTO, "badfr", CTLFLAG_RD,
1606             &rack_badfr, "Total number of bad FRs");
1607         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
1608         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1609             SYSCTL_CHILDREN(rack_counters),
1610             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
1611             &rack_badfr_bytes, "Total number of bad FRs");
1612         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
1613         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1614             SYSCTL_CHILDREN(rack_counters),
1615             OID_AUTO, "prrsndret", CTLFLAG_RD,
1616             &rack_rtm_prr_retran,
1617             "Total number of prr based retransmits");
1618         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
1619         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1620             SYSCTL_CHILDREN(rack_counters),
1621             OID_AUTO, "prrsndnew", CTLFLAG_RD,
1622             &rack_rtm_prr_newdata,
1623             "Total number of prr based new transmits");
1624         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
1625         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1626             SYSCTL_CHILDREN(rack_counters),
1627             OID_AUTO, "tsnf", CTLFLAG_RD,
1628             &rack_timestamp_mismatch,
1629             "Total number of timestamps that we could not find the reported ts");
1630         rack_find_high = counter_u64_alloc(M_WAITOK);
1631         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1632             SYSCTL_CHILDREN(rack_counters),
1633             OID_AUTO, "findhigh", CTLFLAG_RD,
1634             &rack_find_high,
1635             "Total number of FIN causing find-high");
1636         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
1637         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1638             SYSCTL_CHILDREN(rack_counters),
1639             OID_AUTO, "reordering", CTLFLAG_RD,
1640             &rack_reorder_seen,
1641             "Total number of times we added delay due to reordering");
1642         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1643         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1644             SYSCTL_CHILDREN(rack_counters),
1645             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1646             &rack_tlp_tot,
1647             "Total number of tail loss probe expirations");
1648         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1649         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1650             SYSCTL_CHILDREN(rack_counters),
1651             OID_AUTO, "tlp_new", CTLFLAG_RD,
1652             &rack_tlp_newdata,
1653             "Total number of tail loss probe sending new data");
1654         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1655         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1656             SYSCTL_CHILDREN(rack_counters),
1657             OID_AUTO, "tlp_retran", CTLFLAG_RD,
1658             &rack_tlp_retran,
1659             "Total number of tail loss probe sending retransmitted data");
1660         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1661         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1662             SYSCTL_CHILDREN(rack_counters),
1663             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1664             &rack_tlp_retran_bytes,
1665             "Total bytes of tail loss probe sending retransmitted data");
1666         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
1667         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1668             SYSCTL_CHILDREN(rack_counters),
1669             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
1670             &rack_tlp_retran_fail,
1671             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
1672         rack_to_tot = counter_u64_alloc(M_WAITOK);
1673         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1674             SYSCTL_CHILDREN(rack_counters),
1675             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1676             &rack_to_tot,
1677             "Total number of times the rack to expired");
1678         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
1679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1680             SYSCTL_CHILDREN(rack_counters),
1681             OID_AUTO, "arm_rack", CTLFLAG_RD,
1682             &rack_to_arm_rack,
1683             "Total number of times the rack timer armed");
1684         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
1685         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1686             SYSCTL_CHILDREN(rack_counters),
1687             OID_AUTO, "arm_tlp", CTLFLAG_RD,
1688             &rack_to_arm_tlp,
1689             "Total number of times the tlp timer armed");
1690         rack_calc_zero = counter_u64_alloc(M_WAITOK);
1691         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
1692         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1693             SYSCTL_CHILDREN(rack_counters),
1694             OID_AUTO, "calc_zero", CTLFLAG_RD,
1695             &rack_calc_zero,
1696             "Total number of times pacing time worked out to zero");
1697         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1698             SYSCTL_CHILDREN(rack_counters),
1699             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
1700             &rack_calc_nonzero,
1701             "Total number of times pacing time worked out to non-zero");
1702         rack_paced_segments = counter_u64_alloc(M_WAITOK);
1703         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1704             SYSCTL_CHILDREN(rack_counters),
1705             OID_AUTO, "paced", CTLFLAG_RD,
1706             &rack_paced_segments,
1707             "Total number of times a segment send caused hptsi");
1708         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
1709         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1710             SYSCTL_CHILDREN(rack_counters),
1711             OID_AUTO, "unpaced", CTLFLAG_RD,
1712             &rack_unpaced_segments,
1713             "Total number of times a segment did not cause hptsi");
1714         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1715         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1716             SYSCTL_CHILDREN(rack_counters),
1717             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1718             &rack_saw_enobuf,
1719             "Total number of times a sends returned enobuf for non-hdwr paced connections");
1720         rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1721         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1722             SYSCTL_CHILDREN(rack_counters),
1723             OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1724             &rack_saw_enobuf_hw,
1725             "Total number of times a send returned enobuf for hdwr paced connections");
1726         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1727         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1728             SYSCTL_CHILDREN(rack_counters),
1729             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1730             &rack_saw_enetunreach,
1731             "Total number of times a send received a enetunreachable");
1732         rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1733         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1734             SYSCTL_CHILDREN(rack_counters),
1735             OID_AUTO, "alloc_hot", CTLFLAG_RD,
1736             &rack_hot_alloc,
1737             "Total allocations from the top of our list");
1738         rack_to_alloc = counter_u64_alloc(M_WAITOK);
1739         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1740             SYSCTL_CHILDREN(rack_counters),
1741             OID_AUTO, "allocs", CTLFLAG_RD,
1742             &rack_to_alloc,
1743             "Total allocations of tracking structures");
1744         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1745         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1746             SYSCTL_CHILDREN(rack_counters),
1747             OID_AUTO, "allochard", CTLFLAG_RD,
1748             &rack_to_alloc_hard,
1749             "Total allocations done with sleeping the hard way");
1750         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1751         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1752             SYSCTL_CHILDREN(rack_counters),
1753             OID_AUTO, "allocemerg", CTLFLAG_RD,
1754             &rack_to_alloc_emerg,
1755             "Total allocations done from emergency cache");
1756         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1757         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1758             SYSCTL_CHILDREN(rack_counters),
1759             OID_AUTO, "alloc_limited", CTLFLAG_RD,
1760             &rack_to_alloc_limited,
1761             "Total allocations dropped due to limit");
1762         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1763         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1764             SYSCTL_CHILDREN(rack_counters),
1765             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1766             &rack_alloc_limited_conns,
1767             "Connections with allocations dropped due to limit");
1768         rack_split_limited = counter_u64_alloc(M_WAITOK);
1769         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1770             SYSCTL_CHILDREN(rack_counters),
1771             OID_AUTO, "split_limited", CTLFLAG_RD,
1772             &rack_split_limited,
1773             "Split allocations dropped due to limit");
1774
1775         for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
1776                 char name[32];
1777                 sprintf(name, "cmp_ack_cnt_%d", i);
1778                 rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK);
1779                 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1780                                        SYSCTL_CHILDREN(rack_counters),
1781                                        OID_AUTO, name, CTLFLAG_RD,
1782                                        &rack_proc_comp_ack[i],
1783                                        "Number of compressed acks we processed");
1784         }
1785         rack_large_ackcmp = counter_u64_alloc(M_WAITOK);
1786         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1787             SYSCTL_CHILDREN(rack_counters),
1788             OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
1789             &rack_large_ackcmp,
1790             "Number of TCP connections with large mbuf's for compressed acks");
1791         rack_persists_sends = counter_u64_alloc(M_WAITOK);
1792         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1793             SYSCTL_CHILDREN(rack_counters),
1794             OID_AUTO, "persist_sends", CTLFLAG_RD,
1795             &rack_persists_sends,
1796             "Number of times we sent a persist probe");
1797         rack_persists_acks = counter_u64_alloc(M_WAITOK);
1798         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1799             SYSCTL_CHILDREN(rack_counters),
1800             OID_AUTO, "persist_acks", CTLFLAG_RD,
1801             &rack_persists_acks,
1802             "Number of times a persist probe was acked");
1803         rack_persists_loss = counter_u64_alloc(M_WAITOK);
1804         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1805             SYSCTL_CHILDREN(rack_counters),
1806             OID_AUTO, "persist_loss", CTLFLAG_RD,
1807             &rack_persists_loss,
1808             "Number of times we detected a lost persist probe (no ack)");
1809         rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
1810         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1811             SYSCTL_CHILDREN(rack_counters),
1812             OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
1813             &rack_persists_lost_ends,
1814             "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
1815         rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
1816         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1817             SYSCTL_CHILDREN(rack_counters),
1818             OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD,
1819             &rack_small_ackcmp,
1820             "Number of TCP connections with small mbuf's for compressed acks");
1821 #ifdef INVARIANTS
1822         rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1823         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1824             SYSCTL_CHILDREN(rack_counters),
1825             OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1826             &rack_adjust_map_bw,
1827             "Number of times we hit the case where the sb went up and down on a sendmap entry");
1828 #endif
1829         rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1830         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1831             SYSCTL_CHILDREN(rack_counters),
1832             OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1833             &rack_multi_single_eq,
1834             "Number of compressed acks total represented");
1835         rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1836         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1837             SYSCTL_CHILDREN(rack_counters),
1838             OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1839             &rack_proc_non_comp_ack,
1840             "Number of non compresseds acks that we processed");
1841
1842
1843         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1844         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1845             SYSCTL_CHILDREN(rack_counters),
1846             OID_AUTO, "sack_long", CTLFLAG_RD,
1847             &rack_sack_proc_all,
1848             "Total times we had to walk whole list for sack processing");
1849         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1850         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1851             SYSCTL_CHILDREN(rack_counters),
1852             OID_AUTO, "sack_restart", CTLFLAG_RD,
1853             &rack_sack_proc_restart,
1854             "Total times we had to walk whole list due to a restart");
1855         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1856         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1857             SYSCTL_CHILDREN(rack_counters),
1858             OID_AUTO, "sack_short", CTLFLAG_RD,
1859             &rack_sack_proc_short,
1860             "Total times we took shortcut for sack processing");
1861         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
1862         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1863             SYSCTL_CHILDREN(rack_counters),
1864             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
1865             &rack_enter_tlp_calc,
1866             "Total times we called calc-tlp");
1867         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
1868         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1869             SYSCTL_CHILDREN(rack_counters),
1870             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
1871             &rack_used_tlpmethod,
1872             "Total number of runt sacks");
1873         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
1874         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1875             SYSCTL_CHILDREN(rack_counters),
1876             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
1877             &rack_used_tlpmethod2,
1878             "Total number of times we hit TLP method 2");
1879         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
1880         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1881             SYSCTL_CHILDREN(rack_attack),
1882             OID_AUTO, "skipacked", CTLFLAG_RD,
1883             &rack_sack_skipped_acked,
1884             "Total number of times we skipped previously sacked");
1885         rack_sack_splits = counter_u64_alloc(M_WAITOK);
1886         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1887             SYSCTL_CHILDREN(rack_attack),
1888             OID_AUTO, "ofsplit", CTLFLAG_RD,
1889             &rack_sack_splits,
1890             "Total number of times we did the old fashion tree split");
1891         rack_progress_drops = counter_u64_alloc(M_WAITOK);
1892         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1893             SYSCTL_CHILDREN(rack_counters),
1894             OID_AUTO, "prog_drops", CTLFLAG_RD,
1895             &rack_progress_drops,
1896             "Total number of progress drops");
1897         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1898         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1899             SYSCTL_CHILDREN(rack_counters),
1900             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1901             &rack_input_idle_reduces,
1902             "Total number of idle reductions on input");
1903         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1904         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1905             SYSCTL_CHILDREN(rack_counters),
1906             OID_AUTO, "collapsed_win", CTLFLAG_RD,
1907             &rack_collapsed_win,
1908             "Total number of collapsed windows");
1909         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
1910         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1911             SYSCTL_CHILDREN(rack_counters),
1912             OID_AUTO, "tlp_nada", CTLFLAG_RD,
1913             &rack_tlp_does_nada,
1914             "Total number of nada tlp calls");
1915         rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1916         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1917             SYSCTL_CHILDREN(rack_counters),
1918             OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1919             &rack_try_scwnd,
1920             "Total number of scwnd attempts");
1921
1922         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1923         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1924             SYSCTL_CHILDREN(rack_counters),
1925             OID_AUTO, "timer_hole", CTLFLAG_RD,
1926             &rack_per_timer_hole,
1927             "Total persists start in timer hole");
1928
1929         rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK);
1930         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1931             SYSCTL_CHILDREN(rack_counters),
1932             OID_AUTO, "sndptr_wrong", CTLFLAG_RD,
1933             &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret");
1934         rack_sbsndptr_right = counter_u64_alloc(M_WAITOK);
1935         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1936             SYSCTL_CHILDREN(rack_counters),
1937             OID_AUTO, "sndptr_right", CTLFLAG_RD,
1938             &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret");
1939
1940         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1941         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1942             OID_AUTO, "outsize", CTLFLAG_RD,
1943             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1944         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1945         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1946             OID_AUTO, "opts", CTLFLAG_RD,
1947             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1948         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1949             SYSCTL_CHILDREN(rack_sysctl_root),
1950             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1951             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1952 }
1953
1954 static __inline int
1955 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1956 {
1957         if (SEQ_GEQ(b->r_start, a->r_start) &&
1958             SEQ_LT(b->r_start, a->r_end)) {
1959                 /*
1960                  * The entry b is within the
1961                  * block a. i.e.:
1962                  * a --   |-------------|
1963                  * b --   |----|
1964                  * <or>
1965                  * b --       |------|
1966                  * <or>
1967                  * b --       |-----------|
1968                  */
1969                 return (0);
1970         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1971                 /*
1972                  * b falls as either the next
1973                  * sequence block after a so a
1974                  * is said to be smaller than b.
1975                  * i.e:
1976                  * a --   |------|
1977                  * b --          |--------|
1978                  * or
1979                  * b --              |-----|
1980                  */
1981                 return (1);
1982         }
1983         /*
1984          * Whats left is where a is
1985          * larger than b. i.e:
1986          * a --         |-------|
1987          * b --  |---|
1988          * or even possibly
1989          * b --   |--------------|
1990          */
1991         return (-1);
1992 }
1993
1994 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1995 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1996
1997 static uint32_t
1998 rc_init_window(struct tcp_rack *rack)
1999 {
2000         uint32_t win;
2001
2002         if (rack->rc_init_win == 0) {
2003                 /*
2004                  * Nothing set by the user, use the system stack
2005                  * default.
2006                  */
2007                 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
2008         }
2009         win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
2010         return (win);
2011 }
2012
2013 static uint64_t
2014 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
2015 {
2016         if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
2017                 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
2018         else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2019                 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
2020         else
2021                 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
2022 }
2023
2024 static uint64_t
2025 rack_get_bw(struct tcp_rack *rack)
2026 {
2027         if (rack->use_fixed_rate) {
2028                 /* Return the fixed pacing rate */
2029                 return (rack_get_fixed_pacing_bw(rack));
2030         }
2031         if (rack->r_ctl.gp_bw == 0) {
2032                 /*
2033                  * We have yet no b/w measurement,
2034                  * if we have a user set initial bw
2035                  * return it. If we don't have that and
2036                  * we have an srtt, use the tcp IW (10) to
2037                  * calculate a fictional b/w over the SRTT
2038                  * which is more or less a guess. Note
2039                  * we don't use our IW from rack on purpose
2040                  * so if we have like IW=30, we are not
2041                  * calculating a "huge" b/w.
2042                  */
2043                 uint64_t bw, srtt;
2044                 if (rack->r_ctl.init_rate)
2045                         return (rack->r_ctl.init_rate);
2046
2047                 /* Has the user set a max peak rate? */
2048 #ifdef NETFLIX_PEAKRATE
2049                 if (rack->rc_tp->t_maxpeakrate)
2050                         return (rack->rc_tp->t_maxpeakrate);
2051 #endif
2052                 /* Ok lets come up with the IW guess, if we have a srtt */
2053                 if (rack->rc_tp->t_srtt == 0) {
2054                         /*
2055                          * Go with old pacing method
2056                          * i.e. burst mitigation only.
2057                          */
2058                         return (0);
2059                 }
2060                 /* Ok lets get the initial TCP win (not racks) */
2061                 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
2062                 srtt = (uint64_t)rack->rc_tp->t_srtt;
2063                 bw *= (uint64_t)USECS_IN_SECOND;
2064                 bw /= srtt;
2065                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
2066                         bw = rack->r_ctl.bw_rate_cap;
2067                 return (bw);
2068         } else {
2069                 uint64_t bw;
2070
2071                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
2072                         /* Averaging is done, we can return the value */
2073                         bw = rack->r_ctl.gp_bw;
2074                 } else {
2075                         /* Still doing initial average must calculate */
2076                         bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
2077                 }
2078 #ifdef NETFLIX_PEAKRATE
2079                 if ((rack->rc_tp->t_maxpeakrate) &&
2080                     (bw > rack->rc_tp->t_maxpeakrate)) {
2081                         /* The user has set a peak rate to pace at
2082                          * don't allow us to pace faster than that.
2083                          */
2084                         return (rack->rc_tp->t_maxpeakrate);
2085                 }
2086 #endif
2087                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
2088                         bw = rack->r_ctl.bw_rate_cap;
2089                 return (bw);
2090         }
2091 }
2092
2093 static uint16_t
2094 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
2095 {
2096         if (rack->use_fixed_rate) {
2097                 return (100);
2098         } else if (rack->in_probe_rtt && (rsm == NULL))
2099                 return (rack->r_ctl.rack_per_of_gp_probertt);
2100         else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
2101                   rack->r_ctl.rack_per_of_gp_rec)) {
2102                 if (rsm) {
2103                         /* a retransmission always use the recovery rate */
2104                         return (rack->r_ctl.rack_per_of_gp_rec);
2105                 } else if (rack->rack_rec_nonrxt_use_cr) {
2106                         /* Directed to use the configured rate */
2107                         goto configured_rate;
2108                 } else if (rack->rack_no_prr &&
2109                            (rack->r_ctl.rack_per_of_gp_rec > 100)) {
2110                         /* No PRR, lets just use the b/w estimate only */
2111                         return (100);
2112                 } else {
2113                         /*
2114                          * Here we may have a non-retransmit but we
2115                          * have no overrides, so just use the recovery
2116                          * rate (prr is in effect).
2117                          */
2118                         return (rack->r_ctl.rack_per_of_gp_rec);
2119                 }
2120         }
2121 configured_rate:
2122         /* For the configured rate we look at our cwnd vs the ssthresh */
2123         if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2124                 return (rack->r_ctl.rack_per_of_gp_ss);
2125         else
2126                 return (rack->r_ctl.rack_per_of_gp_ca);
2127 }
2128
2129 static void
2130 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
2131 {
2132         /*
2133          * Types of logs (mod value)
2134          * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
2135          * 2 = a dsack round begins, persist is reset to 16.
2136          * 3 = a dsack round ends
2137          * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
2138          * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
2139          * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
2140          */
2141         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2142                 union tcp_log_stackspecific log;
2143                 struct timeval tv;
2144
2145                 memset(&log, 0, sizeof(log));
2146                 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
2147                 log.u_bbr.flex1 <<= 1;
2148                 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
2149                 log.u_bbr.flex1 <<= 1;
2150                 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
2151                 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
2152                 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
2153                 log.u_bbr.flex4 = flex4;
2154                 log.u_bbr.flex5 = flex5;
2155                 log.u_bbr.flex6 = flex6;
2156                 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
2157                 log.u_bbr.flex8 = mod;
2158                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2159                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2160                     &rack->rc_inp->inp_socket->so_rcv,
2161                     &rack->rc_inp->inp_socket->so_snd,
2162                     RACK_DSACK_HANDLING, 0,
2163                     0, &log, false, &tv);
2164         }
2165 }
2166
2167 static void
2168 rack_log_hdwr_pacing(struct tcp_rack *rack,
2169                      uint64_t rate, uint64_t hw_rate, int line,
2170                      int error, uint16_t mod)
2171 {
2172         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2173                 union tcp_log_stackspecific log;
2174                 struct timeval tv;
2175                 const struct ifnet *ifp;
2176
2177                 memset(&log, 0, sizeof(log));
2178                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2179                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2180                 if (rack->r_ctl.crte) {
2181                         ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2182                 } else if (rack->rc_inp->inp_route.ro_nh &&
2183                            rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2184                         ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2185                 } else
2186                         ifp = NULL;
2187                 if (ifp) {
2188                         log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2189                         log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2190                 }
2191                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2192                 log.u_bbr.bw_inuse = rate;
2193                 log.u_bbr.flex5 = line;
2194                 log.u_bbr.flex6 = error;
2195                 log.u_bbr.flex7 = mod;
2196                 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2197                 log.u_bbr.flex8 = rack->use_fixed_rate;
2198                 log.u_bbr.flex8 <<= 1;
2199                 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2200                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2201                 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2202                 if (rack->r_ctl.crte)
2203                         log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2204                 else
2205                         log.u_bbr.cur_del_rate = 0;
2206                 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2207                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2208                     &rack->rc_inp->inp_socket->so_rcv,
2209                     &rack->rc_inp->inp_socket->so_snd,
2210                     BBR_LOG_HDWR_PACE, 0,
2211                     0, &log, false, &tv);
2212         }
2213 }
2214
2215 static uint64_t
2216 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2217 {
2218         /*
2219          * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2220          */
2221         uint64_t bw_est, high_rate;
2222         uint64_t gain;
2223
2224         gain = (uint64_t)rack_get_output_gain(rack, rsm);
2225         bw_est = bw * gain;
2226         bw_est /= (uint64_t)100;
2227         /* Never fall below the minimum (def 64kbps) */
2228         if (bw_est < RACK_MIN_BW)
2229                 bw_est = RACK_MIN_BW;
2230         if (rack->r_rack_hw_rate_caps) {
2231                 /* Rate caps are in place */
2232                 if (rack->r_ctl.crte != NULL) {
2233                         /* We have a hdwr rate already */
2234                         high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2235                         if (bw_est >= high_rate) {
2236                                 /* We are capping bw at the highest rate table entry */
2237                                 rack_log_hdwr_pacing(rack,
2238                                                      bw_est, high_rate, __LINE__,
2239                                                      0, 3);
2240                                 bw_est = high_rate;
2241                                 if (capped)
2242                                         *capped = 1;
2243                         }
2244                 } else if ((rack->rack_hdrw_pacing == 0) &&
2245                            (rack->rack_hdw_pace_ena) &&
2246                            (rack->rack_attempt_hdwr_pace == 0) &&
2247                            (rack->rc_inp->inp_route.ro_nh != NULL) &&
2248                            (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2249                         /*
2250                          * Special case, we have not yet attempted hardware
2251                          * pacing, and yet we may, when we do, find out if we are
2252                          * above the highest rate. We need to know the maxbw for the interface
2253                          * in question (if it supports ratelimiting). We get back
2254                          * a 0, if the interface is not found in the RL lists.
2255                          */
2256                         high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2257                         if (high_rate) {
2258                                 /* Yep, we have a rate is it above this rate? */
2259                                 if (bw_est > high_rate) {
2260                                         bw_est = high_rate;
2261                                         if (capped)
2262                                                 *capped = 1;
2263                                 }
2264                         }
2265                 }
2266         }
2267         return (bw_est);
2268 }
2269
2270 static void
2271 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2272 {
2273         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2274                 union tcp_log_stackspecific log;
2275                 struct timeval tv;
2276
2277                 if ((mod != 1) && (rack_verbose_logging == 0)) {
2278                         /*
2279                          * We get 3 values currently for mod
2280                          * 1 - We are retransmitting and this tells the reason.
2281                          * 2 - We are clearing a dup-ack count.
2282                          * 3 - We are incrementing a dup-ack count.
2283                          *
2284                          * The clear/increment are only logged
2285                          * if you have BBverbose on.
2286                          */
2287                         return;
2288                 }
2289                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2290                 log.u_bbr.flex1 = tsused;
2291                 log.u_bbr.flex2 = thresh;
2292                 log.u_bbr.flex3 = rsm->r_flags;
2293                 log.u_bbr.flex4 = rsm->r_dupack;
2294                 log.u_bbr.flex5 = rsm->r_start;
2295                 log.u_bbr.flex6 = rsm->r_end;
2296                 log.u_bbr.flex8 = mod;
2297                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2298                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2299                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2300                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2301                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2302                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2303                 log.u_bbr.pacing_gain = rack->r_must_retran;
2304                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2305                     &rack->rc_inp->inp_socket->so_rcv,
2306                     &rack->rc_inp->inp_socket->so_snd,
2307                     BBR_LOG_SETTINGS_CHG, 0,
2308                     0, &log, false, &tv);
2309         }
2310 }
2311
2312 static void
2313 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2314 {
2315         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2316                 union tcp_log_stackspecific log;
2317                 struct timeval tv;
2318
2319                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2320                 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2321                 log.u_bbr.flex2 = to;
2322                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2323                 log.u_bbr.flex4 = slot;
2324                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
2325                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2326                 log.u_bbr.flex7 = rack->rc_in_persist;
2327                 log.u_bbr.flex8 = which;
2328                 if (rack->rack_no_prr)
2329                         log.u_bbr.pkts_out = 0;
2330                 else
2331                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2332                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2333                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2334                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2335                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2336                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2337                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2338                 log.u_bbr.pacing_gain = rack->r_must_retran;
2339                 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2340                 log.u_bbr.lost = rack_rto_min;
2341                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2342                     &rack->rc_inp->inp_socket->so_rcv,
2343                     &rack->rc_inp->inp_socket->so_snd,
2344                     BBR_LOG_TIMERSTAR, 0,
2345                     0, &log, false, &tv);
2346         }
2347 }
2348
2349 static void
2350 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2351 {
2352         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2353                 union tcp_log_stackspecific log;
2354                 struct timeval tv;
2355
2356                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2357                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2358                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2359                 log.u_bbr.flex8 = to_num;
2360                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2361                 log.u_bbr.flex2 = rack->rc_rack_rtt;
2362                 if (rsm == NULL)
2363                         log.u_bbr.flex3 = 0;
2364                 else
2365                         log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2366                 if (rack->rack_no_prr)
2367                         log.u_bbr.flex5 = 0;
2368                 else
2369                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2370                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2371                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2372                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2373                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2374                 log.u_bbr.pacing_gain = rack->r_must_retran;
2375                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2376                     &rack->rc_inp->inp_socket->so_rcv,
2377                     &rack->rc_inp->inp_socket->so_snd,
2378                     BBR_LOG_RTO, 0,
2379                     0, &log, false, &tv);
2380         }
2381 }
2382
2383 static void
2384 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2385                  struct rack_sendmap *prev,
2386                  struct rack_sendmap *rsm,
2387                  struct rack_sendmap *next,
2388                  int flag, uint32_t th_ack, int line)
2389 {
2390         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2391                 union tcp_log_stackspecific log;
2392                 struct timeval tv;
2393
2394                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2395                 log.u_bbr.flex8 = flag;
2396                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2397                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2398                 log.u_bbr.cur_del_rate = (uint64_t)prev;
2399                 log.u_bbr.delRate = (uint64_t)rsm;
2400                 log.u_bbr.rttProp = (uint64_t)next;
2401                 log.u_bbr.flex7 = 0;
2402                 if (prev) {
2403                         log.u_bbr.flex1 = prev->r_start;
2404                         log.u_bbr.flex2 = prev->r_end;
2405                         log.u_bbr.flex7 |= 0x4;
2406                 }
2407                 if (rsm) {
2408                         log.u_bbr.flex3 = rsm->r_start;
2409                         log.u_bbr.flex4 = rsm->r_end;
2410                         log.u_bbr.flex7 |= 0x2;
2411                 }
2412                 if (next) {
2413                         log.u_bbr.flex5 = next->r_start;
2414                         log.u_bbr.flex6 = next->r_end;
2415                         log.u_bbr.flex7 |= 0x1;
2416                 }
2417                 log.u_bbr.applimited = line;
2418                 log.u_bbr.pkts_out = th_ack;
2419                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2420                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2421                 if (rack->rack_no_prr)
2422                         log.u_bbr.lost = 0;
2423                 else
2424                         log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2425                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2426                     &rack->rc_inp->inp_socket->so_rcv,
2427                     &rack->rc_inp->inp_socket->so_snd,
2428                     TCP_LOG_MAPCHG, 0,
2429                     0, &log, false, &tv);
2430         }
2431 }
2432
2433 static void
2434 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2435                  struct rack_sendmap *rsm, int conf)
2436 {
2437         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2438                 union tcp_log_stackspecific log;
2439                 struct timeval tv;
2440                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2441                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2442                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2443                 log.u_bbr.flex1 = t;
2444                 log.u_bbr.flex2 = len;
2445                 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2446                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2447                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2448                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2449                 log.u_bbr.flex7 = conf;
2450                 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2451                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2452                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2453                 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2454                 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2455                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2456                 if (rsm) {
2457                         log.u_bbr.pkt_epoch = rsm->r_start;
2458                         log.u_bbr.lost = rsm->r_end;
2459                         log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2460                         /* We loose any upper of the 24 bits */
2461                         log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2462                 } else {
2463                         /* Its a SYN */
2464                         log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2465                         log.u_bbr.lost = 0;
2466                         log.u_bbr.cwnd_gain = 0;
2467                         log.u_bbr.pacing_gain = 0;
2468                 }
2469                 /* Write out general bits of interest rrs here */
2470                 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2471                 log.u_bbr.use_lt_bw <<= 1;
2472                 log.u_bbr.use_lt_bw |= rack->forced_ack;
2473                 log.u_bbr.use_lt_bw <<= 1;
2474                 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2475                 log.u_bbr.use_lt_bw <<= 1;
2476                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2477                 log.u_bbr.use_lt_bw <<= 1;
2478                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2479                 log.u_bbr.use_lt_bw <<= 1;
2480                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2481                 log.u_bbr.use_lt_bw <<= 1;
2482                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2483                 log.u_bbr.use_lt_bw <<= 1;
2484                 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2485                 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2486                 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2487                 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2488                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2489                 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2490                 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
2491                 log.u_bbr.bw_inuse <<= 32;
2492                 if (rsm)
2493                         log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2494                 TCP_LOG_EVENTP(tp, NULL,
2495                     &rack->rc_inp->inp_socket->so_rcv,
2496                     &rack->rc_inp->inp_socket->so_snd,
2497                     BBR_LOG_BBRRTT, 0,
2498                     0, &log, false, &tv);
2499
2500
2501         }
2502 }
2503
2504 static void
2505 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2506 {
2507         /*
2508          * Log the rtt sample we are
2509          * applying to the srtt algorithm in
2510          * useconds.
2511          */
2512         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2513                 union tcp_log_stackspecific log;
2514                 struct timeval tv;
2515
2516                 /* Convert our ms to a microsecond */
2517                 memset(&log, 0, sizeof(log));
2518                 log.u_bbr.flex1 = rtt;
2519                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2520                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
2521                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2522                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
2523                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2524                 log.u_bbr.flex7 = 1;
2525                 log.u_bbr.flex8 = rack->sack_attack_disable;
2526                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2527                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2528                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2529                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2530                 log.u_bbr.pacing_gain = rack->r_must_retran;
2531                 /*
2532                  * We capture in delRate the upper 32 bits as
2533                  * the confidence level we had declared, and the
2534                  * lower 32 bits as the actual RTT using the arrival
2535                  * timestamp.
2536                  */
2537                 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2538                 log.u_bbr.delRate <<= 32;
2539                 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2540                 /* Lets capture all the things that make up t_rtxcur */
2541                 log.u_bbr.applimited = rack_rto_min;
2542                 log.u_bbr.epoch = rack_rto_max;
2543                 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2544                 log.u_bbr.lost = rack_rto_min;
2545                 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2546                 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2547                 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2548                 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2549                 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2550                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2551                     &rack->rc_inp->inp_socket->so_rcv,
2552                     &rack->rc_inp->inp_socket->so_snd,
2553                     TCP_LOG_RTT, 0,
2554                     0, &log, false, &tv);
2555         }
2556 }
2557
2558 static void
2559 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2560 {
2561         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2562                 union tcp_log_stackspecific log;
2563                 struct timeval tv;
2564
2565                 /* Convert our ms to a microsecond */
2566                 memset(&log, 0, sizeof(log));
2567                 log.u_bbr.flex1 = rtt;
2568                 log.u_bbr.flex2 = send_time;
2569                 log.u_bbr.flex3 = ack_time;
2570                 log.u_bbr.flex4 = where;
2571                 log.u_bbr.flex7 = 2;
2572                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2573                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2574                     &rack->rc_inp->inp_socket->so_rcv,
2575                     &rack->rc_inp->inp_socket->so_snd,
2576                     TCP_LOG_RTT, 0,
2577                     0, &log, false, &tv);
2578         }
2579 }
2580
2581
2582
2583 static inline void
2584 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
2585 {
2586         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2587                 union tcp_log_stackspecific log;
2588                 struct timeval tv;
2589
2590                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2591                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2592                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2593                 log.u_bbr.flex1 = line;
2594                 log.u_bbr.flex2 = tick;
2595                 log.u_bbr.flex3 = tp->t_maxunacktime;
2596                 log.u_bbr.flex4 = tp->t_acktime;
2597                 log.u_bbr.flex8 = event;
2598                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2599                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2600                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2601                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2602                 log.u_bbr.pacing_gain = rack->r_must_retran;
2603                 TCP_LOG_EVENTP(tp, NULL,
2604                     &rack->rc_inp->inp_socket->so_rcv,
2605                     &rack->rc_inp->inp_socket->so_snd,
2606                     BBR_LOG_PROGRESS, 0,
2607                     0, &log, false, &tv);
2608         }
2609 }
2610
2611 static void
2612 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
2613 {
2614         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2615                 union tcp_log_stackspecific log;
2616
2617                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2618                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2619                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2620                 log.u_bbr.flex1 = slot;
2621                 if (rack->rack_no_prr)
2622                         log.u_bbr.flex2 = 0;
2623                 else
2624                         log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2625                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2626                 log.u_bbr.flex8 = rack->rc_in_persist;
2627                 log.u_bbr.timeStamp = cts;
2628                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2629                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2630                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2631                 log.u_bbr.pacing_gain = rack->r_must_retran;
2632                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2633                     &rack->rc_inp->inp_socket->so_rcv,
2634                     &rack->rc_inp->inp_socket->so_snd,
2635                     BBR_LOG_BBRSND, 0,
2636                     0, &log, false, tv);
2637         }
2638 }
2639
2640 static void
2641 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2642 {
2643         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2644                 union tcp_log_stackspecific log;
2645                 struct timeval tv;
2646
2647                 memset(&log, 0, sizeof(log));
2648                 log.u_bbr.flex1 = did_out;
2649                 log.u_bbr.flex2 = nxt_pkt;
2650                 log.u_bbr.flex3 = way_out;
2651                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2652                 if (rack->rack_no_prr)
2653                         log.u_bbr.flex5 = 0;
2654                 else
2655                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2656                 log.u_bbr.flex6 = nsegs;
2657                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2658                 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data;        /* Do we have ack-can-send set */
2659                 log.u_bbr.flex7 <<= 1;
2660                 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2661                 log.u_bbr.flex7 <<= 1;
2662                 log.u_bbr.flex7 |= rack->r_wanted_output;       /* Do we want output */
2663                 log.u_bbr.flex8 = rack->rc_in_persist;
2664                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2665                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2666                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2667                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2668                 log.u_bbr.use_lt_bw <<= 1;
2669                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2670                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2671                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2672                 log.u_bbr.pacing_gain = rack->r_must_retran;
2673                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2674                     &rack->rc_inp->inp_socket->so_rcv,
2675                     &rack->rc_inp->inp_socket->so_snd,
2676                     BBR_LOG_DOSEG_DONE, 0,
2677                     0, &log, false, &tv);
2678         }
2679 }
2680
2681 static void
2682 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2683 {
2684         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2685                 union tcp_log_stackspecific log;
2686                 struct timeval tv;
2687                 uint32_t cts;
2688
2689                 memset(&log, 0, sizeof(log));
2690                 cts = tcp_get_usecs(&tv);
2691                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2692                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
2693                 log.u_bbr.flex4 = arg1;
2694                 log.u_bbr.flex5 = arg2;
2695                 log.u_bbr.flex6 = arg3;
2696                 log.u_bbr.flex8 = frm;
2697                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2698                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2699                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2700                 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
2701                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2702                 log.u_bbr.pacing_gain = rack->r_must_retran;
2703                 TCP_LOG_EVENTP(tp, NULL,
2704                     &tp->t_inpcb->inp_socket->so_rcv,
2705                     &tp->t_inpcb->inp_socket->so_snd,
2706                     TCP_HDWR_PACE_SIZE, 0,
2707                     0, &log, false, &tv);
2708         }
2709 }
2710
2711 static void
2712 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
2713                           uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
2714 {
2715         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2716                 union tcp_log_stackspecific log;
2717                 struct timeval tv;
2718
2719                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2720                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2721                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2722                 log.u_bbr.flex1 = slot;
2723                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
2724                 log.u_bbr.flex4 = reason;
2725                 if (rack->rack_no_prr)
2726                         log.u_bbr.flex5 = 0;
2727                 else
2728                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2729                 log.u_bbr.flex7 = hpts_calling;
2730                 log.u_bbr.flex8 = rack->rc_in_persist;
2731                 log.u_bbr.lt_epoch = cwnd_to_use;
2732                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2733                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2734                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2735                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2736                 log.u_bbr.pacing_gain = rack->r_must_retran;
2737                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2738                     &rack->rc_inp->inp_socket->so_rcv,
2739                     &rack->rc_inp->inp_socket->so_snd,
2740                     BBR_LOG_JUSTRET, 0,
2741                     tlen, &log, false, &tv);
2742         }
2743 }
2744
2745 static void
2746 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
2747                    struct timeval *tv, uint32_t flags_on_entry)
2748 {
2749         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2750                 union tcp_log_stackspecific log;
2751
2752                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2753                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2754                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2755                 log.u_bbr.flex1 = line;
2756                 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
2757                 log.u_bbr.flex3 = flags_on_entry;
2758                 log.u_bbr.flex4 = us_cts;
2759                 if (rack->rack_no_prr)
2760                         log.u_bbr.flex5 = 0;
2761                 else
2762                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2763                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2764                 log.u_bbr.flex7 = hpts_removed;
2765                 log.u_bbr.flex8 = 1;
2766                 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
2767                 log.u_bbr.timeStamp = us_cts;
2768                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2769                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2770                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2771                 log.u_bbr.pacing_gain = rack->r_must_retran;
2772                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2773                     &rack->rc_inp->inp_socket->so_rcv,
2774                     &rack->rc_inp->inp_socket->so_snd,
2775                     BBR_LOG_TIMERCANC, 0,
2776                     0, &log, false, tv);
2777         }
2778 }
2779
2780 static void
2781 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
2782                           uint32_t flex1, uint32_t flex2,
2783                           uint32_t flex3, uint32_t flex4,
2784                           uint32_t flex5, uint32_t flex6,
2785                           uint16_t flex7, uint8_t mod)
2786 {
2787         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2788                 union tcp_log_stackspecific log;
2789                 struct timeval tv;
2790
2791                 if (mod == 1) {
2792                         /* No you can't use 1, its for the real to cancel */
2793                         return;
2794                 }
2795                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2796                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2797                 log.u_bbr.flex1 = flex1;
2798                 log.u_bbr.flex2 = flex2;
2799                 log.u_bbr.flex3 = flex3;
2800                 log.u_bbr.flex4 = flex4;
2801                 log.u_bbr.flex5 = flex5;
2802                 log.u_bbr.flex6 = flex6;
2803                 log.u_bbr.flex7 = flex7;
2804                 log.u_bbr.flex8 = mod;
2805                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2806                     &rack->rc_inp->inp_socket->so_rcv,
2807                     &rack->rc_inp->inp_socket->so_snd,
2808                     BBR_LOG_TIMERCANC, 0,
2809                     0, &log, false, &tv);
2810         }
2811 }
2812
2813 static void
2814 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
2815 {
2816         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2817                 union tcp_log_stackspecific log;
2818                 struct timeval tv;
2819
2820                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2821                 log.u_bbr.flex1 = timers;
2822                 log.u_bbr.flex2 = ret;
2823                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
2824                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2825                 log.u_bbr.flex5 = cts;
2826                 if (rack->rack_no_prr)
2827                         log.u_bbr.flex6 = 0;
2828                 else
2829                         log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
2830                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2831                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2832                 log.u_bbr.pacing_gain = rack->r_must_retran;
2833                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2834                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2835                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2836                     &rack->rc_inp->inp_socket->so_rcv,
2837                     &rack->rc_inp->inp_socket->so_snd,
2838                     BBR_LOG_TO_PROCESS, 0,
2839                     0, &log, false, &tv);
2840         }
2841 }
2842
2843 static void
2844 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd)
2845 {
2846         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2847                 union tcp_log_stackspecific log;
2848                 struct timeval tv;
2849
2850                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2851                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
2852                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
2853                 if (rack->rack_no_prr)
2854                         log.u_bbr.flex3 = 0;
2855                 else
2856                         log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
2857                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
2858                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
2859                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
2860                 log.u_bbr.flex8 = frm;
2861                 log.u_bbr.pkts_out = orig_cwnd;
2862                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2863                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2864                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2865                 log.u_bbr.use_lt_bw <<= 1;
2866                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2867                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2868                     &rack->rc_inp->inp_socket->so_rcv,
2869                     &rack->rc_inp->inp_socket->so_snd,
2870                     BBR_LOG_BBRUPD, 0,
2871                     0, &log, false, &tv);
2872         }
2873 }
2874
2875 #ifdef NETFLIX_EXP_DETECTION
2876 static void
2877 rack_log_sad(struct tcp_rack *rack, int event)
2878 {
2879         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2880                 union tcp_log_stackspecific log;
2881                 struct timeval tv;
2882
2883                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2884                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
2885                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2886                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
2887                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2888                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
2889                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
2890                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
2891                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
2892                 log.u_bbr.lt_epoch |= rack->do_detection;
2893                 log.u_bbr.applimited = tcp_map_minimum;
2894                 log.u_bbr.flex7 = rack->sack_attack_disable;
2895                 log.u_bbr.flex8 = event;
2896                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2897                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2898                 log.u_bbr.delivered = tcp_sad_decay_val;
2899                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2900                     &rack->rc_inp->inp_socket->so_rcv,
2901                     &rack->rc_inp->inp_socket->so_snd,
2902                     TCP_SAD_DETECTION, 0,
2903                     0, &log, false, &tv);
2904         }
2905 }
2906 #endif
2907
2908 static void
2909 rack_counter_destroy(void)
2910 {
2911         int i;
2912
2913         counter_u64_free(rack_fto_send);
2914         counter_u64_free(rack_fto_rsm_send);
2915         counter_u64_free(rack_nfto_resend);
2916         counter_u64_free(rack_hw_pace_init_fail);
2917         counter_u64_free(rack_hw_pace_lost);
2918         counter_u64_free(rack_non_fto_send);
2919         counter_u64_free(rack_extended_rfo);
2920         counter_u64_free(rack_ack_total);
2921         counter_u64_free(rack_express_sack);
2922         counter_u64_free(rack_sack_total);
2923         counter_u64_free(rack_move_none);
2924         counter_u64_free(rack_move_some);
2925         counter_u64_free(rack_sack_attacks_detected);
2926         counter_u64_free(rack_sack_attacks_reversed);
2927         counter_u64_free(rack_sack_used_next_merge);
2928         counter_u64_free(rack_sack_used_prev_merge);
2929         counter_u64_free(rack_badfr);
2930         counter_u64_free(rack_badfr_bytes);
2931         counter_u64_free(rack_rtm_prr_retran);
2932         counter_u64_free(rack_rtm_prr_newdata);
2933         counter_u64_free(rack_timestamp_mismatch);
2934         counter_u64_free(rack_find_high);
2935         counter_u64_free(rack_reorder_seen);
2936         counter_u64_free(rack_tlp_tot);
2937         counter_u64_free(rack_tlp_newdata);
2938         counter_u64_free(rack_tlp_retran);
2939         counter_u64_free(rack_tlp_retran_bytes);
2940         counter_u64_free(rack_tlp_retran_fail);
2941         counter_u64_free(rack_to_tot);
2942         counter_u64_free(rack_to_arm_rack);
2943         counter_u64_free(rack_to_arm_tlp);
2944         counter_u64_free(rack_calc_zero);
2945         counter_u64_free(rack_calc_nonzero);
2946         counter_u64_free(rack_paced_segments);
2947         counter_u64_free(rack_unpaced_segments);
2948         counter_u64_free(rack_saw_enobuf);
2949         counter_u64_free(rack_saw_enobuf_hw);
2950         counter_u64_free(rack_saw_enetunreach);
2951         counter_u64_free(rack_hot_alloc);
2952         counter_u64_free(rack_to_alloc);
2953         counter_u64_free(rack_to_alloc_hard);
2954         counter_u64_free(rack_to_alloc_emerg);
2955         counter_u64_free(rack_to_alloc_limited);
2956         counter_u64_free(rack_alloc_limited_conns);
2957         counter_u64_free(rack_split_limited);
2958         for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
2959                 counter_u64_free(rack_proc_comp_ack[i]);
2960         }
2961         counter_u64_free(rack_multi_single_eq);
2962         counter_u64_free(rack_proc_non_comp_ack);
2963         counter_u64_free(rack_sack_proc_all);
2964         counter_u64_free(rack_sack_proc_restart);
2965         counter_u64_free(rack_sack_proc_short);
2966         counter_u64_free(rack_enter_tlp_calc);
2967         counter_u64_free(rack_used_tlpmethod);
2968         counter_u64_free(rack_used_tlpmethod2);
2969         counter_u64_free(rack_sack_skipped_acked);
2970         counter_u64_free(rack_sack_splits);
2971         counter_u64_free(rack_progress_drops);
2972         counter_u64_free(rack_input_idle_reduces);
2973         counter_u64_free(rack_collapsed_win);
2974         counter_u64_free(rack_tlp_does_nada);
2975         counter_u64_free(rack_try_scwnd);
2976         counter_u64_free(rack_per_timer_hole);
2977         counter_u64_free(rack_large_ackcmp);
2978         counter_u64_free(rack_small_ackcmp);
2979         counter_u64_free(rack_persists_sends);
2980         counter_u64_free(rack_persists_acks);
2981         counter_u64_free(rack_persists_loss);
2982         counter_u64_free(rack_persists_lost_ends);
2983 #ifdef INVARIANTS
2984         counter_u64_free(rack_adjust_map_bw);
2985 #endif
2986         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
2987         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
2988 }
2989
2990 static struct rack_sendmap *
2991 rack_alloc(struct tcp_rack *rack)
2992 {
2993         struct rack_sendmap *rsm;
2994
2995         /*
2996          * First get the top of the list it in
2997          * theory is the "hottest" rsm we have,
2998          * possibly just freed by ack processing.
2999          */
3000         if (rack->rc_free_cnt > rack_free_cache) {
3001                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3002                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3003                 counter_u64_add(rack_hot_alloc, 1);
3004                 rack->rc_free_cnt--;
3005                 return (rsm);
3006         }
3007         /*
3008          * Once we get under our free cache we probably
3009          * no longer have a "hot" one available. Lets
3010          * get one from UMA.
3011          */
3012         rsm = uma_zalloc(rack_zone, M_NOWAIT);
3013         if (rsm) {
3014                 rack->r_ctl.rc_num_maps_alloced++;
3015                 counter_u64_add(rack_to_alloc, 1);
3016                 return (rsm);
3017         }
3018         /*
3019          * Dig in to our aux rsm's (the last two) since
3020          * UMA failed to get us one.
3021          */
3022         if (rack->rc_free_cnt) {
3023                 counter_u64_add(rack_to_alloc_emerg, 1);
3024                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3025                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3026                 rack->rc_free_cnt--;
3027                 return (rsm);
3028         }
3029         return (NULL);
3030 }
3031
3032 static struct rack_sendmap *
3033 rack_alloc_full_limit(struct tcp_rack *rack)
3034 {
3035         if ((V_tcp_map_entries_limit > 0) &&
3036             (rack->do_detection == 0) &&
3037             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3038                 counter_u64_add(rack_to_alloc_limited, 1);
3039                 if (!rack->alloc_limit_reported) {
3040                         rack->alloc_limit_reported = 1;
3041                         counter_u64_add(rack_alloc_limited_conns, 1);
3042                 }
3043                 return (NULL);
3044         }
3045         return (rack_alloc(rack));
3046 }
3047
3048 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3049 static struct rack_sendmap *
3050 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
3051 {
3052         struct rack_sendmap *rsm;
3053
3054         if (limit_type) {
3055                 /* currently there is only one limit type */
3056                 if (V_tcp_map_split_limit > 0 &&
3057                     (rack->do_detection == 0) &&
3058                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
3059                         counter_u64_add(rack_split_limited, 1);
3060                         if (!rack->alloc_limit_reported) {
3061                                 rack->alloc_limit_reported = 1;
3062                                 counter_u64_add(rack_alloc_limited_conns, 1);
3063                         }
3064                         return (NULL);
3065                 }
3066         }
3067
3068         /* allocate and mark in the limit type, if set */
3069         rsm = rack_alloc(rack);
3070         if (rsm != NULL && limit_type) {
3071                 rsm->r_limit_type = limit_type;
3072                 rack->r_ctl.rc_num_split_allocs++;
3073         }
3074         return (rsm);
3075 }
3076
3077 static void
3078 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
3079 {
3080         if (rsm->r_flags & RACK_APP_LIMITED) {
3081                 if (rack->r_ctl.rc_app_limited_cnt > 0) {
3082                         rack->r_ctl.rc_app_limited_cnt--;
3083                 }
3084         }
3085         if (rsm->r_limit_type) {
3086                 /* currently there is only one limit type */
3087                 rack->r_ctl.rc_num_split_allocs--;
3088         }
3089         if (rsm == rack->r_ctl.rc_first_appl) {
3090                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3091                         rack->r_ctl.rc_first_appl = NULL;
3092                 else {
3093                         /* Follow the next one out */
3094                         struct rack_sendmap fe;
3095
3096                         fe.r_start = rsm->r_nseq_appl;
3097                         rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3098                 }
3099         }
3100         if (rsm == rack->r_ctl.rc_resend)
3101                 rack->r_ctl.rc_resend = NULL;
3102         if (rsm == rack->r_ctl.rc_rsm_at_retran)
3103                 rack->r_ctl.rc_rsm_at_retran = NULL;
3104         if (rsm == rack->r_ctl.rc_end_appl)
3105                 rack->r_ctl.rc_end_appl = NULL;
3106         if (rack->r_ctl.rc_tlpsend == rsm)
3107                 rack->r_ctl.rc_tlpsend = NULL;
3108         if (rack->r_ctl.rc_sacklast == rsm)
3109                 rack->r_ctl.rc_sacklast = NULL;
3110         memset(rsm, 0, sizeof(struct rack_sendmap));
3111         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
3112         rack->rc_free_cnt++;
3113 }
3114
3115 static void
3116 rack_free_trim(struct tcp_rack *rack)
3117 {
3118         struct rack_sendmap *rsm;
3119
3120         /*
3121          * Free up all the tail entries until
3122          * we get our list down to the limit.
3123          */
3124         while (rack->rc_free_cnt > rack_free_cache) {
3125                 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
3126                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3127                 rack->rc_free_cnt--;
3128                 uma_zfree(rack_zone, rsm);
3129         }
3130 }
3131
3132
3133 static uint32_t
3134 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
3135 {
3136         uint64_t srtt, bw, len, tim;
3137         uint32_t segsiz, def_len, minl;
3138
3139         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3140         def_len = rack_def_data_window * segsiz;
3141         if (rack->rc_gp_filled == 0) {
3142                 /*
3143                  * We have no measurement (IW is in flight?) so
3144                  * we can only guess using our data_window sysctl
3145                  * value (usually 20MSS).
3146                  */
3147                 return (def_len);
3148         }
3149         /*
3150          * Now we have a number of factors to consider.
3151          *
3152          * 1) We have a desired BDP which is usually
3153          *    at least 2.
3154          * 2) We have a minimum number of rtt's usually 1 SRTT
3155          *    but we allow it too to be more.
3156          * 3) We want to make sure a measurement last N useconds (if
3157          *    we have set rack_min_measure_usec.
3158          *
3159          * We handle the first concern here by trying to create a data
3160          * window of max(rack_def_data_window, DesiredBDP). The
3161          * second concern we handle in not letting the measurement
3162          * window end normally until at least the required SRTT's
3163          * have gone by which is done further below in
3164          * rack_enough_for_measurement(). Finally the third concern
3165          * we also handle here by calculating how long that time
3166          * would take at the current BW and then return the
3167          * max of our first calculation and that length. Note
3168          * that if rack_min_measure_usec is 0, we don't deal
3169          * with concern 3. Also for both Concern 1 and 3 an
3170          * application limited period could end the measurement
3171          * earlier.
3172          *
3173          * So lets calculate the BDP with the "known" b/w using
3174          * the SRTT has our rtt and then multiply it by the
3175          * goal.
3176          */
3177         bw = rack_get_bw(rack);
3178         srtt = (uint64_t)tp->t_srtt;
3179         len = bw * srtt;
3180         len /= (uint64_t)HPTS_USEC_IN_SEC;
3181         len *= max(1, rack_goal_bdp);
3182         /* Now we need to round up to the nearest MSS */
3183         len = roundup(len, segsiz);
3184         if (rack_min_measure_usec) {
3185                 /* Now calculate our min length for this b/w */
3186                 tim = rack_min_measure_usec;
3187                 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
3188                 if (minl == 0)
3189                         minl = 1;
3190                 minl = roundup(minl, segsiz);
3191                 if (len < minl)
3192                         len = minl;
3193         }
3194         /*
3195          * Now if we have a very small window we want
3196          * to attempt to get the window that is
3197          * as small as possible. This happens on
3198          * low b/w connections and we don't want to
3199          * span huge numbers of rtt's between measurements.
3200          *
3201          * We basically include 2 over our "MIN window" so
3202          * that the measurement can be shortened (possibly) by
3203          * an ack'ed packet.
3204          */
3205         if (len < def_len)
3206                 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3207         else
3208                 return (max((uint32_t)len, def_len));
3209
3210 }
3211
3212 static int
3213 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3214 {
3215         uint32_t tim, srtts, segsiz;
3216
3217         /*
3218          * Has enough time passed for the GP measurement to be valid?
3219          */
3220         if ((tp->snd_max == tp->snd_una) ||
3221             (th_ack == tp->snd_max)){
3222                 /* All is acked */
3223                 *quality = RACK_QUALITY_ALLACKED;
3224                 return (1);
3225         }
3226         if (SEQ_LT(th_ack, tp->gput_seq)) {
3227                 /* Not enough bytes yet */
3228                 return (0);
3229         }
3230         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3231         if (SEQ_LT(th_ack, tp->gput_ack) &&
3232             ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3233                 /* Not enough bytes yet */
3234                 return (0);
3235         }
3236         if (rack->r_ctl.rc_first_appl &&
3237             (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3238                 /*
3239                  * We are up to the app limited send point
3240                  * we have to measure irrespective of the time..
3241                  */
3242                 *quality = RACK_QUALITY_APPLIMITED;
3243                 return (1);
3244         }
3245         /* Now what about time? */
3246         srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3247         tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3248         if (tim >= srtts) {
3249                 *quality = RACK_QUALITY_HIGH;
3250                 return (1);
3251         }
3252         /* Nope not even a full SRTT has passed */
3253         return (0);
3254 }
3255
3256 static void
3257 rack_log_timely(struct tcp_rack *rack,
3258                 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3259                 uint64_t up_bnd, int line, uint8_t method)
3260 {
3261         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3262                 union tcp_log_stackspecific log;
3263                 struct timeval tv;
3264
3265                 memset(&log, 0, sizeof(log));
3266                 log.u_bbr.flex1 = logged;
3267                 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3268                 log.u_bbr.flex2 <<= 4;
3269                 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3270                 log.u_bbr.flex2 <<= 4;
3271                 log.u_bbr.flex2 |= rack->rc_gp_incr;
3272                 log.u_bbr.flex2 <<= 4;
3273                 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3274                 log.u_bbr.flex3 = rack->rc_gp_incr;
3275                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3276                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3277                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3278                 log.u_bbr.flex7 = rack->rc_gp_bwred;
3279                 log.u_bbr.flex8 = method;
3280                 log.u_bbr.cur_del_rate = cur_bw;
3281                 log.u_bbr.delRate = low_bnd;
3282                 log.u_bbr.bw_inuse = up_bnd;
3283                 log.u_bbr.rttProp = rack_get_bw(rack);
3284                 log.u_bbr.pkt_epoch = line;
3285                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3286                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3287                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3288                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3289                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3290                 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3291                 log.u_bbr.cwnd_gain <<= 1;
3292                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3293                 log.u_bbr.cwnd_gain <<= 1;
3294                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3295                 log.u_bbr.cwnd_gain <<= 1;
3296                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3297                 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3298                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3299                     &rack->rc_inp->inp_socket->so_rcv,
3300                     &rack->rc_inp->inp_socket->so_snd,
3301                     TCP_TIMELY_WORK, 0,
3302                     0, &log, false, &tv);
3303         }
3304 }
3305
3306 static int
3307 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3308 {
3309         /*
3310          * Before we increase we need to know if
3311          * the estimate just made was less than
3312          * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3313          *
3314          * If we already are pacing at a fast enough
3315          * rate to push us faster there is no sense of
3316          * increasing.
3317          *
3318          * We first caculate our actual pacing rate (ss or ca multipler
3319          * times our cur_bw).
3320          *
3321          * Then we take the last measured rate and multipy by our
3322          * maximum pacing overage to give us a max allowable rate.
3323          *
3324          * If our act_rate is smaller than our max_allowable rate
3325          * then we should increase. Else we should hold steady.
3326          *
3327          */
3328         uint64_t act_rate, max_allow_rate;
3329
3330         if (rack_timely_no_stopping)
3331                 return (1);
3332
3333         if ((cur_bw == 0) || (last_bw_est == 0)) {
3334                 /*
3335                  * Initial startup case or
3336                  * everything is acked case.
3337                  */
3338                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3339                                 __LINE__, 9);
3340                 return (1);
3341         }
3342         if (mult <= 100) {
3343                 /*
3344                  * We can always pace at or slightly above our rate.
3345                  */
3346                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3347                                 __LINE__, 9);
3348                 return (1);
3349         }
3350         act_rate = cur_bw * (uint64_t)mult;
3351         act_rate /= 100;
3352         max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3353         max_allow_rate /= 100;
3354         if (act_rate < max_allow_rate) {
3355                 /*
3356                  * Here the rate we are actually pacing at
3357                  * is smaller than 10% above our last measurement.
3358                  * This means we are pacing below what we would
3359                  * like to try to achieve (plus some wiggle room).
3360                  */
3361                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3362                                 __LINE__, 9);
3363                 return (1);
3364         } else {
3365                 /*
3366                  * Here we are already pacing at least rack_max_per_above(10%)
3367                  * what we are getting back. This indicates most likely
3368                  * that we are being limited (cwnd/rwnd/app) and can't
3369                  * get any more b/w. There is no sense of trying to
3370                  * raise up the pacing rate its not speeding us up
3371                  * and we already are pacing faster than we are getting.
3372                  */
3373                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3374                                 __LINE__, 8);
3375                 return (0);
3376         }
3377 }
3378
3379 static void
3380 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3381 {
3382         /*
3383          * When we drag bottom, we want to assure
3384          * that no multiplier is below 1.0, if so
3385          * we want to restore it to at least that.
3386          */
3387         if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
3388                 /* This is unlikely we usually do not touch recovery */
3389                 rack->r_ctl.rack_per_of_gp_rec = 100;
3390         }
3391         if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3392                 rack->r_ctl.rack_per_of_gp_ca = 100;
3393         }
3394         if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3395                 rack->r_ctl.rack_per_of_gp_ss = 100;
3396         }
3397 }
3398
3399 static void
3400 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3401 {
3402         if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3403                 rack->r_ctl.rack_per_of_gp_ca = 100;
3404         }
3405         if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3406                 rack->r_ctl.rack_per_of_gp_ss = 100;
3407         }
3408 }
3409
3410 static void
3411 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3412 {
3413         int32_t  calc, logged, plus;
3414
3415         logged = 0;
3416
3417         if (override) {
3418                 /*
3419                  * override is passed when we are
3420                  * loosing b/w and making one last
3421                  * gasp at trying to not loose out
3422                  * to a new-reno flow.
3423                  */
3424                 goto extra_boost;
3425         }
3426         /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3427         if (rack->rc_gp_incr &&
3428             ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3429                 /*
3430                  * Reset and get 5 strokes more before the boost. Note
3431                  * that the count is 0 based so we have to add one.
3432                  */
3433 extra_boost:
3434                 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3435                 rack->rc_gp_timely_inc_cnt = 0;
3436         } else
3437                 plus = (uint32_t)rack_gp_increase_per;
3438         /* Must be at least 1% increase for true timely increases */
3439         if ((plus < 1) &&
3440             ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3441                 plus = 1;
3442         if (rack->rc_gp_saw_rec &&
3443             (rack->rc_gp_no_rec_chg == 0) &&
3444             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3445                                   rack->r_ctl.rack_per_of_gp_rec)) {
3446                 /* We have been in recovery ding it too */
3447                 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3448                 if (calc > 0xffff)
3449                         calc = 0xffff;
3450                 logged |= 1;
3451                 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3452                 if (rack_per_upper_bound_ss &&
3453                     (rack->rc_dragged_bottom == 0) &&
3454                     (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
3455                         rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
3456         }
3457         if (rack->rc_gp_saw_ca &&
3458             (rack->rc_gp_saw_ss == 0) &&
3459             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3460                                   rack->r_ctl.rack_per_of_gp_ca)) {
3461                 /* In CA */
3462                 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3463                 if (calc > 0xffff)
3464                         calc = 0xffff;
3465                 logged |= 2;
3466                 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3467                 if (rack_per_upper_bound_ca &&
3468                     (rack->rc_dragged_bottom == 0) &&
3469                     (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
3470                         rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
3471         }
3472         if (rack->rc_gp_saw_ss &&
3473             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3474                                   rack->r_ctl.rack_per_of_gp_ss)) {
3475                 /* In SS */
3476                 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3477                 if (calc > 0xffff)
3478                         calc = 0xffff;
3479                 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3480                 if (rack_per_upper_bound_ss &&
3481                     (rack->rc_dragged_bottom == 0) &&
3482                     (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
3483                         rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
3484                 logged |= 4;
3485         }
3486         if (logged &&
3487             (rack->rc_gp_incr == 0)){
3488                 /* Go into increment mode */
3489                 rack->rc_gp_incr = 1;
3490                 rack->rc_gp_timely_inc_cnt = 0;
3491         }
3492         if (rack->rc_gp_incr &&
3493             logged &&
3494             (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3495                 rack->rc_gp_timely_inc_cnt++;
3496         }
3497         rack_log_timely(rack,  logged, plus, 0, 0,
3498                         __LINE__, 1);
3499 }
3500
3501 static uint32_t
3502 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3503 {
3504         /*
3505          * norm_grad = rtt_diff / minrtt;
3506          * new_per = curper * (1 - B * norm_grad)
3507          *
3508          * B = rack_gp_decrease_per (default 10%)
3509          * rtt_dif = input var current rtt-diff
3510          * curper = input var current percentage
3511          * minrtt = from rack filter
3512          *
3513          */
3514         uint64_t perf;
3515
3516         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3517                     ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3518                      (((uint64_t)rtt_diff * (uint64_t)1000000)/
3519                       (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3520                      (uint64_t)1000000)) /
3521                 (uint64_t)1000000);
3522         if (perf > curper) {
3523                 /* TSNH */
3524                 perf = curper - 1;
3525         }
3526         return ((uint32_t)perf);
3527 }
3528
3529 static uint32_t
3530 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3531 {
3532         /*
3533          *                                   highrttthresh
3534          * result = curper * (1 - (B * ( 1 -  ------          ))
3535          *                                     gp_srtt
3536          *
3537          * B = rack_gp_decrease_per (default 10%)
3538          * highrttthresh = filter_min * rack_gp_rtt_maxmul
3539          */
3540         uint64_t perf;
3541         uint32_t highrttthresh;
3542
3543         highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3544
3545         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3546                                      ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3547                                         ((uint64_t)highrttthresh * (uint64_t)1000000) /
3548                                                     (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3549         return (perf);
3550 }
3551
3552 static void
3553 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3554 {
3555         uint64_t logvar, logvar2, logvar3;
3556         uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3557
3558         if (rack->rc_gp_incr) {
3559                 /* Turn off increment counting */
3560                 rack->rc_gp_incr = 0;
3561                 rack->rc_gp_timely_inc_cnt = 0;
3562         }
3563         ss_red = ca_red = rec_red = 0;
3564         logged = 0;
3565         /* Calculate the reduction value */
3566         if (rtt_diff < 0) {
3567                 rtt_diff *= -1;
3568         }
3569         /* Must be at least 1% reduction */
3570         if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3571                 /* We have been in recovery ding it too */
3572                 if (timely_says == 2) {
3573                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3574                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3575                         if (alt < new_per)
3576                                 val = alt;
3577                         else
3578                                 val = new_per;
3579                 } else
3580                          val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3581                 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3582                         rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3583                         rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3584                 } else {
3585                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3586                         rec_red = 0;
3587                 }
3588                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3589                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3590                 logged |= 1;
3591         }
3592         if (rack->rc_gp_saw_ss) {
3593                 /* Sent in SS */
3594                 if (timely_says == 2) {
3595                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3596                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3597                         if (alt < new_per)
3598                                 val = alt;
3599                         else
3600                                 val = new_per;
3601                 } else
3602                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3603                 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3604                         ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3605                         rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3606                 } else {
3607                         ss_red = new_per;
3608                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3609                         logvar = new_per;
3610                         logvar <<= 32;
3611                         logvar |= alt;
3612                         logvar2 = (uint32_t)rtt;
3613                         logvar2 <<= 32;
3614                         logvar2 |= (uint32_t)rtt_diff;
3615                         logvar3 = rack_gp_rtt_maxmul;
3616                         logvar3 <<= 32;
3617                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3618                         rack_log_timely(rack, timely_says,
3619                                         logvar2, logvar3,
3620                                         logvar, __LINE__, 10);
3621                 }
3622                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3623                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3624                 logged |= 4;
3625         } else if (rack->rc_gp_saw_ca) {
3626                 /* Sent in CA */
3627                 if (timely_says == 2) {
3628                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3629                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3630                         if (alt < new_per)
3631                                 val = alt;
3632                         else
3633                                 val = new_per;
3634                 } else
3635                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3636                 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3637                         ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3638                         rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3639                 } else {
3640                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3641                         ca_red = 0;
3642                         logvar = new_per;
3643                         logvar <<= 32;
3644                         logvar |= alt;
3645                         logvar2 = (uint32_t)rtt;
3646                         logvar2 <<= 32;
3647                         logvar2 |= (uint32_t)rtt_diff;
3648                         logvar3 = rack_gp_rtt_maxmul;
3649                         logvar3 <<= 32;
3650                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3651                         rack_log_timely(rack, timely_says,
3652                                         logvar2, logvar3,
3653                                         logvar, __LINE__, 10);
3654                 }
3655                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3656                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3657                 logged |= 2;
3658         }
3659         if (rack->rc_gp_timely_dec_cnt < 0x7) {
3660                 rack->rc_gp_timely_dec_cnt++;
3661                 if (rack_timely_dec_clear &&
3662                     (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3663                         rack->rc_gp_timely_dec_cnt = 0;
3664         }
3665         logvar = ss_red;
3666         logvar <<= 32;
3667         logvar |= ca_red;
3668         rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
3669                         __LINE__, 2);
3670 }
3671
3672 static void
3673 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3674                      uint32_t rtt, uint32_t line, uint8_t reas)
3675 {
3676         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3677                 union tcp_log_stackspecific log;
3678                 struct timeval tv;
3679
3680                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3681                 log.u_bbr.flex1 = line;
3682                 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3683                 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3684                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3685                 log.u_bbr.flex5 = rtt;
3686                 log.u_bbr.flex6 = rack->rc_highly_buffered;
3687                 log.u_bbr.flex6 <<= 1;
3688                 log.u_bbr.flex6 |= rack->forced_ack;
3689                 log.u_bbr.flex6 <<= 1;
3690                 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3691                 log.u_bbr.flex6 <<= 1;
3692                 log.u_bbr.flex6 |= rack->in_probe_rtt;
3693                 log.u_bbr.flex6 <<= 1;
3694                 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
3695                 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
3696                 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
3697                 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
3698                 log.u_bbr.flex8 = reas;
3699                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3700                 log.u_bbr.delRate = rack_get_bw(rack);
3701                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
3702                 log.u_bbr.cur_del_rate <<= 32;
3703                 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
3704                 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
3705                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3706                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3707                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3708                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3709                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
3710                 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
3711                 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3712                 log.u_bbr.rttProp = us_cts;
3713                 log.u_bbr.rttProp <<= 32;
3714                 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
3715                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3716                     &rack->rc_inp->inp_socket->so_rcv,
3717                     &rack->rc_inp->inp_socket->so_snd,
3718                     BBR_LOG_RTT_SHRINKS, 0,
3719                     0, &log, false, &rack->r_ctl.act_rcv_time);
3720         }
3721 }
3722
3723 static void
3724 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
3725 {
3726         uint64_t bwdp;
3727
3728         bwdp = rack_get_bw(rack);
3729         bwdp *= (uint64_t)rtt;
3730         bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
3731         rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
3732         if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
3733                 /*
3734                  * A window protocol must be able to have 4 packets
3735                  * outstanding as the floor in order to function
3736                  * (especially considering delayed ack :D).
3737                  */
3738                 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
3739         }
3740 }
3741
3742 static void
3743 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
3744 {
3745         /**
3746          * ProbeRTT is a bit different in rack_pacing than in
3747          * BBR. It is like BBR in that it uses the lowering of
3748          * the RTT as a signal that we saw something new and
3749          * counts from there for how long between. But it is
3750          * different in that its quite simple. It does not
3751          * play with the cwnd and wait until we get down
3752          * to N segments outstanding and hold that for
3753          * 200ms. Instead it just sets the pacing reduction
3754          * rate to a set percentage (70 by default) and hold
3755          * that for a number of recent GP Srtt's.
3756          */
3757         uint32_t segsiz;
3758
3759         if (rack->rc_gp_dyn_mul == 0)
3760                 return;
3761
3762         if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
3763                 /* We are idle */
3764                 return;
3765         }
3766         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3767             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3768                 /*
3769                  * Stop the goodput now, the idea here is
3770                  * that future measurements with in_probe_rtt
3771                  * won't register if they are not greater so
3772                  * we want to get what info (if any) is available
3773                  * now.
3774                  */
3775                 rack_do_goodput_measurement(rack->rc_tp, rack,
3776                                             rack->rc_tp->snd_una, __LINE__,
3777                                             RACK_QUALITY_PROBERTT);
3778         }
3779         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3780         rack->r_ctl.rc_time_probertt_entered = us_cts;
3781         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3782                      rack->r_ctl.rc_pace_min_segs);
3783         rack->in_probe_rtt = 1;
3784         rack->measure_saw_probe_rtt = 1;
3785         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3786         rack->r_ctl.rc_time_probertt_starts = 0;
3787         rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
3788         if (rack_probertt_use_min_rtt_entry)
3789                 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3790         else
3791                 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
3792         rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3793                              __LINE__, RACK_RTTS_ENTERPROBE);
3794 }
3795
3796 static void
3797 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
3798 {
3799         struct rack_sendmap *rsm;
3800         uint32_t segsiz;
3801
3802         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3803                      rack->r_ctl.rc_pace_min_segs);
3804         rack->in_probe_rtt = 0;
3805         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3806             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3807                 /*
3808                  * Stop the goodput now, the idea here is
3809                  * that future measurements with in_probe_rtt
3810                  * won't register if they are not greater so
3811                  * we want to get what info (if any) is available
3812                  * now.
3813                  */
3814                 rack_do_goodput_measurement(rack->rc_tp, rack,
3815                                             rack->rc_tp->snd_una, __LINE__,
3816                                             RACK_QUALITY_PROBERTT);
3817         } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
3818                 /*
3819                  * We don't have enough data to make a measurement.
3820                  * So lets just stop and start here after exiting
3821                  * probe-rtt. We probably are not interested in
3822                  * the results anyway.
3823                  */
3824                 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
3825         }
3826         /*
3827          * Measurements through the current snd_max are going
3828          * to be limited by the slower pacing rate.
3829          *
3830          * We need to mark these as app-limited so we
3831          * don't collapse the b/w.
3832          */
3833         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3834         if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
3835                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3836                         rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
3837                 else {
3838                         /*
3839                          * Go out to the end app limited and mark
3840                          * this new one as next and move the end_appl up
3841                          * to this guy.
3842                          */
3843                         if (rack->r_ctl.rc_end_appl)
3844                                 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
3845                         rack->r_ctl.rc_end_appl = rsm;
3846                 }
3847                 rsm->r_flags |= RACK_APP_LIMITED;
3848                 rack->r_ctl.rc_app_limited_cnt++;
3849         }
3850         /*
3851          * Now, we need to examine our pacing rate multipliers.
3852          * If its under 100%, we need to kick it back up to
3853          * 100%. We also don't let it be over our "max" above
3854          * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
3855          * Note setting clamp_atexit_prtt to 0 has the effect
3856          * of setting CA/SS to 100% always at exit (which is
3857          * the default behavior).
3858          */
3859         if (rack_probertt_clear_is) {
3860                 rack->rc_gp_incr = 0;
3861                 rack->rc_gp_bwred = 0;
3862                 rack->rc_gp_timely_inc_cnt = 0;
3863                 rack->rc_gp_timely_dec_cnt = 0;
3864         }
3865         /* Do we do any clamping at exit? */
3866         if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
3867                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
3868                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
3869         }
3870         if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
3871                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
3872                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
3873         }
3874         /*
3875          * Lets set rtt_diff to 0, so that we will get a "boost"
3876          * after exiting.
3877          */
3878         rack->r_ctl.rc_rtt_diff = 0;
3879
3880         /* Clear all flags so we start fresh */
3881         rack->rc_tp->t_bytes_acked = 0;
3882         rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND;
3883         /*
3884          * If configured to, set the cwnd and ssthresh to
3885          * our targets.
3886          */
3887         if (rack_probe_rtt_sets_cwnd) {
3888                 uint64_t ebdp;
3889                 uint32_t setto;
3890
3891                 /* Set ssthresh so we get into CA once we hit our target */
3892                 if (rack_probertt_use_min_rtt_exit == 1) {
3893                         /* Set to min rtt */
3894                         rack_set_prtt_target(rack, segsiz,
3895                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3896                 } else if (rack_probertt_use_min_rtt_exit == 2) {
3897                         /* Set to current gp rtt */
3898                         rack_set_prtt_target(rack, segsiz,
3899                                              rack->r_ctl.rc_gp_srtt);
3900                 } else if (rack_probertt_use_min_rtt_exit == 3) {
3901                         /* Set to entry gp rtt */
3902                         rack_set_prtt_target(rack, segsiz,
3903                                              rack->r_ctl.rc_entry_gp_rtt);
3904                 } else {
3905                         uint64_t sum;
3906                         uint32_t setval;
3907
3908                         sum = rack->r_ctl.rc_entry_gp_rtt;
3909                         sum *= 10;
3910                         sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
3911                         if (sum >= 20) {
3912                                 /*
3913                                  * A highly buffered path needs
3914                                  * cwnd space for timely to work.
3915                                  * Lets set things up as if
3916                                  * we are heading back here again.
3917                                  */
3918                                 setval = rack->r_ctl.rc_entry_gp_rtt;
3919                         } else if (sum >= 15) {
3920                                 /*
3921                                  * Lets take the smaller of the
3922                                  * two since we are just somewhat
3923                                  * buffered.
3924                                  */
3925                                 setval = rack->r_ctl.rc_gp_srtt;
3926                                 if (setval > rack->r_ctl.rc_entry_gp_rtt)
3927                                         setval = rack->r_ctl.rc_entry_gp_rtt;
3928                         } else {
3929                                 /*
3930                                  * Here we are not highly buffered
3931                                  * and should pick the min we can to
3932                                  * keep from causing loss.
3933                                  */
3934                                 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3935                         }
3936                         rack_set_prtt_target(rack, segsiz,
3937                                              setval);
3938                 }
3939                 if (rack_probe_rtt_sets_cwnd > 1) {
3940                         /* There is a percentage here to boost */
3941                         ebdp = rack->r_ctl.rc_target_probertt_flight;
3942                         ebdp *= rack_probe_rtt_sets_cwnd;
3943                         ebdp /= 100;
3944                         setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
3945                 } else
3946                         setto = rack->r_ctl.rc_target_probertt_flight;
3947                 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
3948                 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
3949                         /* Enforce a min */
3950                         rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
3951                 }
3952                 /* If we set in the cwnd also set the ssthresh point so we are in CA */
3953                 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
3954         }
3955         rack_log_rtt_shrinks(rack,  us_cts,
3956                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3957                              __LINE__, RACK_RTTS_EXITPROBE);
3958         /* Clear times last so log has all the info */
3959         rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
3960         rack->r_ctl.rc_time_probertt_entered = us_cts;
3961         rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3962         rack->r_ctl.rc_time_of_last_probertt = us_cts;
3963 }
3964
3965 static void
3966 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
3967 {
3968         /* Check in on probe-rtt */
3969         if (rack->rc_gp_filled == 0) {
3970                 /* We do not do p-rtt unless we have gp measurements */
3971                 return;
3972         }
3973         if (rack->in_probe_rtt) {
3974                 uint64_t no_overflow;
3975                 uint32_t endtime, must_stay;
3976
3977                 if (rack->r_ctl.rc_went_idle_time &&
3978                     ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
3979                         /*
3980                          * We went idle during prtt, just exit now.
3981                          */
3982                         rack_exit_probertt(rack, us_cts);
3983                 } else if (rack_probe_rtt_safety_val &&
3984                     TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
3985                     ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
3986                         /*
3987                          * Probe RTT safety value triggered!
3988                          */
3989                         rack_log_rtt_shrinks(rack,  us_cts,
3990                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3991                                              __LINE__, RACK_RTTS_SAFETY);
3992                         rack_exit_probertt(rack, us_cts);
3993                 }
3994                 /* Calculate the max we will wait */
3995                 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
3996                 if (rack->rc_highly_buffered)
3997                         endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
3998                 /* Calculate the min we must wait */
3999                 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
4000                 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
4001                     TSTMP_LT(us_cts, endtime)) {
4002                         uint32_t calc;
4003                         /* Do we lower more? */
4004 no_exit:
4005                         if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
4006                                 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
4007                         else
4008                                 calc = 0;
4009                         calc /= max(rack->r_ctl.rc_gp_srtt, 1);
4010                         if (calc) {
4011                                 /* Maybe */
4012                                 calc *= rack_per_of_gp_probertt_reduce;
4013                                 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
4014                                 /* Limit it too */
4015                                 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
4016                                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4017                         }
4018                         /* We must reach target or the time set */
4019                         return;
4020                 }
4021                 if (rack->r_ctl.rc_time_probertt_starts == 0) {
4022                         if ((TSTMP_LT(us_cts, must_stay) &&
4023                              rack->rc_highly_buffered) ||
4024                              (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
4025                               rack->r_ctl.rc_target_probertt_flight)) {
4026                                 /* We are not past the must_stay time */
4027                                 goto no_exit;
4028                         }
4029                         rack_log_rtt_shrinks(rack,  us_cts,
4030                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4031                                              __LINE__, RACK_RTTS_REACHTARGET);
4032                         rack->r_ctl.rc_time_probertt_starts = us_cts;
4033                         if (rack->r_ctl.rc_time_probertt_starts == 0)
4034                                 rack->r_ctl.rc_time_probertt_starts = 1;
4035                         /* Restore back to our rate we want to pace at in prtt */
4036                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4037                 }
4038                 /*
4039                  * Setup our end time, some number of gp_srtts plus 200ms.
4040                  */
4041                 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
4042                                (uint64_t)rack_probertt_gpsrtt_cnt_mul);
4043                 if (rack_probertt_gpsrtt_cnt_div)
4044                         endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
4045                 else
4046                         endtime = 0;
4047                 endtime += rack_min_probertt_hold;
4048                 endtime += rack->r_ctl.rc_time_probertt_starts;
4049                 if (TSTMP_GEQ(us_cts,  endtime)) {
4050                         /* yes, exit probertt */
4051                         rack_exit_probertt(rack, us_cts);
4052                 }
4053
4054         } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
4055                 /* Go into probertt, its been too long since we went lower */
4056                 rack_enter_probertt(rack, us_cts);
4057         }
4058 }
4059
4060 static void
4061 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
4062                        uint32_t rtt, int32_t rtt_diff)
4063 {
4064         uint64_t cur_bw, up_bnd, low_bnd, subfr;
4065         uint32_t losses;
4066
4067         if ((rack->rc_gp_dyn_mul == 0) ||
4068             (rack->use_fixed_rate) ||
4069             (rack->in_probe_rtt) ||
4070             (rack->rc_always_pace == 0)) {
4071                 /* No dynamic GP multipler in play */
4072                 return;
4073         }
4074         losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
4075         cur_bw = rack_get_bw(rack);
4076         /* Calculate our up and down range */
4077         up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
4078         up_bnd /= 100;
4079         up_bnd += rack->r_ctl.last_gp_comp_bw;
4080
4081         subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
4082         subfr /= 100;
4083         low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
4084         if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
4085                 /*
4086                  * This is the case where our RTT is above
4087                  * the max target and we have been configured
4088                  * to just do timely no bonus up stuff in that case.
4089                  *
4090                  * There are two configurations, set to 1, and we
4091                  * just do timely if we are over our max. If its
4092                  * set above 1 then we slam the multipliers down
4093                  * to 100 and then decrement per timely.
4094                  */
4095                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4096                                 __LINE__, 3);
4097                 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
4098                         rack_validate_multipliers_at_or_below_100(rack);
4099                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4100         } else if ((last_bw_est < low_bnd) && !losses) {
4101                 /*
4102                  * We are decreasing this is a bit complicated this
4103                  * means we are loosing ground. This could be
4104                  * because another flow entered and we are competing
4105                  * for b/w with it. This will push the RTT up which
4106                  * makes timely unusable unless we want to get shoved
4107                  * into a corner and just be backed off (the age
4108                  * old problem with delay based CC).
4109                  *
4110                  * On the other hand if it was a route change we
4111                  * would like to stay somewhat contained and not
4112                  * blow out the buffers.
4113                  */
4114                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4115                                 __LINE__, 3);
4116                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4117                 if (rack->rc_gp_bwred == 0) {
4118                         /* Go into reduction counting */
4119                         rack->rc_gp_bwred = 1;
4120                         rack->rc_gp_timely_dec_cnt = 0;
4121                 }
4122                 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
4123                     (timely_says == 0)) {
4124                         /*
4125                          * Push another time with a faster pacing
4126                          * to try to gain back (we include override to
4127                          * get a full raise factor).
4128                          */
4129                         if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
4130                             (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
4131                             (timely_says == 0) ||
4132                             (rack_down_raise_thresh == 0)) {
4133                                 /*
4134                                  * Do an override up in b/w if we were
4135                                  * below the threshold or if the threshold
4136                                  * is zero we always do the raise.
4137                                  */
4138                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
4139                         } else {
4140                                 /* Log it stays the same */
4141                                 rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
4142                                                 __LINE__, 11);
4143                         }
4144                         rack->rc_gp_timely_dec_cnt++;
4145                         /* We are not incrementing really no-count */
4146                         rack->rc_gp_incr = 0;
4147                         rack->rc_gp_timely_inc_cnt = 0;
4148                 } else {
4149                         /*
4150                          * Lets just use the RTT
4151                          * information and give up
4152                          * pushing.
4153                          */
4154                         goto use_timely;
4155                 }
4156         } else if ((timely_says != 2) &&
4157                     !losses &&
4158                     (last_bw_est > up_bnd)) {
4159                 /*
4160                  * We are increasing b/w lets keep going, updating
4161                  * our b/w and ignoring any timely input, unless
4162                  * of course we are at our max raise (if there is one).
4163                  */
4164
4165                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4166                                 __LINE__, 3);
4167                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4168                 if (rack->rc_gp_saw_ss &&
4169                     rack_per_upper_bound_ss &&
4170                      (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
4171                             /*
4172                              * In cases where we can't go higher
4173                              * we should just use timely.
4174                              */
4175                             goto use_timely;
4176                 }
4177                 if (rack->rc_gp_saw_ca &&
4178                     rack_per_upper_bound_ca &&
4179                     (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
4180                             /*
4181                              * In cases where we can't go higher
4182                              * we should just use timely.
4183                              */
4184                             goto use_timely;
4185                 }
4186                 rack->rc_gp_bwred = 0;
4187                 rack->rc_gp_timely_dec_cnt = 0;
4188                 /* You get a set number of pushes if timely is trying to reduce */
4189                 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
4190                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4191                 } else {
4192                         /* Log it stays the same */
4193                         rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
4194                             __LINE__, 12);
4195                 }
4196                 return;
4197         } else {
4198                 /*
4199                  * We are staying between the lower and upper range bounds
4200                  * so use timely to decide.
4201                  */
4202                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4203                                 __LINE__, 3);
4204 use_timely:
4205                 if (timely_says) {
4206                         rack->rc_gp_incr = 0;
4207                         rack->rc_gp_timely_inc_cnt = 0;
4208                         if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4209                             !losses &&
4210                             (last_bw_est < low_bnd)) {
4211                                 /* We are loosing ground */
4212                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4213                                 rack->rc_gp_timely_dec_cnt++;
4214                                 /* We are not incrementing really no-count */
4215                                 rack->rc_gp_incr = 0;
4216                                 rack->rc_gp_timely_inc_cnt = 0;
4217                         } else
4218                                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4219                 } else {
4220                         rack->rc_gp_bwred = 0;
4221                         rack->rc_gp_timely_dec_cnt = 0;
4222                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4223                 }
4224         }
4225 }
4226
4227 static int32_t
4228 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4229 {
4230         int32_t timely_says;
4231         uint64_t log_mult, log_rtt_a_diff;
4232
4233         log_rtt_a_diff = rtt;
4234         log_rtt_a_diff <<= 32;
4235         log_rtt_a_diff |= (uint32_t)rtt_diff;
4236         if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4237                     rack_gp_rtt_maxmul)) {
4238                 /* Reduce the b/w multipler */
4239                 timely_says = 2;
4240                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4241                 log_mult <<= 32;
4242                 log_mult |= prev_rtt;
4243                 rack_log_timely(rack,  timely_says, log_mult,
4244                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4245                                 log_rtt_a_diff, __LINE__, 4);
4246         } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4247                            ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4248                             max(rack_gp_rtt_mindiv , 1)))) {
4249                 /* Increase the b/w multipler */
4250                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4251                         ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4252                          max(rack_gp_rtt_mindiv , 1));
4253                 log_mult <<= 32;
4254                 log_mult |= prev_rtt;
4255                 timely_says = 0;
4256                 rack_log_timely(rack,  timely_says, log_mult ,
4257                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4258                                 log_rtt_a_diff, __LINE__, 5);
4259         } else {
4260                 /*
4261                  * Use a gradient to find it the timely gradient
4262                  * is:
4263                  * grad = rc_rtt_diff / min_rtt;
4264                  *
4265                  * anything below or equal to 0 will be
4266                  * a increase indication. Anything above
4267                  * zero is a decrease. Note we take care
4268                  * of the actual gradient calculation
4269                  * in the reduction (its not needed for
4270                  * increase).
4271                  */
4272                 log_mult = prev_rtt;
4273                 if (rtt_diff <= 0) {
4274                         /*
4275                          * Rttdiff is less than zero, increase the
4276                          * b/w multipler (its 0 or negative)
4277                          */
4278                         timely_says = 0;
4279                         rack_log_timely(rack,  timely_says, log_mult,
4280                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4281                 } else {
4282                         /* Reduce the b/w multipler */
4283                         timely_says = 1;
4284                         rack_log_timely(rack,  timely_says, log_mult,
4285                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4286                 }
4287         }
4288         return (timely_says);
4289 }
4290
4291 static void
4292 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4293                             tcp_seq th_ack, int line, uint8_t quality)
4294 {
4295         uint64_t tim, bytes_ps, ltim, stim, utim;
4296         uint32_t segsiz, bytes, reqbytes, us_cts;
4297         int32_t gput, new_rtt_diff, timely_says;
4298         uint64_t  resid_bw, subpart = 0, addpart = 0, srtt;
4299         int did_add = 0;
4300
4301         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4302         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4303         if (TSTMP_GEQ(us_cts, tp->gput_ts))
4304                 tim = us_cts - tp->gput_ts;
4305         else
4306                 tim = 0;
4307         if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4308                 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4309         else
4310                 stim = 0;
4311         /*
4312          * Use the larger of the send time or ack time. This prevents us
4313          * from being influenced by ack artifacts to come up with too
4314          * high of measurement. Note that since we are spanning over many more
4315          * bytes in most of our measurements hopefully that is less likely to
4316          * occur.
4317          */
4318         if (tim > stim)
4319                 utim = max(tim, 1);
4320         else
4321                 utim = max(stim, 1);
4322         /* Lets get a msec time ltim too for the old stuff */
4323         ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
4324         gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
4325         reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4326         if ((tim == 0) && (stim == 0)) {
4327                 /*
4328                  * Invalid measurement time, maybe
4329                  * all on one ack/one send?
4330                  */
4331                 bytes = 0;
4332                 bytes_ps = 0;
4333                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4334                                            0, 0, 0, 10, __LINE__, NULL, quality);
4335                 goto skip_measurement;
4336         }
4337         if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4338                 /* We never made a us_rtt measurement? */
4339                 bytes = 0;
4340                 bytes_ps = 0;
4341                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4342                                            0, 0, 0, 10, __LINE__, NULL, quality);
4343                 goto skip_measurement;
4344         }
4345         /*
4346          * Calculate the maximum possible b/w this connection
4347          * could have. We base our calculation on the lowest
4348          * rtt we have seen during the measurement and the
4349          * largest rwnd the client has given us in that time. This
4350          * forms a BDP that is the maximum that we could ever
4351          * get to the client. Anything larger is not valid.
4352          *
4353          * I originally had code here that rejected measurements
4354          * where the time was less than 1/2 the latest us_rtt.
4355          * But after thinking on that I realized its wrong since
4356          * say you had a 150Mbps or even 1Gbps link, and you
4357          * were a long way away.. example I am in Europe (100ms rtt)
4358          * talking to my 1Gbps link in S.C. Now measuring say 150,000
4359          * bytes my time would be 1.2ms, and yet my rtt would say
4360          * the measurement was invalid the time was < 50ms. The
4361          * same thing is true for 150Mb (8ms of time).
4362          *
4363          * A better way I realized is to look at what the maximum
4364          * the connection could possibly do. This is gated on
4365          * the lowest RTT we have seen and the highest rwnd.
4366          * We should in theory never exceed that, if we are
4367          * then something on the path is storing up packets
4368          * and then feeding them all at once to our endpoint
4369          * messing up our measurement.
4370          */
4371         rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4372         rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4373         rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4374         if (SEQ_LT(th_ack, tp->gput_seq)) {
4375                 /* No measurement can be made */
4376                 bytes = 0;
4377                 bytes_ps = 0;
4378                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4379                                            0, 0, 0, 10, __LINE__, NULL, quality);
4380                 goto skip_measurement;
4381         } else
4382                 bytes = (th_ack - tp->gput_seq);
4383         bytes_ps = (uint64_t)bytes;
4384         /*
4385          * Don't measure a b/w for pacing unless we have gotten at least
4386          * an initial windows worth of data in this measurement interval.
4387          *
4388          * Small numbers of bytes get badly influenced by delayed ack and
4389          * other artifacts. Note we take the initial window or our
4390          * defined minimum GP (defaulting to 10 which hopefully is the
4391          * IW).
4392          */
4393         if (rack->rc_gp_filled == 0) {
4394                 /*
4395                  * The initial estimate is special. We
4396                  * have blasted out an IW worth of packets
4397                  * without a real valid ack ts results. We
4398                  * then setup the app_limited_needs_set flag,
4399                  * this should get the first ack in (probably 2
4400                  * MSS worth) to be recorded as the timestamp.
4401                  * We thus allow a smaller number of bytes i.e.
4402                  * IW - 2MSS.
4403                  */
4404                 reqbytes -= (2 * segsiz);
4405                 /* Also lets fill previous for our first measurement to be neutral */
4406                 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4407         }
4408         if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4409                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4410                                            rack->r_ctl.rc_app_limited_cnt,
4411                                            0, 0, 10, __LINE__, NULL, quality);
4412                 goto skip_measurement;
4413         }
4414         /*
4415          * We now need to calculate the Timely like status so
4416          * we can update (possibly) the b/w multipliers.
4417          */
4418         new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4419         if (rack->rc_gp_filled == 0) {
4420                 /* No previous reading */
4421                 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4422         } else {
4423                 if (rack->measure_saw_probe_rtt == 0) {
4424                         /*
4425                          * We don't want a probertt to be counted
4426                          * since it will be negative incorrectly. We
4427                          * expect to be reducing the RTT when we
4428                          * pace at a slower rate.
4429                          */
4430                         rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4431                         rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4432                 }
4433         }
4434         timely_says = rack_make_timely_judgement(rack,
4435                 rack->r_ctl.rc_gp_srtt,
4436                 rack->r_ctl.rc_rtt_diff,
4437                 rack->r_ctl.rc_prev_gp_srtt
4438                 );
4439         bytes_ps *= HPTS_USEC_IN_SEC;
4440         bytes_ps /= utim;
4441         if (bytes_ps > rack->r_ctl.last_max_bw) {
4442                 /*
4443                  * Something is on path playing
4444                  * since this b/w is not possible based
4445                  * on our BDP (highest rwnd and lowest rtt
4446                  * we saw in the measurement window).
4447                  *
4448                  * Another option here would be to
4449                  * instead skip the measurement.
4450                  */
4451                 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4452                                            bytes_ps, rack->r_ctl.last_max_bw, 0,
4453                                            11, __LINE__, NULL, quality);
4454                 bytes_ps = rack->r_ctl.last_max_bw;
4455         }
4456         /* We store gp for b/w in bytes per second */
4457         if (rack->rc_gp_filled == 0) {
4458                 /* Initial measurment */
4459                 if (bytes_ps) {
4460                         rack->r_ctl.gp_bw = bytes_ps;
4461                         rack->rc_gp_filled = 1;
4462                         rack->r_ctl.num_measurements = 1;
4463                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4464                 } else {
4465                         rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4466                                                    rack->r_ctl.rc_app_limited_cnt,
4467                                                    0, 0, 10, __LINE__, NULL, quality);
4468                 }
4469                 if (rack->rc_inp->inp_in_hpts &&
4470                     (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4471                         /*
4472                          * Ok we can't trust the pacer in this case
4473                          * where we transition from un-paced to paced.
4474                          * Or for that matter when the burst mitigation
4475                          * was making a wild guess and got it wrong.
4476                          * Stop the pacer and clear up all the aggregate
4477                          * delays etc.
4478                          */
4479                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
4480                         rack->r_ctl.rc_hpts_flags = 0;
4481                         rack->r_ctl.rc_last_output_to = 0;
4482                 }
4483                 did_add = 2;
4484         } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4485                 /* Still a small number run an average */
4486                 rack->r_ctl.gp_bw += bytes_ps;
4487                 addpart = rack->r_ctl.num_measurements;
4488                 rack->r_ctl.num_measurements++;
4489                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4490                         /* We have collected enought to move forward */
4491                         rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4492                 }
4493                 did_add = 3;
4494         } else {
4495                 /*
4496                  * We want to take 1/wma of the goodput and add in to 7/8th
4497                  * of the old value weighted by the srtt. So if your measurement
4498                  * period is say 2 SRTT's long you would get 1/4 as the
4499                  * value, if it was like 1/2 SRTT then you would get 1/16th.
4500                  *
4501                  * But we must be careful not to take too much i.e. if the
4502                  * srtt is say 20ms and the measurement is taken over
4503                  * 400ms our weight would be 400/20 i.e. 20. On the
4504                  * other hand if we get a measurement over 1ms with a
4505                  * 10ms rtt we only want to take a much smaller portion.
4506                  */
4507                 if (rack->r_ctl.num_measurements < 0xff) {
4508                         rack->r_ctl.num_measurements++;
4509                 }
4510                 srtt = (uint64_t)tp->t_srtt;
4511                 if (srtt == 0) {
4512                         /*
4513                          * Strange why did t_srtt go back to zero?
4514                          */
4515                         if (rack->r_ctl.rc_rack_min_rtt)
4516                                 srtt = rack->r_ctl.rc_rack_min_rtt;
4517                         else
4518                                 srtt = HPTS_USEC_IN_MSEC;
4519                 }
4520                 /*
4521                  * XXXrrs: Note for reviewers, in playing with
4522                  * dynamic pacing I discovered this GP calculation
4523                  * as done originally leads to some undesired results.
4524                  * Basically you can get longer measurements contributing
4525                  * too much to the WMA. Thus I changed it if you are doing
4526                  * dynamic adjustments to only do the aportioned adjustment
4527                  * if we have a very small (time wise) measurement. Longer
4528                  * measurements just get there weight (defaulting to 1/8)
4529                  * add to the WMA. We may want to think about changing
4530                  * this to always do that for both sides i.e. dynamic
4531                  * and non-dynamic... but considering lots of folks
4532                  * were playing with this I did not want to change the
4533                  * calculation per.se. without your thoughts.. Lawerence?
4534                  * Peter??
4535                  */
4536                 if (rack->rc_gp_dyn_mul == 0) {
4537                         subpart = rack->r_ctl.gp_bw * utim;
4538                         subpart /= (srtt * 8);
4539                         if (subpart < (rack->r_ctl.gp_bw / 2)) {
4540                                 /*
4541                                  * The b/w update takes no more
4542                                  * away then 1/2 our running total
4543                                  * so factor it in.
4544                                  */
4545                                 addpart = bytes_ps * utim;
4546                                 addpart /= (srtt * 8);
4547                         } else {
4548                                 /*
4549                                  * Don't allow a single measurement
4550                                  * to account for more than 1/2 of the
4551                                  * WMA. This could happen on a retransmission
4552                                  * where utim becomes huge compared to
4553                                  * srtt (multiple retransmissions when using
4554                                  * the sending rate which factors in all the
4555                                  * transmissions from the first one).
4556                                  */
4557                                 subpart = rack->r_ctl.gp_bw / 2;
4558                                 addpart = bytes_ps / 2;
4559                         }
4560                         resid_bw = rack->r_ctl.gp_bw - subpart;
4561                         rack->r_ctl.gp_bw = resid_bw + addpart;
4562                         did_add = 1;
4563                 } else {
4564                         if ((utim / srtt) <= 1) {
4565                                 /*
4566                                  * The b/w update was over a small period
4567                                  * of time. The idea here is to prevent a small
4568                                  * measurement time period from counting
4569                                  * too much. So we scale it based on the
4570                                  * time so it attributes less than 1/rack_wma_divisor
4571                                  * of its measurement.
4572                                  */
4573                                 subpart = rack->r_ctl.gp_bw * utim;
4574                                 subpart /= (srtt * rack_wma_divisor);
4575                                 addpart = bytes_ps * utim;
4576                                 addpart /= (srtt * rack_wma_divisor);
4577                         } else {
4578                                 /*
4579                                  * The scaled measurement was long
4580                                  * enough so lets just add in the
4581                                  * portion of the measurment i.e. 1/rack_wma_divisor
4582                                  */
4583                                 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
4584                                 addpart = bytes_ps / rack_wma_divisor;
4585                         }
4586                         if ((rack->measure_saw_probe_rtt == 0) ||
4587                             (bytes_ps > rack->r_ctl.gp_bw)) {
4588                                 /*
4589                                  * For probe-rtt we only add it in
4590                                  * if its larger, all others we just
4591                                  * add in.
4592                                  */
4593                                 did_add = 1;
4594                                 resid_bw = rack->r_ctl.gp_bw - subpart;
4595                                 rack->r_ctl.gp_bw = resid_bw + addpart;
4596                         }
4597                 }
4598         }
4599         if ((rack->gp_ready == 0) &&
4600             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
4601                 /* We have enough measurements now */
4602                 rack->gp_ready = 1;
4603                 rack_set_cc_pacing(rack);
4604                 if (rack->defer_options)
4605                         rack_apply_deferred_options(rack);
4606         }
4607         rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
4608                                    rack_get_bw(rack), 22, did_add, NULL, quality);
4609         /* We do not update any multipliers if we are in or have seen a probe-rtt */
4610         if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
4611                 rack_update_multiplier(rack, timely_says, bytes_ps,
4612                                        rack->r_ctl.rc_gp_srtt,
4613                                        rack->r_ctl.rc_rtt_diff);
4614         rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
4615                                    rack_get_bw(rack), 3, line, NULL, quality);
4616         /* reset the gp srtt and setup the new prev */
4617         rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4618         /* Record the lost count for the next measurement */
4619         rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
4620         /*
4621          * We restart our diffs based on the gpsrtt in the
4622          * measurement window.
4623          */
4624         rack->rc_gp_rtt_set = 0;
4625         rack->rc_gp_saw_rec = 0;
4626         rack->rc_gp_saw_ca = 0;
4627         rack->rc_gp_saw_ss = 0;
4628         rack->rc_dragged_bottom = 0;
4629 skip_measurement:
4630
4631 #ifdef STATS
4632         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
4633                                  gput);
4634         /*
4635          * XXXLAS: This is a temporary hack, and should be
4636          * chained off VOI_TCP_GPUT when stats(9) grows an
4637          * API to deal with chained VOIs.
4638          */
4639         if (tp->t_stats_gput_prev > 0)
4640                 stats_voi_update_abs_s32(tp->t_stats,
4641                                          VOI_TCP_GPUT_ND,
4642                                          ((gput - tp->t_stats_gput_prev) * 100) /
4643                                          tp->t_stats_gput_prev);
4644 #endif
4645         tp->t_flags &= ~TF_GPUTINPROG;
4646         tp->t_stats_gput_prev = gput;
4647         /*
4648          * Now are we app limited now and there is space from where we
4649          * were to where we want to go?
4650          *
4651          * We don't do the other case i.e. non-applimited here since
4652          * the next send will trigger us picking up the missing data.
4653          */
4654         if (rack->r_ctl.rc_first_appl &&
4655             TCPS_HAVEESTABLISHED(tp->t_state) &&
4656             rack->r_ctl.rc_app_limited_cnt &&
4657             (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
4658             ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
4659              max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
4660                 /*
4661                  * Yep there is enough outstanding to make a measurement here.
4662                  */
4663                 struct rack_sendmap *rsm, fe;
4664
4665                 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
4666                 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
4667                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4668                 rack->app_limited_needs_set = 0;
4669                 tp->gput_seq = th_ack;
4670                 if (rack->in_probe_rtt)
4671                         rack->measure_saw_probe_rtt = 1;
4672                 else if ((rack->measure_saw_probe_rtt) &&
4673                          (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
4674                         rack->measure_saw_probe_rtt = 0;
4675                 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
4676                         /* There is a full window to gain info from */
4677                         tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
4678                 } else {
4679                         /* We can only measure up to the applimited point */
4680                         tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
4681                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
4682                                 /*
4683                                  * We don't have enough to make a measurement.
4684                                  */
4685                                 tp->t_flags &= ~TF_GPUTINPROG;
4686                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
4687                                                            0, 0, 0, 6, __LINE__, NULL, quality);
4688                                 return;
4689                         }
4690                 }
4691                 if (tp->t_state >= TCPS_FIN_WAIT_1) {
4692                         /*
4693                          * We will get no more data into the SB
4694                          * this means we need to have the data available
4695                          * before we start a measurement.
4696                          */
4697                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) {
4698                                 /* Nope not enough data. */
4699                                 return;
4700                         }
4701                 }
4702                 tp->t_flags |= TF_GPUTINPROG;
4703                 /*
4704                  * Now we need to find the timestamp of the send at tp->gput_seq
4705                  * for the send based measurement.
4706                  */
4707                 fe.r_start = tp->gput_seq;
4708                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4709                 if (rsm) {
4710                         /* Ok send-based limit is set */
4711                         if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
4712                                 /*
4713                                  * Move back to include the earlier part
4714                                  * so our ack time lines up right (this may
4715                                  * make an overlapping measurement but thats
4716                                  * ok).
4717                                  */
4718                                 tp->gput_seq = rsm->r_start;
4719                         }
4720                         if (rsm->r_flags & RACK_ACKED)
4721                                 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
4722                         else
4723                                 rack->app_limited_needs_set = 1;
4724                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
4725                 } else {
4726                         /*
4727                          * If we don't find the rsm due to some
4728                          * send-limit set the current time, which
4729                          * basically disables the send-limit.
4730                          */
4731                         struct timeval tv;
4732
4733                         microuptime(&tv);
4734                         rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
4735                 }
4736                 rack_log_pacing_delay_calc(rack,
4737                                            tp->gput_seq,
4738                                            tp->gput_ack,
4739                                            (uint64_t)rsm,
4740                                            tp->gput_ts,
4741                                            rack->r_ctl.rc_app_limited_cnt,
4742                                            9,
4743                                            __LINE__, NULL, quality);
4744         }
4745 }
4746
4747 /*
4748  * CC wrapper hook functions
4749  */
4750 static void
4751 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
4752     uint16_t type, int32_t recovery)
4753 {
4754         uint32_t prior_cwnd, acked;
4755         struct tcp_log_buffer *lgb = NULL;
4756         uint8_t labc_to_use, quality;
4757
4758         INP_WLOCK_ASSERT(tp->t_inpcb);
4759         tp->ccv->nsegs = nsegs;
4760         acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una);
4761         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
4762                 uint32_t max;
4763
4764                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
4765                 if (tp->ccv->bytes_this_ack > max) {
4766                         tp->ccv->bytes_this_ack = max;
4767                 }
4768         }
4769 #ifdef STATS
4770         stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
4771             ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
4772 #endif
4773         quality = RACK_QUALITY_NONE;
4774         if ((tp->t_flags & TF_GPUTINPROG) &&
4775             rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
4776                 /* Measure the Goodput */
4777                 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
4778 #ifdef NETFLIX_PEAKRATE
4779                 if ((type == CC_ACK) &&
4780                     (tp->t_maxpeakrate)) {
4781                         /*
4782                          * We update t_peakrate_thr. This gives us roughly
4783                          * one update per round trip time. Note
4784                          * it will only be used if pace_always is off i.e
4785                          * we don't do this for paced flows.
4786                          */
4787                         rack_update_peakrate_thr(tp);
4788                 }
4789 #endif
4790         }
4791         /* Which way our we limited, if not cwnd limited no advance in CA */
4792         if (tp->snd_cwnd <= tp->snd_wnd)
4793                 tp->ccv->flags |= CCF_CWND_LIMITED;
4794         else
4795                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
4796         if (tp->snd_cwnd > tp->snd_ssthresh) {
4797                 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
4798                          nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
4799                 /* For the setting of a window past use the actual scwnd we are using */
4800                 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
4801                         tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
4802                         tp->ccv->flags |= CCF_ABC_SENTAWND;
4803                 }
4804         } else {
4805                 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
4806                 tp->t_bytes_acked = 0;
4807         }
4808         prior_cwnd = tp->snd_cwnd;
4809         if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
4810             (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
4811                 labc_to_use = rack->rc_labc;
4812         else
4813                 labc_to_use = rack_max_abc_post_recovery;
4814         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4815                 union tcp_log_stackspecific log;
4816                 struct timeval tv;
4817
4818                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4819                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4820                 log.u_bbr.flex1 = th_ack;
4821                 log.u_bbr.flex2 = tp->ccv->flags;
4822                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4823                 log.u_bbr.flex4 = tp->ccv->nsegs;
4824                 log.u_bbr.flex5 = labc_to_use;
4825                 log.u_bbr.flex6 = prior_cwnd;
4826                 log.u_bbr.flex7 = V_tcp_do_newsack;
4827                 log.u_bbr.flex8 = 1;
4828                 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4829                                      0, &log, false, NULL, NULL, 0, &tv);
4830         }
4831         if (CC_ALGO(tp)->ack_received != NULL) {
4832                 /* XXXLAS: Find a way to live without this */
4833                 tp->ccv->curack = th_ack;
4834                 tp->ccv->labc = labc_to_use;
4835                 tp->ccv->flags |= CCF_USE_LOCAL_ABC;
4836                 CC_ALGO(tp)->ack_received(tp->ccv, type);
4837         }
4838         if (lgb) {
4839                 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
4840         }
4841         if (rack->r_must_retran) {
4842                 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
4843                         /*
4844                          * We now are beyond the rxt point so lets disable
4845                          * the flag.
4846                          */
4847                         rack->r_ctl.rc_out_at_rto = 0;
4848                         rack->r_must_retran = 0;
4849                 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
4850                         /*
4851                          * Only decrement the rc_out_at_rto if the cwnd advances
4852                          * at least a whole segment. Otherwise next time the peer
4853                          * acks, we won't be able to send this generaly happens
4854                          * when we are in Congestion Avoidance.
4855                          */
4856                         if (acked <= rack->r_ctl.rc_out_at_rto){
4857                                 rack->r_ctl.rc_out_at_rto -= acked;
4858                         } else {
4859                                 rack->r_ctl.rc_out_at_rto = 0;
4860                         }
4861                 }
4862         }
4863 #ifdef STATS
4864         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
4865 #endif
4866         if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
4867                 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
4868         }
4869 #ifdef NETFLIX_PEAKRATE
4870         /* we enforce max peak rate if it is set and we are not pacing */
4871         if ((rack->rc_always_pace == 0) &&
4872             tp->t_peakrate_thr &&
4873             (tp->snd_cwnd > tp->t_peakrate_thr)) {
4874                 tp->snd_cwnd = tp->t_peakrate_thr;
4875         }
4876 #endif
4877 }
4878
4879 static void
4880 tcp_rack_partialack(struct tcpcb *tp)
4881 {
4882         struct tcp_rack *rack;
4883
4884         rack = (struct tcp_rack *)tp->t_fb_ptr;
4885         INP_WLOCK_ASSERT(tp->t_inpcb);
4886         /*
4887          * If we are doing PRR and have enough
4888          * room to send <or> we are pacing and prr
4889          * is disabled we will want to see if we
4890          * can send data (by setting r_wanted_output to
4891          * true).
4892          */
4893         if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
4894             rack->rack_no_prr)
4895                 rack->r_wanted_output = 1;
4896 }
4897
4898 static void
4899 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
4900 {
4901         struct tcp_rack *rack;
4902         uint32_t orig_cwnd;
4903
4904         orig_cwnd = tp->snd_cwnd;
4905         INP_WLOCK_ASSERT(tp->t_inpcb);
4906         rack = (struct tcp_rack *)tp->t_fb_ptr;
4907         /* only alert CC if we alerted when we entered */
4908         if (CC_ALGO(tp)->post_recovery != NULL) {
4909                 tp->ccv->curack = th_ack;
4910                 CC_ALGO(tp)->post_recovery(tp->ccv);
4911                 if (tp->snd_cwnd < tp->snd_ssthresh) {
4912                         /*
4913                          * Rack has burst control and pacing
4914                          * so lets not set this any lower than
4915                          * snd_ssthresh per RFC-6582 (option 2).
4916                          */
4917                         tp->snd_cwnd = tp->snd_ssthresh;
4918                 }
4919         }
4920         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4921                 union tcp_log_stackspecific log;
4922                 struct timeval tv;
4923
4924                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4925                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4926                 log.u_bbr.flex1 = th_ack;
4927                 log.u_bbr.flex2 = tp->ccv->flags;
4928                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4929                 log.u_bbr.flex4 = tp->ccv->nsegs;
4930                 log.u_bbr.flex5 = V_tcp_abc_l_var;
4931                 log.u_bbr.flex6 = orig_cwnd;
4932                 log.u_bbr.flex7 = V_tcp_do_newsack;
4933                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
4934                 log.u_bbr.flex8 = 2;
4935                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4936                                0, &log, false, NULL, NULL, 0, &tv);
4937         }
4938         if ((rack->rack_no_prr == 0) &&
4939             (rack->no_prr_addback == 0) &&
4940             (rack->r_ctl.rc_prr_sndcnt > 0)) {
4941                 /*
4942                  * Suck the next prr cnt back into cwnd, but
4943                  * only do that if we are not application limited.
4944                  */
4945                 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
4946                         /*
4947                          * We are allowed to add back to the cwnd the amount we did
4948                          * not get out if:
4949                          * a) no_prr_addback is off.
4950                          * b) we are not app limited
4951                          * c) we are doing prr
4952                          * <and>
4953                          * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
4954                          */
4955                         tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
4956                                             rack->r_ctl.rc_prr_sndcnt);
4957                 }
4958                 rack->r_ctl.rc_prr_sndcnt = 0;
4959                 rack_log_to_prr(rack, 1, 0);
4960         }
4961         rack_log_to_prr(rack, 14, orig_cwnd);
4962         tp->snd_recover = tp->snd_una;
4963         if (rack->r_ctl.dsack_persist) {
4964                 rack->r_ctl.dsack_persist--;
4965                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
4966                         rack->r_ctl.num_dsack = 0;
4967                 }
4968                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
4969         }
4970         EXIT_RECOVERY(tp->t_flags);
4971 }
4972
4973 static void
4974 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack)
4975 {
4976         struct tcp_rack *rack;
4977         uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
4978
4979         INP_WLOCK_ASSERT(tp->t_inpcb);
4980 #ifdef STATS
4981         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
4982 #endif
4983         if (IN_RECOVERY(tp->t_flags) == 0) {
4984                 in_rec_at_entry = 0;
4985                 ssthresh_enter = tp->snd_ssthresh;
4986                 cwnd_enter = tp->snd_cwnd;
4987         } else
4988                 in_rec_at_entry = 1;
4989         rack = (struct tcp_rack *)tp->t_fb_ptr;
4990         switch (type) {
4991         case CC_NDUPACK:
4992                 tp->t_flags &= ~TF_WASFRECOVERY;
4993                 tp->t_flags &= ~TF_WASCRECOVERY;
4994                 if (!IN_FASTRECOVERY(tp->t_flags)) {
4995                         rack->r_ctl.rc_prr_delivered = 0;
4996                         rack->r_ctl.rc_prr_out = 0;
4997                         if (rack->rack_no_prr == 0) {
4998                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4999                                 rack_log_to_prr(rack, 2, in_rec_at_entry);
5000                         }
5001                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
5002                         tp->snd_recover = tp->snd_max;
5003                         if (tp->t_flags2 & TF2_ECN_PERMIT)
5004                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
5005                 }
5006                 break;
5007         case CC_ECN:
5008                 if (!IN_CONGRECOVERY(tp->t_flags) ||
5009                     /*
5010                      * Allow ECN reaction on ACK to CWR, if
5011                      * that data segment was also CE marked.
5012                      */
5013                     SEQ_GEQ(ack, tp->snd_recover)) {
5014                         EXIT_CONGRECOVERY(tp->t_flags);
5015                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
5016                         tp->snd_recover = tp->snd_max + 1;
5017                         if (tp->t_flags2 & TF2_ECN_PERMIT)
5018                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
5019                 }
5020                 break;
5021         case CC_RTO:
5022                 tp->t_dupacks = 0;
5023                 tp->t_bytes_acked = 0;
5024                 EXIT_RECOVERY(tp->t_flags);
5025                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
5026                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
5027                 orig_cwnd = tp->snd_cwnd;
5028                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
5029                 rack_log_to_prr(rack, 16, orig_cwnd);
5030                 if (tp->t_flags2 & TF2_ECN_PERMIT)
5031                         tp->t_flags2 |= TF2_ECN_SND_CWR;
5032                 break;
5033         case CC_RTO_ERR:
5034                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
5035                 /* RTO was unnecessary, so reset everything. */
5036                 tp->snd_cwnd = tp->snd_cwnd_prev;
5037                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
5038                 tp->snd_recover = tp->snd_recover_prev;
5039                 if (tp->t_flags & TF_WASFRECOVERY) {
5040                         ENTER_FASTRECOVERY(tp->t_flags);
5041                         tp->t_flags &= ~TF_WASFRECOVERY;
5042                 }
5043                 if (tp->t_flags & TF_WASCRECOVERY) {
5044                         ENTER_CONGRECOVERY(tp->t_flags);
5045                         tp->t_flags &= ~TF_WASCRECOVERY;
5046                 }
5047                 tp->snd_nxt = tp->snd_max;
5048                 tp->t_badrxtwin = 0;
5049                 break;
5050         }
5051         if ((CC_ALGO(tp)->cong_signal != NULL)  &&
5052             (type != CC_RTO)){
5053                 tp->ccv->curack = ack;
5054                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
5055         }
5056         if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
5057                 rack_log_to_prr(rack, 15, cwnd_enter);
5058                 rack->r_ctl.dsack_byte_cnt = 0;
5059                 rack->r_ctl.retran_during_recovery = 0;
5060                 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
5061                 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
5062                 rack->r_ent_rec_ns = 1;
5063         }
5064 }
5065
5066 static inline void
5067 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
5068 {
5069         uint32_t i_cwnd;
5070
5071         INP_WLOCK_ASSERT(tp->t_inpcb);
5072
5073 #ifdef NETFLIX_STATS
5074         KMOD_TCPSTAT_INC(tcps_idle_restarts);
5075         if (tp->t_state == TCPS_ESTABLISHED)
5076                 KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
5077 #endif
5078         if (CC_ALGO(tp)->after_idle != NULL)
5079                 CC_ALGO(tp)->after_idle(tp->ccv);
5080
5081         if (tp->snd_cwnd == 1)
5082                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
5083         else
5084                 i_cwnd = rc_init_window(rack);
5085
5086         /*
5087          * Being idle is no differnt than the initial window. If the cc
5088          * clamps it down below the initial window raise it to the initial
5089          * window.
5090          */
5091         if (tp->snd_cwnd < i_cwnd) {
5092                 tp->snd_cwnd = i_cwnd;
5093         }
5094 }
5095
5096 /*
5097  * Indicate whether this ack should be delayed.  We can delay the ack if
5098  * following conditions are met:
5099  *      - There is no delayed ack timer in progress.
5100  *      - Our last ack wasn't a 0-sized window. We never want to delay
5101  *        the ack that opens up a 0-sized window.
5102  *      - LRO wasn't used for this segment. We make sure by checking that the
5103  *        segment size is not larger than the MSS.
5104  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
5105  *        connection.
5106  */
5107 #define DELAY_ACK(tp, tlen)                      \
5108         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
5109         ((tp->t_flags & TF_DELACK) == 0) &&      \
5110         (tlen <= tp->t_maxseg) &&                \
5111         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
5112
5113 static struct rack_sendmap *
5114 rack_find_lowest_rsm(struct tcp_rack *rack)
5115 {
5116         struct rack_sendmap *rsm;
5117
5118         /*
5119          * Walk the time-order transmitted list looking for an rsm that is
5120          * not acked. This will be the one that was sent the longest time
5121          * ago that is still outstanding.
5122          */
5123         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
5124                 if (rsm->r_flags & RACK_ACKED) {
5125                         continue;
5126                 }
5127                 goto finish;
5128         }
5129 finish:
5130         return (rsm);
5131 }
5132
5133 static struct rack_sendmap *
5134 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
5135 {
5136         struct rack_sendmap *prsm;
5137
5138         /*
5139          * Walk the sequence order list backward until we hit and arrive at
5140          * the highest seq not acked. In theory when this is called it
5141          * should be the last segment (which it was not).
5142          */
5143         counter_u64_add(rack_find_high, 1);
5144         prsm = rsm;
5145         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
5146                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
5147                         continue;
5148                 }
5149                 return (prsm);
5150         }
5151         return (NULL);
5152 }
5153
5154 static uint32_t
5155 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
5156 {
5157         int32_t lro;
5158         uint32_t thresh;
5159
5160         /*
5161          * lro is the flag we use to determine if we have seen reordering.
5162          * If it gets set we have seen reordering. The reorder logic either
5163          * works in one of two ways:
5164          *
5165          * If reorder-fade is configured, then we track the last time we saw
5166          * re-ordering occur. If we reach the point where enough time as
5167          * passed we no longer consider reordering has occuring.
5168          *
5169          * Or if reorder-face is 0, then once we see reordering we consider
5170          * the connection to alway be subject to reordering and just set lro
5171          * to 1.
5172          *
5173          * In the end if lro is non-zero we add the extra time for
5174          * reordering in.
5175          */
5176         if (srtt == 0)
5177                 srtt = 1;
5178         if (rack->r_ctl.rc_reorder_ts) {
5179                 if (rack->r_ctl.rc_reorder_fade) {
5180                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
5181                                 lro = cts - rack->r_ctl.rc_reorder_ts;
5182                                 if (lro == 0) {
5183                                         /*
5184                                          * No time as passed since the last
5185                                          * reorder, mark it as reordering.
5186                                          */
5187                                         lro = 1;
5188                                 }
5189                         } else {
5190                                 /* Negative time? */
5191                                 lro = 0;
5192                         }
5193                         if (lro > rack->r_ctl.rc_reorder_fade) {
5194                                 /* Turn off reordering seen too */
5195                                 rack->r_ctl.rc_reorder_ts = 0;
5196                                 lro = 0;
5197                         }
5198                 } else {
5199                         /* Reodering does not fade */
5200                         lro = 1;
5201                 }
5202         } else {
5203                 lro = 0;
5204         }
5205         if (rack->rc_rack_tmr_std_based == 0) {
5206                 thresh = srtt + rack->r_ctl.rc_pkt_delay;
5207         } else {
5208                 /* Standards based pkt-delay is 1/4 srtt */
5209                 thresh = srtt +  (srtt >> 2);
5210         }
5211         if (lro && (rack->rc_rack_tmr_std_based == 0)) {
5212                 /* It must be set, if not you get 1/4 rtt */
5213                 if (rack->r_ctl.rc_reorder_shift)
5214                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
5215                 else
5216                         thresh += (srtt >> 2);
5217         }
5218         if (rack->rc_rack_use_dsack &&
5219             lro &&
5220             (rack->r_ctl.num_dsack > 0)) {
5221                 /*
5222                  * We only increase the reordering window if we
5223                  * have seen reordering <and> we have a DSACK count.
5224                  */
5225                 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
5226                 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
5227         }
5228         /* SRTT * 2 is the ceiling */
5229         if (thresh > (srtt * 2)) {
5230                 thresh = srtt * 2;
5231         }
5232         /* And we don't want it above the RTO max either */
5233         if (thresh > rack_rto_max) {
5234                 thresh = rack_rto_max;
5235         }
5236         rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
5237         return (thresh);
5238 }
5239
5240 static uint32_t
5241 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
5242                      struct rack_sendmap *rsm, uint32_t srtt)
5243 {
5244         struct rack_sendmap *prsm;
5245         uint32_t thresh, len;
5246         int segsiz;
5247
5248         if (srtt == 0)
5249                 srtt = 1;
5250         if (rack->r_ctl.rc_tlp_threshold)
5251                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
5252         else
5253                 thresh = (srtt * 2);
5254
5255         /* Get the previous sent packet, if any */
5256         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5257         counter_u64_add(rack_enter_tlp_calc, 1);
5258         len = rsm->r_end - rsm->r_start;
5259         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
5260                 /* Exactly like the ID */
5261                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
5262                         uint32_t alt_thresh;
5263                         /*
5264                          * Compensate for delayed-ack with the d-ack time.
5265                          */
5266                         counter_u64_add(rack_used_tlpmethod, 1);
5267                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5268                         if (alt_thresh > thresh)
5269                                 thresh = alt_thresh;
5270                 }
5271         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
5272                 /* 2.1 behavior */
5273                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
5274                 if (prsm && (len <= segsiz)) {
5275                         /*
5276                          * Two packets outstanding, thresh should be (2*srtt) +
5277                          * possible inter-packet delay (if any).
5278                          */
5279                         uint32_t inter_gap = 0;
5280                         int idx, nidx;
5281
5282                         counter_u64_add(rack_used_tlpmethod, 1);
5283                         idx = rsm->r_rtr_cnt - 1;
5284                         nidx = prsm->r_rtr_cnt - 1;
5285                         if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
5286                                 /* Yes it was sent later (or at the same time) */
5287                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
5288                         }
5289                         thresh += inter_gap;
5290                 } else if (len <= segsiz) {
5291                         /*
5292                          * Possibly compensate for delayed-ack.
5293                          */
5294                         uint32_t alt_thresh;
5295
5296                         counter_u64_add(rack_used_tlpmethod2, 1);
5297                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5298                         if (alt_thresh > thresh)
5299                                 thresh = alt_thresh;
5300                 }
5301         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
5302                 /* 2.2 behavior */
5303                 if (len <= segsiz) {
5304                         uint32_t alt_thresh;
5305                         /*
5306                          * Compensate for delayed-ack with the d-ack time.
5307                          */
5308                         counter_u64_add(rack_used_tlpmethod, 1);
5309                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5310                         if (alt_thresh > thresh)
5311                                 thresh = alt_thresh;
5312                 }
5313         }
5314         /* Not above an RTO */
5315         if (thresh > tp->t_rxtcur) {
5316                 thresh = tp->t_rxtcur;
5317         }
5318         /* Not above a RTO max */
5319         if (thresh > rack_rto_max) {
5320                 thresh = rack_rto_max;
5321         }
5322         /* Apply user supplied min TLP */
5323         if (thresh < rack_tlp_min) {
5324                 thresh = rack_tlp_min;
5325         }
5326         return (thresh);
5327 }
5328
5329 static uint32_t
5330 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
5331 {
5332         /*
5333          * We want the rack_rtt which is the
5334          * last rtt we measured. However if that
5335          * does not exist we fallback to the srtt (which
5336          * we probably will never do) and then as a last
5337          * resort we use RACK_INITIAL_RTO if no srtt is
5338          * yet set.
5339          */
5340         if (rack->rc_rack_rtt)
5341                 return (rack->rc_rack_rtt);
5342         else if (tp->t_srtt == 0)
5343                 return (RACK_INITIAL_RTO);
5344         return (tp->t_srtt);
5345 }
5346
5347 static struct rack_sendmap *
5348 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
5349 {
5350         /*
5351          * Check to see that we don't need to fall into recovery. We will
5352          * need to do so if our oldest transmit is past the time we should
5353          * have had an ack.
5354          */
5355         struct tcp_rack *rack;
5356         struct rack_sendmap *rsm;
5357         int32_t idx;
5358         uint32_t srtt, thresh;
5359
5360         rack = (struct tcp_rack *)tp->t_fb_ptr;
5361         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
5362                 return (NULL);
5363         }
5364         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5365         if (rsm == NULL)
5366                 return (NULL);
5367
5368         if (rsm->r_flags & RACK_ACKED) {
5369                 rsm = rack_find_lowest_rsm(rack);
5370                 if (rsm == NULL)
5371                         return (NULL);
5372         }
5373         idx = rsm->r_rtr_cnt - 1;
5374         srtt = rack_grab_rtt(tp, rack);
5375         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
5376         if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
5377                 return (NULL);
5378         }
5379         if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
5380                 return (NULL);
5381         }
5382         /* Ok if we reach here we are over-due and this guy can be sent */
5383         if (IN_RECOVERY(tp->t_flags) == 0) {
5384                 /*
5385                  * For the one that enters us into recovery record undo
5386                  * info.
5387                  */
5388                 rack->r_ctl.rc_rsm_start = rsm->r_start;
5389                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
5390                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
5391         }
5392         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
5393         return (rsm);
5394 }
5395
5396 static uint32_t
5397 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
5398 {
5399         int32_t t;
5400         int32_t tt;
5401         uint32_t ret_val;
5402
5403         t = (tp->t_srtt + (tp->t_rttvar << 2));
5404         RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
5405             rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
5406         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
5407         ret_val = (uint32_t)tt;
5408         return (ret_val);
5409 }
5410
5411 static uint32_t
5412 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
5413 {
5414         /*
5415          * Start the FR timer, we do this based on getting the first one in
5416          * the rc_tmap. Note that if its NULL we must stop the timer. in all
5417          * events we need to stop the running timer (if its running) before
5418          * starting the new one.
5419          */
5420         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
5421         uint32_t srtt_cur;
5422         int32_t idx;
5423         int32_t is_tlp_timer = 0;
5424         struct rack_sendmap *rsm;
5425
5426         if (rack->t_timers_stopped) {
5427                 /* All timers have been stopped none are to run */
5428                 return (0);
5429         }
5430         if (rack->rc_in_persist) {
5431                 /* We can't start any timer in persists */
5432                 return (rack_get_persists_timer_val(tp, rack));
5433         }
5434         rack->rc_on_min_to = 0;
5435         if ((tp->t_state < TCPS_ESTABLISHED) ||
5436             ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
5437                 goto activate_rxt;
5438         }
5439         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5440         if ((rsm == NULL) || sup_rack) {
5441                 /* Nothing on the send map or no rack */
5442 activate_rxt:
5443                 time_since_sent = 0;
5444                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5445                 if (rsm) {
5446                         /*
5447                          * Should we discount the RTX timer any?
5448                          *
5449                          * We want to discount it the smallest amount.
5450                          * If a timer (Rack/TLP or RXT) has gone off more
5451                          * recently thats the discount we want to use (now - timer time).
5452                          * If the retransmit of the oldest packet was more recent then
5453                          * we want to use that (now - oldest-packet-last_transmit_time).
5454                          *
5455                          */
5456                         idx = rsm->r_rtr_cnt - 1;
5457                         if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
5458                                 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5459                         else
5460                                 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5461                         if (TSTMP_GT(cts, tstmp_touse))
5462                             time_since_sent = cts - tstmp_touse;
5463                 }
5464                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
5465                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
5466                         to = tp->t_rxtcur;
5467                         if (to > time_since_sent)
5468                                 to -= time_since_sent;
5469                         else
5470                                 to = rack->r_ctl.rc_min_to;
5471                         if (to == 0)
5472                                 to = 1;
5473                         /* Special case for KEEPINIT */
5474                         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
5475                             (TP_KEEPINIT(tp) != 0) &&
5476                             rsm) {
5477                                 /*
5478                                  * We have to put a ceiling on the rxt timer
5479                                  * of the keep-init timeout.
5480                                  */
5481                                 uint32_t max_time, red;
5482
5483                                 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
5484                                 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
5485                                         red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
5486                                         if (red < max_time)
5487                                                 max_time -= red;
5488                                         else
5489                                                 max_time = 1;
5490                                 }
5491                                 /* Reduce timeout to the keep value if needed */
5492                                 if (max_time < to)
5493                                         to = max_time;
5494                         }
5495                         return (to);
5496                 }
5497                 return (0);
5498         }
5499         if (rsm->r_flags & RACK_ACKED) {
5500                 rsm = rack_find_lowest_rsm(rack);
5501                 if (rsm == NULL) {
5502                         /* No lowest? */
5503                         goto activate_rxt;
5504                 }
5505         }
5506         if (rack->sack_attack_disable) {
5507                 /*
5508                  * We don't want to do
5509                  * any TLP's if you are an attacker.
5510                  * Though if you are doing what
5511                  * is expected you may still have
5512                  * SACK-PASSED marks.
5513                  */
5514                 goto activate_rxt;
5515         }
5516         /* Convert from ms to usecs */
5517         if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
5518                 if ((tp->t_flags & TF_SENTFIN) &&
5519                     ((tp->snd_max - tp->snd_una) == 1) &&
5520                     (rsm->r_flags & RACK_HAS_FIN)) {
5521                         /*
5522                          * We don't start a rack timer if all we have is a
5523                          * FIN outstanding.
5524                          */
5525                         goto activate_rxt;
5526                 }
5527                 if ((rack->use_rack_rr == 0) &&
5528                     (IN_FASTRECOVERY(tp->t_flags)) &&
5529                     (rack->rack_no_prr == 0) &&
5530                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
5531                         /*
5532                          * We are not cheating, in recovery  and
5533                          * not enough ack's to yet get our next
5534                          * retransmission out.
5535                          *
5536                          * Note that classified attackers do not
5537                          * get to use the rack-cheat.
5538                          */
5539                         goto activate_tlp;
5540                 }
5541                 srtt = rack_grab_rtt(tp, rack);
5542                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
5543                 idx = rsm->r_rtr_cnt - 1;
5544                 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
5545                 if (SEQ_GEQ(exp, cts)) {
5546                         to = exp - cts;
5547                         if (to < rack->r_ctl.rc_min_to) {
5548                                 to = rack->r_ctl.rc_min_to;
5549                                 if (rack->r_rr_config == 3)
5550                                         rack->rc_on_min_to = 1;
5551                         }
5552                 } else {
5553                         to = rack->r_ctl.rc_min_to;
5554                         if (rack->r_rr_config == 3)
5555                                 rack->rc_on_min_to = 1;
5556                 }
5557         } else {
5558                 /* Ok we need to do a TLP not RACK */
5559 activate_tlp:
5560                 if ((rack->rc_tlp_in_progress != 0) &&
5561                     (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
5562                         /*
5563                          * The previous send was a TLP and we have sent
5564                          * N TLP's without sending new data.
5565                          */
5566                         goto activate_rxt;
5567                 }
5568                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
5569                 if (rsm == NULL) {
5570                         /* We found no rsm to TLP with. */
5571                         goto activate_rxt;
5572                 }
5573                 if (rsm->r_flags & RACK_HAS_FIN) {
5574                         /* If its a FIN we dont do TLP */
5575                         rsm = NULL;
5576                         goto activate_rxt;
5577                 }
5578                 idx = rsm->r_rtr_cnt - 1;
5579                 time_since_sent = 0;
5580                 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
5581                         tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5582                 else
5583                         tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5584                 if (TSTMP_GT(cts, tstmp_touse))
5585                     time_since_sent = cts - tstmp_touse;
5586                 is_tlp_timer = 1;
5587                 if (tp->t_srtt) {
5588                         if ((rack->rc_srtt_measure_made == 0) &&
5589                             (tp->t_srtt == 1)) {
5590                                 /*
5591                                  * If another stack as run and set srtt to 1,
5592                                  * then the srtt was 0, so lets use the initial.
5593                                  */
5594                                 srtt = RACK_INITIAL_RTO;
5595                         } else {
5596                                 srtt_cur = tp->t_srtt;
5597                                 srtt = srtt_cur;
5598                         }
5599                 } else
5600                         srtt = RACK_INITIAL_RTO;
5601                 /*
5602                  * If the SRTT is not keeping up and the
5603                  * rack RTT has spiked we want to use
5604                  * the last RTT not the smoothed one.
5605                  */
5606                 if (rack_tlp_use_greater &&
5607                     tp->t_srtt &&
5608                     (srtt < rack_grab_rtt(tp, rack))) {
5609                         srtt = rack_grab_rtt(tp, rack);
5610                 }
5611                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
5612                 if (thresh > time_since_sent) {
5613                         to = thresh - time_since_sent;
5614                 } else {
5615                         to = rack->r_ctl.rc_min_to;
5616                         rack_log_alt_to_to_cancel(rack,
5617                                                   thresh,               /* flex1 */
5618                                                   time_since_sent,      /* flex2 */
5619                                                   tstmp_touse,          /* flex3 */
5620                                                   rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
5621                                                   (uint32_t)rsm->r_tim_lastsent[idx],
5622                                                   srtt,
5623                                                   idx, 99);
5624                 }
5625                 if (to < rack_tlp_min) {
5626                         to = rack_tlp_min;
5627                 }
5628                 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
5629                         /*
5630                          * If the TLP time works out to larger than the max
5631                          * RTO lets not do TLP.. just RTO.
5632                          */
5633                         goto activate_rxt;
5634                 }
5635         }
5636         if (is_tlp_timer == 0) {
5637                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
5638         } else {
5639                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
5640         }
5641         if (to == 0)
5642                 to = 1;
5643         return (to);
5644 }
5645
5646 static void
5647 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5648 {
5649         if (rack->rc_in_persist == 0) {
5650                 if (tp->t_flags & TF_GPUTINPROG) {
5651                         /*
5652                          * Stop the goodput now, the calling of the
5653                          * measurement function clears the flag.
5654                          */
5655                         rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
5656                                                     RACK_QUALITY_PERSIST);
5657                 }
5658 #ifdef NETFLIX_SHARED_CWND
5659                 if (rack->r_ctl.rc_scw) {
5660                         tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5661                         rack->rack_scwnd_is_idle = 1;
5662                 }
5663 #endif
5664                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
5665                 if (rack->r_ctl.rc_went_idle_time == 0)
5666                         rack->r_ctl.rc_went_idle_time = 1;
5667                 rack_timer_cancel(tp, rack, cts, __LINE__);
5668                 rack->r_ctl.persist_lost_ends = 0;
5669                 rack->probe_not_answered = 0;
5670                 rack->forced_ack = 0;
5671                 tp->t_rxtshift = 0;
5672                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5673                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5674                 rack->rc_in_persist = 1;
5675         }
5676 }
5677
5678 static void
5679 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5680 {
5681         if (rack->rc_inp->inp_in_hpts) {
5682                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
5683                 rack->r_ctl.rc_hpts_flags = 0;
5684         }
5685 #ifdef NETFLIX_SHARED_CWND
5686         if (rack->r_ctl.rc_scw) {
5687                 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5688                 rack->rack_scwnd_is_idle = 0;
5689         }
5690 #endif
5691         if (rack->rc_gp_dyn_mul &&
5692             (rack->use_fixed_rate == 0) &&
5693             (rack->rc_always_pace)) {
5694                 /*
5695                  * Do we count this as if a probe-rtt just
5696                  * finished?
5697                  */
5698                 uint32_t time_idle, idle_min;
5699
5700                 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
5701                 idle_min = rack_min_probertt_hold;
5702                 if (rack_probertt_gpsrtt_cnt_div) {
5703                         uint64_t extra;
5704                         extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
5705                                 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
5706                         extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
5707                         idle_min += (uint32_t)extra;
5708                 }
5709                 if (time_idle >= idle_min) {
5710                         /* Yes, we count it as a probe-rtt. */
5711                         uint32_t us_cts;
5712
5713                         us_cts = tcp_get_usecs(NULL);
5714                         if (rack->in_probe_rtt == 0) {
5715                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
5716                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
5717                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
5718                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
5719                         } else {
5720                                 rack_exit_probertt(rack, us_cts);
5721                         }
5722                 }
5723         }
5724         rack->rc_in_persist = 0;
5725         rack->r_ctl.rc_went_idle_time = 0;
5726         tp->t_rxtshift = 0;
5727         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5728            rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5729         rack->r_ctl.rc_agg_delayed = 0;
5730         rack->r_early = 0;
5731         rack->r_late = 0;
5732         rack->r_ctl.rc_agg_early = 0;
5733 }
5734
5735 static void
5736 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
5737                    struct hpts_diag *diag, struct timeval *tv)
5738 {
5739         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5740                 union tcp_log_stackspecific log;
5741
5742                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5743                 log.u_bbr.flex1 = diag->p_nxt_slot;
5744                 log.u_bbr.flex2 = diag->p_cur_slot;
5745                 log.u_bbr.flex3 = diag->slot_req;
5746                 log.u_bbr.flex4 = diag->inp_hptsslot;
5747                 log.u_bbr.flex5 = diag->slot_remaining;
5748                 log.u_bbr.flex6 = diag->need_new_to;
5749                 log.u_bbr.flex7 = diag->p_hpts_active;
5750                 log.u_bbr.flex8 = diag->p_on_min_sleep;
5751                 /* Hijack other fields as needed */
5752                 log.u_bbr.epoch = diag->have_slept;
5753                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
5754                 log.u_bbr.pkts_out = diag->co_ret;
5755                 log.u_bbr.applimited = diag->hpts_sleep_time;
5756                 log.u_bbr.delivered = diag->p_prev_slot;
5757                 log.u_bbr.inflight = diag->p_runningslot;
5758                 log.u_bbr.bw_inuse = diag->wheel_slot;
5759                 log.u_bbr.rttProp = diag->wheel_cts;
5760                 log.u_bbr.timeStamp = cts;
5761                 log.u_bbr.delRate = diag->maxslots;
5762                 log.u_bbr.cur_del_rate = diag->p_curtick;
5763                 log.u_bbr.cur_del_rate <<= 32;
5764                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
5765                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5766                     &rack->rc_inp->inp_socket->so_rcv,
5767                     &rack->rc_inp->inp_socket->so_snd,
5768                     BBR_LOG_HPTSDIAG, 0,
5769                     0, &log, false, tv);
5770         }
5771
5772 }
5773
5774 static void
5775 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
5776 {
5777         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5778                 union tcp_log_stackspecific log;
5779                 struct timeval tv;
5780
5781                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5782                 log.u_bbr.flex1 = sb->sb_flags;
5783                 log.u_bbr.flex2 = len;
5784                 log.u_bbr.flex3 = sb->sb_state;
5785                 log.u_bbr.flex8 = type;
5786                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5787                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5788                     &rack->rc_inp->inp_socket->so_rcv,
5789                     &rack->rc_inp->inp_socket->so_snd,
5790                     TCP_LOG_SB_WAKE, 0,
5791                     len, &log, false, &tv);
5792         }
5793 }
5794
5795 static void
5796 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
5797       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
5798 {
5799         struct hpts_diag diag;
5800         struct inpcb *inp;
5801         struct timeval tv;
5802         uint32_t delayed_ack = 0;
5803         uint32_t hpts_timeout;
5804         uint32_t entry_slot = slot;
5805         uint8_t stopped;
5806         uint32_t left = 0;
5807         uint32_t us_cts;
5808
5809         inp = tp->t_inpcb;
5810         if ((tp->t_state == TCPS_CLOSED) ||
5811             (tp->t_state == TCPS_LISTEN)) {
5812                 return;
5813         }
5814         if (inp->inp_in_hpts) {
5815                 /* Already on the pacer */
5816                 return;
5817         }
5818         stopped = rack->rc_tmr_stopped;
5819         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
5820                 left = rack->r_ctl.rc_timer_exp - cts;
5821         }
5822         rack->r_ctl.rc_timer_exp = 0;
5823         rack->r_ctl.rc_hpts_flags = 0;
5824         us_cts = tcp_get_usecs(&tv);
5825         /* Now early/late accounting */
5826         rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
5827         if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
5828                 /*
5829                  * We have a early carry over set,
5830                  * we can always add more time so we
5831                  * can always make this compensation.
5832                  *
5833                  * Note if ack's are allowed to wake us do not
5834                  * penalize the next timer for being awoke
5835                  * by an ack aka the rc_agg_early (non-paced mode).
5836                  */
5837                 slot += rack->r_ctl.rc_agg_early;
5838                 rack->r_early = 0;
5839                 rack->r_ctl.rc_agg_early = 0;
5840         }
5841         if (rack->r_late) {
5842                 /*
5843                  * This is harder, we can
5844                  * compensate some but it
5845                  * really depends on what
5846                  * the current pacing time is.
5847                  */
5848                 if (rack->r_ctl.rc_agg_delayed >= slot) {
5849                         /*
5850                          * We can't compensate for it all.
5851                          * And we have to have some time
5852                          * on the clock. We always have a min
5853                          * 10 slots (10 x 10 i.e. 100 usecs).
5854                          */
5855                         if (slot <= HPTS_TICKS_PER_SLOT) {
5856                                 /* We gain delay */
5857                                 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
5858                                 slot = HPTS_TICKS_PER_SLOT;
5859                         } else {
5860                                 /* We take off some */
5861                                 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
5862                                 slot = HPTS_TICKS_PER_SLOT;
5863                         }
5864                 } else {
5865                         slot -= rack->r_ctl.rc_agg_delayed;
5866                         rack->r_ctl.rc_agg_delayed = 0;
5867                         /* Make sure we have 100 useconds at minimum */
5868                         if (slot < HPTS_TICKS_PER_SLOT) {
5869                                 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
5870                                 slot = HPTS_TICKS_PER_SLOT;
5871                         }
5872                         if (rack->r_ctl.rc_agg_delayed == 0)
5873                                 rack->r_late = 0;
5874                 }
5875         }
5876         if (slot) {
5877                 /* We are pacing too */
5878                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
5879         }
5880         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
5881 #ifdef NETFLIX_EXP_DETECTION
5882         if (rack->sack_attack_disable &&
5883             (slot < tcp_sad_pacing_interval)) {
5884                 /*
5885                  * We have a potential attacker on
5886                  * the line. We have possibly some
5887                  * (or now) pacing time set. We want to
5888                  * slow down the processing of sacks by some
5889                  * amount (if it is an attacker). Set the default
5890                  * slot for attackers in place (unless the orginal
5891                  * interval is longer). Its stored in
5892                  * micro-seconds, so lets convert to msecs.
5893                  */
5894                 slot = tcp_sad_pacing_interval;
5895         }
5896 #endif
5897         if (tp->t_flags & TF_DELACK) {
5898                 delayed_ack = TICKS_2_USEC(tcp_delacktime);
5899                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
5900         }
5901         if (delayed_ack && ((hpts_timeout == 0) ||
5902                             (delayed_ack < hpts_timeout)))
5903                 hpts_timeout = delayed_ack;
5904         else
5905                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
5906         /*
5907          * If no timers are going to run and we will fall off the hptsi
5908          * wheel, we resort to a keep-alive timer if its configured.
5909          */
5910         if ((hpts_timeout == 0) &&
5911             (slot == 0)) {
5912                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
5913                     (tp->t_state <= TCPS_CLOSING)) {
5914                         /*
5915                          * Ok we have no timer (persists, rack, tlp, rxt  or
5916                          * del-ack), we don't have segments being paced. So
5917                          * all that is left is the keepalive timer.
5918                          */
5919                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
5920                                 /* Get the established keep-alive time */
5921                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
5922                         } else {
5923                                 /*
5924                                  * Get the initial setup keep-alive time,
5925                                  * note that this is probably not going to
5926                                  * happen, since rack will be running a rxt timer
5927                                  * if a SYN of some sort is outstanding. It is
5928                                  * actually handled in rack_timeout_rxt().
5929                                  */
5930                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
5931                         }
5932                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
5933                         if (rack->in_probe_rtt) {
5934                                 /*
5935                                  * We want to instead not wake up a long time from
5936                                  * now but to wake up about the time we would
5937                                  * exit probe-rtt and initiate a keep-alive ack.
5938                                  * This will get us out of probe-rtt and update
5939                                  * our min-rtt.
5940                                  */
5941                                 hpts_timeout = rack_min_probertt_hold;
5942                         }
5943                 }
5944         }
5945         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
5946             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
5947                 /*
5948                  * RACK, TLP, persists and RXT timers all are restartable
5949                  * based on actions input .. i.e we received a packet (ack
5950                  * or sack) and that changes things (rw, or snd_una etc).
5951                  * Thus we can restart them with a new value. For
5952                  * keep-alive, delayed_ack we keep track of what was left
5953                  * and restart the timer with a smaller value.
5954                  */
5955                 if (left < hpts_timeout)
5956                         hpts_timeout = left;
5957         }
5958         if (hpts_timeout) {
5959                 /*
5960                  * Hack alert for now we can't time-out over 2,147,483
5961                  * seconds (a bit more than 596 hours), which is probably ok
5962                  * :).
5963                  */
5964                 if (hpts_timeout > 0x7ffffffe)
5965                         hpts_timeout = 0x7ffffffe;
5966                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
5967         }
5968         rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
5969         if ((rack->gp_ready == 0) &&
5970             (rack->use_fixed_rate == 0) &&
5971             (hpts_timeout < slot) &&
5972             (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
5973                 /*
5974                  * We have no good estimate yet for the
5975                  * old clunky burst mitigation or the
5976                  * real pacing. And the tlp or rxt is smaller
5977                  * than the pacing calculation. Lets not
5978                  * pace that long since we know the calculation
5979                  * so far is not accurate.
5980                  */
5981                 slot = hpts_timeout;
5982         }
5983         rack->r_ctl.last_pacing_time = slot;
5984         /**
5985          * Turn off all the flags for queuing by default. The
5986          * flags have important meanings to what happens when
5987          * LRO interacts with the transport. Most likely (by default now)
5988          * mbuf_queueing and ack compression are on. So the transport
5989          * has a couple of flags that control what happens (if those
5990          * are not on then these flags won't have any effect since it
5991          * won't go through the queuing LRO path).
5992          *
5993          * INP_MBUF_QUEUE_READY - This flags says that I am busy
5994          *                        pacing output, so don't disturb. But
5995          *                        it also means LRO can wake me if there
5996          *                        is a SACK arrival.
5997          *
5998          * INP_DONT_SACK_QUEUE - This flag is used in conjunction
5999          *                       with the above flag (QUEUE_READY) and
6000          *                       when present it says don't even wake me
6001          *                       if a SACK arrives.
6002          *
6003          * The idea behind these flags is that if we are pacing we
6004          * set the MBUF_QUEUE_READY and only get woken up if
6005          * a SACK arrives (which could change things) or if
6006          * our pacing timer expires. If, however, we have a rack
6007          * timer running, then we don't even want a sack to wake
6008          * us since the rack timer has to expire before we can send.
6009          *
6010          * Other cases should usually have none of the flags set
6011          * so LRO can call into us.
6012          */
6013         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
6014         if (slot) {
6015                 rack->r_ctl.rc_last_output_to = us_cts + slot;
6016                 /*
6017                  * A pacing timer (slot) is being set, in
6018                  * such a case we cannot send (we are blocked by
6019                  * the timer). So lets tell LRO that it should not
6020                  * wake us unless there is a SACK. Note this only
6021                  * will be effective if mbuf queueing is on or
6022                  * compressed acks are being processed.
6023                  */
6024                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
6025                 /*
6026                  * But wait if we have a Rack timer running
6027                  * even a SACK should not disturb us (with
6028                  * the exception of r_rr_config 3).
6029                  */
6030                 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
6031                     (rack->r_rr_config != 3))
6032                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
6033                 if (rack->rc_ack_can_sendout_data) {
6034                         /*
6035                          * Ahh but wait, this is that special case
6036                          * where the pacing timer can be disturbed
6037                          * backout the changes (used for non-paced
6038                          * burst limiting).
6039                          */
6040                         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
6041                 }
6042                 if ((rack->use_rack_rr) &&
6043                     (rack->r_rr_config < 2) &&
6044                     ((hpts_timeout) && (hpts_timeout < slot))) {
6045                         /*
6046                          * Arrange for the hpts to kick back in after the
6047                          * t-o if the t-o does not cause a send.
6048                          */
6049                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
6050                                                    __LINE__, &diag);
6051                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6052                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6053                 } else {
6054                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
6055                                                    __LINE__, &diag);
6056                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6057                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
6058                 }
6059         } else if (hpts_timeout) {
6060                 /*
6061                  * With respect to inp_flags2 here, lets let any new acks wake
6062                  * us up here. Since we are not pacing (no pacing timer), output
6063                  * can happen so we should let it. If its a Rack timer, then any inbound
6064                  * packet probably won't change the sending (we will be blocked)
6065                  * but it may change the prr stats so letting it in (the set defaults
6066                  * at the start of this block) are good enough.
6067                  */
6068                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
6069                                            __LINE__, &diag);
6070                 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6071                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6072         } else {
6073                 /* No timer starting */
6074 #ifdef INVARIANTS
6075                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
6076                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
6077                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
6078                 }
6079 #endif
6080         }
6081         rack->rc_tmr_stopped = 0;
6082         if (slot)
6083                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
6084 }
6085
6086 /*
6087  * RACK Timer, here we simply do logging and house keeping.
6088  * the normal rack_output() function will call the
6089  * appropriate thing to check if we need to do a RACK retransmit.
6090  * We return 1, saying don't proceed with rack_output only
6091  * when all timers have been stopped (destroyed PCB?).
6092  */
6093 static int
6094 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6095 {
6096         /*
6097          * This timer simply provides an internal trigger to send out data.
6098          * The check_recovery_mode call will see if there are needed
6099          * retransmissions, if so we will enter fast-recovery. The output
6100          * call may or may not do the same thing depending on sysctl
6101          * settings.
6102          */
6103         struct rack_sendmap *rsm;
6104
6105         if (tp->t_timers->tt_flags & TT_STOPPED) {
6106                 return (1);
6107         }
6108         counter_u64_add(rack_to_tot, 1);
6109         if (rack->r_state && (rack->r_state != tp->t_state))
6110                 rack_set_state(tp, rack);
6111         rack->rc_on_min_to = 0;
6112         rsm = rack_check_recovery_mode(tp, cts);
6113         rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
6114         if (rsm) {
6115                 rack->r_ctl.rc_resend = rsm;
6116                 rack->r_timer_override = 1;
6117                 if (rack->use_rack_rr) {
6118                         /*
6119                          * Don't accumulate extra pacing delay
6120                          * we are allowing the rack timer to
6121                          * over-ride pacing i.e. rrr takes precedence
6122                          * if the pacing interval is longer than the rrr
6123                          * time (in other words we get the min pacing
6124                          * time versus rrr pacing time).
6125                          */
6126                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6127                 }
6128         }
6129         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
6130         if (rsm == NULL) {
6131                 /* restart a timer and return 1 */
6132                 rack_start_hpts_timer(rack, tp, cts,
6133                                       0, 0, 0);
6134                 return (1);
6135         }
6136         return (0);
6137 }
6138
6139 static void
6140 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
6141 {
6142         if (rsm->m->m_len > rsm->orig_m_len) {
6143                 /*
6144                  * Mbuf grew, caused by sbcompress, our offset does
6145                  * not change.
6146                  */
6147                 rsm->orig_m_len = rsm->m->m_len;
6148         } else if (rsm->m->m_len < rsm->orig_m_len) {
6149                 /*
6150                  * Mbuf shrank, trimmed off the top by an ack, our
6151                  * offset changes.
6152                  */
6153                 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
6154                 rsm->orig_m_len = rsm->m->m_len;
6155         }
6156 }
6157
6158 static void
6159 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
6160 {
6161         struct mbuf *m;
6162         uint32_t soff;
6163
6164         if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
6165                 /* Fix up the orig_m_len and possibly the mbuf offset */
6166                 rack_adjust_orig_mlen(src_rsm);
6167         }
6168         m = src_rsm->m;
6169         soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
6170         while (soff >= m->m_len) {
6171                 /* Move out past this mbuf */
6172                 soff -= m->m_len;
6173                 m = m->m_next;
6174                 KASSERT((m != NULL),
6175                         ("rsm:%p nrsm:%p hit at soff:%u null m",
6176                          src_rsm, rsm, soff));
6177         }
6178         rsm->m = m;
6179         rsm->soff = soff;
6180         rsm->orig_m_len = m->m_len;
6181 }
6182
6183 static __inline void
6184 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
6185                struct rack_sendmap *rsm, uint32_t start)
6186 {
6187         int idx;
6188
6189         nrsm->r_start = start;
6190         nrsm->r_end = rsm->r_end;
6191         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
6192         nrsm->r_flags = rsm->r_flags;
6193         nrsm->r_dupack = rsm->r_dupack;
6194         nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
6195         nrsm->r_rtr_bytes = 0;
6196         nrsm->r_fas = rsm->r_fas;
6197         rsm->r_end = nrsm->r_start;
6198         nrsm->r_just_ret = rsm->r_just_ret;
6199         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
6200                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
6201         }
6202         /* Now if we have SYN flag we keep it on the left edge */
6203         if (nrsm->r_flags & RACK_HAS_SYN)
6204                 nrsm->r_flags &= ~RACK_HAS_SYN;
6205         /* Now if we have a FIN flag we keep it on the right edge */
6206         if (rsm->r_flags & RACK_HAS_FIN)
6207                 rsm->r_flags &= ~RACK_HAS_FIN;
6208         /* Push bit must go to the right edge as well */
6209         if (rsm->r_flags & RACK_HAD_PUSH)
6210                 rsm->r_flags &= ~RACK_HAD_PUSH;
6211         /* Clone over the state of the hw_tls flag */
6212         nrsm->r_hw_tls = rsm->r_hw_tls;
6213         /*
6214          * Now we need to find nrsm's new location in the mbuf chain
6215          * we basically calculate a new offset, which is soff +
6216          * how much is left in original rsm. Then we walk out the mbuf
6217          * chain to find the righ postion, it may be the same mbuf
6218          * or maybe not.
6219          */
6220         KASSERT(((rsm->m != NULL) ||
6221                  (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
6222                 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
6223         if (rsm->m)
6224                 rack_setup_offset_for_rsm(rsm, nrsm);
6225 }
6226
6227 static struct rack_sendmap *
6228 rack_merge_rsm(struct tcp_rack *rack,
6229                struct rack_sendmap *l_rsm,
6230                struct rack_sendmap *r_rsm)
6231 {
6232         /*
6233          * We are merging two ack'd RSM's,
6234          * the l_rsm is on the left (lower seq
6235          * values) and the r_rsm is on the right
6236          * (higher seq value). The simplest way
6237          * to merge these is to move the right
6238          * one into the left. I don't think there
6239          * is any reason we need to try to find
6240          * the oldest (or last oldest retransmitted).
6241          */
6242         struct rack_sendmap *rm;
6243
6244         rack_log_map_chg(rack->rc_tp, rack, NULL,
6245                          l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
6246         l_rsm->r_end = r_rsm->r_end;
6247         if (l_rsm->r_dupack < r_rsm->r_dupack)
6248                 l_rsm->r_dupack = r_rsm->r_dupack;
6249         if (r_rsm->r_rtr_bytes)
6250                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
6251         if (r_rsm->r_in_tmap) {
6252                 /* This really should not happen */
6253                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
6254                 r_rsm->r_in_tmap = 0;
6255         }
6256
6257         /* Now the flags */
6258         if (r_rsm->r_flags & RACK_HAS_FIN)
6259                 l_rsm->r_flags |= RACK_HAS_FIN;
6260         if (r_rsm->r_flags & RACK_TLP)
6261                 l_rsm->r_flags |= RACK_TLP;
6262         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
6263                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
6264         if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
6265             ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
6266                 /*
6267                  * If both are app-limited then let the
6268                  * free lower the count. If right is app
6269                  * limited and left is not, transfer.
6270                  */
6271                 l_rsm->r_flags |= RACK_APP_LIMITED;
6272                 r_rsm->r_flags &= ~RACK_APP_LIMITED;
6273                 if (r_rsm == rack->r_ctl.rc_first_appl)
6274                         rack->r_ctl.rc_first_appl = l_rsm;
6275         }
6276         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
6277 #ifdef INVARIANTS
6278         if (rm != r_rsm) {
6279                 panic("removing head in rack:%p rsm:%p rm:%p",
6280                       rack, r_rsm, rm);
6281         }
6282 #endif
6283         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
6284                 /* Transfer the split limit to the map we free */
6285                 r_rsm->r_limit_type = l_rsm->r_limit_type;
6286                 l_rsm->r_limit_type = 0;
6287         }
6288         rack_free(rack, r_rsm);
6289         return (l_rsm);
6290 }
6291
6292 /*
6293  * TLP Timer, here we simply setup what segment we want to
6294  * have the TLP expire on, the normal rack_output() will then
6295  * send it out.
6296  *
6297  * We return 1, saying don't proceed with rack_output only
6298  * when all timers have been stopped (destroyed PCB?).
6299  */
6300 static int
6301 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
6302 {
6303         /*
6304          * Tail Loss Probe.
6305          */
6306         struct rack_sendmap *rsm = NULL;
6307         struct rack_sendmap *insret;
6308         struct socket *so;
6309         uint32_t amm;
6310         uint32_t out, avail;
6311         int collapsed_win = 0;
6312
6313         if (tp->t_timers->tt_flags & TT_STOPPED) {
6314                 return (1);
6315         }
6316         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
6317                 /* Its not time yet */
6318                 return (0);
6319         }
6320         if (ctf_progress_timeout_check(tp, true)) {
6321                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6322                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6323                 return (1);
6324         }
6325         /*
6326          * A TLP timer has expired. We have been idle for 2 rtts. So we now
6327          * need to figure out how to force a full MSS segment out.
6328          */
6329         rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
6330         rack->r_ctl.retran_during_recovery = 0;
6331         rack->r_ctl.dsack_byte_cnt = 0;
6332         counter_u64_add(rack_tlp_tot, 1);
6333         if (rack->r_state && (rack->r_state != tp->t_state))
6334                 rack_set_state(tp, rack);
6335         so = tp->t_inpcb->inp_socket;
6336         avail = sbavail(&so->so_snd);
6337         out = tp->snd_max - tp->snd_una;
6338         if (out > tp->snd_wnd) {
6339                 /* special case, we need a retransmission */
6340                 collapsed_win = 1;
6341                 goto need_retran;
6342         }
6343         if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
6344                 rack->r_ctl.dsack_persist--;
6345                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6346                         rack->r_ctl.num_dsack = 0;
6347                 }
6348                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6349         }
6350         if ((tp->t_flags & TF_GPUTINPROG) &&
6351             (rack->r_ctl.rc_tlp_cnt_out == 1)) {
6352                 /*
6353                  * If this is the second in a row
6354                  * TLP and we are doing a measurement
6355                  * its time to abandon the measurement.
6356                  * Something is likely broken on
6357                  * the clients network and measuring a
6358                  * broken network does us no good.
6359                  */
6360                 tp->t_flags &= ~TF_GPUTINPROG;
6361                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6362                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6363                                            tp->gput_seq,
6364                                            0, 0, 18, __LINE__, NULL, 0);
6365         }
6366         /*
6367          * Check our send oldest always settings, and if
6368          * there is an oldest to send jump to the need_retran.
6369          */
6370         if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
6371                 goto need_retran;
6372
6373         if (avail > out) {
6374                 /* New data is available */
6375                 amm = avail - out;
6376                 if (amm > ctf_fixed_maxseg(tp)) {
6377                         amm = ctf_fixed_maxseg(tp);
6378                         if ((amm + out) > tp->snd_wnd) {
6379                                 /* We are rwnd limited */
6380                                 goto need_retran;
6381                         }
6382                 } else if (amm < ctf_fixed_maxseg(tp)) {
6383                         /* not enough to fill a MTU */
6384                         goto need_retran;
6385                 }
6386                 if (IN_FASTRECOVERY(tp->t_flags)) {
6387                         /* Unlikely */
6388                         if (rack->rack_no_prr == 0) {
6389                                 if (out + amm <= tp->snd_wnd) {
6390                                         rack->r_ctl.rc_prr_sndcnt = amm;
6391                                         rack->r_ctl.rc_tlp_new_data = amm;
6392                                         rack_log_to_prr(rack, 4, 0);
6393                                 }
6394                         } else
6395                                 goto need_retran;
6396                 } else {
6397                         /* Set the send-new override */
6398                         if (out + amm <= tp->snd_wnd)
6399                                 rack->r_ctl.rc_tlp_new_data = amm;
6400                         else
6401                                 goto need_retran;
6402                 }
6403                 rack->r_ctl.rc_tlpsend = NULL;
6404                 counter_u64_add(rack_tlp_newdata, 1);
6405                 goto send;
6406         }
6407 need_retran:
6408         /*
6409          * Ok we need to arrange the last un-acked segment to be re-sent, or
6410          * optionally the first un-acked segment.
6411          */
6412         if (collapsed_win == 0) {
6413                 if (rack_always_send_oldest)
6414                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6415                 else {
6416                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6417                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
6418                                 rsm = rack_find_high_nonack(rack, rsm);
6419                         }
6420                 }
6421                 if (rsm == NULL) {
6422                         counter_u64_add(rack_tlp_does_nada, 1);
6423 #ifdef TCP_BLACKBOX
6424                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6425 #endif
6426                         goto out;
6427                 }
6428         } else {
6429                 /*
6430                  * We must find the last segment
6431                  * that was acceptable by the client.
6432                  */
6433                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6434                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
6435                                 /* Found one */
6436                                 break;
6437                         }
6438                 }
6439                 if (rsm == NULL) {
6440                         /* None? if so send the first */
6441                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6442                         if (rsm == NULL) {
6443                                 counter_u64_add(rack_tlp_does_nada, 1);
6444 #ifdef TCP_BLACKBOX
6445                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6446 #endif
6447                                 goto out;
6448                         }
6449                 }
6450         }
6451         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
6452                 /*
6453                  * We need to split this the last segment in two.
6454                  */
6455                 struct rack_sendmap *nrsm;
6456
6457                 nrsm = rack_alloc_full_limit(rack);
6458                 if (nrsm == NULL) {
6459                         /*
6460                          * No memory to split, we will just exit and punt
6461                          * off to the RXT timer.
6462                          */
6463                         counter_u64_add(rack_tlp_does_nada, 1);
6464                         goto out;
6465                 }
6466                 rack_clone_rsm(rack, nrsm, rsm,
6467                                (rsm->r_end - ctf_fixed_maxseg(tp)));
6468                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
6469                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6470 #ifdef INVARIANTS
6471                 if (insret != NULL) {
6472                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6473                               nrsm, insret, rack, rsm);
6474                 }
6475 #endif
6476                 if (rsm->r_in_tmap) {
6477                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6478                         nrsm->r_in_tmap = 1;
6479                 }
6480                 rsm = nrsm;
6481         }
6482         rack->r_ctl.rc_tlpsend = rsm;
6483 send:
6484         /* Make sure output path knows we are doing a TLP */
6485         *doing_tlp = 1;
6486         rack->r_timer_override = 1;
6487         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6488         return (0);
6489 out:
6490         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6491         return (0);
6492 }
6493
6494 /*
6495  * Delayed ack Timer, here we simply need to setup the
6496  * ACK_NOW flag and remove the DELACK flag. From there
6497  * the output routine will send the ack out.
6498  *
6499  * We only return 1, saying don't proceed, if all timers
6500  * are stopped (destroyed PCB?).
6501  */
6502 static int
6503 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6504 {
6505         if (tp->t_timers->tt_flags & TT_STOPPED) {
6506                 return (1);
6507         }
6508         rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
6509         tp->t_flags &= ~TF_DELACK;
6510         tp->t_flags |= TF_ACKNOW;
6511         KMOD_TCPSTAT_INC(tcps_delack);
6512         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6513         return (0);
6514 }
6515
6516 /*
6517  * Persists timer, here we simply send the
6518  * same thing as a keepalive will.
6519  * the one byte send.
6520  *
6521  * We only return 1, saying don't proceed, if all timers
6522  * are stopped (destroyed PCB?).
6523  */
6524 static int
6525 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6526 {
6527         struct tcptemp *t_template;
6528         struct inpcb *inp;
6529         int32_t retval = 1;
6530
6531         inp = tp->t_inpcb;
6532
6533         if (tp->t_timers->tt_flags & TT_STOPPED) {
6534                 return (1);
6535         }
6536         if (rack->rc_in_persist == 0)
6537                 return (0);
6538         if (ctf_progress_timeout_check(tp, false)) {
6539                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6540                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6541                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
6542                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6543                 return (1);
6544         }
6545         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
6546         /*
6547          * Persistence timer into zero window. Force a byte to be output, if
6548          * possible.
6549          */
6550         KMOD_TCPSTAT_INC(tcps_persisttimeo);
6551         /*
6552          * Hack: if the peer is dead/unreachable, we do not time out if the
6553          * window is closed.  After a full backoff, drop the connection if
6554          * the idle time (no responses to probes) reaches the maximum
6555          * backoff that we would use if retransmitting.
6556          */
6557         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
6558             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
6559              TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
6560                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6561                 retval = 1;
6562                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6563                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6564                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6565                 goto out;
6566         }
6567         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
6568             tp->snd_una == tp->snd_max)
6569                 rack_exit_persist(tp, rack, cts);
6570         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
6571         /*
6572          * If the user has closed the socket then drop a persisting
6573          * connection after a much reduced timeout.
6574          */
6575         if (tp->t_state > TCPS_CLOSE_WAIT &&
6576             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
6577                 retval = 1;
6578                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6579                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6580                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6581                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6582                 goto out;
6583         }
6584         t_template = tcpip_maketemplate(rack->rc_inp);
6585         if (t_template) {
6586                 /* only set it if we were answered */
6587                 if (rack->forced_ack == 0) {
6588                         rack->forced_ack = 1;
6589                         rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6590                 } else {
6591                         rack->probe_not_answered = 1;
6592                         counter_u64_add(rack_persists_loss, 1);
6593                         rack->r_ctl.persist_lost_ends++;
6594                 }
6595                 counter_u64_add(rack_persists_sends, 1);
6596                 tcp_respond(tp, t_template->tt_ipgen,
6597                             &t_template->tt_t, (struct mbuf *)NULL,
6598                             tp->rcv_nxt, tp->snd_una - 1, 0);
6599                 /* This sends an ack */
6600                 if (tp->t_flags & TF_DELACK)
6601                         tp->t_flags &= ~TF_DELACK;
6602                 free(t_template, M_TEMP);
6603         }
6604         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
6605                 tp->t_rxtshift++;
6606 out:
6607         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
6608         rack_start_hpts_timer(rack, tp, cts,
6609                               0, 0, 0);
6610         return (retval);
6611 }
6612
6613 /*
6614  * If a keepalive goes off, we had no other timers
6615  * happening. We always return 1 here since this
6616  * routine either drops the connection or sends
6617  * out a segment with respond.
6618  */
6619 static int
6620 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6621 {
6622         struct tcptemp *t_template;
6623         struct inpcb *inp;
6624
6625         if (tp->t_timers->tt_flags & TT_STOPPED) {
6626                 return (1);
6627         }
6628         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
6629         inp = tp->t_inpcb;
6630         rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
6631         /*
6632          * Keep-alive timer went off; send something or drop connection if
6633          * idle for too long.
6634          */
6635         KMOD_TCPSTAT_INC(tcps_keeptimeo);
6636         if (tp->t_state < TCPS_ESTABLISHED)
6637                 goto dropit;
6638         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6639             tp->t_state <= TCPS_CLOSING) {
6640                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
6641                         goto dropit;
6642                 /*
6643                  * Send a packet designed to force a response if the peer is
6644                  * up and reachable: either an ACK if the connection is
6645                  * still alive, or an RST if the peer has closed the
6646                  * connection due to timeout or reboot. Using sequence
6647                  * number tp->snd_una-1 causes the transmitted zero-length
6648                  * segment to lie outside the receive window; by the
6649                  * protocol spec, this requires the correspondent TCP to
6650                  * respond.
6651                  */
6652                 KMOD_TCPSTAT_INC(tcps_keepprobe);
6653                 t_template = tcpip_maketemplate(inp);
6654                 if (t_template) {
6655                         if (rack->forced_ack == 0) {
6656                                 rack->forced_ack = 1;
6657                                 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6658                         } else {
6659                                 rack->probe_not_answered = 1;
6660                         }
6661                         tcp_respond(tp, t_template->tt_ipgen,
6662                             &t_template->tt_t, (struct mbuf *)NULL,
6663                             tp->rcv_nxt, tp->snd_una - 1, 0);
6664                         free(t_template, M_TEMP);
6665                 }
6666         }
6667         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
6668         return (1);
6669 dropit:
6670         KMOD_TCPSTAT_INC(tcps_keepdrops);
6671         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6672         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6673         return (1);
6674 }
6675
6676 /*
6677  * Retransmit helper function, clear up all the ack
6678  * flags and take care of important book keeping.
6679  */
6680 static void
6681 rack_remxt_tmr(struct tcpcb *tp)
6682 {
6683         /*
6684          * The retransmit timer went off, all sack'd blocks must be
6685          * un-acked.
6686          */
6687         struct rack_sendmap *rsm, *trsm = NULL;
6688         struct tcp_rack *rack;
6689
6690         rack = (struct tcp_rack *)tp->t_fb_ptr;
6691         rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
6692         rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
6693         if (rack->r_state && (rack->r_state != tp->t_state))
6694                 rack_set_state(tp, rack);
6695         /*
6696          * Ideally we would like to be able to
6697          * mark SACK-PASS on anything not acked here.
6698          *
6699          * However, if we do that we would burst out
6700          * all that data 1ms apart. This would be unwise,
6701          * so for now we will just let the normal rxt timer
6702          * and tlp timer take care of it.
6703          *
6704          * Also we really need to stick them back in sequence
6705          * order. This way we send in the proper order and any
6706          * sacks that come floating in will "re-ack" the data.
6707          * To do this we zap the tmap with an INIT and then
6708          * walk through and place every rsm in the RB tree
6709          * back in its seq ordered place.
6710          */
6711         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6712         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6713                 rsm->r_dupack = 0;
6714                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
6715                 /* We must re-add it back to the tlist */
6716                 if (trsm == NULL) {
6717                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6718                 } else {
6719                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
6720                 }
6721                 rsm->r_in_tmap = 1;
6722                 trsm = rsm;
6723                 if (rsm->r_flags & RACK_ACKED)
6724                         rsm->r_flags |= RACK_WAS_ACKED;
6725                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
6726                 rsm->r_flags |= RACK_MUST_RXT;
6727         }
6728         /* Clear the count (we just un-acked them) */
6729         rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
6730         rack->r_ctl.rc_sacked = 0;
6731         rack->r_ctl.rc_sacklast = NULL;
6732         rack->r_ctl.rc_agg_delayed = 0;
6733         rack->r_early = 0;
6734         rack->r_ctl.rc_agg_early = 0;
6735         rack->r_late = 0;
6736         /* Clear the tlp rtx mark */
6737         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6738         if (rack->r_ctl.rc_resend != NULL)
6739                 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
6740         rack->r_ctl.rc_prr_sndcnt = 0;
6741         rack_log_to_prr(rack, 6, 0);
6742         rack->r_timer_override = 1;
6743         if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
6744 #ifdef NETFLIX_EXP_DETECTION
6745             || (rack->sack_attack_disable != 0)
6746 #endif
6747                     ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
6748                 /*
6749                  * For non-sack customers new data
6750                  * needs to go out as retransmits until
6751                  * we retransmit up to snd_max.
6752                  */
6753                 rack->r_must_retran = 1;
6754                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
6755                                                 rack->r_ctl.rc_sacked);
6756         }
6757         rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
6758 }
6759
6760 static void
6761 rack_convert_rtts(struct tcpcb *tp)
6762 {
6763         if (tp->t_srtt > 1) {
6764                 uint32_t val, frac;
6765
6766                 val = tp->t_srtt >> TCP_RTT_SHIFT;
6767                 frac = tp->t_srtt & 0x1f;
6768                 tp->t_srtt = TICKS_2_USEC(val);
6769                 /*
6770                  * frac is the fractional part of the srtt (if any)
6771                  * but its in ticks and every bit represents
6772                  * 1/32nd of a hz.
6773                  */
6774                 if (frac) {
6775                         if (hz == 1000) {
6776                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6777                         } else {
6778                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6779                         }
6780                         tp->t_srtt += frac;
6781                 }
6782         }
6783         if (tp->t_rttvar) {
6784                 uint32_t val, frac;
6785
6786                 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
6787                 frac = tp->t_rttvar & 0x1f;
6788                 tp->t_rttvar = TICKS_2_USEC(val);
6789                 /*
6790                  * frac is the fractional part of the srtt (if any)
6791                  * but its in ticks and every bit represents
6792                  * 1/32nd of a hz.
6793                  */
6794                 if (frac) {
6795                         if (hz == 1000) {
6796                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6797                         } else {
6798                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6799                         }
6800                         tp->t_rttvar += frac;
6801                 }
6802         }
6803         tp->t_rxtcur = RACK_REXMTVAL(tp);
6804         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6805                 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
6806         }
6807         if (tp->t_rxtcur > rack_rto_max) {
6808                 tp->t_rxtcur = rack_rto_max;
6809         }
6810 }
6811
6812 static void
6813 rack_cc_conn_init(struct tcpcb *tp)
6814 {
6815         struct tcp_rack *rack;
6816         uint32_t srtt;
6817
6818         rack = (struct tcp_rack *)tp->t_fb_ptr;
6819         srtt = tp->t_srtt;
6820         cc_conn_init(tp);
6821         /*
6822          * Now convert to rack's internal format,
6823          * if required.
6824          */
6825         if ((srtt == 0) && (tp->t_srtt != 0))
6826                 rack_convert_rtts(tp);
6827         /*
6828          * We want a chance to stay in slowstart as
6829          * we create a connection. TCP spec says that
6830          * initially ssthresh is infinite. For our
6831          * purposes that is the snd_wnd.
6832          */
6833         if (tp->snd_ssthresh < tp->snd_wnd) {
6834                 tp->snd_ssthresh = tp->snd_wnd;
6835         }
6836         /*
6837          * We also want to assure a IW worth of
6838          * data can get inflight.
6839          */
6840         if (rc_init_window(rack) < tp->snd_cwnd)
6841                 tp->snd_cwnd = rc_init_window(rack);
6842 }
6843
6844 /*
6845  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
6846  * we will setup to retransmit the lowest seq number outstanding.
6847  */
6848 static int
6849 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6850 {
6851         int32_t rexmt;
6852         struct inpcb *inp;
6853         int32_t retval = 0;
6854         bool isipv6;
6855
6856         inp = tp->t_inpcb;
6857         if (tp->t_timers->tt_flags & TT_STOPPED) {
6858                 return (1);
6859         }
6860         if ((tp->t_flags & TF_GPUTINPROG) &&
6861             (tp->t_rxtshift)) {
6862                 /*
6863                  * We have had a second timeout
6864                  * measurements on successive rxt's are not profitable.
6865                  * It is unlikely to be of any use (the network is
6866                  * broken or the client went away).
6867                  */
6868                 tp->t_flags &= ~TF_GPUTINPROG;
6869                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6870                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6871                                            tp->gput_seq,
6872                                            0, 0, 18, __LINE__, NULL, 0);
6873         }
6874         if (ctf_progress_timeout_check(tp, false)) {
6875                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6876                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6877                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
6878                 return (1);
6879         }
6880         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
6881         rack->r_ctl.retran_during_recovery = 0;
6882         rack->r_ctl.dsack_byte_cnt = 0;
6883         if (IN_FASTRECOVERY(tp->t_flags))
6884                 tp->t_flags |= TF_WASFRECOVERY;
6885         else
6886                 tp->t_flags &= ~TF_WASFRECOVERY;
6887         if (IN_CONGRECOVERY(tp->t_flags))
6888                 tp->t_flags |= TF_WASCRECOVERY;
6889         else
6890                 tp->t_flags &= ~TF_WASCRECOVERY;
6891         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
6892             (tp->snd_una == tp->snd_max)) {
6893                 /* Nothing outstanding .. nothing to do */
6894                 return (0);
6895         }
6896         if (rack->r_ctl.dsack_persist) {
6897                 rack->r_ctl.dsack_persist--;
6898                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6899                         rack->r_ctl.num_dsack = 0;
6900                 }
6901                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6902         }
6903         /*
6904          * Rack can only run one timer  at a time, so we cannot
6905          * run a KEEPINIT (gating SYN sending) and a retransmit
6906          * timer for the SYN. So if we are in a front state and
6907          * have a KEEPINIT timer we need to check the first transmit
6908          * against now to see if we have exceeded the KEEPINIT time
6909          * (if one is set).
6910          */
6911         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6912             (TP_KEEPINIT(tp) != 0)) {
6913                 struct rack_sendmap *rsm;
6914
6915                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6916                 if (rsm) {
6917                         /* Ok we have something outstanding to test keepinit with */
6918                         if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
6919                             ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
6920                                 /* We have exceeded the KEEPINIT time */
6921                                 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6922                                 goto drop_it;
6923                         }
6924                 }
6925         }
6926         /*
6927          * Retransmission timer went off.  Message has not been acked within
6928          * retransmit interval.  Back off to a longer retransmit interval
6929          * and retransmit one segment.
6930          */
6931         rack_remxt_tmr(tp);
6932         if ((rack->r_ctl.rc_resend == NULL) ||
6933             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
6934                 /*
6935                  * If the rwnd collapsed on
6936                  * the one we are retransmitting
6937                  * it does not count against the
6938                  * rxt count.
6939                  */
6940                 tp->t_rxtshift++;
6941         }
6942         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
6943                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6944 drop_it:
6945                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
6946                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
6947                 retval = 1;
6948                 tcp_set_inp_to_drop(rack->rc_inp,
6949                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
6950                 goto out;
6951         }
6952         if (tp->t_state == TCPS_SYN_SENT) {
6953                 /*
6954                  * If the SYN was retransmitted, indicate CWND to be limited
6955                  * to 1 segment in cc_conn_init().
6956                  */
6957                 tp->snd_cwnd = 1;
6958         } else if (tp->t_rxtshift == 1) {
6959                 /*
6960                  * first retransmit; record ssthresh and cwnd so they can be
6961                  * recovered if this turns out to be a "bad" retransmit. A
6962                  * retransmit is considered "bad" if an ACK for this segment
6963                  * is received within RTT/2 interval; the assumption here is
6964                  * that the ACK was already in flight.  See "On Estimating
6965                  * End-to-End Network Path Properties" by Allman and Paxson
6966                  * for more details.
6967                  */
6968                 tp->snd_cwnd_prev = tp->snd_cwnd;
6969                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
6970                 tp->snd_recover_prev = tp->snd_recover;
6971                 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
6972                 tp->t_flags |= TF_PREVVALID;
6973         } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
6974                 tp->t_flags &= ~TF_PREVVALID;
6975         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
6976         if ((tp->t_state == TCPS_SYN_SENT) ||
6977             (tp->t_state == TCPS_SYN_RECEIVED))
6978                 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
6979         else
6980                 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
6981
6982         RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
6983            max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
6984         /*
6985          * We enter the path for PLMTUD if connection is established or, if
6986          * connection is FIN_WAIT_1 status, reason for the last is that if
6987          * amount of data we send is very small, we could send it in couple
6988          * of packets and process straight to FIN. In that case we won't
6989          * catch ESTABLISHED state.
6990          */
6991 #ifdef INET6
6992         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
6993 #else
6994         isipv6 = false;
6995 #endif
6996         if (((V_tcp_pmtud_blackhole_detect == 1) ||
6997             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
6998             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
6999             ((tp->t_state == TCPS_ESTABLISHED) ||
7000             (tp->t_state == TCPS_FIN_WAIT_1))) {
7001                 /*
7002                  * Idea here is that at each stage of mtu probe (usually,
7003                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
7004                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
7005                  * should take care of that.
7006                  */
7007                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
7008                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
7009                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
7010                     tp->t_rxtshift % 2 == 0)) {
7011                         /*
7012                          * Enter Path MTU Black-hole Detection mechanism: -
7013                          * Disable Path MTU Discovery (IP "DF" bit). -
7014                          * Reduce MTU to lower value than what we negotiated
7015                          * with peer.
7016                          */
7017                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
7018                                 /* Record that we may have found a black hole. */
7019                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
7020                                 /* Keep track of previous MSS. */
7021                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
7022                         }
7023
7024                         /*
7025                          * Reduce the MSS to blackhole value or to the
7026                          * default in an attempt to retransmit.
7027                          */
7028 #ifdef INET6
7029                         if (isipv6 &&
7030                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
7031                                 /* Use the sysctl tuneable blackhole MSS. */
7032                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
7033                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7034                         } else if (isipv6) {
7035                                 /* Use the default MSS. */
7036                                 tp->t_maxseg = V_tcp_v6mssdflt;
7037                                 /*
7038                                  * Disable Path MTU Discovery when we switch
7039                                  * to minmss.
7040                                  */
7041                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7042                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7043                         }
7044 #endif
7045 #if defined(INET6) && defined(INET)
7046                         else
7047 #endif
7048 #ifdef INET
7049                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
7050                                 /* Use the sysctl tuneable blackhole MSS. */
7051                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
7052                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7053                         } else {
7054                                 /* Use the default MSS. */
7055                                 tp->t_maxseg = V_tcp_mssdflt;
7056                                 /*
7057                                  * Disable Path MTU Discovery when we switch
7058                                  * to minmss.
7059                                  */
7060                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7061                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7062                         }
7063 #endif
7064                 } else {
7065                         /*
7066                          * If further retransmissions are still unsuccessful
7067                          * with a lowered MTU, maybe this isn't a blackhole
7068                          * and we restore the previous MSS and blackhole
7069                          * detection flags. The limit '6' is determined by
7070                          * giving each probe stage (1448, 1188, 524) 2
7071                          * chances to recover.
7072                          */
7073                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
7074                             (tp->t_rxtshift >= 6)) {
7075                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
7076                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
7077                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
7078                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
7079                         }
7080                 }
7081         }
7082         /*
7083          * Disable RFC1323 and SACK if we haven't got any response to
7084          * our third SYN to work-around some broken terminal servers
7085          * (most of which have hopefully been retired) that have bad VJ
7086          * header compression code which trashes TCP segments containing
7087          * unknown-to-them TCP options.
7088          */
7089         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
7090             (tp->t_rxtshift == 3))
7091                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
7092         /*
7093          * If we backed off this far, our srtt estimate is probably bogus.
7094          * Clobber it so we'll take the next rtt measurement as our srtt;
7095          * move the current srtt into rttvar to keep the current retransmit
7096          * times until then.
7097          */
7098         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
7099 #ifdef INET6
7100                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
7101                         in6_losing(tp->t_inpcb);
7102                 else
7103 #endif
7104                         in_losing(tp->t_inpcb);
7105                 tp->t_rttvar += tp->t_srtt;
7106                 tp->t_srtt = 0;
7107         }
7108         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
7109         tp->snd_recover = tp->snd_max;
7110         tp->t_flags |= TF_ACKNOW;
7111         tp->t_rtttime = 0;
7112         rack_cong_signal(tp, CC_RTO, tp->snd_una);
7113 out:
7114         return (retval);
7115 }
7116
7117 static int
7118 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
7119 {
7120         int32_t ret = 0;
7121         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
7122
7123         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
7124             (tp->t_flags & TF_GPUTINPROG)) {
7125                 /*
7126                  * We have a goodput in progress
7127                  * and we have entered a late state.
7128                  * Do we have enough data in the sb
7129                  * to handle the GPUT request?
7130                  */
7131                 uint32_t bytes;
7132
7133                 bytes = tp->gput_ack - tp->gput_seq;
7134                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
7135                         bytes += tp->gput_seq - tp->snd_una;
7136                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
7137                         /*
7138                          * There are not enough bytes in the socket
7139                          * buffer that have been sent to cover this
7140                          * measurement. Cancel it.
7141                          */
7142                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7143                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
7144                                                    tp->gput_seq,
7145                                                    0, 0, 18, __LINE__, NULL, 0);
7146                         tp->t_flags &= ~TF_GPUTINPROG;
7147                 }
7148         }
7149         if (timers == 0) {
7150                 return (0);
7151         }
7152         if (tp->t_state == TCPS_LISTEN) {
7153                 /* no timers on listen sockets */
7154                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
7155                         return (0);
7156                 return (1);
7157         }
7158         if ((timers & PACE_TMR_RACK) &&
7159             rack->rc_on_min_to) {
7160                 /*
7161                  * For the rack timer when we
7162                  * are on a min-timeout (which means rrr_conf = 3)
7163                  * we don't want to check the timer. It may
7164                  * be going off for a pace and thats ok we
7165                  * want to send the retransmit (if its ready).
7166                  *
7167                  * If its on a normal rack timer (non-min) then
7168                  * we will check if its expired.
7169                  */
7170                 goto skip_time_check;
7171         }
7172         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7173                 uint32_t left;
7174
7175                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
7176                         ret = -1;
7177                         rack_log_to_processing(rack, cts, ret, 0);
7178                         return (0);
7179                 }
7180                 if (hpts_calling == 0) {
7181                         /*
7182                          * A user send or queued mbuf (sack) has called us? We
7183                          * return 0 and let the pacing guards
7184                          * deal with it if they should or
7185                          * should not cause a send.
7186                          */
7187                         ret = -2;
7188                         rack_log_to_processing(rack, cts, ret, 0);
7189                         return (0);
7190                 }
7191                 /*
7192                  * Ok our timer went off early and we are not paced false
7193                  * alarm, go back to sleep.
7194                  */
7195                 ret = -3;
7196                 left = rack->r_ctl.rc_timer_exp - cts;
7197                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
7198                 rack_log_to_processing(rack, cts, ret, left);
7199                 return (1);
7200         }
7201 skip_time_check:
7202         rack->rc_tmr_stopped = 0;
7203         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
7204         if (timers & PACE_TMR_DELACK) {
7205                 ret = rack_timeout_delack(tp, rack, cts);
7206         } else if (timers & PACE_TMR_RACK) {
7207                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7208                 rack->r_fast_output = 0;
7209                 ret = rack_timeout_rack(tp, rack, cts);
7210         } else if (timers & PACE_TMR_TLP) {
7211                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7212                 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
7213         } else if (timers & PACE_TMR_RXT) {
7214                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7215                 rack->r_fast_output = 0;
7216                 ret = rack_timeout_rxt(tp, rack, cts);
7217         } else if (timers & PACE_TMR_PERSIT) {
7218                 ret = rack_timeout_persist(tp, rack, cts);
7219         } else if (timers & PACE_TMR_KEEP) {
7220                 ret = rack_timeout_keepalive(tp, rack, cts);
7221         }
7222         rack_log_to_processing(rack, cts, ret, timers);
7223         return (ret);
7224 }
7225
7226 static void
7227 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
7228 {
7229         struct timeval tv;
7230         uint32_t us_cts, flags_on_entry;
7231         uint8_t hpts_removed = 0;
7232
7233         flags_on_entry = rack->r_ctl.rc_hpts_flags;
7234         us_cts = tcp_get_usecs(&tv);
7235         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
7236             ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
7237              ((tp->snd_max - tp->snd_una) == 0))) {
7238                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7239                 hpts_removed = 1;
7240                 /* If we were not delayed cancel out the flag. */
7241                 if ((tp->snd_max - tp->snd_una) == 0)
7242                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
7243                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7244         }
7245         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7246                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7247                 if (rack->rc_inp->inp_in_hpts &&
7248                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
7249                         /*
7250                          * Canceling timer's when we have no output being
7251                          * paced. We also must remove ourselves from the
7252                          * hpts.
7253                          */
7254                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7255                         hpts_removed = 1;
7256                 }
7257                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
7258         }
7259         if (hpts_removed == 0)
7260                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7261 }
7262
7263 static void
7264 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
7265 {
7266         return;
7267 }
7268
7269 static int
7270 rack_stopall(struct tcpcb *tp)
7271 {
7272         struct tcp_rack *rack;
7273         rack = (struct tcp_rack *)tp->t_fb_ptr;
7274         rack->t_timers_stopped = 1;
7275         return (0);
7276 }
7277
7278 static void
7279 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
7280 {
7281         return;
7282 }
7283
7284 static int
7285 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
7286 {
7287         return (0);
7288 }
7289
7290 static void
7291 rack_stop_all_timers(struct tcpcb *tp)
7292 {
7293         struct tcp_rack *rack;
7294
7295         /*
7296          * Assure no timers are running.
7297          */
7298         if (tcp_timer_active(tp, TT_PERSIST)) {
7299                 /* We enter in persists, set the flag appropriately */
7300                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7301                 rack->rc_in_persist = 1;
7302         }
7303         tcp_timer_suspend(tp, TT_PERSIST);
7304         tcp_timer_suspend(tp, TT_REXMT);
7305         tcp_timer_suspend(tp, TT_KEEP);
7306         tcp_timer_suspend(tp, TT_DELACK);
7307 }
7308
7309 static void
7310 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
7311     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
7312 {
7313         int32_t idx;
7314
7315         rsm->r_rtr_cnt++;
7316         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7317         rsm->r_dupack = 0;
7318         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
7319                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
7320                 rsm->r_flags |= RACK_OVERMAX;
7321         }
7322         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
7323                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
7324                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
7325         }
7326         idx = rsm->r_rtr_cnt - 1;
7327         rsm->r_tim_lastsent[idx] = ts;
7328         /*
7329          * Here we don't add in the len of send, since its already
7330          * in snduna <->snd_max.
7331          */
7332         rsm->r_fas = ctf_flight_size(rack->rc_tp,
7333                                      rack->r_ctl.rc_sacked);
7334         if (rsm->r_flags & RACK_ACKED) {
7335                 /* Problably MTU discovery messing with us */
7336                 rsm->r_flags &= ~RACK_ACKED;
7337                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7338         }
7339         if (rsm->r_in_tmap) {
7340                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7341                 rsm->r_in_tmap = 0;
7342         }
7343         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7344         rsm->r_in_tmap = 1;
7345         if (rsm->r_flags & RACK_SACK_PASSED) {
7346                 /* We have retransmitted due to the SACK pass */
7347                 rsm->r_flags &= ~RACK_SACK_PASSED;
7348                 rsm->r_flags |= RACK_WAS_SACKPASS;
7349         }
7350 }
7351
7352 static uint32_t
7353 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
7354     struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
7355 {
7356         /*
7357          * We (re-)transmitted starting at rsm->r_start for some length
7358          * (possibly less than r_end.
7359          */
7360         struct rack_sendmap *nrsm, *insret;
7361         uint32_t c_end;
7362         int32_t len;
7363
7364         len = *lenp;
7365         c_end = rsm->r_start + len;
7366         if (SEQ_GEQ(c_end, rsm->r_end)) {
7367                 /*
7368                  * We retransmitted the whole piece or more than the whole
7369                  * slopping into the next rsm.
7370                  */
7371                 rack_update_rsm(tp, rack, rsm, ts, add_flag);
7372                 if (c_end == rsm->r_end) {
7373                         *lenp = 0;
7374                         return (0);
7375                 } else {
7376                         int32_t act_len;
7377
7378                         /* Hangs over the end return whats left */
7379                         act_len = rsm->r_end - rsm->r_start;
7380                         *lenp = (len - act_len);
7381                         return (rsm->r_end);
7382                 }
7383                 /* We don't get out of this block. */
7384         }
7385         /*
7386          * Here we retransmitted less than the whole thing which means we
7387          * have to split this into what was transmitted and what was not.
7388          */
7389         nrsm = rack_alloc_full_limit(rack);
7390         if (nrsm == NULL) {
7391                 /*
7392                  * We can't get memory, so lets not proceed.
7393                  */
7394                 *lenp = 0;
7395                 return (0);
7396         }
7397         /*
7398          * So here we are going to take the original rsm and make it what we
7399          * retransmitted. nrsm will be the tail portion we did not
7400          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
7401          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
7402          * 1, 6 and the new piece will be 6, 11.
7403          */
7404         rack_clone_rsm(rack, nrsm, rsm, c_end);
7405         nrsm->r_dupack = 0;
7406         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
7407         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7408 #ifdef INVARIANTS
7409         if (insret != NULL) {
7410                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7411                       nrsm, insret, rack, rsm);
7412         }
7413 #endif
7414         if (rsm->r_in_tmap) {
7415                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7416                 nrsm->r_in_tmap = 1;
7417         }
7418         rsm->r_flags &= (~RACK_HAS_FIN);
7419         rack_update_rsm(tp, rack, rsm, ts, add_flag);
7420         /* Log a split of rsm into rsm and nrsm */
7421         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7422         *lenp = 0;
7423         return (0);
7424 }
7425
7426 static void
7427 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
7428                 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts,
7429                 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
7430 {
7431         struct tcp_rack *rack;
7432         struct rack_sendmap *rsm, *nrsm, *insret, fe;
7433         register uint32_t snd_max, snd_una;
7434
7435         /*
7436          * Add to the RACK log of packets in flight or retransmitted. If
7437          * there is a TS option we will use the TS echoed, if not we will
7438          * grab a TS.
7439          *
7440          * Retransmissions will increment the count and move the ts to its
7441          * proper place. Note that if options do not include TS's then we
7442          * won't be able to effectively use the ACK for an RTT on a retran.
7443          *
7444          * Notes about r_start and r_end. Lets consider a send starting at
7445          * sequence 1 for 10 bytes. In such an example the r_start would be
7446          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
7447          * This means that r_end is actually the first sequence for the next
7448          * slot (11).
7449          *
7450          */
7451         /*
7452          * If err is set what do we do XXXrrs? should we not add the thing?
7453          * -- i.e. return if err != 0 or should we pretend we sent it? --
7454          * i.e. proceed with add ** do this for now.
7455          */
7456         INP_WLOCK_ASSERT(tp->t_inpcb);
7457         if (err)
7458                 /*
7459                  * We don't log errors -- we could but snd_max does not
7460                  * advance in this case either.
7461                  */
7462                 return;
7463
7464         if (th_flags & TH_RST) {
7465                 /*
7466                  * We don't log resets and we return immediately from
7467                  * sending
7468                  */
7469                 return;
7470         }
7471         rack = (struct tcp_rack *)tp->t_fb_ptr;
7472         snd_una = tp->snd_una;
7473         snd_max = tp->snd_max;
7474         if (th_flags & (TH_SYN | TH_FIN)) {
7475                 /*
7476                  * The call to rack_log_output is made before bumping
7477                  * snd_max. This means we can record one extra byte on a SYN
7478                  * or FIN if seq_out is adding more on and a FIN is present
7479                  * (and we are not resending).
7480                  */
7481                 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
7482                         len++;
7483                 if (th_flags & TH_FIN)
7484                         len++;
7485                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
7486                         /*
7487                          * The add/update as not been done for the FIN/SYN
7488                          * yet.
7489                          */
7490                         snd_max = tp->snd_nxt;
7491                 }
7492         }
7493         if (SEQ_LEQ((seq_out + len), snd_una)) {
7494                 /* Are sending an old segment to induce an ack (keep-alive)? */
7495                 return;
7496         }
7497         if (SEQ_LT(seq_out, snd_una)) {
7498                 /* huh? should we panic? */
7499                 uint32_t end;
7500
7501                 end = seq_out + len;
7502                 seq_out = snd_una;
7503                 if (SEQ_GEQ(end, seq_out))
7504                         len = end - seq_out;
7505                 else
7506                         len = 0;
7507         }
7508         if (len == 0) {
7509                 /* We don't log zero window probes */
7510                 return;
7511         }
7512         rack->r_ctl.rc_time_last_sent = cts;
7513         if (IN_FASTRECOVERY(tp->t_flags)) {
7514                 rack->r_ctl.rc_prr_out += len;
7515         }
7516         /* First question is it a retransmission or new? */
7517         if (seq_out == snd_max) {
7518                 /* Its new */
7519 again:
7520                 rsm = rack_alloc(rack);
7521                 if (rsm == NULL) {
7522                         /*
7523                          * Hmm out of memory and the tcb got destroyed while
7524                          * we tried to wait.
7525                          */
7526                         return;
7527                 }
7528                 if (th_flags & TH_FIN) {
7529                         rsm->r_flags = RACK_HAS_FIN|add_flag;
7530                 } else {
7531                         rsm->r_flags = add_flag;
7532                 }
7533                 if (hw_tls)
7534                         rsm->r_hw_tls = 1;
7535                 rsm->r_tim_lastsent[0] = cts;
7536                 rsm->r_rtr_cnt = 1;
7537                 rsm->r_rtr_bytes = 0;
7538                 if (th_flags & TH_SYN) {
7539                         /* The data space is one beyond snd_una */
7540                         rsm->r_flags |= RACK_HAS_SYN;
7541                 }
7542                 rsm->r_start = seq_out;
7543                 rsm->r_end = rsm->r_start + len;
7544                 rsm->r_dupack = 0;
7545                 /*
7546                  * save off the mbuf location that
7547                  * sndmbuf_noadv returned (which is
7548                  * where we started copying from)..
7549                  */
7550                 rsm->m = s_mb;
7551                 rsm->soff = s_moff;
7552                 /*
7553                  * Here we do add in the len of send, since its not yet
7554                  * reflected in in snduna <->snd_max
7555                  */
7556                 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
7557                                               rack->r_ctl.rc_sacked) +
7558                               (rsm->r_end - rsm->r_start));
7559                 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
7560                 if (rsm->m) {
7561                         if (rsm->m->m_len <= rsm->soff) {
7562                                 /*
7563                                  * XXXrrs Question, will this happen?
7564                                  *
7565                                  * If sbsndptr is set at the correct place
7566                                  * then s_moff should always be somewhere
7567                                  * within rsm->m. But if the sbsndptr was
7568                                  * off then that won't be true. If it occurs
7569                                  * we need to walkout to the correct location.
7570                                  */
7571                                 struct mbuf *lm;
7572
7573                                 lm = rsm->m;
7574                                 while (lm->m_len <= rsm->soff) {
7575                                         rsm->soff -= lm->m_len;
7576                                         lm = lm->m_next;
7577                                         KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
7578                                                              __func__, rack, s_moff, s_mb, rsm->soff));
7579                                 }
7580                                 rsm->m = lm;
7581                                 counter_u64_add(rack_sbsndptr_wrong, 1);
7582                         } else
7583                                 counter_u64_add(rack_sbsndptr_right, 1);
7584                         rsm->orig_m_len = rsm->m->m_len;
7585                 } else
7586                         rsm->orig_m_len = 0;
7587                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7588                 /* Log a new rsm */
7589                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
7590                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7591 #ifdef INVARIANTS
7592                 if (insret != NULL) {
7593                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7594                               nrsm, insret, rack, rsm);
7595                 }
7596 #endif
7597                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7598                 rsm->r_in_tmap = 1;
7599                 /*
7600                  * Special case detection, is there just a single
7601                  * packet outstanding when we are not in recovery?
7602                  *
7603                  * If this is true mark it so.
7604                  */
7605                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
7606                     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
7607                         struct rack_sendmap *prsm;
7608
7609                         prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7610                         if (prsm)
7611                                 prsm->r_one_out_nr = 1;
7612                 }
7613                 return;
7614         }
7615         /*
7616          * If we reach here its a retransmission and we need to find it.
7617          */
7618         memset(&fe, 0, sizeof(fe));
7619 more:
7620         if (hintrsm && (hintrsm->r_start == seq_out)) {
7621                 rsm = hintrsm;
7622                 hintrsm = NULL;
7623         } else {
7624                 /* No hints sorry */
7625                 rsm = NULL;
7626         }
7627         if ((rsm) && (rsm->r_start == seq_out)) {
7628                 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7629                 if (len == 0) {
7630                         return;
7631                 } else {
7632                         goto more;
7633                 }
7634         }
7635         /* Ok it was not the last pointer go through it the hard way. */
7636 refind:
7637         fe.r_start = seq_out;
7638         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
7639         if (rsm) {
7640                 if (rsm->r_start == seq_out) {
7641                         seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7642                         if (len == 0) {
7643                                 return;
7644                         } else {
7645                                 goto refind;
7646                         }
7647                 }
7648                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
7649                         /* Transmitted within this piece */
7650                         /*
7651                          * Ok we must split off the front and then let the
7652                          * update do the rest
7653                          */
7654                         nrsm = rack_alloc_full_limit(rack);
7655                         if (nrsm == NULL) {
7656                                 rack_update_rsm(tp, rack, rsm, cts, add_flag);
7657                                 return;
7658                         }
7659                         /*
7660                          * copy rsm to nrsm and then trim the front of rsm
7661                          * to not include this part.
7662                          */
7663                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
7664                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7665                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7666 #ifdef INVARIANTS
7667                         if (insret != NULL) {
7668                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7669                                       nrsm, insret, rack, rsm);
7670                         }
7671 #endif
7672                         if (rsm->r_in_tmap) {
7673                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7674                                 nrsm->r_in_tmap = 1;
7675                         }
7676                         rsm->r_flags &= (~RACK_HAS_FIN);
7677                         seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
7678                         if (len == 0) {
7679                                 return;
7680                         } else if (len > 0)
7681                                 goto refind;
7682                 }
7683         }
7684         /*
7685          * Hmm not found in map did they retransmit both old and on into the
7686          * new?
7687          */
7688         if (seq_out == tp->snd_max) {
7689                 goto again;
7690         } else if (SEQ_LT(seq_out, tp->snd_max)) {
7691 #ifdef INVARIANTS
7692                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
7693                        seq_out, len, tp->snd_una, tp->snd_max);
7694                 printf("Starting Dump of all rack entries\n");
7695                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
7696                         printf("rsm:%p start:%u end:%u\n",
7697                                rsm, rsm->r_start, rsm->r_end);
7698                 }
7699                 printf("Dump complete\n");
7700                 panic("seq_out not found rack:%p tp:%p",
7701                       rack, tp);
7702 #endif
7703         } else {
7704 #ifdef INVARIANTS
7705                 /*
7706                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
7707                  * flag)
7708                  */
7709                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
7710                       seq_out, len, tp->snd_max, tp);
7711 #endif
7712         }
7713 }
7714
7715 /*
7716  * Record one of the RTT updates from an ack into
7717  * our sample structure.
7718  */
7719
7720 static void
7721 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
7722                     int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
7723 {
7724         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7725             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
7726                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
7727         }
7728         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7729             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
7730                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
7731         }
7732         if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
7733             if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
7734                 rack->r_ctl.rc_gp_lowrtt = us_rtt;
7735             if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
7736                     rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
7737         }
7738         if ((confidence == 1) &&
7739             ((rsm == NULL) ||
7740              (rsm->r_just_ret) ||
7741              (rsm->r_one_out_nr &&
7742               len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
7743                 /*
7744                  * If the rsm had a just return
7745                  * hit it then we can't trust the
7746                  * rtt measurement for buffer deterimination
7747                  * Note that a confidence of 2, indicates
7748                  * SACK'd which overrides the r_just_ret or
7749                  * the r_one_out_nr. If it was a CUM-ACK and
7750                  * we had only two outstanding, but get an
7751                  * ack for only 1. Then that also lowers our
7752                  * confidence.
7753                  */
7754                 confidence = 0;
7755         }
7756         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7757             (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
7758                 if (rack->r_ctl.rack_rs.confidence == 0) {
7759                         /*
7760                          * We take anything with no current confidence
7761                          * saved.
7762                          */
7763                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7764                         rack->r_ctl.rack_rs.confidence = confidence;
7765                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7766                 } else if (confidence || rack->r_ctl.rack_rs.confidence) {
7767                         /*
7768                          * Once we have a confident number,
7769                          * we can update it with a smaller
7770                          * value since this confident number
7771                          * may include the DSACK time until
7772                          * the next segment (the second one) arrived.
7773                          */
7774                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7775                         rack->r_ctl.rack_rs.confidence = confidence;
7776                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7777                 }
7778         }
7779         rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
7780         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
7781         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
7782         rack->r_ctl.rack_rs.rs_rtt_cnt++;
7783 }
7784
7785 /*
7786  * Collect new round-trip time estimate
7787  * and update averages and current timeout.
7788  */
7789 static void
7790 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
7791 {
7792         int32_t delta;
7793         uint32_t o_srtt, o_var;
7794         int32_t hrtt_up = 0;
7795         int32_t rtt;
7796
7797         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
7798                 /* No valid sample */
7799                 return;
7800         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
7801                 /* We are to use the lowest RTT seen in a single ack */
7802                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
7803         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
7804                 /* We are to use the highest RTT seen in a single ack */
7805                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
7806         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
7807                 /* We are to use the average RTT seen in a single ack */
7808                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
7809                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
7810         } else {
7811 #ifdef INVARIANTS
7812                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
7813 #endif
7814                 return;
7815         }
7816         if (rtt == 0)
7817                 rtt = 1;
7818         if (rack->rc_gp_rtt_set == 0) {
7819                 /*
7820                  * With no RTT we have to accept
7821                  * even one we are not confident of.
7822                  */
7823                 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
7824                 rack->rc_gp_rtt_set = 1;
7825         } else if (rack->r_ctl.rack_rs.confidence) {
7826                 /* update the running gp srtt */
7827                 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
7828                 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
7829         }
7830         if (rack->r_ctl.rack_rs.confidence) {
7831                 /*
7832                  * record the low and high for highly buffered path computation,
7833                  * we only do this if we are confident (not a retransmission).
7834                  */
7835                 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
7836                         rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7837                         hrtt_up = 1;
7838                 }
7839                 if (rack->rc_highly_buffered == 0) {
7840                         /*
7841                          * Currently once we declare a path has
7842                          * highly buffered there is no going
7843                          * back, which may be a problem...
7844                          */
7845                         if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
7846                                 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
7847                                                      rack->r_ctl.rc_highest_us_rtt,
7848                                                      rack->r_ctl.rc_lowest_us_rtt,
7849                                                      RACK_RTTS_SEEHBP);
7850                                 rack->rc_highly_buffered = 1;
7851                         }
7852                 }
7853         }
7854         if ((rack->r_ctl.rack_rs.confidence) ||
7855             (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
7856                 /*
7857                  * If we are highly confident of it <or> it was
7858                  * never retransmitted we accept it as the last us_rtt.
7859                  */
7860                 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7861                 /* The lowest rtt can be set if its was not retransmited */
7862                 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
7863                         rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7864                         if (rack->r_ctl.rc_lowest_us_rtt == 0)
7865                                 rack->r_ctl.rc_lowest_us_rtt = 1;
7866                 }
7867         }
7868         o_srtt = tp->t_srtt;
7869         o_var = tp->t_rttvar;
7870         rack = (struct tcp_rack *)tp->t_fb_ptr;
7871         if (tp->t_srtt != 0) {
7872                 /*
7873                  * We keep a simple srtt in microseconds, like our rtt
7874                  * measurement. We don't need to do any tricks with shifting
7875                  * etc. Instead we just add in 1/8th of the new measurement
7876                  * and subtract out 1/8 of the old srtt. We do the same with
7877                  * the variance after finding the absolute value of the
7878                  * difference between this sample and the current srtt.
7879                  */
7880                 delta = tp->t_srtt - rtt;
7881                 /* Take off 1/8th of the current sRTT */
7882                 tp->t_srtt -= (tp->t_srtt >> 3);
7883                 /* Add in 1/8th of the new RTT just measured */
7884                 tp->t_srtt += (rtt >> 3);
7885                 if (tp->t_srtt <= 0)
7886                         tp->t_srtt = 1;
7887                 /* Now lets make the absolute value of the variance */
7888                 if (delta < 0)
7889                         delta = -delta;
7890                 /* Subtract out 1/8th */
7891                 tp->t_rttvar -= (tp->t_rttvar >> 3);
7892                 /* Add in 1/8th of the new variance we just saw */
7893                 tp->t_rttvar += (delta >> 3);
7894                 if (tp->t_rttvar <= 0)
7895                         tp->t_rttvar = 1;
7896                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
7897                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7898         } else {
7899                 /*
7900                  * No rtt measurement yet - use the unsmoothed rtt. Set the
7901                  * variance to half the rtt (so our first retransmit happens
7902                  * at 3*rtt).
7903                  */
7904                 tp->t_srtt = rtt;
7905                 tp->t_rttvar = rtt >> 1;
7906                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7907         }
7908         rack->rc_srtt_measure_made = 1;
7909         KMOD_TCPSTAT_INC(tcps_rttupdated);
7910         tp->t_rttupdated++;
7911 #ifdef STATS
7912         if (rack_stats_gets_ms_rtt == 0) {
7913                 /* Send in the microsecond rtt used for rxt timeout purposes */
7914                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
7915         } else if (rack_stats_gets_ms_rtt == 1) {
7916                 /* Send in the millisecond rtt used for rxt timeout purposes */
7917                 int32_t ms_rtt;
7918
7919                 /* Round up */
7920                 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7921                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7922         } else if (rack_stats_gets_ms_rtt == 2) {
7923                 /* Send in the millisecond rtt has close to the path RTT as we can get  */
7924                 int32_t ms_rtt;
7925
7926                 /* Round up */
7927                 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7928                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7929         }  else {
7930                 /* Send in the microsecond rtt has close to the path RTT as we can get  */
7931                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
7932         }
7933
7934 #endif
7935         /*
7936          * the retransmit should happen at rtt + 4 * rttvar. Because of the
7937          * way we do the smoothing, srtt and rttvar will each average +1/2
7938          * tick of bias.  When we compute the retransmit timer, we want 1/2
7939          * tick of rounding and 1 extra tick because of +-1/2 tick
7940          * uncertainty in the firing of the timer.  The bias will give us
7941          * exactly the 1.5 tick we need.  But, because the bias is
7942          * statistical, we have to test that we don't drop below the minimum
7943          * feasible timer (which is 2 ticks).
7944          */
7945         tp->t_rxtshift = 0;
7946         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
7947                       max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
7948         rack_log_rtt_sample(rack, rtt);
7949         tp->t_softerror = 0;
7950 }
7951
7952
7953 static void
7954 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
7955 {
7956         /*
7957          * Apply to filter the inbound us-rtt at us_cts.
7958          */
7959         uint32_t old_rtt;
7960
7961         old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
7962         apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
7963                                us_rtt, us_cts);
7964         if (rack->r_ctl.last_pacing_time &&
7965             rack->rc_gp_dyn_mul &&
7966             (rack->r_ctl.last_pacing_time > us_rtt))
7967                 rack->pacing_longer_than_rtt = 1;
7968         else
7969                 rack->pacing_longer_than_rtt = 0;
7970         if (old_rtt > us_rtt) {
7971                 /* We just hit a new lower rtt time */
7972                 rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
7973                                      __LINE__, RACK_RTTS_NEWRTT);
7974                 /*
7975                  * Only count it if its lower than what we saw within our
7976                  * calculated range.
7977                  */
7978                 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
7979                         if (rack_probertt_lower_within &&
7980                             rack->rc_gp_dyn_mul &&
7981                             (rack->use_fixed_rate == 0) &&
7982                             (rack->rc_always_pace)) {
7983                                 /*
7984                                  * We are seeing a new lower rtt very close
7985                                  * to the time that we would have entered probe-rtt.
7986                                  * This is probably due to the fact that a peer flow
7987                                  * has entered probe-rtt. Lets go in now too.
7988                                  */
7989                                 uint32_t val;
7990
7991                                 val = rack_probertt_lower_within * rack_time_between_probertt;
7992                                 val /= 100;
7993                                 if ((rack->in_probe_rtt == 0)  &&
7994                                     ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
7995                                         rack_enter_probertt(rack, us_cts);
7996                                 }
7997                         }
7998                         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
7999                 }
8000         }
8001 }
8002
8003 static int
8004 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
8005     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
8006 {
8007         uint32_t us_rtt;
8008         int32_t i, all;
8009         uint32_t t, len_acked;
8010
8011         if ((rsm->r_flags & RACK_ACKED) ||
8012             (rsm->r_flags & RACK_WAS_ACKED))
8013                 /* Already done */
8014                 return (0);
8015         if (rsm->r_no_rtt_allowed) {
8016                 /* Not allowed */
8017                 return (0);
8018         }
8019         if (ack_type == CUM_ACKED) {
8020                 if (SEQ_GT(th_ack, rsm->r_end)) {
8021                         len_acked = rsm->r_end - rsm->r_start;
8022                         all = 1;
8023                 } else {
8024                         len_acked = th_ack - rsm->r_start;
8025                         all = 0;
8026                 }
8027         } else {
8028                 len_acked = rsm->r_end - rsm->r_start;
8029                 all = 0;
8030         }
8031         if (rsm->r_rtr_cnt == 1) {
8032
8033                 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8034                 if ((int)t <= 0)
8035                         t = 1;
8036                 if (!tp->t_rttlow || tp->t_rttlow > t)
8037                         tp->t_rttlow = t;
8038                 if (!rack->r_ctl.rc_rack_min_rtt ||
8039                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8040                         rack->r_ctl.rc_rack_min_rtt = t;
8041                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
8042                                 rack->r_ctl.rc_rack_min_rtt = 1;
8043                         }
8044                 }
8045                 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
8046                         us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8047                 else
8048                         us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8049                 if (us_rtt == 0)
8050                         us_rtt = 1;
8051                 if (CC_ALGO(tp)->rttsample != NULL) {
8052                         /* Kick the RTT to the CC */
8053                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
8054                 }
8055                 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
8056                 if (ack_type == SACKED) {
8057                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
8058                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
8059                 } else {
8060                         /*
8061                          * We need to setup what our confidence
8062                          * is in this ack.
8063                          *
8064                          * If the rsm was app limited and it is
8065                          * less than a mss in length (the end
8066                          * of the send) then we have a gap. If we
8067                          * were app limited but say we were sending
8068                          * multiple MSS's then we are more confident
8069                          * int it.
8070                          *
8071                          * When we are not app-limited then we see if
8072                          * the rsm is being included in the current
8073                          * measurement, we tell this by the app_limited_needs_set
8074                          * flag.
8075                          *
8076                          * Note that being cwnd blocked is not applimited
8077                          * as well as the pacing delay between packets which
8078                          * are sending only 1 or 2 MSS's also will show up
8079                          * in the RTT. We probably need to examine this algorithm
8080                          * a bit more and enhance it to account for the delay
8081                          * between rsm's. We could do that by saving off the
8082                          * pacing delay of each rsm (in an rsm) and then
8083                          * factoring that in somehow though for now I am
8084                          * not sure how :)
8085                          */
8086                         int calc_conf = 0;
8087
8088                         if (rsm->r_flags & RACK_APP_LIMITED) {
8089                                 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
8090                                         calc_conf = 0;
8091                                 else
8092                                         calc_conf = 1;
8093                         } else if (rack->app_limited_needs_set == 0) {
8094                                 calc_conf = 1;
8095                         } else {
8096                                 calc_conf = 0;
8097                         }
8098                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
8099                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
8100                                             calc_conf, rsm, rsm->r_rtr_cnt);
8101                 }
8102                 if ((rsm->r_flags & RACK_TLP) &&
8103                     (!IN_FASTRECOVERY(tp->t_flags))) {
8104                         /* Segment was a TLP and our retrans matched */
8105                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
8106                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
8107                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8108                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8109                                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
8110                         }
8111                 }
8112                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
8113                         /* New more recent rack_tmit_time */
8114                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8115                         rack->rc_rack_rtt = t;
8116                 }
8117                 return (1);
8118         }
8119         /*
8120          * We clear the soft/rxtshift since we got an ack.
8121          * There is no assurance we will call the commit() function
8122          * so we need to clear these to avoid incorrect handling.
8123          */
8124         tp->t_rxtshift = 0;
8125         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8126                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
8127         tp->t_softerror = 0;
8128         if (to && (to->to_flags & TOF_TS) &&
8129             (ack_type == CUM_ACKED) &&
8130             (to->to_tsecr) &&
8131             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
8132                 /*
8133                  * Now which timestamp does it match? In this block the ACK
8134                  * must be coming from a previous transmission.
8135                  */
8136                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8137                         if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
8138                                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8139                                 if ((int)t <= 0)
8140                                         t = 1;
8141                                 if (CC_ALGO(tp)->rttsample != NULL) {
8142                                         /*
8143                                          * Kick the RTT to the CC, here
8144                                          * we lie a bit in that we know the
8145                                          * retransmission is correct even though
8146                                          * we retransmitted. This is because
8147                                          * we match the timestamps.
8148                                          */
8149                                         if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
8150                                                 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
8151                                         else
8152                                                 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
8153                                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
8154                                 }
8155                                 if ((i + 1) < rsm->r_rtr_cnt) {
8156                                         /*
8157                                          * The peer ack'd from our previous
8158                                          * transmission. We have a spurious
8159                                          * retransmission and thus we dont
8160                                          * want to update our rack_rtt.
8161                                          *
8162                                          * Hmm should there be a CC revert here?
8163                                          *
8164                                          */
8165                                         return (0);
8166                                 }
8167                                 if (!tp->t_rttlow || tp->t_rttlow > t)
8168                                         tp->t_rttlow = t;
8169                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8170                                         rack->r_ctl.rc_rack_min_rtt = t;
8171                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
8172                                                 rack->r_ctl.rc_rack_min_rtt = 1;
8173                                         }
8174                                 }
8175                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8176                                            (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
8177                                         /* New more recent rack_tmit_time */
8178                                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8179                                         rack->rc_rack_rtt = t;
8180                                 }
8181                                 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
8182                                 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
8183                                                     rsm->r_rtr_cnt);
8184                                 return (1);
8185                         }
8186                 }
8187                 goto ts_not_found;
8188         } else {
8189                 /*
8190                  * Ok its a SACK block that we retransmitted. or a windows
8191                  * machine without timestamps. We can tell nothing from the
8192                  * time-stamp since its not there or the time the peer last
8193                  * recieved a segment that moved forward its cum-ack point.
8194                  */
8195 ts_not_found:
8196                 i = rsm->r_rtr_cnt - 1;
8197                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8198                 if ((int)t <= 0)
8199                         t = 1;
8200                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8201                         /*
8202                          * We retransmitted and the ack came back in less
8203                          * than the smallest rtt we have observed. We most
8204                          * likely did an improper retransmit as outlined in
8205                          * 6.2 Step 2 point 2 in the rack-draft so we
8206                          * don't want to update our rack_rtt. We in
8207                          * theory (in future) might want to think about reverting our
8208                          * cwnd state but we won't for now.
8209                          */
8210                         return (0);
8211                 } else if (rack->r_ctl.rc_rack_min_rtt) {
8212                         /*
8213                          * We retransmitted it and the retransmit did the
8214                          * job.
8215                          */
8216                         if (!rack->r_ctl.rc_rack_min_rtt ||
8217                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8218                                 rack->r_ctl.rc_rack_min_rtt = t;
8219                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8220                                         rack->r_ctl.rc_rack_min_rtt = 1;
8221                                 }
8222                         }
8223                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
8224                                 /* New more recent rack_tmit_time */
8225                                 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
8226                                 rack->rc_rack_rtt = t;
8227                         }
8228                         return (1);
8229                 }
8230         }
8231         return (0);
8232 }
8233
8234 /*
8235  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
8236  */
8237 static void
8238 rack_log_sack_passed(struct tcpcb *tp,
8239     struct tcp_rack *rack, struct rack_sendmap *rsm)
8240 {
8241         struct rack_sendmap *nrsm;
8242
8243         nrsm = rsm;
8244         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
8245             rack_head, r_tnext) {
8246                 if (nrsm == rsm) {
8247                         /* Skip orginal segment he is acked */
8248                         continue;
8249                 }
8250                 if (nrsm->r_flags & RACK_ACKED) {
8251                         /*
8252                          * Skip ack'd segments, though we
8253                          * should not see these, since tmap
8254                          * should not have ack'd segments.
8255                          */
8256                         continue;
8257                 }
8258                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8259                         /*
8260                          * We found one that is already marked
8261                          * passed, we have been here before and
8262                          * so all others below this are marked.
8263                          */
8264                         break;
8265                 }
8266                 nrsm->r_flags |= RACK_SACK_PASSED;
8267                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
8268         }
8269 }
8270
8271 static void
8272 rack_need_set_test(struct tcpcb *tp,
8273                    struct tcp_rack *rack,
8274                    struct rack_sendmap *rsm,
8275                    tcp_seq th_ack,
8276                    int line,
8277                    int use_which)
8278 {
8279
8280         if ((tp->t_flags & TF_GPUTINPROG) &&
8281             SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8282                 /*
8283                  * We were app limited, and this ack
8284                  * butts up or goes beyond the point where we want
8285                  * to start our next measurement. We need
8286                  * to record the new gput_ts as here and
8287                  * possibly update the start sequence.
8288                  */
8289                 uint32_t seq, ts;
8290
8291                 if (rsm->r_rtr_cnt > 1) {
8292                         /*
8293                          * This is a retransmit, can we
8294                          * really make any assessment at this
8295                          * point?  We are not really sure of
8296                          * the timestamp, is it this or the
8297                          * previous transmission?
8298                          *
8299                          * Lets wait for something better that
8300                          * is not retransmitted.
8301                          */
8302                         return;
8303                 }
8304                 seq = tp->gput_seq;
8305                 ts = tp->gput_ts;
8306                 rack->app_limited_needs_set = 0;
8307                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
8308                 /* Do we start at a new end? */
8309                 if ((use_which == RACK_USE_BEG) &&
8310                     SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
8311                         /*
8312                          * When we get an ACK that just eats
8313                          * up some of the rsm, we set RACK_USE_BEG
8314                          * since whats at r_start (i.e. th_ack)
8315                          * is left unacked and thats where the
8316                          * measurement not starts.
8317                          */
8318                         tp->gput_seq = rsm->r_start;
8319                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8320                 }
8321                 if ((use_which == RACK_USE_END) &&
8322                     SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8323                             /*
8324                              * We use the end when the cumack
8325                              * is moving forward and completely
8326                              * deleting the rsm passed so basically
8327                              * r_end holds th_ack.
8328                              *
8329                              * For SACK's we also want to use the end
8330                              * since this piece just got sacked and
8331                              * we want to target anything after that
8332                              * in our measurement.
8333                              */
8334                             tp->gput_seq = rsm->r_end;
8335                             rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8336                 }
8337                 if (use_which == RACK_USE_END_OR_THACK) {
8338                         /*
8339                          * special case for ack moving forward,
8340                          * not a sack, we need to move all the
8341                          * way up to where this ack cum-ack moves
8342                          * to.
8343                          */
8344                         if (SEQ_GT(th_ack, rsm->r_end))
8345                                 tp->gput_seq = th_ack;
8346                         else
8347                                 tp->gput_seq = rsm->r_end;
8348                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8349                 }
8350                 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
8351                         /*
8352                          * We moved beyond this guy's range, re-calculate
8353                          * the new end point.
8354                          */
8355                         if (rack->rc_gp_filled == 0) {
8356                                 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
8357                         } else {
8358                                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
8359                         }
8360                 }
8361                 /*
8362                  * We are moving the goal post, we may be able to clear the
8363                  * measure_saw_probe_rtt flag.
8364                  */
8365                 if ((rack->in_probe_rtt == 0) &&
8366                     (rack->measure_saw_probe_rtt) &&
8367                     (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
8368                         rack->measure_saw_probe_rtt = 0;
8369                 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
8370                                            seq, tp->gput_seq, 0, 5, line, NULL, 0);
8371                 if (rack->rc_gp_filled &&
8372                     ((tp->gput_ack - tp->gput_seq) <
8373                      max(rc_init_window(rack), (MIN_GP_WIN *
8374                                                 ctf_fixed_maxseg(tp))))) {
8375                         uint32_t ideal_amount;
8376
8377                         ideal_amount = rack_get_measure_window(tp, rack);
8378                         if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8379                                 /*
8380                                  * There is no sense of continuing this measurement
8381                                  * because its too small to gain us anything we
8382                                  * trust. Skip it and that way we can start a new
8383                                  * measurement quicker.
8384                                  */
8385                                 tp->t_flags &= ~TF_GPUTINPROG;
8386                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
8387                                                            0, 0, 0, 6, __LINE__, NULL, 0);
8388                         } else {
8389                                 /*
8390                                  * Reset the window further out.
8391                                  */
8392                                 tp->gput_ack = tp->gput_seq + ideal_amount;
8393                         }
8394                 }
8395         }
8396 }
8397
8398 static inline int
8399 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
8400 {
8401         if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
8402                 /* Behind our TLP definition or right at */
8403                 return (0);
8404         }
8405         if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
8406                 /* The start is beyond or right at our end of TLP definition */
8407                 return (0);
8408         }
8409         /* It has to be a sub-part of the original TLP recorded */
8410         return (1);
8411 }
8412
8413
8414 static uint32_t
8415 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
8416                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
8417 {
8418         uint32_t start, end, changed = 0;
8419         struct rack_sendmap stack_map;
8420         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
8421         int32_t used_ref = 1;
8422         int moved = 0;
8423
8424         start = sack->start;
8425         end = sack->end;
8426         rsm = *prsm;
8427         memset(&fe, 0, sizeof(fe));
8428 do_rest_ofb:
8429         if ((rsm == NULL) ||
8430             (SEQ_LT(end, rsm->r_start)) ||
8431             (SEQ_GEQ(start, rsm->r_end)) ||
8432             (SEQ_LT(start, rsm->r_start))) {
8433                 /*
8434                  * We are not in the right spot,
8435                  * find the correct spot in the tree.
8436                  */
8437                 used_ref = 0;
8438                 fe.r_start = start;
8439                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
8440                 moved++;
8441         }
8442         if (rsm == NULL) {
8443                 /* TSNH */
8444                 goto out;
8445         }
8446         /* Ok we have an ACK for some piece of this rsm */
8447         if (rsm->r_start != start) {
8448                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8449                         /*
8450                          * Before any splitting or hookery is
8451                          * done is it a TLP of interest i.e. rxt?
8452                          */
8453                         if ((rsm->r_flags & RACK_TLP) &&
8454                             (rsm->r_rtr_cnt > 1)) {
8455                                 /*
8456                                  * We are splitting a rxt TLP, check
8457                                  * if we need to save off the start/end
8458                                  */
8459                                 if (rack->rc_last_tlp_acked_set &&
8460                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8461                                         /*
8462                                          * We already turned this on since we are inside
8463                                          * the previous one was a partially sack now we
8464                                          * are getting another one (maybe all of it).
8465                                          *
8466                                          */
8467                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8468                                         /*
8469                                          * Lets make sure we have all of it though.
8470                                          */
8471                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8472                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8473                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8474                                                                      rack->r_ctl.last_tlp_acked_end);
8475                                         }
8476                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8477                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8478                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8479                                                                      rack->r_ctl.last_tlp_acked_end);
8480                                         }
8481                                 } else {
8482                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8483                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8484                                         rack->rc_last_tlp_past_cumack = 0;
8485                                         rack->rc_last_tlp_acked_set = 1;
8486                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8487                                 }
8488                         }
8489                         /**
8490                          * Need to split this in two pieces the before and after,
8491                          * the before remains in the map, the after must be
8492                          * added. In other words we have:
8493                          * rsm        |--------------|
8494                          * sackblk        |------->
8495                          * rsm will become
8496                          *     rsm    |---|
8497                          * and nrsm will be  the sacked piece
8498                          *     nrsm       |----------|
8499                          *
8500                          * But before we start down that path lets
8501                          * see if the sack spans over on top of
8502                          * the next guy and it is already sacked.
8503                          *
8504                          */
8505                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8506                         if (next && (next->r_flags & RACK_ACKED) &&
8507                             SEQ_GEQ(end, next->r_start)) {
8508                                 /**
8509                                  * So the next one is already acked, and
8510                                  * we can thus by hookery use our stack_map
8511                                  * to reflect the piece being sacked and
8512                                  * then adjust the two tree entries moving
8513                                  * the start and ends around. So we start like:
8514                                  *  rsm     |------------|             (not-acked)
8515                                  *  next                 |-----------| (acked)
8516                                  *  sackblk        |-------->
8517                                  *  We want to end like so:
8518                                  *  rsm     |------|                   (not-acked)
8519                                  *  next           |-----------------| (acked)
8520                                  *  nrsm           |-----|
8521                                  * Where nrsm is a temporary stack piece we
8522                                  * use to update all the gizmos.
8523                                  */
8524                                 /* Copy up our fudge block */
8525                                 nrsm = &stack_map;
8526                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8527                                 /* Now adjust our tree blocks */
8528                                 rsm->r_end = start;
8529                                 next->r_start = start;
8530                                 /* Now we must adjust back where next->m is */
8531                                 rack_setup_offset_for_rsm(rsm, next);
8532
8533                                 /* We don't need to adjust rsm, it did not change */
8534                                 /* Clear out the dup ack count of the remainder */
8535                                 rsm->r_dupack = 0;
8536                                 rsm->r_just_ret = 0;
8537                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8538                                 /* Now lets make sure our fudge block is right */
8539                                 nrsm->r_start = start;
8540                                 /* Now lets update all the stats and such */
8541                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8542                                 if (rack->app_limited_needs_set)
8543                                         rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8544                                 changed += (nrsm->r_end - nrsm->r_start);
8545                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8546                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8547                                         counter_u64_add(rack_reorder_seen, 1);
8548                                         rack->r_ctl.rc_reorder_ts = cts;
8549                                 }
8550                                 /*
8551                                  * Now we want to go up from rsm (the
8552                                  * one left un-acked) to the next one
8553                                  * in the tmap. We do this so when
8554                                  * we walk backwards we include marking
8555                                  * sack-passed on rsm (The one passed in
8556                                  * is skipped since it is generally called
8557                                  * on something sacked before removing it
8558                                  * from the tmap).
8559                                  */
8560                                 if (rsm->r_in_tmap) {
8561                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
8562                                         /*
8563                                          * Now that we have the next
8564                                          * one walk backwards from there.
8565                                          */
8566                                         if (nrsm && nrsm->r_in_tmap)
8567                                                 rack_log_sack_passed(tp, rack, nrsm);
8568                                 }
8569                                 /* Now are we done? */
8570                                 if (SEQ_LT(end, next->r_end) ||
8571                                     (end == next->r_end)) {
8572                                         /* Done with block */
8573                                         goto out;
8574                                 }
8575                                 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
8576                                 counter_u64_add(rack_sack_used_next_merge, 1);
8577                                 /* Postion for the next block */
8578                                 start = next->r_end;
8579                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
8580                                 if (rsm == NULL)
8581                                         goto out;
8582                         } else {
8583                                 /**
8584                                  * We can't use any hookery here, so we
8585                                  * need to split the map. We enter like
8586                                  * so:
8587                                  *  rsm      |--------|
8588                                  *  sackblk       |----->
8589                                  * We will add the new block nrsm and
8590                                  * that will be the new portion, and then
8591                                  * fall through after reseting rsm. So we
8592                                  * split and look like this:
8593                                  *  rsm      |----|
8594                                  *  sackblk       |----->
8595                                  *  nrsm          |---|
8596                                  * We then fall through reseting
8597                                  * rsm to nrsm, so the next block
8598                                  * picks it up.
8599                                  */
8600                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8601                                 if (nrsm == NULL) {
8602                                         /*
8603                                          * failed XXXrrs what can we do but loose the sack
8604                                          * info?
8605                                          */
8606                                         goto out;
8607                                 }
8608                                 counter_u64_add(rack_sack_splits, 1);
8609                                 rack_clone_rsm(rack, nrsm, rsm, start);
8610                                 rsm->r_just_ret = 0;
8611                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8612 #ifdef INVARIANTS
8613                                 if (insret != NULL) {
8614                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8615                                               nrsm, insret, rack, rsm);
8616                                 }
8617 #endif
8618                                 if (rsm->r_in_tmap) {
8619                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8620                                         nrsm->r_in_tmap = 1;
8621                                 }
8622                                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
8623                                 rsm->r_flags &= (~RACK_HAS_FIN);
8624                                 /* Position us to point to the new nrsm that starts the sack blk */
8625                                 rsm = nrsm;
8626                         }
8627                 } else {
8628                         /* Already sacked this piece */
8629                         counter_u64_add(rack_sack_skipped_acked, 1);
8630                         moved++;
8631                         if (end == rsm->r_end) {
8632                                 /* Done with block */
8633                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8634                                 goto out;
8635                         } else if (SEQ_LT(end, rsm->r_end)) {
8636                                 /* A partial sack to a already sacked block */
8637                                 moved++;
8638                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8639                                 goto out;
8640                         } else {
8641                                 /*
8642                                  * The end goes beyond this guy
8643                                  * repostion the start to the
8644                                  * next block.
8645                                  */
8646                                 start = rsm->r_end;
8647                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8648                                 if (rsm == NULL)
8649                                         goto out;
8650                         }
8651                 }
8652         }
8653         if (SEQ_GEQ(end, rsm->r_end)) {
8654                 /**
8655                  * The end of this block is either beyond this guy or right
8656                  * at this guy. I.e.:
8657                  *  rsm ---                 |-----|
8658                  *  end                     |-----|
8659                  *  <or>
8660                  *  end                     |---------|
8661                  */
8662                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8663                         /*
8664                          * Is it a TLP of interest?
8665                          */
8666                         if ((rsm->r_flags & RACK_TLP) &&
8667                             (rsm->r_rtr_cnt > 1)) {
8668                                 /*
8669                                  * We are splitting a rxt TLP, check
8670                                  * if we need to save off the start/end
8671                                  */
8672                                 if (rack->rc_last_tlp_acked_set &&
8673                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8674                                         /*
8675                                          * We already turned this on since we are inside
8676                                          * the previous one was a partially sack now we
8677                                          * are getting another one (maybe all of it).
8678                                          */
8679                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8680                                         /*
8681                                          * Lets make sure we have all of it though.
8682                                          */
8683                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8684                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8685                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8686                                                                      rack->r_ctl.last_tlp_acked_end);
8687                                         }
8688                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8689                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8690                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8691                                                                      rack->r_ctl.last_tlp_acked_end);
8692                                         }
8693                                 } else {
8694                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8695                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8696                                         rack->rc_last_tlp_past_cumack = 0;
8697                                         rack->rc_last_tlp_acked_set = 1;
8698                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8699                                 }
8700                         }
8701                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8702                         changed += (rsm->r_end - rsm->r_start);
8703                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8704                         if (rsm->r_in_tmap) /* should be true */
8705                                 rack_log_sack_passed(tp, rack, rsm);
8706                         /* Is Reordering occuring? */
8707                         if (rsm->r_flags & RACK_SACK_PASSED) {
8708                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8709                                 counter_u64_add(rack_reorder_seen, 1);
8710                                 rack->r_ctl.rc_reorder_ts = cts;
8711                         }
8712                         if (rack->app_limited_needs_set)
8713                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8714                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8715                         rsm->r_flags |= RACK_ACKED;
8716                         if (rsm->r_in_tmap) {
8717                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8718                                 rsm->r_in_tmap = 0;
8719                         }
8720                         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
8721                 } else {
8722                         counter_u64_add(rack_sack_skipped_acked, 1);
8723                         moved++;
8724                 }
8725                 if (end == rsm->r_end) {
8726                         /* This block only - done, setup for next */
8727                         goto out;
8728                 }
8729                 /*
8730                  * There is more not coverend by this rsm move on
8731                  * to the next block in the RB tree.
8732                  */
8733                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8734                 start = rsm->r_end;
8735                 rsm = nrsm;
8736                 if (rsm == NULL)
8737                         goto out;
8738                 goto do_rest_ofb;
8739         }
8740         /**
8741          * The end of this sack block is smaller than
8742          * our rsm i.e.:
8743          *  rsm ---                 |-----|
8744          *  end                     |--|
8745          */
8746         if ((rsm->r_flags & RACK_ACKED) == 0) {
8747                 /*
8748                  * Is it a TLP of interest?
8749                  */
8750                 if ((rsm->r_flags & RACK_TLP) &&
8751                     (rsm->r_rtr_cnt > 1)) {
8752                         /*
8753                          * We are splitting a rxt TLP, check
8754                          * if we need to save off the start/end
8755                          */
8756                         if (rack->rc_last_tlp_acked_set &&
8757                             (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8758                                 /*
8759                                  * We already turned this on since we are inside
8760                                  * the previous one was a partially sack now we
8761                                  * are getting another one (maybe all of it).
8762                                  */
8763                                 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8764                                 /*
8765                                  * Lets make sure we have all of it though.
8766                                  */
8767                                 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8768                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8769                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8770                                                              rack->r_ctl.last_tlp_acked_end);
8771                                 }
8772                                 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8773                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8774                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8775                                                              rack->r_ctl.last_tlp_acked_end);
8776                                 }
8777                         } else {
8778                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8779                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8780                                 rack->rc_last_tlp_past_cumack = 0;
8781                                 rack->rc_last_tlp_acked_set = 1;
8782                                 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8783                         }
8784                 }
8785                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8786                 if (prev &&
8787                     (prev->r_flags & RACK_ACKED)) {
8788                         /**
8789                          * Goal, we want the right remainder of rsm to shrink
8790                          * in place and span from (rsm->r_start = end) to rsm->r_end.
8791                          * We want to expand prev to go all the way
8792                          * to prev->r_end <- end.
8793                          * so in the tree we have before:
8794                          *   prev     |--------|         (acked)
8795                          *   rsm               |-------| (non-acked)
8796                          *   sackblk           |-|
8797                          * We churn it so we end up with
8798                          *   prev     |----------|       (acked)
8799                          *   rsm                 |-----| (non-acked)
8800                          *   nrsm              |-| (temporary)
8801                          *
8802                          * Note if either prev/rsm is a TLP we don't
8803                          * do this.
8804                          */
8805                         nrsm = &stack_map;
8806                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8807                         prev->r_end = end;
8808                         rsm->r_start = end;
8809                         /* Now adjust nrsm (stack copy) to be
8810                          * the one that is the small
8811                          * piece that was "sacked".
8812                          */
8813                         nrsm->r_end = end;
8814                         rsm->r_dupack = 0;
8815                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8816                         /*
8817                          * Now that the rsm has had its start moved forward
8818                          * lets go ahead and get its new place in the world.
8819                          */
8820                         rack_setup_offset_for_rsm(prev, rsm);
8821                         /*
8822                          * Now nrsm is our new little piece
8823                          * that is acked (which was merged
8824                          * to prev). Update the rtt and changed
8825                          * based on that. Also check for reordering.
8826                          */
8827                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8828                         if (rack->app_limited_needs_set)
8829                                 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8830                         changed += (nrsm->r_end - nrsm->r_start);
8831                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8832                         if (nrsm->r_flags & RACK_SACK_PASSED) {
8833                                 counter_u64_add(rack_reorder_seen, 1);
8834                                 rack->r_ctl.rc_reorder_ts = cts;
8835                         }
8836                         rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
8837                         rsm = prev;
8838                         counter_u64_add(rack_sack_used_prev_merge, 1);
8839                 } else {
8840                         /**
8841                          * This is the case where our previous
8842                          * block is not acked either, so we must
8843                          * split the block in two.
8844                          */
8845                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8846                         if (nrsm == NULL) {
8847                                 /* failed rrs what can we do but loose the sack info? */
8848                                 goto out;
8849                         }
8850                         if ((rsm->r_flags & RACK_TLP) &&
8851                             (rsm->r_rtr_cnt > 1)) {
8852                                 /*
8853                                  * We are splitting a rxt TLP, check
8854                                  * if we need to save off the start/end
8855                                  */
8856                                 if (rack->rc_last_tlp_acked_set &&
8857                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8858                                             /*
8859                                              * We already turned this on since this block is inside
8860                                              * the previous one was a partially sack now we
8861                                              * are getting another one (maybe all of it).
8862                                              */
8863                                             rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8864                                             /*
8865                                              * Lets make sure we have all of it though.
8866                                              */
8867                                             if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8868                                                     rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8869                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8870                                                                          rack->r_ctl.last_tlp_acked_end);
8871                                             }
8872                                             if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8873                                                     rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8874                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8875                                                                          rack->r_ctl.last_tlp_acked_end);
8876                                             }
8877                                     } else {
8878                                             rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8879                                             rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8880                                             rack->rc_last_tlp_acked_set = 1;
8881                                             rack->rc_last_tlp_past_cumack = 0;
8882                                             rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8883                                     }
8884                         }
8885                         /**
8886                          * In this case nrsm becomes
8887                          * nrsm->r_start = end;
8888                          * nrsm->r_end = rsm->r_end;
8889                          * which is un-acked.
8890                          * <and>
8891                          * rsm->r_end = nrsm->r_start;
8892                          * i.e. the remaining un-acked
8893                          * piece is left on the left
8894                          * hand side.
8895                          *
8896                          * So we start like this
8897                          * rsm      |----------| (not acked)
8898                          * sackblk  |---|
8899                          * build it so we have
8900                          * rsm      |---|         (acked)
8901                          * nrsm         |------|  (not acked)
8902                          */
8903                         counter_u64_add(rack_sack_splits, 1);
8904                         rack_clone_rsm(rack, nrsm, rsm, end);
8905                         rsm->r_flags &= (~RACK_HAS_FIN);
8906                         rsm->r_just_ret = 0;
8907                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8908 #ifdef INVARIANTS
8909                         if (insret != NULL) {
8910                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8911                                       nrsm, insret, rack, rsm);
8912                         }
8913 #endif
8914                         if (rsm->r_in_tmap) {
8915                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8916                                 nrsm->r_in_tmap = 1;
8917                         }
8918                         nrsm->r_dupack = 0;
8919                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8920                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8921                         changed += (rsm->r_end - rsm->r_start);
8922                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8923                         if (rsm->r_in_tmap) /* should be true */
8924                                 rack_log_sack_passed(tp, rack, rsm);
8925                         /* Is Reordering occuring? */
8926                         if (rsm->r_flags & RACK_SACK_PASSED) {
8927                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8928                                 counter_u64_add(rack_reorder_seen, 1);
8929                                 rack->r_ctl.rc_reorder_ts = cts;
8930                         }
8931                         if (rack->app_limited_needs_set)
8932                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8933                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8934                         rsm->r_flags |= RACK_ACKED;
8935                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
8936                         if (rsm->r_in_tmap) {
8937                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8938                                 rsm->r_in_tmap = 0;
8939                         }
8940                 }
8941         } else if (start != end){
8942                 /*
8943                  * The block was already acked.
8944                  */
8945                 counter_u64_add(rack_sack_skipped_acked, 1);
8946                 moved++;
8947         }
8948 out:
8949         if (rsm &&
8950             ((rsm->r_flags & RACK_TLP) == 0) &&
8951             (rsm->r_flags & RACK_ACKED)) {
8952                 /*
8953                  * Now can we merge where we worked
8954                  * with either the previous or
8955                  * next block?
8956                  */
8957                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8958                 while (next) {
8959                         if (next->r_flags & RACK_TLP)
8960                                 break;
8961                         if (next->r_flags & RACK_ACKED) {
8962                         /* yep this and next can be merged */
8963                                 rsm = rack_merge_rsm(rack, rsm, next);
8964                                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8965                         } else
8966                                 break;
8967                 }
8968                 /* Now what about the previous? */
8969                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8970                 while (prev) {
8971                         if (prev->r_flags & RACK_TLP)
8972                                 break;
8973                         if (prev->r_flags & RACK_ACKED) {
8974                                 /* yep the previous and this can be merged */
8975                                 rsm = rack_merge_rsm(rack, prev, rsm);
8976                                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8977                         } else
8978                                 break;
8979                 }
8980         }
8981         if (used_ref == 0) {
8982                 counter_u64_add(rack_sack_proc_all, 1);
8983         } else {
8984                 counter_u64_add(rack_sack_proc_short, 1);
8985         }
8986         /* Save off the next one for quick reference. */
8987         if (rsm)
8988                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8989         else
8990                 nrsm = NULL;
8991         *prsm = rack->r_ctl.rc_sacklast = nrsm;
8992         /* Pass back the moved. */
8993         *moved_two = moved;
8994         return (changed);
8995 }
8996
8997 static void inline
8998 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
8999 {
9000         struct rack_sendmap *tmap;
9001
9002         tmap = NULL;
9003         while (rsm && (rsm->r_flags & RACK_ACKED)) {
9004                 /* Its no longer sacked, mark it so */
9005                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
9006 #ifdef INVARIANTS
9007                 if (rsm->r_in_tmap) {
9008                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
9009                               rack, rsm, rsm->r_flags);
9010                 }
9011 #endif
9012                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
9013                 /* Rebuild it into our tmap */
9014                 if (tmap == NULL) {
9015                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9016                         tmap = rsm;
9017                 } else {
9018                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
9019                         tmap = rsm;
9020                 }
9021                 tmap->r_in_tmap = 1;
9022                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
9023         }
9024         /*
9025          * Now lets possibly clear the sack filter so we start
9026          * recognizing sacks that cover this area.
9027          */
9028         sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
9029
9030 }
9031
9032 static void
9033 rack_do_decay(struct tcp_rack *rack)
9034 {
9035         struct timeval res;
9036
9037 #define timersub(tvp, uvp, vvp)                                         \
9038         do {                                                            \
9039                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
9040                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
9041                 if ((vvp)->tv_usec < 0) {                               \
9042                         (vvp)->tv_sec--;                                \
9043                         (vvp)->tv_usec += 1000000;                      \
9044                 }                                                       \
9045         } while (0)
9046
9047         timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
9048 #undef timersub
9049
9050         rack->r_ctl.input_pkt++;
9051         if ((rack->rc_in_persist) ||
9052             (res.tv_sec >= 1) ||
9053             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
9054                 /*
9055                  * Check for decay of non-SAD,
9056                  * we want all SAD detection metrics to
9057                  * decay 1/4 per second (or more) passed.
9058                  */
9059                 uint32_t pkt_delta;
9060
9061                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
9062                 /* Update our saved tracking values */
9063                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
9064                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
9065                 /* Now do we escape without decay? */
9066 #ifdef NETFLIX_EXP_DETECTION
9067                 if (rack->rc_in_persist ||
9068                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
9069                     (pkt_delta < tcp_sad_low_pps)){
9070                         /*
9071                          * We don't decay idle connections
9072                          * or ones that have a low input pps.
9073                          */
9074                         return;
9075                 }
9076                 /* Decay the counters */
9077                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
9078                                                         tcp_sad_decay_val);
9079                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
9080                                                          tcp_sad_decay_val);
9081                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
9082                                                                tcp_sad_decay_val);
9083                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
9084                                                                 tcp_sad_decay_val);
9085 #endif
9086         }
9087 }
9088
9089 static void
9090 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
9091 {
9092         struct rack_sendmap *rsm, *rm;
9093
9094         /*
9095          * The ACK point is advancing to th_ack, we must drop off
9096          * the packets in the rack log and calculate any eligble
9097          * RTT's.
9098          */
9099         rack->r_wanted_output = 1;
9100
9101         /* Tend any TLP that has been marked for 1/2 the seq space (its old)  */
9102         if ((rack->rc_last_tlp_acked_set == 1)&&
9103             (rack->rc_last_tlp_past_cumack == 1) &&
9104             (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
9105                 /*
9106                  * We have reached the point where our last rack
9107                  * tlp retransmit sequence is ahead of the cum-ack.
9108                  * This can only happen when the cum-ack moves all
9109                  * the way around (its been a full 2^^31+1 bytes
9110                  * or more since we sent a retransmitted TLP). Lets
9111                  * turn off the valid flag since its not really valid.
9112                  *
9113                  * Note since sack's also turn on this event we have
9114                  * a complication, we have to wait to age it out until
9115                  * the cum-ack is by the TLP before checking which is
9116                  * what the next else clause does.
9117                  */
9118                 rack_log_dsack_event(rack, 9, __LINE__,
9119                                      rack->r_ctl.last_tlp_acked_start,
9120                                      rack->r_ctl.last_tlp_acked_end);
9121                 rack->rc_last_tlp_acked_set = 0;
9122                 rack->rc_last_tlp_past_cumack = 0;
9123         } else if ((rack->rc_last_tlp_acked_set == 1) &&
9124                    (rack->rc_last_tlp_past_cumack == 0) &&
9125                    (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
9126                 /*
9127                  * It is safe to start aging TLP's out.
9128                  */
9129                 rack->rc_last_tlp_past_cumack = 1;
9130         }
9131         /* We do the same for the tlp send seq as well */
9132         if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
9133             (rack->rc_last_sent_tlp_past_cumack == 1) &&
9134             (SEQ_GT(rack->r_ctl.last_sent_tlp_seq,  th_ack))) {
9135                 rack_log_dsack_event(rack, 9, __LINE__,
9136                                      rack->r_ctl.last_sent_tlp_seq,
9137                                      (rack->r_ctl.last_sent_tlp_seq +
9138                                       rack->r_ctl.last_sent_tlp_len));
9139                 rack->rc_last_sent_tlp_seq_valid = 0;
9140                 rack->rc_last_sent_tlp_past_cumack = 0;
9141         } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
9142                    (rack->rc_last_sent_tlp_past_cumack == 0) &&
9143                    (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
9144                 /*
9145                  * It is safe to start aging TLP's send.
9146                  */
9147                 rack->rc_last_sent_tlp_past_cumack = 1;
9148         }
9149 more:
9150         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9151         if (rsm == NULL) {
9152                 if ((th_ack - 1) == tp->iss) {
9153                         /*
9154                          * For the SYN incoming case we will not
9155                          * have called tcp_output for the sending of
9156                          * the SYN, so there will be no map. All
9157                          * other cases should probably be a panic.
9158                          */
9159                         return;
9160                 }
9161                 if (tp->t_flags & TF_SENTFIN) {
9162                         /* if we sent a FIN we often will not have map */
9163                         return;
9164                 }
9165 #ifdef INVARIANTS
9166                 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
9167                       tp,
9168                       tp->t_state, th_ack, rack,
9169                       tp->snd_una, tp->snd_max, tp->snd_nxt);
9170 #endif
9171                 return;
9172         }
9173         if (SEQ_LT(th_ack, rsm->r_start)) {
9174                 /* Huh map is missing this */
9175 #ifdef INVARIANTS
9176                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
9177                        rsm->r_start,
9178                        th_ack, tp->t_state, rack->r_state);
9179 #endif
9180                 return;
9181         }
9182         rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
9183
9184         /* Now was it a retransmitted TLP? */
9185         if ((rsm->r_flags & RACK_TLP) &&
9186             (rsm->r_rtr_cnt > 1)) {
9187                 /*
9188                  * Yes, this rsm was a TLP and retransmitted, remember that
9189                  * since if a DSACK comes back on this we don't want
9190                  * to think of it as a reordered segment. This may
9191                  * get updated again with possibly even other TLPs
9192                  * in flight, but thats ok. Only when we don't send
9193                  * a retransmitted TLP for 1/2 the sequences space
9194                  * will it get turned off (above).
9195                  */
9196                 if (rack->rc_last_tlp_acked_set &&
9197                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9198                         /*
9199                          * We already turned this on since the end matches,
9200                          * the previous one was a partially ack now we
9201                          * are getting another one (maybe all of it).
9202                          */
9203                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9204                         /*
9205                          * Lets make sure we have all of it though.
9206                          */
9207                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9208                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9209                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9210                                                      rack->r_ctl.last_tlp_acked_end);
9211                         }
9212                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9213                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9214                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9215                                                      rack->r_ctl.last_tlp_acked_end);
9216                         }
9217                 } else {
9218                         rack->rc_last_tlp_past_cumack = 1;
9219                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9220                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9221                         rack->rc_last_tlp_acked_set = 1;
9222                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9223                 }
9224         }
9225         /* Now do we consume the whole thing? */
9226         if (SEQ_GEQ(th_ack, rsm->r_end)) {
9227                 /* Its all consumed. */
9228                 uint32_t left;
9229                 uint8_t newly_acked;
9230
9231                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
9232                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
9233                 rsm->r_rtr_bytes = 0;
9234                 /* Record the time of highest cumack sent */
9235                 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9236                 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
9237 #ifdef INVARIANTS
9238                 if (rm != rsm) {
9239                         panic("removing head in rack:%p rsm:%p rm:%p",
9240                               rack, rsm, rm);
9241                 }
9242 #endif
9243                 if (rsm->r_in_tmap) {
9244                         TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9245                         rsm->r_in_tmap = 0;
9246                 }
9247                 newly_acked = 1;
9248                 if (rsm->r_flags & RACK_ACKED) {
9249                         /*
9250                          * It was acked on the scoreboard -- remove
9251                          * it from total
9252                          */
9253                         rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
9254                         newly_acked = 0;
9255                 } else if (rsm->r_flags & RACK_SACK_PASSED) {
9256                         /*
9257                          * There are segments ACKED on the
9258                          * scoreboard further up. We are seeing
9259                          * reordering.
9260                          */
9261                         rsm->r_flags &= ~RACK_SACK_PASSED;
9262                         counter_u64_add(rack_reorder_seen, 1);
9263                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9264                         rsm->r_flags |= RACK_ACKED;
9265                         rack->r_ctl.rc_reorder_ts = cts;
9266                         if (rack->r_ent_rec_ns) {
9267                                 /*
9268                                  * We have sent no more, and we saw an sack
9269                                  * then ack arrive.
9270                                  */
9271                                 rack->r_might_revert = 1;
9272                         }
9273                 }
9274                 if ((rsm->r_flags & RACK_TO_REXT) &&
9275                     (tp->t_flags & TF_RCVD_TSTMP) &&
9276                     (to->to_flags & TOF_TS) &&
9277                     (to->to_tsecr != 0) &&
9278                     (tp->t_flags & TF_PREVVALID)) {
9279                         /*
9280                          * We can use the timestamp to see
9281                          * if this retransmission was from the
9282                          * first transmit. If so we made a mistake.
9283                          */
9284                         tp->t_flags &= ~TF_PREVVALID;
9285                         if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
9286                                 /* The first transmit is what this ack is for */
9287                                 rack_cong_signal(tp, CC_RTO_ERR, th_ack);
9288                         }
9289                 }
9290                 left = th_ack - rsm->r_end;
9291                 if (rack->app_limited_needs_set && newly_acked)
9292                         rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
9293                 /* Free back to zone */
9294                 rack_free(rack, rsm);
9295                 if (left) {
9296                         goto more;
9297                 }
9298                 /* Check for reneging */
9299                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9300                 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
9301                         /*
9302                          * The peer has moved snd_una up to
9303                          * the edge of this send, i.e. one
9304                          * that it had previously acked. The only
9305                          * way that can be true if the peer threw
9306                          * away data (space issues) that it had
9307                          * previously sacked (else it would have
9308                          * given us snd_una up to (rsm->r_end).
9309                          * We need to undo the acked markings here.
9310                          *
9311                          * Note we have to look to make sure th_ack is
9312                          * our rsm->r_start in case we get an old ack
9313                          * where th_ack is behind snd_una.
9314                          */
9315                         rack_peer_reneges(rack, rsm, th_ack);
9316                 }
9317                 return;
9318         }
9319         if (rsm->r_flags & RACK_ACKED) {
9320                 /*
9321                  * It was acked on the scoreboard -- remove it from
9322                  * total for the part being cum-acked.
9323                  */
9324                 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
9325         }
9326         /*
9327          * Clear the dup ack count for
9328          * the piece that remains.
9329          */
9330         rsm->r_dupack = 0;
9331         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9332         if (rsm->r_rtr_bytes) {
9333                 /*
9334                  * It was retransmitted adjust the
9335                  * sack holes for what was acked.
9336                  */
9337                 int ack_am;
9338
9339                 ack_am = (th_ack - rsm->r_start);
9340                 if (ack_am >= rsm->r_rtr_bytes) {
9341                         rack->r_ctl.rc_holes_rxt -= ack_am;
9342                         rsm->r_rtr_bytes -= ack_am;
9343                 }
9344         }
9345         /*
9346          * Update where the piece starts and record
9347          * the time of send of highest cumack sent.
9348          */
9349         rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9350         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
9351         /* Now we need to move our offset forward too */
9352         if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
9353                 /* Fix up the orig_m_len and possibly the mbuf offset */
9354                 rack_adjust_orig_mlen(rsm);
9355         }
9356         rsm->soff += (th_ack - rsm->r_start);
9357         rsm->r_start = th_ack;
9358         /* Now do we need to move the mbuf fwd too? */
9359         if (rsm->m) {
9360                 while (rsm->soff >= rsm->m->m_len) {
9361                         rsm->soff -= rsm->m->m_len;
9362                         rsm->m = rsm->m->m_next;
9363                         KASSERT((rsm->m != NULL),
9364                                 (" nrsm:%p hit at soff:%u null m",
9365                                  rsm, rsm->soff));
9366                 }
9367                 rsm->orig_m_len = rsm->m->m_len;
9368         }
9369         if (rack->app_limited_needs_set)
9370                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
9371 }
9372
9373 static void
9374 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
9375 {
9376         struct rack_sendmap *rsm;
9377         int sack_pass_fnd = 0;
9378
9379         if (rack->r_might_revert) {
9380                 /*
9381                  * Ok we have reordering, have not sent anything, we
9382                  * might want to revert the congestion state if nothing
9383                  * further has SACK_PASSED on it. Lets check.
9384                  *
9385                  * We also get here when we have DSACKs come in for
9386                  * all the data that we FR'd. Note that a rxt or tlp
9387                  * timer clears this from happening.
9388                  */
9389
9390                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
9391                         if (rsm->r_flags & RACK_SACK_PASSED) {
9392                                 sack_pass_fnd = 1;
9393                                 break;
9394                         }
9395                 }
9396                 if (sack_pass_fnd == 0) {
9397                         /*
9398                          * We went into recovery
9399                          * incorrectly due to reordering!
9400                          */
9401                         int orig_cwnd;
9402
9403                         rack->r_ent_rec_ns = 0;
9404                         orig_cwnd = tp->snd_cwnd;
9405                         tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec;
9406                         tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
9407                         tp->snd_recover = tp->snd_una;
9408                         rack_log_to_prr(rack, 14, orig_cwnd);
9409                         EXIT_RECOVERY(tp->t_flags);
9410                 }
9411                 rack->r_might_revert = 0;
9412         }
9413 }
9414
9415 #ifdef NETFLIX_EXP_DETECTION
9416 static void
9417 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t bytes_this_ack, uint32_t segsiz)
9418 {
9419         if ((rack->do_detection || tcp_force_detection) &&
9420             tcp_sack_to_ack_thresh &&
9421             tcp_sack_to_move_thresh &&
9422             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
9423                 /*
9424                  * We have thresholds set to find
9425                  * possible attackers and disable sack.
9426                  * Check them.
9427                  */
9428                 uint64_t ackratio, moveratio, movetotal;
9429
9430                 /* Log detecting */
9431                 rack_log_sad(rack, 1);
9432                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
9433                 ackratio *= (uint64_t)(1000);
9434                 if (rack->r_ctl.ack_count)
9435                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
9436                 else {
9437                         /* We really should not hit here */
9438                         ackratio = 1000;
9439                 }
9440                 if ((rack->sack_attack_disable == 0) &&
9441                     (ackratio > rack_highest_sack_thresh_seen))
9442                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
9443                 movetotal = rack->r_ctl.sack_moved_extra;
9444                 movetotal += rack->r_ctl.sack_noextra_move;
9445                 moveratio = rack->r_ctl.sack_moved_extra;
9446                 moveratio *= (uint64_t)1000;
9447                 if (movetotal)
9448                         moveratio /= movetotal;
9449                 else {
9450                         /* No moves, thats pretty good */
9451                         moveratio = 0;
9452                 }
9453                 if ((rack->sack_attack_disable == 0) &&
9454                     (moveratio > rack_highest_move_thresh_seen))
9455                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
9456                 if (rack->sack_attack_disable == 0) {
9457                         if ((ackratio > tcp_sack_to_ack_thresh) &&
9458                             (moveratio > tcp_sack_to_move_thresh)) {
9459                                 /* Disable sack processing */
9460                                 rack->sack_attack_disable = 1;
9461                                 if (rack->r_rep_attack == 0) {
9462                                         rack->r_rep_attack = 1;
9463                                         counter_u64_add(rack_sack_attacks_detected, 1);
9464                                 }
9465                                 if (tcp_attack_on_turns_on_logging) {
9466                                         /*
9467                                          * Turn on logging, used for debugging
9468                                          * false positives.
9469                                          */
9470                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
9471                                 }
9472                                 /* Clamp the cwnd at flight size */
9473                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
9474                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9475                                 rack_log_sad(rack, 2);
9476                         }
9477                 } else {
9478                         /* We are sack-disabled check for false positives */
9479                         if ((ackratio <= tcp_restoral_thresh) ||
9480                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
9481                                 rack->sack_attack_disable = 0;
9482                                 rack_log_sad(rack, 3);
9483                                 /* Restart counting */
9484                                 rack->r_ctl.sack_count = 0;
9485                                 rack->r_ctl.sack_moved_extra = 0;
9486                                 rack->r_ctl.sack_noextra_move = 1;
9487                                 rack->r_ctl.ack_count = max(1,
9488                                       (bytes_this_ack / segsiz));
9489
9490                                 if (rack->r_rep_reverse == 0) {
9491                                         rack->r_rep_reverse = 1;
9492                                         counter_u64_add(rack_sack_attacks_reversed, 1);
9493                                 }
9494                                 /* Restore the cwnd */
9495                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
9496                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
9497                         }
9498                 }
9499         }
9500 }
9501 #endif
9502
9503 static int
9504 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
9505 {
9506
9507         uint32_t am, l_end;
9508         int was_tlp = 0;
9509
9510         if (SEQ_GT(end, start))
9511                 am = end - start;
9512         else
9513                 am = 0;
9514         if ((rack->rc_last_tlp_acked_set ) &&
9515             (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
9516             (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
9517                 /*
9518                  * The DSACK is because of a TLP which we don't
9519                  * do anything with the reordering window over since
9520                  * it was not reordering that caused the DSACK but
9521                  * our previous retransmit TLP.
9522                  */
9523                 rack_log_dsack_event(rack, 7, __LINE__, start, end);
9524                 was_tlp = 1;
9525                 goto skip_dsack_round;
9526         }
9527         if (rack->rc_last_sent_tlp_seq_valid) {
9528                 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
9529                 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
9530                     (SEQ_LEQ(end, l_end))) {
9531                         /*
9532                          * This dsack is from the last sent TLP, ignore it
9533                          * for reordering purposes.
9534                          */
9535                         rack_log_dsack_event(rack, 7, __LINE__, start, end);
9536                         was_tlp = 1;
9537                         goto skip_dsack_round;
9538                 }
9539         }
9540         if (rack->rc_dsack_round_seen == 0) {
9541                 rack->rc_dsack_round_seen = 1;
9542                 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
9543                 rack->r_ctl.num_dsack++;
9544                 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
9545                 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
9546         }
9547 skip_dsack_round:
9548         /*
9549          * We keep track of how many DSACK blocks we get
9550          * after a recovery incident.
9551          */
9552         rack->r_ctl.dsack_byte_cnt += am;
9553         if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
9554             rack->r_ctl.retran_during_recovery &&
9555             (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
9556                 /*
9557                  * False recovery most likely culprit is reordering. If
9558                  * nothing else is missing we need to revert.
9559                  */
9560                 rack->r_might_revert = 1;
9561                 rack_handle_might_revert(rack->rc_tp, rack);
9562                 rack->r_might_revert = 0;
9563                 rack->r_ctl.retran_during_recovery = 0;
9564                 rack->r_ctl.dsack_byte_cnt = 0;
9565         }
9566         return (was_tlp);
9567 }
9568
9569 static void
9570 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
9571 {
9572         /* Deal with changed and PRR here (in recovery only) */
9573         uint32_t pipe, snd_una;
9574
9575         rack->r_ctl.rc_prr_delivered += changed;
9576
9577         if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
9578                 /*
9579                  * It is all outstanding, we are application limited
9580                  * and thus we don't need more room to send anything.
9581                  * Note we use tp->snd_una here and not th_ack because
9582                  * the data as yet not been cut from the sb.
9583                  */
9584                 rack->r_ctl.rc_prr_sndcnt = 0;
9585                 return;
9586         }
9587         /* Compute prr_sndcnt */
9588         if (SEQ_GT(tp->snd_una, th_ack)) {
9589                 snd_una = tp->snd_una;
9590         } else {
9591                 snd_una = th_ack;
9592         }
9593         pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
9594         if (pipe > tp->snd_ssthresh) {
9595                 long sndcnt;
9596
9597                 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
9598                 if (rack->r_ctl.rc_prr_recovery_fs > 0)
9599                         sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
9600                 else {
9601                         rack->r_ctl.rc_prr_sndcnt = 0;
9602                         rack_log_to_prr(rack, 9, 0);
9603                         sndcnt = 0;
9604                 }
9605                 sndcnt++;
9606                 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
9607                         sndcnt -= rack->r_ctl.rc_prr_out;
9608                 else
9609                         sndcnt = 0;
9610                 rack->r_ctl.rc_prr_sndcnt = sndcnt;
9611                 rack_log_to_prr(rack, 10, 0);
9612         } else {
9613                 uint32_t limit;
9614
9615                 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
9616                         limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
9617                 else
9618                         limit = 0;
9619                 if (changed > limit)
9620                         limit = changed;
9621                 limit += ctf_fixed_maxseg(tp);
9622                 if (tp->snd_ssthresh > pipe) {
9623                         rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
9624                         rack_log_to_prr(rack, 11, 0);
9625                 } else {
9626                         rack->r_ctl.rc_prr_sndcnt = min(0, limit);
9627                         rack_log_to_prr(rack, 12, 0);
9628                 }
9629         }
9630 }
9631
9632 static void
9633 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
9634 {
9635         uint32_t changed;
9636         struct tcp_rack *rack;
9637         struct rack_sendmap *rsm;
9638         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
9639         register uint32_t th_ack;
9640         int32_t i, j, k, num_sack_blks = 0;
9641         uint32_t cts, acked, ack_point, sack_changed = 0;
9642         int loop_start = 0, moved_two = 0;
9643         uint32_t tsused;
9644
9645
9646         INP_WLOCK_ASSERT(tp->t_inpcb);
9647         if (th->th_flags & TH_RST) {
9648                 /* We don't log resets */
9649                 return;
9650         }
9651         rack = (struct tcp_rack *)tp->t_fb_ptr;
9652         cts = tcp_get_usecs(NULL);
9653         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9654         changed = 0;
9655         th_ack = th->th_ack;
9656         if (rack->sack_attack_disable == 0)
9657                 rack_do_decay(rack);
9658         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
9659                 /*
9660                  * You only get credit for
9661                  * MSS and greater (and you get extra
9662                  * credit for larger cum-ack moves).
9663                  */
9664                 int ac;
9665
9666                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
9667                 rack->r_ctl.ack_count += ac;
9668                 counter_u64_add(rack_ack_total, ac);
9669         }
9670         if (rack->r_ctl.ack_count > 0xfff00000) {
9671                 /*
9672                  * reduce the number to keep us under
9673                  * a uint32_t.
9674                  */
9675                 rack->r_ctl.ack_count /= 2;
9676                 rack->r_ctl.sack_count /= 2;
9677         }
9678         if (SEQ_GT(th_ack, tp->snd_una)) {
9679                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
9680                 tp->t_acktime = ticks;
9681         }
9682         if (rsm && SEQ_GT(th_ack, rsm->r_start))
9683                 changed = th_ack - rsm->r_start;
9684         if (changed) {
9685                 rack_process_to_cumack(tp, rack, th_ack, cts, to);
9686         }
9687         if ((to->to_flags & TOF_SACK) == 0) {
9688                 /* We are done nothing left and no sack. */
9689                 rack_handle_might_revert(tp, rack);
9690                 /*
9691                  * For cases where we struck a dup-ack
9692                  * with no SACK, add to the changes so
9693                  * PRR will work right.
9694                  */
9695                 if (dup_ack_struck && (changed == 0)) {
9696                         changed += ctf_fixed_maxseg(rack->rc_tp);
9697                 }
9698                 goto out;
9699         }
9700         /* Sack block processing */
9701         if (SEQ_GT(th_ack, tp->snd_una))
9702                 ack_point = th_ack;
9703         else
9704                 ack_point = tp->snd_una;
9705         for (i = 0; i < to->to_nsacks; i++) {
9706                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
9707                       &sack, sizeof(sack));
9708                 sack.start = ntohl(sack.start);
9709                 sack.end = ntohl(sack.end);
9710                 if (SEQ_GT(sack.end, sack.start) &&
9711                     SEQ_GT(sack.start, ack_point) &&
9712                     SEQ_LT(sack.start, tp->snd_max) &&
9713                     SEQ_GT(sack.end, ack_point) &&
9714                     SEQ_LEQ(sack.end, tp->snd_max)) {
9715                         sack_blocks[num_sack_blks] = sack;
9716                         num_sack_blks++;
9717                 } else if (SEQ_LEQ(sack.start, th_ack) &&
9718                            SEQ_LEQ(sack.end, th_ack)) {
9719                         int was_tlp;
9720
9721                         was_tlp = rack_note_dsack(rack, sack.start, sack.end);
9722                         /*
9723                          * Its a D-SACK block.
9724                          */
9725                         tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
9726                 }
9727         }
9728         if (rack->rc_dsack_round_seen) {
9729                 /* Is the dsack roound over? */
9730                 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
9731                         /* Yes it is */
9732                         rack->rc_dsack_round_seen = 0;
9733                         rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
9734                 }
9735         }
9736         /*
9737          * Sort the SACK blocks so we can update the rack scoreboard with
9738          * just one pass.
9739          */
9740         num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
9741                                          num_sack_blks, th->th_ack);
9742         ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
9743         if (num_sack_blks == 0) {
9744                 /* Nothing to sack (DSACKs?) */
9745                 goto out_with_totals;
9746         }
9747         if (num_sack_blks < 2) {
9748                 /* Only one, we don't need to sort */
9749                 goto do_sack_work;
9750         }
9751         /* Sort the sacks */
9752         for (i = 0; i < num_sack_blks; i++) {
9753                 for (j = i + 1; j < num_sack_blks; j++) {
9754                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
9755                                 sack = sack_blocks[i];
9756                                 sack_blocks[i] = sack_blocks[j];
9757                                 sack_blocks[j] = sack;
9758                         }
9759                 }
9760         }
9761         /*
9762          * Now are any of the sack block ends the same (yes some
9763          * implementations send these)?
9764          */
9765 again:
9766         if (num_sack_blks == 0)
9767                 goto out_with_totals;
9768         if (num_sack_blks > 1) {
9769                 for (i = 0; i < num_sack_blks; i++) {
9770                         for (j = i + 1; j < num_sack_blks; j++) {
9771                                 if (sack_blocks[i].end == sack_blocks[j].end) {
9772                                         /*
9773                                          * Ok these two have the same end we
9774                                          * want the smallest end and then
9775                                          * throw away the larger and start
9776                                          * again.
9777                                          */
9778                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
9779                                                 /*
9780                                                  * The second block covers
9781                                                  * more area use that
9782                                                  */
9783                                                 sack_blocks[i].start = sack_blocks[j].start;
9784                                         }
9785                                         /*
9786                                          * Now collapse out the dup-sack and
9787                                          * lower the count
9788                                          */
9789                                         for (k = (j + 1); k < num_sack_blks; k++) {
9790                                                 sack_blocks[j].start = sack_blocks[k].start;
9791                                                 sack_blocks[j].end = sack_blocks[k].end;
9792                                                 j++;
9793                                         }
9794                                         num_sack_blks--;
9795                                         goto again;
9796                                 }
9797                         }
9798                 }
9799         }
9800 do_sack_work:
9801         /*
9802          * First lets look to see if
9803          * we have retransmitted and
9804          * can use the transmit next?
9805          */
9806         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
9807         if (rsm &&
9808             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
9809             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
9810                 /*
9811                  * We probably did the FR and the next
9812                  * SACK in continues as we would expect.
9813                  */
9814                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
9815                 if (acked) {
9816                         rack->r_wanted_output = 1;
9817                         changed += acked;
9818                         sack_changed += acked;
9819                 }
9820                 if (num_sack_blks == 1) {
9821                         /*
9822                          * This is what we would expect from
9823                          * a normal implementation to happen
9824                          * after we have retransmitted the FR,
9825                          * i.e the sack-filter pushes down
9826                          * to 1 block and the next to be retransmitted
9827                          * is the sequence in the sack block (has more
9828                          * are acked). Count this as ACK'd data to boost
9829                          * up the chances of recovering any false positives.
9830                          */
9831                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
9832                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
9833                         counter_u64_add(rack_express_sack, 1);
9834                         if (rack->r_ctl.ack_count > 0xfff00000) {
9835                                 /*
9836                                  * reduce the number to keep us under
9837                                  * a uint32_t.
9838                                  */
9839                                 rack->r_ctl.ack_count /= 2;
9840                                 rack->r_ctl.sack_count /= 2;
9841                         }
9842                         goto out_with_totals;
9843                 } else {
9844                         /*
9845                          * Start the loop through the
9846                          * rest of blocks, past the first block.
9847                          */
9848                         moved_two = 0;
9849                         loop_start = 1;
9850                 }
9851         }
9852         /* Its a sack of some sort */
9853         rack->r_ctl.sack_count++;
9854         if (rack->r_ctl.sack_count > 0xfff00000) {
9855                 /*
9856                  * reduce the number to keep us under
9857                  * a uint32_t.
9858                  */
9859                 rack->r_ctl.ack_count /= 2;
9860                 rack->r_ctl.sack_count /= 2;
9861         }
9862         counter_u64_add(rack_sack_total, 1);
9863         if (rack->sack_attack_disable) {
9864                 /* An attacker disablement is in place */
9865                 if (num_sack_blks > 1) {
9866                         rack->r_ctl.sack_count += (num_sack_blks - 1);
9867                         rack->r_ctl.sack_moved_extra++;
9868                         counter_u64_add(rack_move_some, 1);
9869                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
9870                                 rack->r_ctl.sack_moved_extra /= 2;
9871                                 rack->r_ctl.sack_noextra_move /= 2;
9872                         }
9873                 }
9874                 goto out;
9875         }
9876         rsm = rack->r_ctl.rc_sacklast;
9877         for (i = loop_start; i < num_sack_blks; i++) {
9878                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
9879                 if (acked) {
9880                         rack->r_wanted_output = 1;
9881                         changed += acked;
9882                         sack_changed += acked;
9883                 }
9884                 if (moved_two) {
9885                         /*
9886                          * If we did not get a SACK for at least a MSS and
9887                          * had to move at all, or if we moved more than our
9888                          * threshold, it counts against the "extra" move.
9889                          */
9890                         rack->r_ctl.sack_moved_extra += moved_two;
9891                         counter_u64_add(rack_move_some, 1);
9892                 } else {
9893                         /*
9894                          * else we did not have to move
9895                          * any more than we would expect.
9896                          */
9897                         rack->r_ctl.sack_noextra_move++;
9898                         counter_u64_add(rack_move_none, 1);
9899                 }
9900                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
9901                         /*
9902                          * If the SACK was not a full MSS then
9903                          * we add to sack_count the number of
9904                          * MSS's (or possibly more than
9905                          * a MSS if its a TSO send) we had to skip by.
9906                          */
9907                         rack->r_ctl.sack_count += moved_two;
9908                         counter_u64_add(rack_sack_total, moved_two);
9909                 }
9910                 /*
9911                  * Now we need to setup for the next
9912                  * round. First we make sure we won't
9913                  * exceed the size of our uint32_t on
9914                  * the various counts, and then clear out
9915                  * moved_two.
9916                  */
9917                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
9918                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
9919                         rack->r_ctl.sack_moved_extra /= 2;
9920                         rack->r_ctl.sack_noextra_move /= 2;
9921                 }
9922                 if (rack->r_ctl.sack_count > 0xfff00000) {
9923                         rack->r_ctl.ack_count /= 2;
9924                         rack->r_ctl.sack_count /= 2;
9925                 }
9926                 moved_two = 0;
9927         }
9928 out_with_totals:
9929         if (num_sack_blks > 1) {
9930                 /*
9931                  * You get an extra stroke if
9932                  * you have more than one sack-blk, this
9933                  * could be where we are skipping forward
9934                  * and the sack-filter is still working, or
9935                  * it could be an attacker constantly
9936                  * moving us.
9937                  */
9938                 rack->r_ctl.sack_moved_extra++;
9939                 counter_u64_add(rack_move_some, 1);
9940         }
9941 out:
9942 #ifdef NETFLIX_EXP_DETECTION
9943         rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
9944 #endif
9945         if (changed) {
9946                 /* Something changed cancel the rack timer */
9947                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
9948         }
9949         tsused = tcp_get_usecs(NULL);
9950         rsm = tcp_rack_output(tp, rack, tsused);
9951         if ((!IN_FASTRECOVERY(tp->t_flags)) &&
9952             rsm) {
9953                 /* Enter recovery */
9954                 rack->r_ctl.rc_rsm_start = rsm->r_start;
9955                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
9956                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
9957                 entered_recovery = 1;
9958                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
9959                 /*
9960                  * When we enter recovery we need to assure we send
9961                  * one packet.
9962                  */
9963                 if (rack->rack_no_prr == 0) {
9964                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
9965                         rack_log_to_prr(rack, 8, 0);
9966                 }
9967                 rack->r_timer_override = 1;
9968                 rack->r_early = 0;
9969                 rack->r_ctl.rc_agg_early = 0;
9970         } else if (IN_FASTRECOVERY(tp->t_flags) &&
9971                    rsm &&
9972                    (rack->r_rr_config == 3)) {
9973                 /*
9974                  * Assure we can output and we get no
9975                  * remembered pace time except the retransmit.
9976                  */
9977                 rack->r_timer_override = 1;
9978                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
9979                 rack->r_ctl.rc_resend = rsm;
9980         }
9981         if (IN_FASTRECOVERY(tp->t_flags) &&
9982             (rack->rack_no_prr == 0) &&
9983             (entered_recovery == 0)) {
9984                 rack_update_prr(tp, rack, changed, th_ack);
9985                 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
9986                      ((rack->rc_inp->inp_in_hpts == 0) &&
9987                       ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
9988                         /*
9989                          * If you are pacing output you don't want
9990                          * to override.
9991                          */
9992                         rack->r_early = 0;
9993                         rack->r_ctl.rc_agg_early = 0;
9994                         rack->r_timer_override = 1;
9995                 }
9996         }
9997 }
9998
9999 static void
10000 rack_strike_dupack(struct tcp_rack *rack)
10001 {
10002         struct rack_sendmap *rsm;
10003
10004         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10005         while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
10006                 rsm = TAILQ_NEXT(rsm, r_tnext);
10007         }
10008         if (rsm && (rsm->r_dupack < 0xff)) {
10009                 rsm->r_dupack++;
10010                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
10011                         struct timeval tv;
10012                         uint32_t cts;
10013                         /*
10014                          * Here we see if we need to retransmit. For
10015                          * a SACK type connection if enough time has passed
10016                          * we will get a return of the rsm. For a non-sack
10017                          * connection we will get the rsm returned if the
10018                          * dupack value is 3 or more.
10019                          */
10020                         cts = tcp_get_usecs(&tv);
10021                         rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
10022                         if (rack->r_ctl.rc_resend != NULL) {
10023                                 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
10024                                         rack_cong_signal(rack->rc_tp, CC_NDUPACK,
10025                                                          rack->rc_tp->snd_una);
10026                                 }
10027                                 rack->r_wanted_output = 1;
10028                                 rack->r_timer_override = 1;
10029                                 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
10030                         }
10031                 } else {
10032                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
10033                 }
10034         }
10035 }
10036
10037 static void
10038 rack_check_bottom_drag(struct tcpcb *tp,
10039                        struct tcp_rack *rack,
10040                        struct socket *so, int32_t acked)
10041 {
10042         uint32_t segsiz, minseg;
10043
10044         segsiz = ctf_fixed_maxseg(tp);
10045         minseg = segsiz;
10046
10047         if (tp->snd_max == tp->snd_una) {
10048                 /*
10049                  * We are doing dynamic pacing and we are way
10050                  * under. Basically everything got acked while
10051                  * we were still waiting on the pacer to expire.
10052                  *
10053                  * This means we need to boost the b/w in
10054                  * addition to any earlier boosting of
10055                  * the multipler.
10056                  */
10057                 rack->rc_dragged_bottom = 1;
10058                 rack_validate_multipliers_at_or_above100(rack);
10059                 /*
10060                  * Lets use the segment bytes acked plus
10061                  * the lowest RTT seen as the basis to
10062                  * form a b/w estimate. This will be off
10063                  * due to the fact that the true estimate
10064                  * should be around 1/2 the time of the RTT
10065                  * but we can settle for that.
10066                  */
10067                 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
10068                     acked) {
10069                         uint64_t bw, calc_bw, rtt;
10070
10071                         rtt = rack->r_ctl.rack_rs.rs_us_rtt;
10072                         if (rtt == 0) {
10073                                 /* no us sample is there a ms one? */
10074                                 if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
10075                                         rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
10076                                 } else {
10077                                         goto no_measurement;
10078                                 }
10079                         }
10080                         bw = acked;
10081                         calc_bw = bw * 1000000;
10082                         calc_bw /= rtt;
10083                         if (rack->r_ctl.last_max_bw &&
10084                             (rack->r_ctl.last_max_bw < calc_bw)) {
10085                                 /*
10086                                  * If we have a last calculated max bw
10087                                  * enforce it.
10088                                  */
10089                                 calc_bw = rack->r_ctl.last_max_bw;
10090                         }
10091                         /* now plop it in */
10092                         if (rack->rc_gp_filled == 0) {
10093                                 if (calc_bw > ONE_POINT_TWO_MEG) {
10094                                         /*
10095                                          * If we have no measurement
10096                                          * don't let us set in more than
10097                                          * 1.2Mbps. If we are still too
10098                                          * low after pacing with this we
10099                                          * will hopefully have a max b/w
10100                                          * available to sanity check things.
10101                                          */
10102                                         calc_bw = ONE_POINT_TWO_MEG;
10103                                 }
10104                                 rack->r_ctl.rc_rtt_diff = 0;
10105                                 rack->r_ctl.gp_bw = calc_bw;
10106                                 rack->rc_gp_filled = 1;
10107                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10108                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
10109                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10110                         } else if (calc_bw > rack->r_ctl.gp_bw) {
10111                                 rack->r_ctl.rc_rtt_diff = 0;
10112                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10113                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
10114                                 rack->r_ctl.gp_bw = calc_bw;
10115                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10116                         } else
10117                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10118                         if ((rack->gp_ready == 0) &&
10119                             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
10120                                 /* We have enough measurements now */
10121                                 rack->gp_ready = 1;
10122                                 rack_set_cc_pacing(rack);
10123                                 if (rack->defer_options)
10124                                         rack_apply_deferred_options(rack);
10125                         }
10126                         /*
10127                          * For acks over 1mss we do a extra boost to simulate
10128                          * where we would get 2 acks (we want 110 for the mul).
10129                          */
10130                         if (acked > segsiz)
10131                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10132                 } else {
10133                         /*
10134                          * zero rtt possibly?, settle for just an old increase.
10135                          */
10136 no_measurement:
10137                         rack_increase_bw_mul(rack, -1, 0, 0, 1);
10138                 }
10139         } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
10140                    (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
10141                                                minseg)) &&
10142                    (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10143                    (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10144                    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
10145                     (segsiz * rack_req_segs))) {
10146                 /*
10147                  * We are doing dynamic GP pacing and
10148                  * we have everything except 1MSS or less
10149                  * bytes left out. We are still pacing away.
10150                  * And there is data that could be sent, This
10151                  * means we are inserting delayed ack time in
10152                  * our measurements because we are pacing too slow.
10153                  */
10154                 rack_validate_multipliers_at_or_above100(rack);
10155                 rack->rc_dragged_bottom = 1;
10156                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10157         }
10158 }
10159
10160
10161
10162 static void
10163 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
10164 {
10165         /*
10166          * The fast output path is enabled and we
10167          * have moved the cumack forward. Lets see if
10168          * we can expand forward the fast path length by
10169          * that amount. What we would ideally like to
10170          * do is increase the number of bytes in the
10171          * fast path block (left_to_send) by the
10172          * acked amount. However we have to gate that
10173          * by two factors:
10174          * 1) The amount outstanding and the rwnd of the peer
10175          *    (i.e. we don't want to exceed the rwnd of the peer).
10176          *    <and>
10177          * 2) The amount of data left in the socket buffer (i.e.
10178          *    we can't send beyond what is in the buffer).
10179          *
10180          * Note that this does not take into account any increase
10181          * in the cwnd. We will only extend the fast path by
10182          * what was acked.
10183          */
10184         uint32_t new_total, gating_val;
10185
10186         new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
10187         gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
10188                          (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
10189         if (new_total <= gating_val) {
10190                 /* We can increase left_to_send by the acked amount */
10191                 counter_u64_add(rack_extended_rfo, 1);
10192                 rack->r_ctl.fsb.left_to_send = new_total;
10193                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
10194                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
10195                          rack, rack->r_ctl.fsb.left_to_send,
10196                          sbavail(&rack->rc_inp->inp_socket->so_snd),
10197                          (tp->snd_max - tp->snd_una)));
10198
10199         }
10200 }
10201
10202 static void
10203 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
10204 {
10205         /*
10206          * Here any sendmap entry that points to the
10207          * beginning mbuf must be adjusted to the correct
10208          * offset. This must be called with:
10209          * 1) The socket buffer locked
10210          * 2) snd_una adjusted to its new postion.
10211          *
10212          * Note that (2) implies rack_ack_received has also
10213          * been called.
10214          *
10215          * We grab the first mbuf in the socket buffer and
10216          * then go through the front of the sendmap, recalculating
10217          * the stored offset for any sendmap entry that has
10218          * that mbuf. We must use the sb functions to do this
10219          * since its possible an add was done has well as
10220          * the subtraction we may have just completed. This should
10221          * not be a penalty though, since we just referenced the sb
10222          * to go in and trim off the mbufs that we freed (of course
10223          * there will be a penalty for the sendmap references though).
10224          */
10225         struct mbuf *m;
10226         struct rack_sendmap *rsm;
10227
10228         SOCKBUF_LOCK_ASSERT(sb);
10229         m = sb->sb_mb;
10230         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
10231         if ((rsm == NULL) || (m == NULL)) {
10232                 /* Nothing outstanding */
10233                 return;
10234         }
10235         while (rsm->m && (rsm->m == m)) {
10236                 /* one to adjust */
10237 #ifdef INVARIANTS
10238                 struct mbuf *tm;
10239                 uint32_t soff;
10240
10241                 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
10242                 if (rsm->orig_m_len != m->m_len) {
10243                         rack_adjust_orig_mlen(rsm);
10244                 }
10245                 if (rsm->soff != soff) {
10246                         /*
10247                          * This is not a fatal error, we anticipate it
10248                          * might happen (the else code), so we count it here
10249                          * so that under invariant we can see that it really
10250                          * does happen.
10251                          */
10252                         counter_u64_add(rack_adjust_map_bw, 1);
10253                 }
10254                 rsm->m = tm;
10255                 rsm->soff = soff;
10256                 if (tm)
10257                         rsm->orig_m_len = rsm->m->m_len;
10258                 else
10259                         rsm->orig_m_len = 0;
10260 #else
10261                 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
10262                 if (rsm->m)
10263                         rsm->orig_m_len = rsm->m->m_len;
10264                 else
10265                         rsm->orig_m_len = 0;
10266 #endif
10267                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
10268                               rsm);
10269                 if (rsm == NULL)
10270                         break;
10271         }
10272 }
10273
10274 /*
10275  * Return value of 1, we do not need to call rack_process_data().
10276  * return value of 0, rack_process_data can be called.
10277  * For ret_val if its 0 the TCP is locked, if its non-zero
10278  * its unlocked and probably unsafe to touch the TCB.
10279  */
10280 static int
10281 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
10282     struct tcpcb *tp, struct tcpopt *to,
10283     uint32_t tiwin, int32_t tlen,
10284     int32_t * ofia, int32_t thflags, int32_t *ret_val)
10285 {
10286         int32_t ourfinisacked = 0;
10287         int32_t nsegs, acked_amount;
10288         int32_t acked;
10289         struct mbuf *mfree;
10290         struct tcp_rack *rack;
10291         int32_t under_pacing = 0;
10292         int32_t recovery = 0;
10293
10294         rack = (struct tcp_rack *)tp->t_fb_ptr;
10295         if (SEQ_GT(th->th_ack, tp->snd_max)) {
10296                 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
10297                                       &rack->r_ctl.challenge_ack_ts,
10298                                       &rack->r_ctl.challenge_ack_cnt);
10299                 rack->r_wanted_output = 1;
10300                 return (1);
10301         }
10302         if (rack->gp_ready &&
10303             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
10304                 under_pacing = 1;
10305         }
10306         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
10307                 int in_rec, dup_ack_struck = 0;
10308
10309                 in_rec = IN_FASTRECOVERY(tp->t_flags);
10310                 if (rack->rc_in_persist) {
10311                         tp->t_rxtshift = 0;
10312                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10313                                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10314                 }
10315                 if ((th->th_ack == tp->snd_una) &&
10316                     (tiwin == tp->snd_wnd) &&
10317                     ((to->to_flags & TOF_SACK) == 0)) {
10318                         rack_strike_dupack(rack);
10319                         dup_ack_struck = 1;
10320                 }
10321                 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
10322         }
10323         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
10324                 /*
10325                  * Old ack, behind (or duplicate to) the last one rcv'd
10326                  * Note: We mark reordering is occuring if its
10327                  * less than and we have not closed our window.
10328                  */
10329                 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
10330                         counter_u64_add(rack_reorder_seen, 1);
10331                         rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
10332                 }
10333                 return (0);
10334         }
10335         /*
10336          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
10337          * something we sent.
10338          */
10339         if (tp->t_flags & TF_NEEDSYN) {
10340                 /*
10341                  * T/TCP: Connection was half-synchronized, and our SYN has
10342                  * been ACK'd (so connection is now fully synchronized).  Go
10343                  * to non-starred state, increment snd_una for ACK of SYN,
10344                  * and check if we can do window scaling.
10345                  */
10346                 tp->t_flags &= ~TF_NEEDSYN;
10347                 tp->snd_una++;
10348                 /* Do window scaling? */
10349                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
10350                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
10351                         tp->rcv_scale = tp->request_r_scale;
10352                         /* Send window already scaled. */
10353                 }
10354         }
10355         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10356         INP_WLOCK_ASSERT(tp->t_inpcb);
10357
10358         acked = BYTES_THIS_ACK(tp, th);
10359         if (acked) {
10360                 /*
10361                  * Any time we move the cum-ack forward clear
10362                  * keep-alive tied probe-not-answered. The
10363                  * persists clears its own on entry.
10364                  */
10365                 rack->probe_not_answered = 0;
10366         }
10367         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
10368         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
10369         /*
10370          * If we just performed our first retransmit, and the ACK arrives
10371          * within our recovery window, then it was a mistake to do the
10372          * retransmit in the first place.  Recover our original cwnd and
10373          * ssthresh, and proceed to transmit where we left off.
10374          */
10375         if ((tp->t_flags & TF_PREVVALID) &&
10376             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
10377                 tp->t_flags &= ~TF_PREVVALID;
10378                 if (tp->t_rxtshift == 1 &&
10379                     (int)(ticks - tp->t_badrxtwin) < 0)
10380                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
10381         }
10382         if (acked) {
10383                 /* assure we are not backed off */
10384                 tp->t_rxtshift = 0;
10385                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10386                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10387                 rack->rc_tlp_in_progress = 0;
10388                 rack->r_ctl.rc_tlp_cnt_out = 0;
10389                 /*
10390                  * If it is the RXT timer we want to
10391                  * stop it, so we can restart a TLP.
10392                  */
10393                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
10394                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10395 #ifdef NETFLIX_HTTP_LOGGING
10396                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
10397 #endif
10398         }
10399         /*
10400          * If we have a timestamp reply, update smoothed round trip time. If
10401          * no timestamp is present but transmit timer is running and timed
10402          * sequence number was acked, update smoothed round trip time. Since
10403          * we now have an rtt measurement, cancel the timer backoff (cf.,
10404          * Phil Karn's retransmit alg.). Recompute the initial retransmit
10405          * timer.
10406          *
10407          * Some boxes send broken timestamp replies during the SYN+ACK
10408          * phase, ignore timestamps of 0 or we could calculate a huge RTT
10409          * and blow up the retransmit timer.
10410          */
10411         /*
10412          * If all outstanding data is acked, stop retransmit timer and
10413          * remember to restart (more output or persist). If there is more
10414          * data to be acked, restart retransmit timer, using current
10415          * (possibly backed-off) value.
10416          */
10417         if (acked == 0) {
10418                 if (ofia)
10419                         *ofia = ourfinisacked;
10420                 return (0);
10421         }
10422         if (IN_RECOVERY(tp->t_flags)) {
10423                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
10424                     (SEQ_LT(th->th_ack, tp->snd_max))) {
10425                         tcp_rack_partialack(tp);
10426                 } else {
10427                         rack_post_recovery(tp, th->th_ack);
10428                         recovery = 1;
10429                 }
10430         }
10431         /*
10432          * Let the congestion control algorithm update congestion control
10433          * related information. This typically means increasing the
10434          * congestion window.
10435          */
10436         rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
10437         SOCKBUF_LOCK(&so->so_snd);
10438         acked_amount = min(acked, (int)sbavail(&so->so_snd));
10439         tp->snd_wnd -= acked_amount;
10440         mfree = sbcut_locked(&so->so_snd, acked_amount);
10441         if ((sbused(&so->so_snd) == 0) &&
10442             (acked > acked_amount) &&
10443             (tp->t_state >= TCPS_FIN_WAIT_1) &&
10444             (tp->t_flags & TF_SENTFIN)) {
10445                 /*
10446                  * We must be sure our fin
10447                  * was sent and acked (we can be
10448                  * in FIN_WAIT_1 without having
10449                  * sent the fin).
10450                  */
10451                 ourfinisacked = 1;
10452         }
10453         tp->snd_una = th->th_ack;
10454         if (acked_amount && sbavail(&so->so_snd))
10455                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
10456         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
10457         /* NB: sowwakeup_locked() does an implicit unlock. */
10458         sowwakeup_locked(so);
10459         m_freem(mfree);
10460         if (SEQ_GT(tp->snd_una, tp->snd_recover))
10461                 tp->snd_recover = tp->snd_una;
10462
10463         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
10464                 tp->snd_nxt = tp->snd_una;
10465         }
10466         if (under_pacing &&
10467             (rack->use_fixed_rate == 0) &&
10468             (rack->in_probe_rtt == 0) &&
10469             rack->rc_gp_dyn_mul &&
10470             rack->rc_always_pace) {
10471                 /* Check if we are dragging bottom */
10472                 rack_check_bottom_drag(tp, rack, so, acked);
10473         }
10474         if (tp->snd_una == tp->snd_max) {
10475                 /* Nothing left outstanding */
10476                 tp->t_flags &= ~TF_PREVVALID;
10477                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
10478                 rack->r_ctl.retran_during_recovery = 0;
10479                 rack->r_ctl.dsack_byte_cnt = 0;
10480                 if (rack->r_ctl.rc_went_idle_time == 0)
10481                         rack->r_ctl.rc_went_idle_time = 1;
10482                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
10483                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
10484                         tp->t_acktime = 0;
10485                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10486                 /* Set need output so persist might get set */
10487                 rack->r_wanted_output = 1;
10488                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
10489                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
10490                     (sbavail(&so->so_snd) == 0) &&
10491                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
10492                         /*
10493                          * The socket was gone and the
10494                          * peer sent data (now or in the past), time to
10495                          * reset him.
10496                          */
10497                         *ret_val = 1;
10498                         /* tcp_close will kill the inp pre-log the Reset */
10499                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
10500                         tp = tcp_close(tp);
10501                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
10502                         return (1);
10503                 }
10504         }
10505         if (ofia)
10506                 *ofia = ourfinisacked;
10507         return (0);
10508 }
10509
10510 static void
10511 rack_collapsed_window(struct tcp_rack *rack)
10512 {
10513         /*
10514          * Now we must walk the
10515          * send map and divide the
10516          * ones left stranded. These
10517          * guys can't cause us to abort
10518          * the connection and are really
10519          * "unsent". However if a buggy
10520          * client actually did keep some
10521          * of the data i.e. collapsed the win
10522          * and refused to ack and then opened
10523          * the win and acked that data. We would
10524          * get into an ack war, the simplier
10525          * method then of just pretending we
10526          * did not send those segments something
10527          * won't work.
10528          */
10529         struct rack_sendmap *rsm, *nrsm, fe, *insret;
10530         tcp_seq max_seq;
10531
10532         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
10533         memset(&fe, 0, sizeof(fe));
10534         fe.r_start = max_seq;
10535         /* Find the first seq past or at maxseq */
10536         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
10537         if (rsm == NULL) {
10538                 /* Nothing to do strange */
10539                 rack->rc_has_collapsed = 0;
10540                 return;
10541         }
10542         /*
10543          * Now do we need to split at
10544          * the collapse point?
10545          */
10546         if (SEQ_GT(max_seq, rsm->r_start)) {
10547                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
10548                 if (nrsm == NULL) {
10549                         /* We can't get a rsm, mark all? */
10550                         nrsm = rsm;
10551                         goto no_split;
10552                 }
10553                 /* Clone it */
10554                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
10555                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
10556 #ifdef INVARIANTS
10557                 if (insret != NULL) {
10558                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
10559                               nrsm, insret, rack, rsm);
10560                 }
10561 #endif
10562                 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
10563                 if (rsm->r_in_tmap) {
10564                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
10565                         nrsm->r_in_tmap = 1;
10566                 }
10567                 /*
10568                  * Set in the new RSM as the
10569                  * collapsed starting point
10570                  */
10571                 rsm = nrsm;
10572         }
10573 no_split:
10574         counter_u64_add(rack_collapsed_win, 1);
10575         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
10576                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
10577         }
10578         rack->rc_has_collapsed = 1;
10579 }
10580
10581 static void
10582 rack_un_collapse_window(struct tcp_rack *rack)
10583 {
10584         struct rack_sendmap *rsm;
10585
10586         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
10587                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
10588                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
10589                 else
10590                         break;
10591         }
10592         rack->rc_has_collapsed = 0;
10593 }
10594
10595 static void
10596 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
10597                         int32_t tlen, int32_t tfo_syn)
10598 {
10599         if (DELAY_ACK(tp, tlen) || tfo_syn) {
10600                 if (rack->rc_dack_mode &&
10601                     (tlen > 500) &&
10602                     (rack->rc_dack_toggle == 1)) {
10603                         goto no_delayed_ack;
10604                 }
10605                 rack_timer_cancel(tp, rack,
10606                                   rack->r_ctl.rc_rcvtime, __LINE__);
10607                 tp->t_flags |= TF_DELACK;
10608         } else {
10609 no_delayed_ack:
10610                 rack->r_wanted_output = 1;
10611                 tp->t_flags |= TF_ACKNOW;
10612                 if (rack->rc_dack_mode) {
10613                         if (tp->t_flags & TF_DELACK)
10614                                 rack->rc_dack_toggle = 1;
10615                         else
10616                                 rack->rc_dack_toggle = 0;
10617                 }
10618         }
10619 }
10620
10621 static void
10622 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
10623 {
10624         /*
10625          * If fast output is in progress, lets validate that
10626          * the new window did not shrink on us and make it
10627          * so fast output should end.
10628          */
10629         if (rack->r_fast_output) {
10630                 uint32_t out;
10631
10632                 /*
10633                  * Calculate what we will send if left as is
10634                  * and compare that to our send window.
10635                  */
10636                 out = ctf_outstanding(tp);
10637                 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
10638                         /* ok we have an issue */
10639                         if (out >= tp->snd_wnd) {
10640                                 /* Turn off fast output the window is met or collapsed */
10641                                 rack->r_fast_output = 0;
10642                         } else {
10643                                 /* we have some room left */
10644                                 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
10645                                 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
10646                                         /* If not at least 1 full segment never mind */
10647                                         rack->r_fast_output = 0;
10648                                 }
10649                         }
10650                 }
10651         }
10652 }
10653
10654
10655 /*
10656  * Return value of 1, the TCB is unlocked and most
10657  * likely gone, return value of 0, the TCP is still
10658  * locked.
10659  */
10660 static int
10661 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
10662     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
10663     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
10664 {
10665         /*
10666          * Update window information. Don't look at window if no ACK: TAC's
10667          * send garbage on first SYN.
10668          */
10669         int32_t nsegs;
10670         int32_t tfo_syn;
10671         struct tcp_rack *rack;
10672
10673         rack = (struct tcp_rack *)tp->t_fb_ptr;
10674         INP_WLOCK_ASSERT(tp->t_inpcb);
10675         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10676         if ((thflags & TH_ACK) &&
10677             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
10678             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
10679             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
10680                 /* keep track of pure window updates */
10681                 if (tlen == 0 &&
10682                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
10683                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
10684                 tp->snd_wnd = tiwin;
10685                 rack_validate_fo_sendwin_up(tp, rack);
10686                 tp->snd_wl1 = th->th_seq;
10687                 tp->snd_wl2 = th->th_ack;
10688                 if (tp->snd_wnd > tp->max_sndwnd)
10689                         tp->max_sndwnd = tp->snd_wnd;
10690                 rack->r_wanted_output = 1;
10691         } else if (thflags & TH_ACK) {
10692                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
10693                         tp->snd_wnd = tiwin;
10694                         rack_validate_fo_sendwin_up(tp, rack);
10695                         tp->snd_wl1 = th->th_seq;
10696                         tp->snd_wl2 = th->th_ack;
10697                 }
10698         }
10699         if (tp->snd_wnd < ctf_outstanding(tp))
10700                 /* The peer collapsed the window */
10701                 rack_collapsed_window(rack);
10702         else if (rack->rc_has_collapsed)
10703                 rack_un_collapse_window(rack);
10704         /* Was persist timer active and now we have window space? */
10705         if ((rack->rc_in_persist != 0) &&
10706             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
10707                                 rack->r_ctl.rc_pace_min_segs))) {
10708                 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10709                 tp->snd_nxt = tp->snd_max;
10710                 /* Make sure we output to start the timer */
10711                 rack->r_wanted_output = 1;
10712         }
10713         /* Do we enter persists? */
10714         if ((rack->rc_in_persist == 0) &&
10715             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
10716             TCPS_HAVEESTABLISHED(tp->t_state) &&
10717             (tp->snd_max == tp->snd_una) &&
10718             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
10719             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
10720                 /*
10721                  * Here the rwnd is less than
10722                  * the pacing size, we are established,
10723                  * nothing is outstanding, and there is
10724                  * data to send. Enter persists.
10725                  */
10726                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10727         }
10728         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
10729                 m_freem(m);
10730                 return (0);
10731         }
10732         /*
10733          * don't process the URG bit, ignore them drag
10734          * along the up.
10735          */
10736         tp->rcv_up = tp->rcv_nxt;
10737         INP_WLOCK_ASSERT(tp->t_inpcb);
10738
10739         /*
10740          * Process the segment text, merging it into the TCP sequencing
10741          * queue, and arranging for acknowledgment of receipt if necessary.
10742          * This process logically involves adjusting tp->rcv_wnd as data is
10743          * presented to the user (this happens in tcp_usrreq.c, case
10744          * PRU_RCVD).  If a FIN has already been received on this connection
10745          * then we just ignore the text.
10746          */
10747         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
10748                    IS_FASTOPEN(tp->t_flags));
10749         if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
10750             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10751                 tcp_seq save_start = th->th_seq;
10752                 tcp_seq save_rnxt  = tp->rcv_nxt;
10753                 int     save_tlen  = tlen;
10754
10755                 m_adj(m, drop_hdrlen);  /* delayed header drop */
10756                 /*
10757                  * Insert segment which includes th into TCP reassembly
10758                  * queue with control block tp.  Set thflags to whether
10759                  * reassembly now includes a segment with FIN.  This handles
10760                  * the common case inline (segment is the next to be
10761                  * received on an established connection, and the queue is
10762                  * empty), avoiding linkage into and removal from the queue
10763                  * and repetition of various conversions. Set DELACK for
10764                  * segments received in order, but ack immediately when
10765                  * segments are out of order (so fast retransmit can work).
10766                  */
10767                 if (th->th_seq == tp->rcv_nxt &&
10768                     SEGQ_EMPTY(tp) &&
10769                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
10770                     tfo_syn)) {
10771 #ifdef NETFLIX_SB_LIMITS
10772                         u_int mcnt, appended;
10773
10774                         if (so->so_rcv.sb_shlim) {
10775                                 mcnt = m_memcnt(m);
10776                                 appended = 0;
10777                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
10778                                     CFO_NOSLEEP, NULL) == false) {
10779                                         counter_u64_add(tcp_sb_shlim_fails, 1);
10780                                         m_freem(m);
10781                                         return (0);
10782                                 }
10783                         }
10784 #endif
10785                         rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
10786                         tp->rcv_nxt += tlen;
10787                         if (tlen &&
10788                             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
10789                             (tp->t_fbyte_in == 0)) {
10790                                 tp->t_fbyte_in = ticks;
10791                                 if (tp->t_fbyte_in == 0)
10792                                         tp->t_fbyte_in = 1;
10793                                 if (tp->t_fbyte_out && tp->t_fbyte_in)
10794                                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
10795                         }
10796                         thflags = th->th_flags & TH_FIN;
10797                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
10798                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
10799                         SOCKBUF_LOCK(&so->so_rcv);
10800                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10801                                 m_freem(m);
10802                         } else
10803 #ifdef NETFLIX_SB_LIMITS
10804                                 appended =
10805 #endif
10806                                         sbappendstream_locked(&so->so_rcv, m, 0);
10807
10808                         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
10809                         /* NB: sorwakeup_locked() does an implicit unlock. */
10810                         sorwakeup_locked(so);
10811 #ifdef NETFLIX_SB_LIMITS
10812                         if (so->so_rcv.sb_shlim && appended != mcnt)
10813                                 counter_fo_release(so->so_rcv.sb_shlim,
10814                                     mcnt - appended);
10815 #endif
10816                 } else {
10817                         /*
10818                          * XXX: Due to the header drop above "th" is
10819                          * theoretically invalid by now.  Fortunately
10820                          * m_adj() doesn't actually frees any mbufs when
10821                          * trimming from the head.
10822                          */
10823                         tcp_seq temp = save_start;
10824
10825                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
10826                         tp->t_flags |= TF_ACKNOW;
10827                         if (tp->t_flags & TF_WAKESOR) {
10828                                 tp->t_flags &= ~TF_WAKESOR;
10829                                 /* NB: sorwakeup_locked() does an implicit unlock. */
10830                                 sorwakeup_locked(so);
10831                         }
10832                 }
10833                 if ((tp->t_flags & TF_SACK_PERMIT) &&
10834                     (save_tlen > 0) &&
10835                     TCPS_HAVEESTABLISHED(tp->t_state)) {
10836                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
10837                                 /*
10838                                  * DSACK actually handled in the fastpath
10839                                  * above.
10840                                  */
10841                                 RACK_OPTS_INC(tcp_sack_path_1);
10842                                 tcp_update_sack_list(tp, save_start,
10843                                     save_start + save_tlen);
10844                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
10845                                 if ((tp->rcv_numsacks >= 1) &&
10846                                     (tp->sackblks[0].end == save_start)) {
10847                                         /*
10848                                          * Partial overlap, recorded at todrop
10849                                          * above.
10850                                          */
10851                                         RACK_OPTS_INC(tcp_sack_path_2a);
10852                                         tcp_update_sack_list(tp,
10853                                             tp->sackblks[0].start,
10854                                             tp->sackblks[0].end);
10855                                 } else {
10856                                         RACK_OPTS_INC(tcp_sack_path_2b);
10857                                         tcp_update_dsack_list(tp, save_start,
10858                                             save_start + save_tlen);
10859                                 }
10860                         } else if (tlen >= save_tlen) {
10861                                 /* Update of sackblks. */
10862                                 RACK_OPTS_INC(tcp_sack_path_3);
10863                                 tcp_update_dsack_list(tp, save_start,
10864                                     save_start + save_tlen);
10865                         } else if (tlen > 0) {
10866                                 RACK_OPTS_INC(tcp_sack_path_4);
10867                                 tcp_update_dsack_list(tp, save_start,
10868                                     save_start + tlen);
10869                         }
10870                 }
10871         } else {
10872                 m_freem(m);
10873                 thflags &= ~TH_FIN;
10874         }
10875
10876         /*
10877          * If FIN is received ACK the FIN and let the user know that the
10878          * connection is closing.
10879          */
10880         if (thflags & TH_FIN) {
10881                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10882                         /* The socket upcall is handled by socantrcvmore. */
10883                         socantrcvmore(so);
10884                         /*
10885                          * If connection is half-synchronized (ie NEEDSYN
10886                          * flag on) then delay ACK, so it may be piggybacked
10887                          * when SYN is sent. Otherwise, since we received a
10888                          * FIN then no more input can be expected, send ACK
10889                          * now.
10890                          */
10891                         if (tp->t_flags & TF_NEEDSYN) {
10892                                 rack_timer_cancel(tp, rack,
10893                                     rack->r_ctl.rc_rcvtime, __LINE__);
10894                                 tp->t_flags |= TF_DELACK;
10895                         } else {
10896                                 tp->t_flags |= TF_ACKNOW;
10897                         }
10898                         tp->rcv_nxt++;
10899                 }
10900                 switch (tp->t_state) {
10901                         /*
10902                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
10903                          * CLOSE_WAIT state.
10904                          */
10905                 case TCPS_SYN_RECEIVED:
10906                         tp->t_starttime = ticks;
10907                         /* FALLTHROUGH */
10908                 case TCPS_ESTABLISHED:
10909                         rack_timer_cancel(tp, rack,
10910                             rack->r_ctl.rc_rcvtime, __LINE__);
10911                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
10912                         break;
10913
10914                         /*
10915                          * If still in FIN_WAIT_1 STATE FIN has not been
10916                          * acked so enter the CLOSING state.
10917                          */
10918                 case TCPS_FIN_WAIT_1:
10919                         rack_timer_cancel(tp, rack,
10920                             rack->r_ctl.rc_rcvtime, __LINE__);
10921                         tcp_state_change(tp, TCPS_CLOSING);
10922                         break;
10923
10924                         /*
10925                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
10926                          * starting the time-wait timer, turning off the
10927                          * other standard timers.
10928                          */
10929                 case TCPS_FIN_WAIT_2:
10930                         rack_timer_cancel(tp, rack,
10931                             rack->r_ctl.rc_rcvtime, __LINE__);
10932                         tcp_twstart(tp);
10933                         return (1);
10934                 }
10935         }
10936         /*
10937          * Return any desired output.
10938          */
10939         if ((tp->t_flags & TF_ACKNOW) ||
10940             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
10941                 rack->r_wanted_output = 1;
10942         }
10943         INP_WLOCK_ASSERT(tp->t_inpcb);
10944         return (0);
10945 }
10946
10947 /*
10948  * Here nothing is really faster, its just that we
10949  * have broken out the fast-data path also just like
10950  * the fast-ack.
10951  */
10952 static int
10953 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
10954     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10955     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
10956 {
10957         int32_t nsegs;
10958         int32_t newsize = 0;    /* automatic sockbuf scaling */
10959         struct tcp_rack *rack;
10960 #ifdef NETFLIX_SB_LIMITS
10961         u_int mcnt, appended;
10962 #endif
10963 #ifdef TCPDEBUG
10964         /*
10965          * The size of tcp_saveipgen must be the size of the max ip header,
10966          * now IPv6.
10967          */
10968         u_char tcp_saveipgen[IP6_HDR_LEN];
10969         struct tcphdr tcp_savetcp;
10970         short ostate = 0;
10971
10972 #endif
10973         /*
10974          * If last ACK falls within this segment's sequence numbers, record
10975          * the timestamp. NOTE that the test is modified according to the
10976          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
10977          */
10978         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
10979                 return (0);
10980         }
10981         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
10982                 return (0);
10983         }
10984         if (tiwin && tiwin != tp->snd_wnd) {
10985                 return (0);
10986         }
10987         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
10988                 return (0);
10989         }
10990         if (__predict_false((to->to_flags & TOF_TS) &&
10991             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
10992                 return (0);
10993         }
10994         if (__predict_false((th->th_ack != tp->snd_una))) {
10995                 return (0);
10996         }
10997         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
10998                 return (0);
10999         }
11000         if ((to->to_flags & TOF_TS) != 0 &&
11001             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
11002                 tp->ts_recent_age = tcp_ts_getticks();
11003                 tp->ts_recent = to->to_tsval;
11004         }
11005         rack = (struct tcp_rack *)tp->t_fb_ptr;
11006         /*
11007          * This is a pure, in-sequence data packet with nothing on the
11008          * reassembly queue and we have enough buffer space to take it.
11009          */
11010         nsegs = max(1, m->m_pkthdr.lro_nsegs);
11011
11012 #ifdef NETFLIX_SB_LIMITS
11013         if (so->so_rcv.sb_shlim) {
11014                 mcnt = m_memcnt(m);
11015                 appended = 0;
11016                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
11017                     CFO_NOSLEEP, NULL) == false) {
11018                         counter_u64_add(tcp_sb_shlim_fails, 1);
11019                         m_freem(m);
11020                         return (1);
11021                 }
11022         }
11023 #endif
11024         /* Clean receiver SACK report if present */
11025         if (tp->rcv_numsacks)
11026                 tcp_clean_sackreport(tp);
11027         KMOD_TCPSTAT_INC(tcps_preddat);
11028         tp->rcv_nxt += tlen;
11029         if (tlen &&
11030             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
11031             (tp->t_fbyte_in == 0)) {
11032                 tp->t_fbyte_in = ticks;
11033                 if (tp->t_fbyte_in == 0)
11034                         tp->t_fbyte_in = 1;
11035                 if (tp->t_fbyte_out && tp->t_fbyte_in)
11036                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
11037         }
11038         /*
11039          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
11040          */
11041         tp->snd_wl1 = th->th_seq;
11042         /*
11043          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
11044          */
11045         tp->rcv_up = tp->rcv_nxt;
11046         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
11047         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
11048 #ifdef TCPDEBUG
11049         if (so->so_options & SO_DEBUG)
11050                 tcp_trace(TA_INPUT, ostate, tp,
11051                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
11052 #endif
11053         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
11054
11055         /* Add data to socket buffer. */
11056         SOCKBUF_LOCK(&so->so_rcv);
11057         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11058                 m_freem(m);
11059         } else {
11060                 /*
11061                  * Set new socket buffer size. Give up when limit is
11062                  * reached.
11063                  */
11064                 if (newsize)
11065                         if (!sbreserve_locked(&so->so_rcv,
11066                             newsize, so, NULL))
11067                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
11068                 m_adj(m, drop_hdrlen);  /* delayed header drop */
11069 #ifdef NETFLIX_SB_LIMITS
11070                 appended =
11071 #endif
11072                         sbappendstream_locked(&so->so_rcv, m, 0);
11073                 ctf_calc_rwin(so, tp);
11074         }
11075         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
11076         /* NB: sorwakeup_locked() does an implicit unlock. */
11077         sorwakeup_locked(so);
11078 #ifdef NETFLIX_SB_LIMITS
11079         if (so->so_rcv.sb_shlim && mcnt != appended)
11080                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
11081 #endif
11082         rack_handle_delayed_ack(tp, rack, tlen, 0);
11083         if (tp->snd_una == tp->snd_max)
11084                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
11085         return (1);
11086 }
11087
11088 /*
11089  * This subfunction is used to try to highly optimize the
11090  * fast path. We again allow window updates that are
11091  * in sequence to remain in the fast-path. We also add
11092  * in the __predict's to attempt to help the compiler.
11093  * Note that if we return a 0, then we can *not* process
11094  * it and the caller should push the packet into the
11095  * slow-path.
11096  */
11097 static int
11098 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
11099     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11100     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
11101 {
11102         int32_t acked;
11103         int32_t nsegs;
11104 #ifdef TCPDEBUG
11105         /*
11106          * The size of tcp_saveipgen must be the size of the max ip header,
11107          * now IPv6.
11108          */
11109         u_char tcp_saveipgen[IP6_HDR_LEN];
11110         struct tcphdr tcp_savetcp;
11111         short ostate = 0;
11112 #endif
11113         int32_t under_pacing = 0;
11114         struct tcp_rack *rack;
11115
11116         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
11117                 /* Old ack, behind (or duplicate to) the last one rcv'd */
11118                 return (0);
11119         }
11120         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
11121                 /* Above what we have sent? */
11122                 return (0);
11123         }
11124         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
11125                 /* We are retransmitting */
11126                 return (0);
11127         }
11128         if (__predict_false(tiwin == 0)) {
11129                 /* zero window */
11130                 return (0);
11131         }
11132         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
11133                 /* We need a SYN or a FIN, unlikely.. */
11134                 return (0);
11135         }
11136         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
11137                 /* Timestamp is behind .. old ack with seq wrap? */
11138                 return (0);
11139         }
11140         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
11141                 /* Still recovering */
11142                 return (0);
11143         }
11144         rack = (struct tcp_rack *)tp->t_fb_ptr;
11145         if (rack->r_ctl.rc_sacked) {
11146                 /* We have sack holes on our scoreboard */
11147                 return (0);
11148         }
11149         /* Ok if we reach here, we can process a fast-ack */
11150         if (rack->gp_ready &&
11151             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11152                 under_pacing = 1;
11153         }
11154         nsegs = max(1, m->m_pkthdr.lro_nsegs);
11155         rack_log_ack(tp, to, th, 0, 0);
11156         /* Did the window get updated? */
11157         if (tiwin != tp->snd_wnd) {
11158                 tp->snd_wnd = tiwin;
11159                 rack_validate_fo_sendwin_up(tp, rack);
11160                 tp->snd_wl1 = th->th_seq;
11161                 if (tp->snd_wnd > tp->max_sndwnd)
11162                         tp->max_sndwnd = tp->snd_wnd;
11163         }
11164         /* Do we exit persists? */
11165         if ((rack->rc_in_persist != 0) &&
11166             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
11167                                rack->r_ctl.rc_pace_min_segs))) {
11168                 rack_exit_persist(tp, rack, cts);
11169         }
11170         /* Do we enter persists? */
11171         if ((rack->rc_in_persist == 0) &&
11172             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
11173             TCPS_HAVEESTABLISHED(tp->t_state) &&
11174             (tp->snd_max == tp->snd_una) &&
11175             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
11176             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
11177                 /*
11178                  * Here the rwnd is less than
11179                  * the pacing size, we are established,
11180                  * nothing is outstanding, and there is
11181                  * data to send. Enter persists.
11182                  */
11183                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
11184         }
11185         /*
11186          * If last ACK falls within this segment's sequence numbers, record
11187          * the timestamp. NOTE that the test is modified according to the
11188          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
11189          */
11190         if ((to->to_flags & TOF_TS) != 0 &&
11191             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
11192                 tp->ts_recent_age = tcp_ts_getticks();
11193                 tp->ts_recent = to->to_tsval;
11194         }
11195         /*
11196          * This is a pure ack for outstanding data.
11197          */
11198         KMOD_TCPSTAT_INC(tcps_predack);
11199
11200         /*
11201          * "bad retransmit" recovery.
11202          */
11203         if ((tp->t_flags & TF_PREVVALID) &&
11204             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
11205                 tp->t_flags &= ~TF_PREVVALID;
11206                 if (tp->t_rxtshift == 1 &&
11207                     (int)(ticks - tp->t_badrxtwin) < 0)
11208                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
11209         }
11210         /*
11211          * Recalculate the transmit timer / rtt.
11212          *
11213          * Some boxes send broken timestamp replies during the SYN+ACK
11214          * phase, ignore timestamps of 0 or we could calculate a huge RTT
11215          * and blow up the retransmit timer.
11216          */
11217         acked = BYTES_THIS_ACK(tp, th);
11218
11219 #ifdef TCP_HHOOK
11220         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
11221         hhook_run_tcp_est_in(tp, th, to);
11222 #endif
11223         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
11224         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
11225         if (acked) {
11226                 struct mbuf *mfree;
11227
11228                 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
11229                 SOCKBUF_LOCK(&so->so_snd);
11230                 mfree = sbcut_locked(&so->so_snd, acked);
11231                 tp->snd_una = th->th_ack;
11232                 /* Note we want to hold the sb lock through the sendmap adjust */
11233                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
11234                 /* Wake up the socket if we have room to write more */
11235                 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
11236                 sowwakeup_locked(so);
11237                 m_freem(mfree);
11238                 tp->t_rxtshift = 0;
11239                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11240                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11241                 rack->rc_tlp_in_progress = 0;
11242                 rack->r_ctl.rc_tlp_cnt_out = 0;
11243                 /*
11244                  * If it is the RXT timer we want to
11245                  * stop it, so we can restart a TLP.
11246                  */
11247                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
11248                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11249 #ifdef NETFLIX_HTTP_LOGGING
11250                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
11251 #endif
11252         }
11253         /*
11254          * Let the congestion control algorithm update congestion control
11255          * related information. This typically means increasing the
11256          * congestion window.
11257          */
11258         if (tp->snd_wnd < ctf_outstanding(tp)) {
11259                 /* The peer collapsed the window */
11260                 rack_collapsed_window(rack);
11261         } else if (rack->rc_has_collapsed)
11262                 rack_un_collapse_window(rack);
11263
11264         /*
11265          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
11266          */
11267         tp->snd_wl2 = th->th_ack;
11268         tp->t_dupacks = 0;
11269         m_freem(m);
11270         /* ND6_HINT(tp);         *//* Some progress has been made. */
11271
11272         /*
11273          * If all outstanding data are acked, stop retransmit timer,
11274          * otherwise restart timer using current (possibly backed-off)
11275          * value. If process is waiting for space, wakeup/selwakeup/signal.
11276          * If data are ready to send, let tcp_output decide between more
11277          * output or persist.
11278          */
11279 #ifdef TCPDEBUG
11280         if (so->so_options & SO_DEBUG)
11281                 tcp_trace(TA_INPUT, ostate, tp,
11282                     (void *)tcp_saveipgen,
11283                     &tcp_savetcp, 0);
11284 #endif
11285         if (under_pacing &&
11286             (rack->use_fixed_rate == 0) &&
11287             (rack->in_probe_rtt == 0) &&
11288             rack->rc_gp_dyn_mul &&
11289             rack->rc_always_pace) {
11290                 /* Check if we are dragging bottom */
11291                 rack_check_bottom_drag(tp, rack, so, acked);
11292         }
11293         if (tp->snd_una == tp->snd_max) {
11294                 tp->t_flags &= ~TF_PREVVALID;
11295                 rack->r_ctl.retran_during_recovery = 0;
11296                 rack->r_ctl.dsack_byte_cnt = 0;
11297                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
11298                 if (rack->r_ctl.rc_went_idle_time == 0)
11299                         rack->r_ctl.rc_went_idle_time = 1;
11300                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
11301                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
11302                         tp->t_acktime = 0;
11303                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11304         }
11305         if (acked && rack->r_fast_output)
11306                 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
11307         if (sbavail(&so->so_snd)) {
11308                 rack->r_wanted_output = 1;
11309         }
11310         return (1);
11311 }
11312
11313 /*
11314  * Return value of 1, the TCB is unlocked and most
11315  * likely gone, return value of 0, the TCP is still
11316  * locked.
11317  */
11318 static int
11319 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
11320     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11321     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11322 {
11323         int32_t ret_val = 0;
11324         int32_t todrop;
11325         int32_t ourfinisacked = 0;
11326         struct tcp_rack *rack;
11327
11328         ctf_calc_rwin(so, tp);
11329         /*
11330          * If the state is SYN_SENT: if seg contains an ACK, but not for our
11331          * SYN, drop the input. if seg contains a RST, then drop the
11332          * connection. if seg does not contain SYN, then drop it. Otherwise
11333          * this is an acceptable SYN segment initialize tp->rcv_nxt and
11334          * tp->irs if seg contains ack then advance tp->snd_una if seg
11335          * contains an ECE and ECN support is enabled, the stream is ECN
11336          * capable. if SYN has been acked change to ESTABLISHED else
11337          * SYN_RCVD state arrange for segment to be acked (eventually)
11338          * continue processing rest of data/controls.
11339          */
11340         if ((thflags & TH_ACK) &&
11341             (SEQ_LEQ(th->th_ack, tp->iss) ||
11342             SEQ_GT(th->th_ack, tp->snd_max))) {
11343                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11344                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11345                 return (1);
11346         }
11347         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
11348                 TCP_PROBE5(connect__refused, NULL, tp,
11349                     mtod(m, const char *), tp, th);
11350                 tp = tcp_drop(tp, ECONNREFUSED);
11351                 ctf_do_drop(m, tp);
11352                 return (1);
11353         }
11354         if (thflags & TH_RST) {
11355                 ctf_do_drop(m, tp);
11356                 return (1);
11357         }
11358         if (!(thflags & TH_SYN)) {
11359                 ctf_do_drop(m, tp);
11360                 return (1);
11361         }
11362         tp->irs = th->th_seq;
11363         tcp_rcvseqinit(tp);
11364         rack = (struct tcp_rack *)tp->t_fb_ptr;
11365         if (thflags & TH_ACK) {
11366                 int tfo_partial = 0;
11367
11368                 KMOD_TCPSTAT_INC(tcps_connects);
11369                 soisconnected(so);
11370 #ifdef MAC
11371                 mac_socketpeer_set_from_mbuf(m, so);
11372 #endif
11373                 /* Do window scaling on this connection? */
11374                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11375                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11376                         tp->rcv_scale = tp->request_r_scale;
11377                 }
11378                 tp->rcv_adv += min(tp->rcv_wnd,
11379                     TCP_MAXWIN << tp->rcv_scale);
11380                 /*
11381                  * If not all the data that was sent in the TFO SYN
11382                  * has been acked, resend the remainder right away.
11383                  */
11384                 if (IS_FASTOPEN(tp->t_flags) &&
11385                     (tp->snd_una != tp->snd_max)) {
11386                         tp->snd_nxt = th->th_ack;
11387                         tfo_partial = 1;
11388                 }
11389                 /*
11390                  * If there's data, delay ACK; if there's also a FIN ACKNOW
11391                  * will be turned on later.
11392                  */
11393                 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
11394                         rack_timer_cancel(tp, rack,
11395                                           rack->r_ctl.rc_rcvtime, __LINE__);
11396                         tp->t_flags |= TF_DELACK;
11397                 } else {
11398                         rack->r_wanted_output = 1;
11399                         tp->t_flags |= TF_ACKNOW;
11400                         rack->rc_dack_toggle = 0;
11401                 }
11402                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
11403                     (V_tcp_do_ecn == 1)) {
11404                         tp->t_flags2 |= TF2_ECN_PERMIT;
11405                         KMOD_TCPSTAT_INC(tcps_ecn_shs);
11406                 }
11407                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
11408                         /*
11409                          * We advance snd_una for the
11410                          * fast open case. If th_ack is
11411                          * acknowledging data beyond
11412                          * snd_una we can't just call
11413                          * ack-processing since the
11414                          * data stream in our send-map
11415                          * will start at snd_una + 1 (one
11416                          * beyond the SYN). If its just
11417                          * equal we don't need to do that
11418                          * and there is no send_map.
11419                          */
11420                         tp->snd_una++;
11421                 }
11422                 /*
11423                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
11424                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
11425                  */
11426                 tp->t_starttime = ticks;
11427                 if (tp->t_flags & TF_NEEDFIN) {
11428                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
11429                         tp->t_flags &= ~TF_NEEDFIN;
11430                         thflags &= ~TH_SYN;
11431                 } else {
11432                         tcp_state_change(tp, TCPS_ESTABLISHED);
11433                         TCP_PROBE5(connect__established, NULL, tp,
11434                             mtod(m, const char *), tp, th);
11435                         rack_cc_conn_init(tp);
11436                 }
11437         } else {
11438                 /*
11439                  * Received initial SYN in SYN-SENT[*] state => simultaneous
11440                  * open.  If segment contains CC option and there is a
11441                  * cached CC, apply TAO test. If it succeeds, connection is *
11442                  * half-synchronized. Otherwise, do 3-way handshake:
11443                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
11444                  * there was no CC option, clear cached CC value.
11445                  */
11446                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
11447                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
11448         }
11449         INP_WLOCK_ASSERT(tp->t_inpcb);
11450         /*
11451          * Advance th->th_seq to correspond to first data byte. If data,
11452          * trim to stay within window, dropping FIN if necessary.
11453          */
11454         th->th_seq++;
11455         if (tlen > tp->rcv_wnd) {
11456                 todrop = tlen - tp->rcv_wnd;
11457                 m_adj(m, -todrop);
11458                 tlen = tp->rcv_wnd;
11459                 thflags &= ~TH_FIN;
11460                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
11461                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
11462         }
11463         tp->snd_wl1 = th->th_seq - 1;
11464         tp->rcv_up = th->th_seq;
11465         /*
11466          * Client side of transaction: already sent SYN and data. If the
11467          * remote host used T/TCP to validate the SYN, our data will be
11468          * ACK'd; if so, enter normal data segment processing in the middle
11469          * of step 5, ack processing. Otherwise, goto step 6.
11470          */
11471         if (thflags & TH_ACK) {
11472                 /* For syn-sent we need to possibly update the rtt */
11473                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11474                         uint32_t t, mcts;
11475
11476                         mcts = tcp_ts_getticks();
11477                         t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11478                         if (!tp->t_rttlow || tp->t_rttlow > t)
11479                                 tp->t_rttlow = t;
11480                         rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
11481                         tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11482                         tcp_rack_xmit_timer_commit(rack, tp);
11483                 }
11484                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
11485                         return (ret_val);
11486                 /* We may have changed to FIN_WAIT_1 above */
11487                 if (tp->t_state == TCPS_FIN_WAIT_1) {
11488                         /*
11489                          * In FIN_WAIT_1 STATE in addition to the processing
11490                          * for the ESTABLISHED state if our FIN is now
11491                          * acknowledged then enter FIN_WAIT_2.
11492                          */
11493                         if (ourfinisacked) {
11494                                 /*
11495                                  * If we can't receive any more data, then
11496                                  * closing user can proceed. Starting the
11497                                  * timer is contrary to the specification,
11498                                  * but if we don't get a FIN we'll hang
11499                                  * forever.
11500                                  *
11501                                  * XXXjl: we should release the tp also, and
11502                                  * use a compressed state.
11503                                  */
11504                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11505                                         soisdisconnected(so);
11506                                         tcp_timer_activate(tp, TT_2MSL,
11507                                             (tcp_fast_finwait2_recycle ?
11508                                             tcp_finwait2_timeout :
11509                                             TP_MAXIDLE(tp)));
11510                                 }
11511                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
11512                         }
11513                 }
11514         }
11515         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11516            tiwin, thflags, nxt_pkt));
11517 }
11518
11519 /*
11520  * Return value of 1, the TCB is unlocked and most
11521  * likely gone, return value of 0, the TCP is still
11522  * locked.
11523  */
11524 static int
11525 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
11526     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11527     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11528 {
11529         struct tcp_rack *rack;
11530         int32_t ret_val = 0;
11531         int32_t ourfinisacked = 0;
11532
11533         ctf_calc_rwin(so, tp);
11534         if ((thflags & TH_ACK) &&
11535             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
11536             SEQ_GT(th->th_ack, tp->snd_max))) {
11537                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11538                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11539                 return (1);
11540         }
11541         rack = (struct tcp_rack *)tp->t_fb_ptr;
11542         if (IS_FASTOPEN(tp->t_flags)) {
11543                 /*
11544                  * When a TFO connection is in SYN_RECEIVED, the
11545                  * only valid packets are the initial SYN, a
11546                  * retransmit/copy of the initial SYN (possibly with
11547                  * a subset of the original data), a valid ACK, a
11548                  * FIN, or a RST.
11549                  */
11550                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
11551                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11552                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11553                         return (1);
11554                 } else if (thflags & TH_SYN) {
11555                         /* non-initial SYN is ignored */
11556                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
11557                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
11558                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
11559                                 ctf_do_drop(m, NULL);
11560                                 return (0);
11561                         }
11562                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
11563                         ctf_do_drop(m, NULL);
11564                         return (0);
11565                 }
11566         }
11567         if ((thflags & TH_RST) ||
11568             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11569                 return (ctf_process_rst(m, th, so, tp));
11570         /*
11571          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11572          * it's less than ts_recent, drop it.
11573          */
11574         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11575             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11576                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11577                         return (ret_val);
11578         }
11579         /*
11580          * In the SYN-RECEIVED state, validate that the packet belongs to
11581          * this connection before trimming the data to fit the receive
11582          * window.  Check the sequence number versus IRS since we know the
11583          * sequence numbers haven't wrapped.  This is a partial fix for the
11584          * "LAND" DoS attack.
11585          */
11586         if (SEQ_LT(th->th_seq, tp->irs)) {
11587                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11588                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11589                 return (1);
11590         }
11591         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11592                               &rack->r_ctl.challenge_ack_ts,
11593                               &rack->r_ctl.challenge_ack_cnt)) {
11594                 return (ret_val);
11595         }
11596         /*
11597          * If last ACK falls within this segment's sequence numbers, record
11598          * its timestamp. NOTE: 1) That the test incorporates suggestions
11599          * from the latest proposal of the tcplw@cray.com list (Braden
11600          * 1993/04/26). 2) That updating only on newer timestamps interferes
11601          * with our earlier PAWS tests, so this check should be solely
11602          * predicated on the sequence space of this segment. 3) That we
11603          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11604          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11605          * SEG.Len, This modified check allows us to overcome RFC1323's
11606          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11607          * p.869. In such cases, we can still calculate the RTT correctly
11608          * when RCV.NXT == Last.ACK.Sent.
11609          */
11610         if ((to->to_flags & TOF_TS) != 0 &&
11611             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11612             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11613             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11614                 tp->ts_recent_age = tcp_ts_getticks();
11615                 tp->ts_recent = to->to_tsval;
11616         }
11617         tp->snd_wnd = tiwin;
11618         rack_validate_fo_sendwin_up(tp, rack);
11619         /*
11620          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11621          * is on (half-synchronized state), then queue data for later
11622          * processing; else drop segment and return.
11623          */
11624         if ((thflags & TH_ACK) == 0) {
11625                 if (IS_FASTOPEN(tp->t_flags)) {
11626                         rack_cc_conn_init(tp);
11627                 }
11628                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11629                     tiwin, thflags, nxt_pkt));
11630         }
11631         KMOD_TCPSTAT_INC(tcps_connects);
11632         soisconnected(so);
11633         /* Do window scaling? */
11634         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11635             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11636                 tp->rcv_scale = tp->request_r_scale;
11637         }
11638         /*
11639          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
11640          * FIN-WAIT-1
11641          */
11642         tp->t_starttime = ticks;
11643         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
11644                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
11645                 tp->t_tfo_pending = NULL;
11646         }
11647         if (tp->t_flags & TF_NEEDFIN) {
11648                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
11649                 tp->t_flags &= ~TF_NEEDFIN;
11650         } else {
11651                 tcp_state_change(tp, TCPS_ESTABLISHED);
11652                 TCP_PROBE5(accept__established, NULL, tp,
11653                     mtod(m, const char *), tp, th);
11654                 /*
11655                  * TFO connections call cc_conn_init() during SYN
11656                  * processing.  Calling it again here for such connections
11657                  * is not harmless as it would undo the snd_cwnd reduction
11658                  * that occurs when a TFO SYN|ACK is retransmitted.
11659                  */
11660                 if (!IS_FASTOPEN(tp->t_flags))
11661                         rack_cc_conn_init(tp);
11662         }
11663         /*
11664          * Account for the ACK of our SYN prior to
11665          * regular ACK processing below, except for
11666          * simultaneous SYN, which is handled later.
11667          */
11668         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
11669                 tp->snd_una++;
11670         /*
11671          * If segment contains data or ACK, will call tcp_reass() later; if
11672          * not, do so now to pass queued data to user.
11673          */
11674         if (tlen == 0 && (thflags & TH_FIN) == 0) {
11675                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
11676                     (struct mbuf *)0);
11677                 if (tp->t_flags & TF_WAKESOR) {
11678                         tp->t_flags &= ~TF_WAKESOR;
11679                         /* NB: sorwakeup_locked() does an implicit unlock. */
11680                         sorwakeup_locked(so);
11681                 }
11682         }
11683         tp->snd_wl1 = th->th_seq - 1;
11684         /* For syn-recv we need to possibly update the rtt */
11685         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11686                 uint32_t t, mcts;
11687
11688                 mcts = tcp_ts_getticks();
11689                 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11690                 if (!tp->t_rttlow || tp->t_rttlow > t)
11691                         tp->t_rttlow = t;
11692                 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
11693                 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11694                 tcp_rack_xmit_timer_commit(rack, tp);
11695         }
11696         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
11697                 return (ret_val);
11698         }
11699         if (tp->t_state == TCPS_FIN_WAIT_1) {
11700                 /* We could have went to FIN_WAIT_1 (or EST) above */
11701                 /*
11702                  * In FIN_WAIT_1 STATE in addition to the processing for the
11703                  * ESTABLISHED state if our FIN is now acknowledged then
11704                  * enter FIN_WAIT_2.
11705                  */
11706                 if (ourfinisacked) {
11707                         /*
11708                          * If we can't receive any more data, then closing
11709                          * user can proceed. Starting the timer is contrary
11710                          * to the specification, but if we don't get a FIN
11711                          * we'll hang forever.
11712                          *
11713                          * XXXjl: we should release the tp also, and use a
11714                          * compressed state.
11715                          */
11716                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11717                                 soisdisconnected(so);
11718                                 tcp_timer_activate(tp, TT_2MSL,
11719                                     (tcp_fast_finwait2_recycle ?
11720                                     tcp_finwait2_timeout :
11721                                     TP_MAXIDLE(tp)));
11722                         }
11723                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
11724                 }
11725         }
11726         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11727             tiwin, thflags, nxt_pkt));
11728 }
11729
11730 /*
11731  * Return value of 1, the TCB is unlocked and most
11732  * likely gone, return value of 0, the TCP is still
11733  * locked.
11734  */
11735 static int
11736 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
11737     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11738     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11739 {
11740         int32_t ret_val = 0;
11741         struct tcp_rack *rack;
11742
11743         /*
11744          * Header prediction: check for the two common cases of a
11745          * uni-directional data xfer.  If the packet has no control flags,
11746          * is in-sequence, the window didn't change and we're not
11747          * retransmitting, it's a candidate.  If the length is zero and the
11748          * ack moved forward, we're the sender side of the xfer.  Just free
11749          * the data acked & wake any higher level process that was blocked
11750          * waiting for space.  If the length is non-zero and the ack didn't
11751          * move, we're the receiver side.  If we're getting packets in-order
11752          * (the reassembly queue is empty), add the data toc The socket
11753          * buffer and note that we need a delayed ack. Make sure that the
11754          * hidden state-flags are also off. Since we check for
11755          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
11756          */
11757         rack = (struct tcp_rack *)tp->t_fb_ptr;
11758         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
11759             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
11760             __predict_true(SEGQ_EMPTY(tp)) &&
11761             __predict_true(th->th_seq == tp->rcv_nxt)) {
11762                 if (tlen == 0) {
11763                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
11764                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
11765                                 return (0);
11766                         }
11767                 } else {
11768                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
11769                             tiwin, nxt_pkt, iptos)) {
11770                                 return (0);
11771                         }
11772                 }
11773         }
11774         ctf_calc_rwin(so, tp);
11775
11776         if ((thflags & TH_RST) ||
11777             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11778                 return (ctf_process_rst(m, th, so, tp));
11779
11780         /*
11781          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11782          * synchronized state.
11783          */
11784         if (thflags & TH_SYN) {
11785                 ctf_challenge_ack(m, th, tp, &ret_val);
11786                 return (ret_val);
11787         }
11788         /*
11789          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11790          * it's less than ts_recent, drop it.
11791          */
11792         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11793             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11794                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11795                         return (ret_val);
11796         }
11797         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11798                               &rack->r_ctl.challenge_ack_ts,
11799                               &rack->r_ctl.challenge_ack_cnt)) {
11800                 return (ret_val);
11801         }
11802         /*
11803          * If last ACK falls within this segment's sequence numbers, record
11804          * its timestamp. NOTE: 1) That the test incorporates suggestions
11805          * from the latest proposal of the tcplw@cray.com list (Braden
11806          * 1993/04/26). 2) That updating only on newer timestamps interferes
11807          * with our earlier PAWS tests, so this check should be solely
11808          * predicated on the sequence space of this segment. 3) That we
11809          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11810          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11811          * SEG.Len, This modified check allows us to overcome RFC1323's
11812          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11813          * p.869. In such cases, we can still calculate the RTT correctly
11814          * when RCV.NXT == Last.ACK.Sent.
11815          */
11816         if ((to->to_flags & TOF_TS) != 0 &&
11817             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11818             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11819             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11820                 tp->ts_recent_age = tcp_ts_getticks();
11821                 tp->ts_recent = to->to_tsval;
11822         }
11823         /*
11824          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11825          * is on (half-synchronized state), then queue data for later
11826          * processing; else drop segment and return.
11827          */
11828         if ((thflags & TH_ACK) == 0) {
11829                 if (tp->t_flags & TF_NEEDSYN) {
11830                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11831                             tiwin, thflags, nxt_pkt));
11832
11833                 } else if (tp->t_flags & TF_ACKNOW) {
11834                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11835                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11836                         return (ret_val);
11837                 } else {
11838                         ctf_do_drop(m, NULL);
11839                         return (0);
11840                 }
11841         }
11842         /*
11843          * Ack processing.
11844          */
11845         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11846                 return (ret_val);
11847         }
11848         if (sbavail(&so->so_snd)) {
11849                 if (ctf_progress_timeout_check(tp, true)) {
11850                         rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
11851                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
11852                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11853                         return (1);
11854                 }
11855         }
11856         /* State changes only happen in rack_process_data() */
11857         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11858             tiwin, thflags, nxt_pkt));
11859 }
11860
11861 /*
11862  * Return value of 1, the TCB is unlocked and most
11863  * likely gone, return value of 0, the TCP is still
11864  * locked.
11865  */
11866 static int
11867 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
11868     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11869     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11870 {
11871         int32_t ret_val = 0;
11872         struct tcp_rack *rack;
11873
11874         rack = (struct tcp_rack *)tp->t_fb_ptr;
11875         ctf_calc_rwin(so, tp);
11876         if ((thflags & TH_RST) ||
11877             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11878                 return (ctf_process_rst(m, th, so, tp));
11879         /*
11880          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11881          * synchronized state.
11882          */
11883         if (thflags & TH_SYN) {
11884                 ctf_challenge_ack(m, th, tp, &ret_val);
11885                 return (ret_val);
11886         }
11887         /*
11888          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11889          * it's less than ts_recent, drop it.
11890          */
11891         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11892             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11893                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11894                         return (ret_val);
11895         }
11896         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11897                               &rack->r_ctl.challenge_ack_ts,
11898                               &rack->r_ctl.challenge_ack_cnt)) {
11899                 return (ret_val);
11900         }
11901         /*
11902          * If last ACK falls within this segment's sequence numbers, record
11903          * its timestamp. NOTE: 1) That the test incorporates suggestions
11904          * from the latest proposal of the tcplw@cray.com list (Braden
11905          * 1993/04/26). 2) That updating only on newer timestamps interferes
11906          * with our earlier PAWS tests, so this check should be solely
11907          * predicated on the sequence space of this segment. 3) That we
11908          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11909          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11910          * SEG.Len, This modified check allows us to overcome RFC1323's
11911          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11912          * p.869. In such cases, we can still calculate the RTT correctly
11913          * when RCV.NXT == Last.ACK.Sent.
11914          */
11915         if ((to->to_flags & TOF_TS) != 0 &&
11916             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11917             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11918             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11919                 tp->ts_recent_age = tcp_ts_getticks();
11920                 tp->ts_recent = to->to_tsval;
11921         }
11922         /*
11923          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11924          * is on (half-synchronized state), then queue data for later
11925          * processing; else drop segment and return.
11926          */
11927         if ((thflags & TH_ACK) == 0) {
11928                 if (tp->t_flags & TF_NEEDSYN) {
11929                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11930                             tiwin, thflags, nxt_pkt));
11931
11932                 } else if (tp->t_flags & TF_ACKNOW) {
11933                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11934                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11935                         return (ret_val);
11936                 } else {
11937                         ctf_do_drop(m, NULL);
11938                         return (0);
11939                 }
11940         }
11941         /*
11942          * Ack processing.
11943          */
11944         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11945                 return (ret_val);
11946         }
11947         if (sbavail(&so->so_snd)) {
11948                 if (ctf_progress_timeout_check(tp, true)) {
11949                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
11950                                                 tp, tick, PROGRESS_DROP, __LINE__);
11951                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
11952                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11953                         return (1);
11954                 }
11955         }
11956         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11957             tiwin, thflags, nxt_pkt));
11958 }
11959
11960 static int
11961 rack_check_data_after_close(struct mbuf *m,
11962     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
11963 {
11964         struct tcp_rack *rack;
11965
11966         rack = (struct tcp_rack *)tp->t_fb_ptr;
11967         if (rack->rc_allow_data_af_clo == 0) {
11968         close_now:
11969                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11970                 /* tcp_close will kill the inp pre-log the Reset */
11971                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
11972                 tp = tcp_close(tp);
11973                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
11974                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
11975                 return (1);
11976         }
11977         if (sbavail(&so->so_snd) == 0)
11978                 goto close_now;
11979         /* Ok we allow data that is ignored and a followup reset */
11980         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11981         tp->rcv_nxt = th->th_seq + *tlen;
11982         tp->t_flags2 |= TF2_DROP_AF_DATA;
11983         rack->r_wanted_output = 1;
11984         *tlen = 0;
11985         return (0);
11986 }
11987
11988 /*
11989  * Return value of 1, the TCB is unlocked and most
11990  * likely gone, return value of 0, the TCP is still
11991  * locked.
11992  */
11993 static int
11994 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
11995     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11996     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11997 {
11998         int32_t ret_val = 0;
11999         int32_t ourfinisacked = 0;
12000         struct tcp_rack *rack;
12001
12002         rack = (struct tcp_rack *)tp->t_fb_ptr;
12003         ctf_calc_rwin(so, tp);
12004
12005         if ((thflags & TH_RST) ||
12006             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12007                 return (ctf_process_rst(m, th, so, tp));
12008         /*
12009          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12010          * synchronized state.
12011          */
12012         if (thflags & TH_SYN) {
12013                 ctf_challenge_ack(m, th, tp, &ret_val);
12014                 return (ret_val);
12015         }
12016         /*
12017          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12018          * it's less than ts_recent, drop it.
12019          */
12020         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12021             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12022                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12023                         return (ret_val);
12024         }
12025         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12026                               &rack->r_ctl.challenge_ack_ts,
12027                               &rack->r_ctl.challenge_ack_cnt)) {
12028                 return (ret_val);
12029         }
12030         /*
12031          * If new data are received on a connection after the user processes
12032          * are gone, then RST the other end.
12033          */
12034         if ((so->so_state & SS_NOFDREF) && tlen) {
12035                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12036                         return (1);
12037         }
12038         /*
12039          * If last ACK falls within this segment's sequence numbers, record
12040          * its timestamp. NOTE: 1) That the test incorporates suggestions
12041          * from the latest proposal of the tcplw@cray.com list (Braden
12042          * 1993/04/26). 2) That updating only on newer timestamps interferes
12043          * with our earlier PAWS tests, so this check should be solely
12044          * predicated on the sequence space of this segment. 3) That we
12045          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12046          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12047          * SEG.Len, This modified check allows us to overcome RFC1323's
12048          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12049          * p.869. In such cases, we can still calculate the RTT correctly
12050          * when RCV.NXT == Last.ACK.Sent.
12051          */
12052         if ((to->to_flags & TOF_TS) != 0 &&
12053             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12054             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12055             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12056                 tp->ts_recent_age = tcp_ts_getticks();
12057                 tp->ts_recent = to->to_tsval;
12058         }
12059         /*
12060          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12061          * is on (half-synchronized state), then queue data for later
12062          * processing; else drop segment and return.
12063          */
12064         if ((thflags & TH_ACK) == 0) {
12065                 if (tp->t_flags & TF_NEEDSYN) {
12066                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12067                             tiwin, thflags, nxt_pkt));
12068                 } else if (tp->t_flags & TF_ACKNOW) {
12069                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12070                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12071                         return (ret_val);
12072                 } else {
12073                         ctf_do_drop(m, NULL);
12074                         return (0);
12075                 }
12076         }
12077         /*
12078          * Ack processing.
12079          */
12080         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12081                 return (ret_val);
12082         }
12083         if (ourfinisacked) {
12084                 /*
12085                  * If we can't receive any more data, then closing user can
12086                  * proceed. Starting the timer is contrary to the
12087                  * specification, but if we don't get a FIN we'll hang
12088                  * forever.
12089                  *
12090                  * XXXjl: we should release the tp also, and use a
12091                  * compressed state.
12092                  */
12093                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12094                         soisdisconnected(so);
12095                         tcp_timer_activate(tp, TT_2MSL,
12096                             (tcp_fast_finwait2_recycle ?
12097                             tcp_finwait2_timeout :
12098                             TP_MAXIDLE(tp)));
12099                 }
12100                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
12101         }
12102         if (sbavail(&so->so_snd)) {
12103                 if (ctf_progress_timeout_check(tp, true)) {
12104                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12105                                                 tp, tick, PROGRESS_DROP, __LINE__);
12106                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12107                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12108                         return (1);
12109                 }
12110         }
12111         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12112             tiwin, thflags, nxt_pkt));
12113 }
12114
12115 /*
12116  * Return value of 1, the TCB is unlocked and most
12117  * likely gone, return value of 0, the TCP is still
12118  * locked.
12119  */
12120 static int
12121 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
12122     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12123     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12124 {
12125         int32_t ret_val = 0;
12126         int32_t ourfinisacked = 0;
12127         struct tcp_rack *rack;
12128
12129         rack = (struct tcp_rack *)tp->t_fb_ptr;
12130         ctf_calc_rwin(so, tp);
12131
12132         if ((thflags & TH_RST) ||
12133             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12134                 return (ctf_process_rst(m, th, so, tp));
12135         /*
12136          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12137          * synchronized state.
12138          */
12139         if (thflags & TH_SYN) {
12140                 ctf_challenge_ack(m, th, tp, &ret_val);
12141                 return (ret_val);
12142         }
12143         /*
12144          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12145          * it's less than ts_recent, drop it.
12146          */
12147         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12148             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12149                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12150                         return (ret_val);
12151         }
12152         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12153                               &rack->r_ctl.challenge_ack_ts,
12154                               &rack->r_ctl.challenge_ack_cnt)) {
12155                 return (ret_val);
12156         }
12157         /*
12158          * If new data are received on a connection after the user processes
12159          * are gone, then RST the other end.
12160          */
12161         if ((so->so_state & SS_NOFDREF) && tlen) {
12162                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12163                         return (1);
12164         }
12165         /*
12166          * If last ACK falls within this segment's sequence numbers, record
12167          * its timestamp. NOTE: 1) That the test incorporates suggestions
12168          * from the latest proposal of the tcplw@cray.com list (Braden
12169          * 1993/04/26). 2) That updating only on newer timestamps interferes
12170          * with our earlier PAWS tests, so this check should be solely
12171          * predicated on the sequence space of this segment. 3) That we
12172          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12173          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12174          * SEG.Len, This modified check allows us to overcome RFC1323's
12175          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12176          * p.869. In such cases, we can still calculate the RTT correctly
12177          * when RCV.NXT == Last.ACK.Sent.
12178          */
12179         if ((to->to_flags & TOF_TS) != 0 &&
12180             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12181             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12182             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12183                 tp->ts_recent_age = tcp_ts_getticks();
12184                 tp->ts_recent = to->to_tsval;
12185         }
12186         /*
12187          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12188          * is on (half-synchronized state), then queue data for later
12189          * processing; else drop segment and return.
12190          */
12191         if ((thflags & TH_ACK) == 0) {
12192                 if (tp->t_flags & TF_NEEDSYN) {
12193                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12194                             tiwin, thflags, nxt_pkt));
12195                 } else if (tp->t_flags & TF_ACKNOW) {
12196                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12197                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12198                         return (ret_val);
12199                 } else {
12200                         ctf_do_drop(m, NULL);
12201                         return (0);
12202                 }
12203         }
12204         /*
12205          * Ack processing.
12206          */
12207         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12208                 return (ret_val);
12209         }
12210         if (ourfinisacked) {
12211                 tcp_twstart(tp);
12212                 m_freem(m);
12213                 return (1);
12214         }
12215         if (sbavail(&so->so_snd)) {
12216                 if (ctf_progress_timeout_check(tp, true)) {
12217                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12218                                                 tp, tick, PROGRESS_DROP, __LINE__);
12219                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12220                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12221                         return (1);
12222                 }
12223         }
12224         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12225             tiwin, thflags, nxt_pkt));
12226 }
12227
12228 /*
12229  * Return value of 1, the TCB is unlocked and most
12230  * likely gone, return value of 0, the TCP is still
12231  * locked.
12232  */
12233 static int
12234 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12235     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12236     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12237 {
12238         int32_t ret_val = 0;
12239         int32_t ourfinisacked = 0;
12240         struct tcp_rack *rack;
12241
12242         rack = (struct tcp_rack *)tp->t_fb_ptr;
12243         ctf_calc_rwin(so, tp);
12244
12245         if ((thflags & TH_RST) ||
12246             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12247                 return (ctf_process_rst(m, th, so, tp));
12248         /*
12249          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12250          * synchronized state.
12251          */
12252         if (thflags & TH_SYN) {
12253                 ctf_challenge_ack(m, th, tp, &ret_val);
12254                 return (ret_val);
12255         }
12256         /*
12257          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12258          * it's less than ts_recent, drop it.
12259          */
12260         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12261             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12262                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12263                         return (ret_val);
12264         }
12265         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12266                               &rack->r_ctl.challenge_ack_ts,
12267                               &rack->r_ctl.challenge_ack_cnt)) {
12268                 return (ret_val);
12269         }
12270         /*
12271          * If new data are received on a connection after the user processes
12272          * are gone, then RST the other end.
12273          */
12274         if ((so->so_state & SS_NOFDREF) && tlen) {
12275                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12276                         return (1);
12277         }
12278         /*
12279          * If last ACK falls within this segment's sequence numbers, record
12280          * its timestamp. NOTE: 1) That the test incorporates suggestions
12281          * from the latest proposal of the tcplw@cray.com list (Braden
12282          * 1993/04/26). 2) That updating only on newer timestamps interferes
12283          * with our earlier PAWS tests, so this check should be solely
12284          * predicated on the sequence space of this segment. 3) That we
12285          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12286          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12287          * SEG.Len, This modified check allows us to overcome RFC1323's
12288          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12289          * p.869. In such cases, we can still calculate the RTT correctly
12290          * when RCV.NXT == Last.ACK.Sent.
12291          */
12292         if ((to->to_flags & TOF_TS) != 0 &&
12293             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12294             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12295             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12296                 tp->ts_recent_age = tcp_ts_getticks();
12297                 tp->ts_recent = to->to_tsval;
12298         }
12299         /*
12300          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12301          * is on (half-synchronized state), then queue data for later
12302          * processing; else drop segment and return.
12303          */
12304         if ((thflags & TH_ACK) == 0) {
12305                 if (tp->t_flags & TF_NEEDSYN) {
12306                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12307                             tiwin, thflags, nxt_pkt));
12308                 } else if (tp->t_flags & TF_ACKNOW) {
12309                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12310                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12311                         return (ret_val);
12312                 } else {
12313                         ctf_do_drop(m, NULL);
12314                         return (0);
12315                 }
12316         }
12317         /*
12318          * case TCPS_LAST_ACK: Ack processing.
12319          */
12320         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12321                 return (ret_val);
12322         }
12323         if (ourfinisacked) {
12324                 tp = tcp_close(tp);
12325                 ctf_do_drop(m, tp);
12326                 return (1);
12327         }
12328         if (sbavail(&so->so_snd)) {
12329                 if (ctf_progress_timeout_check(tp, true)) {
12330                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12331                                                 tp, tick, PROGRESS_DROP, __LINE__);
12332                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12333                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12334                         return (1);
12335                 }
12336         }
12337         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12338             tiwin, thflags, nxt_pkt));
12339 }
12340
12341 /*
12342  * Return value of 1, the TCB is unlocked and most
12343  * likely gone, return value of 0, the TCP is still
12344  * locked.
12345  */
12346 static int
12347 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
12348     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12349     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12350 {
12351         int32_t ret_val = 0;
12352         int32_t ourfinisacked = 0;
12353         struct tcp_rack *rack;
12354
12355         rack = (struct tcp_rack *)tp->t_fb_ptr;
12356         ctf_calc_rwin(so, tp);
12357
12358         /* Reset receive buffer auto scaling when not in bulk receive mode. */
12359         if ((thflags & TH_RST) ||
12360             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12361                 return (ctf_process_rst(m, th, so, tp));
12362         /*
12363          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12364          * synchronized state.
12365          */
12366         if (thflags & TH_SYN) {
12367                 ctf_challenge_ack(m, th, tp, &ret_val);
12368                 return (ret_val);
12369         }
12370         /*
12371          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12372          * it's less than ts_recent, drop it.
12373          */
12374         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12375             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12376                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12377                         return (ret_val);
12378         }
12379         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12380                               &rack->r_ctl.challenge_ack_ts,
12381                               &rack->r_ctl.challenge_ack_cnt)) {
12382                 return (ret_val);
12383         }
12384         /*
12385          * If new data are received on a connection after the user processes
12386          * are gone, then RST the other end.
12387          */
12388         if ((so->so_state & SS_NOFDREF) &&
12389             tlen) {
12390                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12391                         return (1);
12392         }
12393         /*
12394          * If last ACK falls within this segment's sequence numbers, record
12395          * its timestamp. NOTE: 1) That the test incorporates suggestions
12396          * from the latest proposal of the tcplw@cray.com list (Braden
12397          * 1993/04/26). 2) That updating only on newer timestamps interferes
12398          * with our earlier PAWS tests, so this check should be solely
12399          * predicated on the sequence space of this segment. 3) That we
12400          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12401          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12402          * SEG.Len, This modified check allows us to overcome RFC1323's
12403          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12404          * p.869. In such cases, we can still calculate the RTT correctly
12405          * when RCV.NXT == Last.ACK.Sent.
12406          */
12407         if ((to->to_flags & TOF_TS) != 0 &&
12408             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12409             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12410             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12411                 tp->ts_recent_age = tcp_ts_getticks();
12412                 tp->ts_recent = to->to_tsval;
12413         }
12414         /*
12415          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12416          * is on (half-synchronized state), then queue data for later
12417          * processing; else drop segment and return.
12418          */
12419         if ((thflags & TH_ACK) == 0) {
12420                 if (tp->t_flags & TF_NEEDSYN) {
12421                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12422                             tiwin, thflags, nxt_pkt));
12423                 } else if (tp->t_flags & TF_ACKNOW) {
12424                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12425                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12426                         return (ret_val);
12427                 } else {
12428                         ctf_do_drop(m, NULL);
12429                         return (0);
12430                 }
12431         }
12432         /*
12433          * Ack processing.
12434          */
12435         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12436                 return (ret_val);
12437         }
12438         if (sbavail(&so->so_snd)) {
12439                 if (ctf_progress_timeout_check(tp, true)) {
12440                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12441                                                 tp, tick, PROGRESS_DROP, __LINE__);
12442                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12443                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12444                         return (1);
12445                 }
12446         }
12447         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12448             tiwin, thflags, nxt_pkt));
12449 }
12450
12451 static void inline
12452 rack_clear_rate_sample(struct tcp_rack *rack)
12453 {
12454         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
12455         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
12456         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
12457 }
12458
12459 static void
12460 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
12461 {
12462         uint64_t bw_est, rate_wanted;
12463         int chged = 0;
12464         uint32_t user_max, orig_min, orig_max;
12465
12466         orig_min = rack->r_ctl.rc_pace_min_segs;
12467         orig_max = rack->r_ctl.rc_pace_max_segs;
12468         user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
12469         if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
12470                 chged = 1;
12471         rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
12472         if (rack->use_fixed_rate || rack->rc_force_max_seg) {
12473                 if (user_max != rack->r_ctl.rc_pace_max_segs)
12474                         chged = 1;
12475         }
12476         if (rack->rc_force_max_seg) {
12477                 rack->r_ctl.rc_pace_max_segs = user_max;
12478         } else if (rack->use_fixed_rate) {
12479                 bw_est = rack_get_bw(rack);
12480                 if ((rack->r_ctl.crte == NULL) ||
12481                     (bw_est != rack->r_ctl.crte->rate)) {
12482                         rack->r_ctl.rc_pace_max_segs = user_max;
12483                 } else {
12484                         /* We are pacing right at the hardware rate */
12485                         uint32_t segsiz;
12486
12487                         segsiz = min(ctf_fixed_maxseg(tp),
12488                                      rack->r_ctl.rc_pace_min_segs);
12489                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
12490                                                            tp, bw_est, segsiz, 0,
12491                                                            rack->r_ctl.crte, NULL);
12492                 }
12493         } else if (rack->rc_always_pace) {
12494                 if (rack->r_ctl.gp_bw ||
12495 #ifdef NETFLIX_PEAKRATE
12496                     rack->rc_tp->t_maxpeakrate ||
12497 #endif
12498                     rack->r_ctl.init_rate) {
12499                         /* We have a rate of some sort set */
12500                         uint32_t  orig;
12501
12502                         bw_est = rack_get_bw(rack);
12503                         orig = rack->r_ctl.rc_pace_max_segs;
12504                         if (fill_override)
12505                                 rate_wanted = *fill_override;
12506                         else
12507                                 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
12508                         if (rate_wanted) {
12509                                 /* We have something */
12510                                 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
12511                                                                                    rate_wanted,
12512                                                                                    ctf_fixed_maxseg(rack->rc_tp));
12513                         } else
12514                                 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
12515                         if (orig != rack->r_ctl.rc_pace_max_segs)
12516                                 chged = 1;
12517                 } else if ((rack->r_ctl.gp_bw == 0) &&
12518                            (rack->r_ctl.rc_pace_max_segs == 0)) {
12519                         /*
12520                          * If we have nothing limit us to bursting
12521                          * out IW sized pieces.
12522                          */
12523                         chged = 1;
12524                         rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
12525                 }
12526         }
12527         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
12528                 chged = 1;
12529                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
12530         }
12531         if (chged)
12532                 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
12533 }
12534
12535
12536 static void
12537 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
12538 {
12539 #ifdef INET6
12540         struct ip6_hdr *ip6 = NULL;
12541 #endif
12542 #ifdef INET
12543         struct ip *ip = NULL;
12544 #endif
12545         struct udphdr *udp = NULL;
12546
12547         /* Ok lets fill in the fast block, it can only be used with no IP options! */
12548 #ifdef INET6
12549         if (rack->r_is_v6) {
12550                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
12551                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
12552                 if (tp->t_port) {
12553                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12554                         udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
12555                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12556                         udp->uh_dport = tp->t_port;
12557                         rack->r_ctl.fsb.udp = udp;
12558                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12559                 } else
12560                 {
12561                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
12562                         rack->r_ctl.fsb.udp = NULL;
12563                 }
12564                 tcpip_fillheaders(rack->rc_inp,
12565                                   tp->t_port,
12566                                   ip6, rack->r_ctl.fsb.th);
12567         } else
12568 #endif                          /* INET6 */
12569         {
12570                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
12571                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
12572                 if (tp->t_port) {
12573                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12574                         udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
12575                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12576                         udp->uh_dport = tp->t_port;
12577                         rack->r_ctl.fsb.udp = udp;
12578                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12579                 } else
12580                 {
12581                         rack->r_ctl.fsb.udp = NULL;
12582                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
12583                 }
12584                 tcpip_fillheaders(rack->rc_inp,
12585                                   tp->t_port,
12586                                   ip, rack->r_ctl.fsb.th);
12587         }
12588         rack->r_fsb_inited = 1;
12589 }
12590
12591 static int
12592 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
12593 {
12594         /*
12595          * Allocate the larger of spaces V6 if available else just
12596          * V4 and include udphdr (overbook)
12597          */
12598 #ifdef INET6
12599         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
12600 #else
12601         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
12602 #endif
12603         rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
12604                                             M_TCPFSB, M_NOWAIT|M_ZERO);
12605         if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
12606                 return (ENOMEM);
12607         }
12608         rack->r_fsb_inited = 0;
12609         return (0);
12610 }
12611
12612 static int
12613 rack_init(struct tcpcb *tp)
12614 {
12615         struct tcp_rack *rack = NULL;
12616         struct rack_sendmap *insret;
12617         uint32_t iwin, snt, us_cts;
12618         int err;
12619
12620         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
12621         if (tp->t_fb_ptr == NULL) {
12622                 /*
12623                  * We need to allocate memory but cant. The INP and INP_INFO
12624                  * locks and they are recusive (happens during setup. So a
12625                  * scheme to drop the locks fails :(
12626                  *
12627                  */
12628                 return (ENOMEM);
12629         }
12630         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
12631
12632         rack = (struct tcp_rack *)tp->t_fb_ptr;
12633         RB_INIT(&rack->r_ctl.rc_mtree);
12634         TAILQ_INIT(&rack->r_ctl.rc_free);
12635         TAILQ_INIT(&rack->r_ctl.rc_tmap);
12636         rack->rc_tp = tp;
12637         rack->rc_inp = tp->t_inpcb;
12638         /* Set the flag */
12639         rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
12640         /* Probably not needed but lets be sure */
12641         rack_clear_rate_sample(rack);
12642         /*
12643          * Save off the default values, socket options will poke
12644          * at these if pacing is not on or we have not yet
12645          * reached where pacing is on (gp_ready/fixed enabled).
12646          * When they get set into the CC module (when gp_ready
12647          * is enabled or we enable fixed) then we will set these
12648          * values into the CC and place in here the old values
12649          * so we have a restoral. Then we will set the flag
12650          * rc_pacing_cc_set. That way whenever we turn off pacing
12651          * or switch off this stack, we will know to go restore
12652          * the saved values.
12653          */
12654         rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
12655         rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
12656         /* We want abe like behavior as well */
12657         rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
12658         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
12659         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
12660         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
12661         rack->r_ctl.roundends = tp->snd_max;
12662         if (use_rack_rr)
12663                 rack->use_rack_rr = 1;
12664         if (V_tcp_delack_enabled)
12665                 tp->t_delayed_ack = 1;
12666         else
12667                 tp->t_delayed_ack = 0;
12668 #ifdef TCP_ACCOUNTING
12669         if (rack_tcp_accounting) {
12670                 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
12671         }
12672 #endif
12673         if (rack_enable_shared_cwnd)
12674                 rack->rack_enable_scwnd = 1;
12675         rack->rc_user_set_max_segs = rack_hptsi_segments;
12676         rack->rc_force_max_seg = 0;
12677         if (rack_use_imac_dack)
12678                 rack->rc_dack_mode = 1;
12679         TAILQ_INIT(&rack->r_ctl.opt_list);
12680         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
12681         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
12682         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
12683         rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
12684         rack->r_ctl.rc_highest_us_rtt = 0;
12685         rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
12686         rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
12687         if (rack_use_cmp_acks)
12688                 rack->r_use_cmp_ack = 1;
12689         if (rack_disable_prr)
12690                 rack->rack_no_prr = 1;
12691         if (rack_gp_no_rec_chg)
12692                 rack->rc_gp_no_rec_chg = 1;
12693         if (rack_pace_every_seg && tcp_can_enable_pacing()) {
12694                 rack->rc_always_pace = 1;
12695                 if (rack->use_fixed_rate || rack->gp_ready)
12696                         rack_set_cc_pacing(rack);
12697         } else
12698                 rack->rc_always_pace = 0;
12699         if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
12700                 rack->r_mbuf_queue = 1;
12701         else
12702                 rack->r_mbuf_queue = 0;
12703         if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
12704                 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
12705         else
12706                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
12707         rack_set_pace_segments(tp, rack, __LINE__, NULL);
12708         if (rack_limits_scwnd)
12709                 rack->r_limit_scw = 1;
12710         else
12711                 rack->r_limit_scw = 0;
12712         rack->rc_labc = V_tcp_abc_l_var;
12713         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
12714         rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
12715         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
12716         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
12717         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
12718         rack->r_ctl.rc_min_to = rack_min_to;
12719         microuptime(&rack->r_ctl.act_rcv_time);
12720         rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
12721         rack->r_running_late = 0;
12722         rack->r_running_early = 0;
12723         rack->rc_init_win = rack_default_init_window;
12724         rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
12725         if (rack_hw_up_only)
12726                 rack->r_up_only = 1;
12727         if (rack_do_dyn_mul) {
12728                 /* When dynamic adjustment is on CA needs to start at 100% */
12729                 rack->rc_gp_dyn_mul = 1;
12730                 if (rack_do_dyn_mul >= 100)
12731                         rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
12732         } else
12733                 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
12734         rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
12735         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
12736         rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
12737         setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
12738                                 rack_probertt_filter_life);
12739         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
12740         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
12741         rack->r_ctl.rc_time_of_last_probertt = us_cts;
12742         rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
12743         rack->r_ctl.rc_time_probertt_starts = 0;
12744         if (rack_dsack_std_based & 0x1) {
12745                 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
12746                 rack->rc_rack_tmr_std_based = 1;
12747         }
12748         if (rack_dsack_std_based & 0x2) {
12749                 /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
12750                 rack->rc_rack_use_dsack = 1;
12751         }
12752         /* We require at least one measurement, even if the sysctl is 0 */
12753         if (rack_req_measurements)
12754                 rack->r_ctl.req_measurements = rack_req_measurements;
12755         else
12756                 rack->r_ctl.req_measurements = 1;
12757         if (rack_enable_hw_pacing)
12758                 rack->rack_hdw_pace_ena = 1;
12759         if (rack_hw_rate_caps)
12760                 rack->r_rack_hw_rate_caps = 1;
12761         /* Do we force on detection? */
12762 #ifdef NETFLIX_EXP_DETECTION
12763         if (tcp_force_detection)
12764                 rack->do_detection = 1;
12765         else
12766 #endif
12767                 rack->do_detection = 0;
12768         if (rack_non_rxt_use_cr)
12769                 rack->rack_rec_nonrxt_use_cr = 1;
12770         err = rack_init_fsb(tp, rack);
12771         if (err) {
12772                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12773                 tp->t_fb_ptr = NULL;
12774                 return (err);
12775         }
12776         if (tp->snd_una != tp->snd_max) {
12777                 /* Create a send map for the current outstanding data */
12778                 struct rack_sendmap *rsm;
12779
12780                 rsm = rack_alloc(rack);
12781                 if (rsm == NULL) {
12782                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12783                         tp->t_fb_ptr = NULL;
12784                         return (ENOMEM);
12785                 }
12786                 rsm->r_no_rtt_allowed = 1;
12787                 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
12788                 rsm->r_rtr_cnt = 1;
12789                 rsm->r_rtr_bytes = 0;
12790                 if (tp->t_flags & TF_SENTFIN) {
12791                         rsm->r_end = tp->snd_max - 1;
12792                         rsm->r_flags |= RACK_HAS_FIN;
12793                 } else {
12794                         rsm->r_end = tp->snd_max;
12795                 }
12796                 if (tp->snd_una == tp->iss) {
12797                         /* The data space is one beyond snd_una */
12798                         rsm->r_flags |= RACK_HAS_SYN;
12799                         rsm->r_start = tp->iss;
12800                         rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
12801                 } else
12802                         rsm->r_start = tp->snd_una;
12803                 rsm->r_dupack = 0;
12804                 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
12805                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
12806                         if (rsm->m)
12807                                 rsm->orig_m_len = rsm->m->m_len;
12808                         else
12809                                 rsm->orig_m_len = 0;
12810                 } else {
12811                         /*
12812                          * This can happen if we have a stand-alone FIN or
12813                          *  SYN.
12814                          */
12815                         rsm->m = NULL;
12816                         rsm->orig_m_len = 0;
12817                         rsm->soff = 0;
12818                 }
12819                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12820 #ifdef INVARIANTS
12821                 if (insret != NULL) {
12822                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
12823                               insret, rack, rsm);
12824                 }
12825 #endif
12826                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
12827                 rsm->r_in_tmap = 1;
12828         }
12829         /*
12830          * Timers in Rack are kept in microseconds so lets
12831          * convert any initial incoming variables
12832          * from ticks into usecs. Note that we
12833          * also change the values of t_srtt and t_rttvar, if
12834          * they are non-zero. They are kept with a 5
12835          * bit decimal so we have to carefully convert
12836          * these to get the full precision.
12837          */
12838         rack_convert_rtts(tp);
12839         tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
12840         if (rack_do_hystart) {
12841                 struct sockopt sopt;
12842                 struct cc_newreno_opts opt;
12843
12844                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
12845                 sopt.sopt_dir = SOPT_SET;
12846                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
12847                 opt.val = rack_do_hystart;
12848                 if (CC_ALGO(tp)->ctl_output != NULL)
12849                         (void)CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
12850         }
12851         if (rack_def_profile)
12852                 rack_set_profile(rack, rack_def_profile);
12853         /* Cancel the GP measurement in progress */
12854         tp->t_flags &= ~TF_GPUTINPROG;
12855         if (SEQ_GT(tp->snd_max, tp->iss))
12856                 snt = tp->snd_max - tp->iss;
12857         else
12858                 snt = 0;
12859         iwin = rc_init_window(rack);
12860         if (snt < iwin) {
12861                 /* We are not past the initial window
12862                  * so we need to make sure cwnd is
12863                  * correct.
12864                  */
12865                 if (tp->snd_cwnd < iwin)
12866                         tp->snd_cwnd = iwin;
12867                 /*
12868                  * If we are within the initial window
12869                  * we want ssthresh to be unlimited. Setting
12870                  * it to the rwnd (which the default stack does
12871                  * and older racks) is not really a good idea
12872                  * since we want to be in SS and grow both the
12873                  * cwnd and the rwnd (via dynamic rwnd growth). If
12874                  * we set it to the rwnd then as the peer grows its
12875                  * rwnd we will be stuck in CA and never hit SS.
12876                  *
12877                  * Its far better to raise it up high (this takes the
12878                  * risk that there as been a loss already, probably
12879                  * we should have an indicator in all stacks of loss
12880                  * but we don't), but considering the normal use this
12881                  * is a risk worth taking. The consequences of not
12882                  * hitting SS are far worse than going one more time
12883                  * into it early on (before we have sent even a IW).
12884                  * It is highly unlikely that we will have had a loss
12885                  * before getting the IW out.
12886                  */
12887                 tp->snd_ssthresh = 0xffffffff;
12888         }
12889         rack_stop_all_timers(tp);
12890         /* Lets setup the fsb block */
12891         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
12892         rack_log_rtt_shrinks(rack,  us_cts,  tp->t_rxtcur,
12893                              __LINE__, RACK_RTTS_INIT);
12894         return (0);
12895 }
12896
12897 static int
12898 rack_handoff_ok(struct tcpcb *tp)
12899 {
12900         if ((tp->t_state == TCPS_CLOSED) ||
12901             (tp->t_state == TCPS_LISTEN)) {
12902                 /* Sure no problem though it may not stick */
12903                 return (0);
12904         }
12905         if ((tp->t_state == TCPS_SYN_SENT) ||
12906             (tp->t_state == TCPS_SYN_RECEIVED)) {
12907                 /*
12908                  * We really don't know if you support sack,
12909                  * you have to get to ESTAB or beyond to tell.
12910                  */
12911                 return (EAGAIN);
12912         }
12913         if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
12914                 /*
12915                  * Rack will only send a FIN after all data is acknowledged.
12916                  * So in this case we have more data outstanding. We can't
12917                  * switch stacks until either all data and only the FIN
12918                  * is left (in which case rack_init() now knows how
12919                  * to deal with that) <or> all is acknowledged and we
12920                  * are only left with incoming data, though why you
12921                  * would want to switch to rack after all data is acknowledged
12922                  * I have no idea (rrs)!
12923                  */
12924                 return (EAGAIN);
12925         }
12926         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
12927                 return (0);
12928         }
12929         /*
12930          * If we reach here we don't do SACK on this connection so we can
12931          * never do rack.
12932          */
12933         return (EINVAL);
12934 }
12935
12936
12937 static void
12938 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
12939 {
12940         int ack_cmp = 0;
12941
12942         if (tp->t_fb_ptr) {
12943                 struct tcp_rack *rack;
12944                 struct rack_sendmap *rsm, *nrsm, *rm;
12945
12946                 rack = (struct tcp_rack *)tp->t_fb_ptr;
12947                 if (tp->t_in_pkt) {
12948                         /*
12949                          * It is unsafe to process the packets since a
12950                          * reset may be lurking in them (its rare but it
12951                          * can occur). If we were to find a RST, then we
12952                          * would end up dropping the connection and the
12953                          * INP lock, so when we return the caller (tcp_usrreq)
12954                          * will blow up when it trys to unlock the inp.
12955                          */
12956                         struct mbuf *save, *m;
12957
12958                         m = tp->t_in_pkt;
12959                         tp->t_in_pkt = NULL;
12960                         tp->t_tail_pkt = NULL;
12961                         while (m) {
12962                                 save = m->m_nextpkt;
12963                                 m->m_nextpkt = NULL;
12964                                 m_freem(m);
12965                                 m = save;
12966                         }
12967                         if ((tp->t_inpcb) &&
12968                             (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP))
12969                                 ack_cmp = 1;
12970                         if (ack_cmp) {
12971                                 /* Total if we used large or small (if ack-cmp was used). */
12972                                 if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS)
12973                                         counter_u64_add(rack_large_ackcmp, 1);
12974                                 else
12975                                         counter_u64_add(rack_small_ackcmp, 1);
12976                         }
12977                 }
12978                 tp->t_flags &= ~TF_FORCEDATA;
12979 #ifdef NETFLIX_SHARED_CWND
12980                 if (rack->r_ctl.rc_scw) {
12981                         uint32_t limit;
12982
12983                         if (rack->r_limit_scw)
12984                                 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
12985                         else
12986                                 limit = 0;
12987                         tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
12988                                                   rack->r_ctl.rc_scw_index,
12989                                                   limit);
12990                         rack->r_ctl.rc_scw = NULL;
12991                 }
12992 #endif
12993                 if (rack->r_ctl.fsb.tcp_ip_hdr) {
12994                         free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
12995                         rack->r_ctl.fsb.tcp_ip_hdr = NULL;
12996                         rack->r_ctl.fsb.th = NULL;
12997                 }
12998                 /* Convert back to ticks, with  */
12999                 if (tp->t_srtt > 1) {
13000                         uint32_t val, frac;
13001
13002                         val = USEC_2_TICKS(tp->t_srtt);
13003                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
13004                         tp->t_srtt = val << TCP_RTT_SHIFT;
13005                         /*
13006                          * frac is the fractional part here is left
13007                          * over from converting to hz and shifting.
13008                          * We need to convert this to the 5 bit
13009                          * remainder.
13010                          */
13011                         if (frac) {
13012                                 if (hz == 1000) {
13013                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
13014                                 } else {
13015                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
13016                                 }
13017                                 tp->t_srtt += frac;
13018                         }
13019                 }
13020                 if (tp->t_rttvar) {
13021                         uint32_t val, frac;
13022
13023                         val = USEC_2_TICKS(tp->t_rttvar);
13024                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
13025                         tp->t_rttvar = val <<  TCP_RTTVAR_SHIFT;
13026                         /*
13027                          * frac is the fractional part here is left
13028                          * over from converting to hz and shifting.
13029                          * We need to convert this to the 5 bit
13030                          * remainder.
13031                          */
13032                         if (frac) {
13033                                 if (hz == 1000) {
13034                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
13035                                 } else {
13036                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
13037                                 }
13038                                 tp->t_rttvar += frac;
13039                         }
13040                 }
13041                 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
13042                 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
13043                 if (rack->rc_always_pace) {
13044                         tcp_decrement_paced_conn();
13045                         rack_undo_cc_pacing(rack);
13046                         rack->rc_always_pace = 0;
13047                 }
13048                 /* Clean up any options if they were not applied */
13049                 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
13050                         struct deferred_opt_list *dol;
13051
13052                         dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
13053                         TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
13054                         free(dol, M_TCPDO);
13055                 }
13056                 /* rack does not use force data but other stacks may clear it */
13057                 if (rack->r_ctl.crte != NULL) {
13058                         tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
13059                         rack->rack_hdrw_pacing = 0;
13060                         rack->r_ctl.crte = NULL;
13061                 }
13062 #ifdef TCP_BLACKBOX
13063                 tcp_log_flowend(tp);
13064 #endif
13065                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
13066                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
13067 #ifdef INVARIANTS
13068                         if (rm != rsm) {
13069                                 panic("At fini, rack:%p rsm:%p rm:%p",
13070                                       rack, rsm, rm);
13071                         }
13072 #endif
13073                         uma_zfree(rack_zone, rsm);
13074                 }
13075                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
13076                 while (rsm) {
13077                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
13078                         uma_zfree(rack_zone, rsm);
13079                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
13080                 }
13081                 rack->rc_free_cnt = 0;
13082                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
13083                 tp->t_fb_ptr = NULL;
13084         }
13085         if (tp->t_inpcb) {
13086                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
13087                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
13088                 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
13089                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP;
13090                 /* Cancel the GP measurement in progress */
13091                 tp->t_flags &= ~TF_GPUTINPROG;
13092                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS;
13093         }
13094         /* Make sure snd_nxt is correctly set */
13095         tp->snd_nxt = tp->snd_max;
13096 }
13097
13098 static void
13099 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
13100 {
13101         if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
13102                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
13103         }
13104         switch (tp->t_state) {
13105         case TCPS_SYN_SENT:
13106                 rack->r_state = TCPS_SYN_SENT;
13107                 rack->r_substate = rack_do_syn_sent;
13108                 break;
13109         case TCPS_SYN_RECEIVED:
13110                 rack->r_state = TCPS_SYN_RECEIVED;
13111                 rack->r_substate = rack_do_syn_recv;
13112                 break;
13113         case TCPS_ESTABLISHED:
13114                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13115                 rack->r_state = TCPS_ESTABLISHED;
13116                 rack->r_substate = rack_do_established;
13117                 break;
13118         case TCPS_CLOSE_WAIT:
13119                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13120                 rack->r_state = TCPS_CLOSE_WAIT;
13121                 rack->r_substate = rack_do_close_wait;
13122                 break;
13123         case TCPS_FIN_WAIT_1:
13124                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13125                 rack->r_state = TCPS_FIN_WAIT_1;
13126                 rack->r_substate = rack_do_fin_wait_1;
13127                 break;
13128         case TCPS_CLOSING:
13129                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13130                 rack->r_state = TCPS_CLOSING;
13131                 rack->r_substate = rack_do_closing;
13132                 break;
13133         case TCPS_LAST_ACK:
13134                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13135                 rack->r_state = TCPS_LAST_ACK;
13136                 rack->r_substate = rack_do_lastack;
13137                 break;
13138         case TCPS_FIN_WAIT_2:
13139                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13140                 rack->r_state = TCPS_FIN_WAIT_2;
13141                 rack->r_substate = rack_do_fin_wait_2;
13142                 break;
13143         case TCPS_LISTEN:
13144         case TCPS_CLOSED:
13145         case TCPS_TIME_WAIT:
13146         default:
13147                 break;
13148         };
13149         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
13150                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
13151
13152 }
13153
13154 static void
13155 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
13156 {
13157         /*
13158          * We received an ack, and then did not
13159          * call send or were bounced out due to the
13160          * hpts was running. Now a timer is up as well, is
13161          * it the right timer?
13162          */
13163         struct rack_sendmap *rsm;
13164         int tmr_up;
13165
13166         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
13167         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
13168                 return;
13169         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
13170         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
13171             (tmr_up == PACE_TMR_RXT)) {
13172                 /* Should be an RXT */
13173                 return;
13174         }
13175         if (rsm == NULL) {
13176                 /* Nothing outstanding? */
13177                 if (tp->t_flags & TF_DELACK) {
13178                         if (tmr_up == PACE_TMR_DELACK)
13179                                 /* We are supposed to have delayed ack up and we do */
13180                                 return;
13181                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
13182                         /*
13183                          * if we hit enobufs then we would expect the possiblity
13184                          * of nothing outstanding and the RXT up (and the hptsi timer).
13185                          */
13186                         return;
13187                 } else if (((V_tcp_always_keepalive ||
13188                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
13189                             (tp->t_state <= TCPS_CLOSING)) &&
13190                            (tmr_up == PACE_TMR_KEEP) &&
13191                            (tp->snd_max == tp->snd_una)) {
13192                         /* We should have keep alive up and we do */
13193                         return;
13194                 }
13195         }
13196         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
13197                    ((tmr_up == PACE_TMR_TLP) ||
13198                     (tmr_up == PACE_TMR_RACK) ||
13199                     (tmr_up == PACE_TMR_RXT))) {
13200                 /*
13201                  * Either a Rack, TLP or RXT is fine if  we
13202                  * have outstanding data.
13203                  */
13204                 return;
13205         } else if (tmr_up == PACE_TMR_DELACK) {
13206                 /*
13207                  * If the delayed ack was going to go off
13208                  * before the rtx/tlp/rack timer were going to
13209                  * expire, then that would be the timer in control.
13210                  * Note we don't check the time here trusting the
13211                  * code is correct.
13212                  */
13213                 return;
13214         }
13215         /*
13216          * Ok the timer originally started is not what we want now.
13217          * We will force the hpts to be stopped if any, and restart
13218          * with the slot set to what was in the saved slot.
13219          */
13220         if (rack->rc_inp->inp_in_hpts) {
13221                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
13222                         uint32_t us_cts;
13223
13224                         us_cts = tcp_get_usecs(NULL);
13225                         if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
13226                                 rack->r_early = 1;
13227                                 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
13228                         }
13229                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
13230                 }
13231                 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
13232         }
13233         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13234         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
13235 }
13236
13237
13238 static void
13239 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
13240 {
13241         if ((SEQ_LT(tp->snd_wl1, seq) ||
13242             (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
13243             (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
13244                 /* keep track of pure window updates */
13245                 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
13246                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
13247                 tp->snd_wnd = tiwin;
13248                 rack_validate_fo_sendwin_up(tp, rack);
13249                 tp->snd_wl1 = seq;
13250                 tp->snd_wl2 = ack;
13251                 if (tp->snd_wnd > tp->max_sndwnd)
13252                         tp->max_sndwnd = tp->snd_wnd;
13253             rack->r_wanted_output = 1;
13254         } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
13255                 tp->snd_wnd = tiwin;
13256                 rack_validate_fo_sendwin_up(tp, rack);
13257                 tp->snd_wl1 = seq;
13258                 tp->snd_wl2 = ack;
13259         } else {
13260                 /* Not a valid win update */
13261                 return;
13262         }
13263         if (tp->snd_wnd > tp->max_sndwnd)
13264                 tp->max_sndwnd = tp->snd_wnd;
13265         if (tp->snd_wnd < (tp->snd_max - high_seq)) {
13266                 /* The peer collapsed the window */
13267                 rack_collapsed_window(rack);
13268         } else if (rack->rc_has_collapsed)
13269                 rack_un_collapse_window(rack);
13270         /* Do we exit persists? */
13271         if ((rack->rc_in_persist != 0) &&
13272             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
13273                                 rack->r_ctl.rc_pace_min_segs))) {
13274                 rack_exit_persist(tp, rack, cts);
13275         }
13276         /* Do we enter persists? */
13277         if ((rack->rc_in_persist == 0) &&
13278             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
13279             TCPS_HAVEESTABLISHED(tp->t_state) &&
13280             (tp->snd_max == tp->snd_una) &&
13281             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
13282             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
13283                 /*
13284                  * Here the rwnd is less than
13285                  * the pacing size, we are established,
13286                  * nothing is outstanding, and there is
13287                  * data to send. Enter persists.
13288                  */
13289                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
13290         }
13291 }
13292
13293 static void
13294 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
13295 {
13296
13297         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13298                 union tcp_log_stackspecific log;
13299                 struct timeval ltv;
13300                 char tcp_hdr_buf[60];
13301                 struct tcphdr *th;
13302                 struct timespec ts;
13303                 uint32_t orig_snd_una;
13304                 uint8_t xx = 0;
13305
13306 #ifdef NETFLIX_HTTP_LOGGING
13307                 struct http_sendfile_track *http_req;
13308
13309                 if (SEQ_GT(ae->ack, tp->snd_una)) {
13310                         http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
13311                 } else {
13312                         http_req = tcp_http_find_req_for_seq(tp, ae->ack);
13313                 }
13314 #endif
13315                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
13316                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
13317                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
13318                 if (rack->rack_no_prr == 0)
13319                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
13320                 else
13321                         log.u_bbr.flex1 = 0;
13322                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
13323                 log.u_bbr.use_lt_bw <<= 1;
13324                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
13325                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
13326                 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
13327                 log.u_bbr.pkts_out = tp->t_maxseg;
13328                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
13329                 log.u_bbr.flex7 = 1;
13330                 log.u_bbr.lost = ae->flags;
13331                 log.u_bbr.cwnd_gain = ackval;
13332                 log.u_bbr.pacing_gain = 0x2;
13333                 if (ae->flags & TSTMP_HDWR) {
13334                         /* Record the hardware timestamp if present */
13335                         log.u_bbr.flex3 = M_TSTMP;
13336                         ts.tv_sec = ae->timestamp / 1000000000;
13337                         ts.tv_nsec = ae->timestamp % 1000000000;
13338                         ltv.tv_sec = ts.tv_sec;
13339                         ltv.tv_usec = ts.tv_nsec / 1000;
13340                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
13341                 } else if (ae->flags & TSTMP_LRO) {
13342                         /* Record the LRO the arrival timestamp */
13343                         log.u_bbr.flex3 = M_TSTMP_LRO;
13344                         ts.tv_sec = ae->timestamp / 1000000000;
13345                         ts.tv_nsec = ae->timestamp % 1000000000;
13346                         ltv.tv_sec = ts.tv_sec;
13347                         ltv.tv_usec = ts.tv_nsec / 1000;
13348                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
13349                 }
13350                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
13351                 /* Log the rcv time */
13352                 log.u_bbr.delRate = ae->timestamp;
13353 #ifdef NETFLIX_HTTP_LOGGING
13354                 log.u_bbr.applimited = tp->t_http_closed;
13355                 log.u_bbr.applimited <<= 8;
13356                 log.u_bbr.applimited |= tp->t_http_open;
13357                 log.u_bbr.applimited <<= 8;
13358                 log.u_bbr.applimited |= tp->t_http_req;
13359                 if (http_req) {
13360                         /* Copy out any client req info */
13361                         /* seconds */
13362                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
13363                         /* useconds */
13364                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
13365                         log.u_bbr.rttProp = http_req->timestamp;
13366                         log.u_bbr.cur_del_rate = http_req->start;
13367                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
13368                                 log.u_bbr.flex8 |= 1;
13369                         } else {
13370                                 log.u_bbr.flex8 |= 2;
13371                                 log.u_bbr.bw_inuse = http_req->end;
13372                         }
13373                         log.u_bbr.flex6 = http_req->start_seq;
13374                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
13375                                 log.u_bbr.flex8 |= 4;
13376                                 log.u_bbr.epoch = http_req->end_seq;
13377                         }
13378                 }
13379 #endif
13380                 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
13381                 th = (struct tcphdr *)tcp_hdr_buf;
13382                 th->th_seq = ae->seq;
13383                 th->th_ack = ae->ack;
13384                 th->th_win = ae->win;
13385                 /* Now fill in the ports */
13386                 th->th_sport = tp->t_inpcb->inp_fport;
13387                 th->th_dport = tp->t_inpcb->inp_lport;
13388                 th->th_flags = ae->flags & 0xff;
13389                 /* Now do we have a timestamp option? */
13390                 if (ae->flags & HAS_TSTMP) {
13391                         u_char *cp;
13392                         uint32_t val;
13393
13394                         th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
13395                         cp = (u_char *)(th + 1);
13396                         *cp = TCPOPT_NOP;
13397                         cp++;
13398                         *cp = TCPOPT_NOP;
13399                         cp++;
13400                         *cp = TCPOPT_TIMESTAMP;
13401                         cp++;
13402                         *cp = TCPOLEN_TIMESTAMP;
13403                         cp++;
13404                         val = htonl(ae->ts_value);
13405                         bcopy((char *)&val,
13406                               (char *)cp, sizeof(uint32_t));
13407                         val = htonl(ae->ts_echo);
13408                         bcopy((char *)&val,
13409                               (char *)(cp + 4), sizeof(uint32_t));
13410                 } else
13411                         th->th_off = (sizeof(struct tcphdr) >> 2);
13412
13413                 /*
13414                  * For sane logging we need to play a little trick.
13415                  * If the ack were fully processed we would have moved
13416                  * snd_una to high_seq, but since compressed acks are
13417                  * processed in two phases, at this point (logging) snd_una
13418                  * won't be advanced. So we would see multiple acks showing
13419                  * the advancement. We can prevent that by "pretending" that
13420                  * snd_una was advanced and then un-advancing it so that the
13421                  * logging code has the right value for tlb_snd_una.
13422                  */
13423                 if (tp->snd_una != high_seq) {
13424                         orig_snd_una = tp->snd_una;
13425                         tp->snd_una = high_seq;
13426                         xx = 1;
13427                 } else
13428                         xx = 0;
13429                 TCP_LOG_EVENTP(tp, th,
13430                                &tp->t_inpcb->inp_socket->so_rcv,
13431                                &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0,
13432                                0, &log, true, &ltv);
13433                 if (xx) {
13434                         tp->snd_una = orig_snd_una;
13435                 }
13436         }
13437
13438 }
13439
13440 static void
13441 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
13442 {
13443         uint32_t us_rtt;
13444         /*
13445          * A persist or keep-alive was forced out, update our
13446          * min rtt time. Note now worry about lost responses.
13447          * When a subsequent keep-alive or persist times out
13448          * and forced_ack is still on, then the last probe
13449          * was not responded to. In such cases we have a
13450          * sysctl that controls the behavior. Either we apply
13451          * the rtt but with reduced confidence (0). Or we just
13452          * plain don't apply the rtt estimate. Having data flow
13453          * will clear the probe_not_answered flag i.e. cum-ack
13454          * move forward <or> exiting and reentering persists.
13455          */
13456
13457         rack->forced_ack = 0;
13458         rack->rc_tp->t_rxtshift = 0;
13459         if ((rack->rc_in_persist &&
13460              (tiwin == rack->rc_tp->snd_wnd)) ||
13461             (rack->rc_in_persist == 0)) {
13462                 /*
13463                  * In persists only apply the RTT update if this is
13464                  * a response to our window probe. And that
13465                  * means the rwnd sent must match the current
13466                  * snd_wnd. If it does not, then we got a
13467                  * window update ack instead. For keepalive
13468                  * we allow the answer no matter what the window.
13469                  *
13470                  * Note that if the probe_not_answered is set then
13471                  * the forced_ack_ts is the oldest one i.e. the first
13472                  * probe sent that might have been lost. This assures
13473                  * us that if we do calculate an RTT it is longer not
13474                  * some short thing.
13475                  */
13476                 if (rack->rc_in_persist)
13477                         counter_u64_add(rack_persists_acks, 1);
13478                 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
13479                 if (us_rtt == 0)
13480                         us_rtt = 1;
13481                 if (rack->probe_not_answered == 0) {
13482                         rack_apply_updated_usrtt(rack, us_rtt, us_cts);
13483                         tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
13484                 } else {
13485                         /* We have a retransmitted probe here too */
13486                         if (rack_apply_rtt_with_reduced_conf) {
13487                                 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
13488                                 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
13489                         }
13490                 }
13491         }
13492 }
13493
13494
13495 static int
13496 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
13497 {
13498         /*
13499          * Handle a "special" compressed ack mbuf. Each incoming
13500          * ack has only four possible dispositions:
13501          *
13502          * A) It moves the cum-ack forward
13503          * B) It is behind the cum-ack.
13504          * C) It is a window-update ack.
13505          * D) It is a dup-ack.
13506          *
13507          * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
13508          * in the incoming mbuf. We also need to still pay attention
13509          * to nxt_pkt since there may be another packet after this
13510          * one.
13511          */
13512 #ifdef TCP_ACCOUNTING
13513         uint64_t ts_val;
13514         uint64_t rdstc;
13515 #endif
13516         int segsiz;
13517         struct timespec ts;
13518         struct tcp_rack *rack;
13519         struct tcp_ackent *ae;
13520         uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
13521         int cnt, i, did_out, ourfinisacked = 0;
13522         struct tcpopt to_holder, *to = NULL;
13523         int win_up_req = 0;
13524         int nsegs = 0;
13525         int under_pacing = 1;
13526         int recovery = 0;
13527         int idx;
13528 #ifdef TCP_ACCOUNTING
13529         sched_pin();
13530 #endif
13531         rack = (struct tcp_rack *)tp->t_fb_ptr;
13532         if (rack->gp_ready &&
13533             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
13534                 under_pacing = 0;
13535         else
13536                 under_pacing = 1;
13537
13538         if (rack->r_state != tp->t_state)
13539                 rack_set_state(tp, rack);
13540         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13541             (tp->t_flags & TF_GPUTINPROG)) {
13542                 /*
13543                  * We have a goodput in progress
13544                  * and we have entered a late state.
13545                  * Do we have enough data in the sb
13546                  * to handle the GPUT request?
13547                  */
13548                 uint32_t bytes;
13549
13550                 bytes = tp->gput_ack - tp->gput_seq;
13551                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
13552                         bytes += tp->gput_seq - tp->snd_una;
13553                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
13554                         /*
13555                          * There are not enough bytes in the socket
13556                          * buffer that have been sent to cover this
13557                          * measurement. Cancel it.
13558                          */
13559                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
13560                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
13561                                                    tp->gput_seq,
13562                                                    0, 0, 18, __LINE__, NULL, 0);
13563                         tp->t_flags &= ~TF_GPUTINPROG;
13564                 }
13565         }
13566         to = &to_holder;
13567         to->to_flags = 0;
13568         KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
13569                 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
13570         cnt = m->m_len / sizeof(struct tcp_ackent);
13571         idx = cnt / 5;
13572         if (idx >= MAX_NUM_OF_CNTS)
13573                 idx = MAX_NUM_OF_CNTS - 1;
13574         counter_u64_add(rack_proc_comp_ack[idx], 1);
13575         counter_u64_add(rack_multi_single_eq, cnt);
13576         high_seq = tp->snd_una;
13577         the_win = tp->snd_wnd;
13578         win_seq = tp->snd_wl1;
13579         win_upd_ack = tp->snd_wl2;
13580         cts = tcp_tv_to_usectick(tv);
13581         ms_cts = tcp_tv_to_mssectick(tv);
13582         segsiz = ctf_fixed_maxseg(tp);
13583         if ((rack->rc_gp_dyn_mul) &&
13584             (rack->use_fixed_rate == 0) &&
13585             (rack->rc_always_pace)) {
13586                 /* Check in on probertt */
13587                 rack_check_probe_rtt(rack, cts);
13588         }
13589         for (i = 0; i < cnt; i++) {
13590 #ifdef TCP_ACCOUNTING
13591                 ts_val = get_cyclecount();
13592 #endif
13593                 rack_clear_rate_sample(rack);
13594                 ae = ((mtod(m, struct tcp_ackent *)) + i);
13595                 /* Setup the window */
13596                 tiwin = ae->win << tp->snd_scale;
13597                 /* figure out the type of ack */
13598                 if (SEQ_LT(ae->ack, high_seq)) {
13599                         /* Case B*/
13600                         ae->ack_val_set = ACK_BEHIND;
13601                 } else if (SEQ_GT(ae->ack, high_seq)) {
13602                         /* Case A */
13603                         ae->ack_val_set = ACK_CUMACK;
13604                 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
13605                         /* Case D */
13606                         ae->ack_val_set = ACK_DUPACK;
13607                 } else {
13608                         /* Case C */
13609                         ae->ack_val_set = ACK_RWND;
13610                 }
13611                 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
13612                 /* Validate timestamp */
13613                 if (ae->flags & HAS_TSTMP) {
13614                         /* Setup for a timestamp */
13615                         to->to_flags = TOF_TS;
13616                         ae->ts_echo -= tp->ts_offset;
13617                         to->to_tsecr = ae->ts_echo;
13618                         to->to_tsval = ae->ts_value;
13619                         /*
13620                          * If echoed timestamp is later than the current time, fall back to
13621                          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
13622                          * were used when this connection was established.
13623                          */
13624                         if (TSTMP_GT(ae->ts_echo, ms_cts))
13625                                 to->to_tsecr = 0;
13626                         if (tp->ts_recent &&
13627                             TSTMP_LT(ae->ts_value, tp->ts_recent)) {
13628                                 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
13629 #ifdef TCP_ACCOUNTING
13630                                         rdstc = get_cyclecount();
13631                                         if (rdstc > ts_val) {
13632                                                 counter_u64_add(tcp_proc_time[ae->ack_val_set] ,
13633                                                                 (rdstc - ts_val));
13634                                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13635                                                         tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13636                                                 }
13637                                         }
13638 #endif
13639                                         continue;
13640                                 }
13641                         }
13642                         if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
13643                             SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
13644                                 tp->ts_recent_age = tcp_ts_getticks();
13645                                 tp->ts_recent = ae->ts_value;
13646                         }
13647                 } else {
13648                         /* Setup for a no options */
13649                         to->to_flags = 0;
13650                 }
13651                 /* Update the rcv time and perform idle reduction possibly */
13652                 if  (tp->t_idle_reduce &&
13653                      (tp->snd_max == tp->snd_una) &&
13654                      ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
13655                         counter_u64_add(rack_input_idle_reduces, 1);
13656                         rack_cc_after_idle(rack, tp);
13657                 }
13658                 tp->t_rcvtime = ticks;
13659                 /* Now what about ECN? */
13660                 if (tp->t_flags2 & TF2_ECN_PERMIT) {
13661                         if (ae->flags & TH_CWR) {
13662                                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
13663                                 tp->t_flags |= TF_ACKNOW;
13664                         }
13665                         switch (ae->codepoint & IPTOS_ECN_MASK) {
13666                         case IPTOS_ECN_CE:
13667                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
13668                                 KMOD_TCPSTAT_INC(tcps_ecn_ce);
13669                                 break;
13670                         case IPTOS_ECN_ECT0:
13671                                 KMOD_TCPSTAT_INC(tcps_ecn_ect0);
13672                                 break;
13673                         case IPTOS_ECN_ECT1:
13674                                 KMOD_TCPSTAT_INC(tcps_ecn_ect1);
13675                                 break;
13676                         }
13677
13678                         /* Process a packet differently from RFC3168. */
13679                         cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint);
13680                         /* Congestion experienced. */
13681                         if (ae->flags & TH_ECE) {
13682                                 rack_cong_signal(tp,  CC_ECN, ae->ack);
13683                         }
13684                 }
13685 #ifdef TCP_ACCOUNTING
13686                 /* Count for the specific type of ack in */
13687                 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1);
13688                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13689                         tp->tcp_cnt_counters[ae->ack_val_set]++;
13690                 }
13691 #endif
13692                 /*
13693                  * Note how we could move up these in the determination
13694                  * above, but we don't so that way the timestamp checks (and ECN)
13695                  * is done first before we do any processing on the ACK.
13696                  * The non-compressed path through the code has this
13697                  * weakness (noted by @jtl) that it actually does some
13698                  * processing before verifying the timestamp information.
13699                  * We don't take that path here which is why we set
13700                  * the ack_val_set first, do the timestamp and ecn
13701                  * processing, and then look at what we have setup.
13702                  */
13703                 if (ae->ack_val_set == ACK_BEHIND) {
13704                         /*
13705                          * Case B flag reordering, if window is not closed
13706                          * or it could be a keep-alive or persists
13707                          */
13708                         if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
13709                                 counter_u64_add(rack_reorder_seen, 1);
13710                                 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
13711                         }
13712                 } else if (ae->ack_val_set == ACK_DUPACK) {
13713                         /* Case D */
13714                         rack_strike_dupack(rack);
13715                 } else if (ae->ack_val_set == ACK_RWND) {
13716                         /* Case C */
13717                         if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
13718                                 ts.tv_sec = ae->timestamp / 1000000000;
13719                                 ts.tv_nsec = ae->timestamp % 1000000000;
13720                                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
13721                                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
13722                         } else {
13723                                 rack->r_ctl.act_rcv_time = *tv;
13724                         }
13725                         if (rack->forced_ack) {
13726                                 rack_handle_probe_response(rack, tiwin,
13727                                                            tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
13728                         }
13729                         win_up_req = 1;
13730                         win_upd_ack = ae->ack;
13731                         win_seq = ae->seq;
13732                         the_win = tiwin;
13733                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13734                 } else {
13735                         /* Case A */
13736                         if (SEQ_GT(ae->ack, tp->snd_max)) {
13737                                 /*
13738                                  * We just send an ack since the incoming
13739                                  * ack is beyond the largest seq we sent.
13740                                  */
13741                                 if ((tp->t_flags & TF_ACKNOW) == 0) {
13742                                         ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
13743                                         if (tp->t_flags && TF_ACKNOW)
13744                                                 rack->r_wanted_output = 1;
13745                                 }
13746                         } else {
13747                                 nsegs++;
13748                                 /* If the window changed setup to update */
13749                                 if (tiwin != tp->snd_wnd) {
13750                                         win_upd_ack = ae->ack;
13751                                         win_seq = ae->seq;
13752                                         the_win = tiwin;
13753                                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13754                                 }
13755 #ifdef TCP_ACCOUNTING
13756                                 /* Account for the acks */
13757                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13758                                         tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
13759                                 }
13760                                 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN],
13761                                                 (((ae->ack - high_seq) + segsiz - 1) / segsiz));
13762 #endif
13763                                 high_seq = ae->ack;
13764                                 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends)) {
13765                                         rack->r_ctl.current_round++;
13766                                         rack->r_ctl.roundends = tp->snd_max;
13767                                         if (CC_ALGO(tp)->newround != NULL) {
13768                                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
13769                                         }
13770                                 }
13771                                 /* Setup our act_rcv_time */
13772                                 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
13773                                         ts.tv_sec = ae->timestamp / 1000000000;
13774                                         ts.tv_nsec = ae->timestamp % 1000000000;
13775                                         rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
13776                                         rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
13777                                 } else {
13778                                         rack->r_ctl.act_rcv_time = *tv;
13779                                 }
13780                                 rack_process_to_cumack(tp, rack, ae->ack, cts, to);
13781                                 if (rack->rc_dsack_round_seen) {
13782                                         /* Is the dsack round over? */
13783                                         if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
13784                                                 /* Yes it is */
13785                                                 rack->rc_dsack_round_seen = 0;
13786                                                 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
13787                                         }
13788                                 }
13789                         }
13790                 }
13791                 /* And lets be sure to commit the rtt measurements for this ack */
13792                 tcp_rack_xmit_timer_commit(rack, tp);
13793 #ifdef TCP_ACCOUNTING
13794                 rdstc = get_cyclecount();
13795                 if (rdstc > ts_val) {
13796                         counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val));
13797                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13798                                 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13799                                 if (ae->ack_val_set == ACK_CUMACK)
13800                                         tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
13801                         }
13802                 }
13803 #endif
13804         }
13805 #ifdef TCP_ACCOUNTING
13806         ts_val = get_cyclecount();
13807 #endif
13808         acked_amount = acked = (high_seq - tp->snd_una);
13809         if (acked) {
13810                 /*
13811                  * Clear the probe not answered flag
13812                  * since cum-ack moved forward.
13813                  */
13814                 rack->probe_not_answered = 0;
13815                 if (rack->sack_attack_disable == 0)
13816                         rack_do_decay(rack);
13817                 if (acked >= segsiz) {
13818                         /*
13819                          * You only get credit for
13820                          * MSS and greater (and you get extra
13821                          * credit for larger cum-ack moves).
13822                          */
13823                         int ac;
13824
13825                         ac = acked / segsiz;
13826                         rack->r_ctl.ack_count += ac;
13827                         counter_u64_add(rack_ack_total, ac);
13828                 }
13829                 if (rack->r_ctl.ack_count > 0xfff00000) {
13830                         /*
13831                          * reduce the number to keep us under
13832                          * a uint32_t.
13833                          */
13834                         rack->r_ctl.ack_count /= 2;
13835                         rack->r_ctl.sack_count /= 2;
13836                 }
13837                 if (tp->t_flags & TF_NEEDSYN) {
13838                         /*
13839                          * T/TCP: Connection was half-synchronized, and our SYN has
13840                          * been ACK'd (so connection is now fully synchronized).  Go
13841                          * to non-starred state, increment snd_una for ACK of SYN,
13842                          * and check if we can do window scaling.
13843                          */
13844                         tp->t_flags &= ~TF_NEEDSYN;
13845                         tp->snd_una++;
13846                         acked_amount = acked = (high_seq - tp->snd_una);
13847                 }
13848                 if (acked > sbavail(&so->so_snd))
13849                         acked_amount = sbavail(&so->so_snd);
13850 #ifdef NETFLIX_EXP_DETECTION
13851                 /*
13852                  * We only care on a cum-ack move if we are in a sack-disabled
13853                  * state. We have already added in to the ack_count, and we never
13854                  * would disable on a cum-ack move, so we only care to do the
13855                  * detection if it may "undo" it, i.e. we were in disabled already.
13856                  */
13857                 if (rack->sack_attack_disable)
13858                         rack_do_detection(tp, rack, acked_amount, segsiz);
13859 #endif
13860                 if (IN_FASTRECOVERY(tp->t_flags) &&
13861                     (rack->rack_no_prr == 0))
13862                         rack_update_prr(tp, rack, acked_amount, high_seq);
13863                 if (IN_RECOVERY(tp->t_flags)) {
13864                         if (SEQ_LT(high_seq, tp->snd_recover) &&
13865                             (SEQ_LT(high_seq, tp->snd_max))) {
13866                                 tcp_rack_partialack(tp);
13867                         } else {
13868                                 rack_post_recovery(tp, high_seq);
13869                                 recovery = 1;
13870                         }
13871                 }
13872                 /* Handle the rack-log-ack part (sendmap) */
13873                 if ((sbused(&so->so_snd) == 0) &&
13874                     (acked > acked_amount) &&
13875                     (tp->t_state >= TCPS_FIN_WAIT_1) &&
13876                     (tp->t_flags & TF_SENTFIN)) {
13877                         /*
13878                          * We must be sure our fin
13879                          * was sent and acked (we can be
13880                          * in FIN_WAIT_1 without having
13881                          * sent the fin).
13882                          */
13883                         ourfinisacked = 1;
13884                         /*
13885                          * Lets make sure snd_una is updated
13886                          * since most likely acked_amount = 0 (it
13887                          * should be).
13888                          */
13889                         tp->snd_una = high_seq;
13890                 }
13891                 /* Did we make a RTO error? */
13892                 if ((tp->t_flags & TF_PREVVALID) &&
13893                     ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
13894                         tp->t_flags &= ~TF_PREVVALID;
13895                         if (tp->t_rxtshift == 1 &&
13896                             (int)(ticks - tp->t_badrxtwin) < 0)
13897                                 rack_cong_signal(tp, CC_RTO_ERR, high_seq);
13898                 }
13899                 /* Handle the data in the socket buffer */
13900                 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
13901                 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
13902                 if (acked_amount > 0) {
13903                         struct mbuf *mfree;
13904
13905                         rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
13906                         SOCKBUF_LOCK(&so->so_snd);
13907                         mfree = sbcut_locked(&so->so_snd, acked_amount);
13908                         tp->snd_una = high_seq;
13909                         /* Note we want to hold the sb lock through the sendmap adjust */
13910                         rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
13911                         /* Wake up the socket if we have room to write more */
13912                         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
13913                         sowwakeup_locked(so);
13914                         m_freem(mfree);
13915                 }
13916                 /* update progress */
13917                 tp->t_acktime = ticks;
13918                 rack_log_progress_event(rack, tp, tp->t_acktime,
13919                                         PROGRESS_UPDATE, __LINE__);
13920                 /* Clear out shifts and such */
13921                 tp->t_rxtshift = 0;
13922                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
13923                                    rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
13924                 rack->rc_tlp_in_progress = 0;
13925                 rack->r_ctl.rc_tlp_cnt_out = 0;
13926                 /* Send recover and snd_nxt must be dragged along */
13927                 if (SEQ_GT(tp->snd_una, tp->snd_recover))
13928                         tp->snd_recover = tp->snd_una;
13929                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
13930                         tp->snd_nxt = tp->snd_una;
13931                 /*
13932                  * If the RXT timer is running we want to
13933                  * stop it, so we can restart a TLP (or new RXT).
13934                  */
13935                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
13936                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13937 #ifdef NETFLIX_HTTP_LOGGING
13938                 tcp_http_check_for_comp(rack->rc_tp, high_seq);
13939 #endif
13940                 tp->snd_wl2 = high_seq;
13941                 tp->t_dupacks = 0;
13942                 if (under_pacing &&
13943                     (rack->use_fixed_rate == 0) &&
13944                     (rack->in_probe_rtt == 0) &&
13945                     rack->rc_gp_dyn_mul &&
13946                     rack->rc_always_pace) {
13947                         /* Check if we are dragging bottom */
13948                         rack_check_bottom_drag(tp, rack, so, acked);
13949                 }
13950                 if (tp->snd_una == tp->snd_max) {
13951                         tp->t_flags &= ~TF_PREVVALID;
13952                         rack->r_ctl.retran_during_recovery = 0;
13953                         rack->r_ctl.dsack_byte_cnt = 0;
13954                         rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
13955                         if (rack->r_ctl.rc_went_idle_time == 0)
13956                                 rack->r_ctl.rc_went_idle_time = 1;
13957                         rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
13958                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
13959                                 tp->t_acktime = 0;
13960                         /* Set so we might enter persists... */
13961                         rack->r_wanted_output = 1;
13962                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13963                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
13964                         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13965                             (sbavail(&so->so_snd) == 0) &&
13966                             (tp->t_flags2 & TF2_DROP_AF_DATA)) {
13967                                 /*
13968                                  * The socket was gone and the
13969                                  * peer sent data (not now in the past), time to
13970                                  * reset him.
13971                                  */
13972                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13973                                 /* tcp_close will kill the inp pre-log the Reset */
13974                                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13975 #ifdef TCP_ACCOUNTING
13976                                 rdstc = get_cyclecount();
13977                                 if (rdstc > ts_val) {
13978                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13979                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13980                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13981                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13982                                         }
13983                                 }
13984 #endif
13985                                 m_freem(m);
13986                                 tp = tcp_close(tp);
13987                                 if (tp == NULL) {
13988 #ifdef TCP_ACCOUNTING
13989                                         sched_unpin();
13990 #endif
13991                                         return (1);
13992                                 }
13993                                 /*
13994                                  * We would normally do drop-with-reset which would
13995                                  * send back a reset. We can't since we don't have
13996                                  * all the needed bits. Instead lets arrange for
13997                                  * a call to tcp_output(). That way since we
13998                                  * are in the closed state we will generate a reset.
13999                                  *
14000                                  * Note if tcp_accounting is on we don't unpin since
14001                                  * we do that after the goto label.
14002                                  */
14003                                 goto send_out_a_rst;
14004                         }
14005                         if ((sbused(&so->so_snd) == 0) &&
14006                             (tp->t_state >= TCPS_FIN_WAIT_1) &&
14007                             (tp->t_flags & TF_SENTFIN)) {
14008                                 /*
14009                                  * If we can't receive any more data, then closing user can
14010                                  * proceed. Starting the timer is contrary to the
14011                                  * specification, but if we don't get a FIN we'll hang
14012                                  * forever.
14013                                  *
14014                                  */
14015                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
14016                                         soisdisconnected(so);
14017                                         tcp_timer_activate(tp, TT_2MSL,
14018                                                            (tcp_fast_finwait2_recycle ?
14019                                                             tcp_finwait2_timeout :
14020                                                             TP_MAXIDLE(tp)));
14021                                 }
14022                                 if (ourfinisacked == 0) {
14023                                         /*
14024                                          * We don't change to fin-wait-2 if we have our fin acked
14025                                          * which means we are probably in TCPS_CLOSING.
14026                                          */
14027                                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
14028                                 }
14029                         }
14030                 }
14031                 /* Wake up the socket if we have room to write more */
14032                 if (sbavail(&so->so_snd)) {
14033                         rack->r_wanted_output = 1;
14034                         if (ctf_progress_timeout_check(tp, true)) {
14035                                 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
14036                                                         tp, tick, PROGRESS_DROP, __LINE__);
14037                                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
14038                                 /*
14039                                  * We cheat here and don't send a RST, we should send one
14040                                  * when the pacer drops the connection.
14041                                  */
14042 #ifdef TCP_ACCOUNTING
14043                                 rdstc = get_cyclecount();
14044                                 if (rdstc > ts_val) {
14045                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
14046                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14047                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
14048                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14049                                         }
14050                                 }
14051                                 sched_unpin();
14052 #endif
14053                                 INP_WUNLOCK(rack->rc_inp);
14054                                 m_freem(m);
14055                                 return (1);
14056                         }
14057                 }
14058                 if (ourfinisacked) {
14059                         switch(tp->t_state) {
14060                         case TCPS_CLOSING:
14061 #ifdef TCP_ACCOUNTING
14062                                 rdstc = get_cyclecount();
14063                                 if (rdstc > ts_val) {
14064                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
14065                                                         (rdstc - ts_val));
14066                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14067                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
14068                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14069                                         }
14070                                 }
14071                                 sched_unpin();
14072 #endif
14073                                 tcp_twstart(tp);
14074                                 m_freem(m);
14075                                 return (1);
14076                                 break;
14077                         case TCPS_LAST_ACK:
14078 #ifdef TCP_ACCOUNTING
14079                                 rdstc = get_cyclecount();
14080                                 if (rdstc > ts_val) {
14081                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
14082                                                         (rdstc - ts_val));
14083                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14084                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
14085                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14086                                         }
14087                                 }
14088                                 sched_unpin();
14089 #endif
14090                                 tp = tcp_close(tp);
14091                                 ctf_do_drop(m, tp);
14092                                 return (1);
14093                                 break;
14094                         case TCPS_FIN_WAIT_1:
14095 #ifdef TCP_ACCOUNTING
14096                                 rdstc = get_cyclecount();
14097                                 if (rdstc > ts_val) {
14098                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
14099                                                         (rdstc - ts_val));
14100                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14101                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
14102                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14103                                         }
14104                                 }
14105 #endif
14106                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
14107                                         soisdisconnected(so);
14108                                         tcp_timer_activate(tp, TT_2MSL,
14109                                                            (tcp_fast_finwait2_recycle ?
14110                                                             tcp_finwait2_timeout :
14111                                                             TP_MAXIDLE(tp)));
14112                                 }
14113                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
14114                                 break;
14115                         default:
14116                                 break;
14117                         }
14118                 }
14119                 if (rack->r_fast_output) {
14120                         /*
14121                          * We re doing fast output.. can we expand that?
14122                          */
14123                         rack_gain_for_fastoutput(rack, tp, so, acked_amount);
14124                 }
14125 #ifdef TCP_ACCOUNTING
14126                 rdstc = get_cyclecount();
14127                 if (rdstc > ts_val) {
14128                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
14129                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14130                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
14131                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14132                         }
14133                 }
14134
14135         } else if (win_up_req) {
14136                 rdstc = get_cyclecount();
14137                 if (rdstc > ts_val) {
14138                         counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val));
14139                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14140                                 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
14141                         }
14142                 }
14143 #endif
14144         }
14145         /* Now is there a next packet, if so we are done */
14146         m_freem(m);
14147         did_out = 0;
14148         if (nxt_pkt) {
14149 #ifdef TCP_ACCOUNTING
14150                 sched_unpin();
14151 #endif
14152                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
14153                 return (0);
14154         }
14155         rack_handle_might_revert(tp, rack);
14156         ctf_calc_rwin(so, tp);
14157         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
14158         send_out_a_rst:
14159                 (void)tp->t_fb->tfb_tcp_output(tp);
14160                 did_out = 1;
14161         }
14162         rack_free_trim(rack);
14163 #ifdef TCP_ACCOUNTING
14164         sched_unpin();
14165 #endif
14166         rack_timer_audit(tp, rack, &so->so_snd);
14167         rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
14168         return (0);
14169 }
14170
14171
14172 static int
14173 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
14174     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
14175     int32_t nxt_pkt, struct timeval *tv)
14176 {
14177 #ifdef TCP_ACCOUNTING
14178         uint64_t ts_val;
14179 #endif
14180         int32_t thflags, retval, did_out = 0;
14181         int32_t way_out = 0;
14182         /*
14183          * cts - is the current time from tv (caller gets ts) in microseconds.
14184          * ms_cts - is the current time from tv in milliseconds.
14185          * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
14186          */
14187         uint32_t cts, us_cts, ms_cts;
14188         uint32_t tiwin;
14189         struct timespec ts;
14190         struct tcpopt to;
14191         struct tcp_rack *rack;
14192         struct rack_sendmap *rsm;
14193         int32_t prev_state = 0;
14194 #ifdef TCP_ACCOUNTING
14195         int ack_val_set = 0xf;
14196 #endif
14197         int nsegs;
14198         /*
14199          * tv passed from common code is from either M_TSTMP_LRO or
14200          * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
14201          */
14202         rack = (struct tcp_rack *)tp->t_fb_ptr;
14203         if (m->m_flags & M_ACKCMP) {
14204                 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
14205         }
14206         if (m->m_flags & M_ACKCMP) {
14207                 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
14208         }
14209         cts = tcp_tv_to_usectick(tv);
14210         ms_cts =  tcp_tv_to_mssectick(tv);
14211         nsegs = m->m_pkthdr.lro_nsegs;
14212         counter_u64_add(rack_proc_non_comp_ack, 1);
14213         thflags = th->th_flags;
14214 #ifdef TCP_ACCOUNTING
14215         sched_pin();
14216         if (thflags & TH_ACK)
14217                 ts_val = get_cyclecount();
14218 #endif
14219         if ((m->m_flags & M_TSTMP) ||
14220             (m->m_flags & M_TSTMP_LRO)) {
14221                 mbuf_tstmp2timespec(m, &ts);
14222                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
14223                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
14224         } else
14225                 rack->r_ctl.act_rcv_time = *tv;
14226         kern_prefetch(rack, &prev_state);
14227         prev_state = 0;
14228         /*
14229          * Unscale the window into a 32-bit value. For the SYN_SENT state
14230          * the scale is zero.
14231          */
14232         tiwin = th->th_win << tp->snd_scale;
14233 #ifdef TCP_ACCOUNTING
14234         if (thflags & TH_ACK) {
14235                 /*
14236                  * We have a tradeoff here. We can either do what we are
14237                  * doing i.e. pinning to this CPU and then doing the accounting
14238                  * <or> we could do a critical enter, setup the rdtsc and cpu
14239                  * as in below, and then validate we are on the same CPU on
14240                  * exit. I have choosen to not do the critical enter since
14241                  * that often will gain you a context switch, and instead lock
14242                  * us (line above this if) to the same CPU with sched_pin(). This
14243                  * means we may be context switched out for a higher priority
14244                  * interupt but we won't be moved to another CPU.
14245                  *
14246                  * If this occurs (which it won't very often since we most likely
14247                  * are running this code in interupt context and only a higher
14248                  * priority will bump us ... clock?) we will falsely add in
14249                  * to the time the interupt processing time plus the ack processing
14250                  * time. This is ok since its a rare event.
14251                  */
14252                 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
14253                                                     ctf_fixed_maxseg(tp));
14254         }
14255 #endif
14256         /*
14257          * Parse options on any incoming segment.
14258          */
14259         memset(&to, 0, sizeof(to));
14260         tcp_dooptions(&to, (u_char *)(th + 1),
14261             (th->th_off << 2) - sizeof(struct tcphdr),
14262             (thflags & TH_SYN) ? TO_SYN : 0);
14263         NET_EPOCH_ASSERT();
14264         INP_WLOCK_ASSERT(tp->t_inpcb);
14265         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
14266             __func__));
14267         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
14268             __func__));
14269         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
14270             (tp->t_flags & TF_GPUTINPROG)) {
14271                 /*
14272                  * We have a goodput in progress
14273                  * and we have entered a late state.
14274                  * Do we have enough data in the sb
14275                  * to handle the GPUT request?
14276                  */
14277                 uint32_t bytes;
14278
14279                 bytes = tp->gput_ack - tp->gput_seq;
14280                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
14281                         bytes += tp->gput_seq - tp->snd_una;
14282                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
14283                         /*
14284                          * There are not enough bytes in the socket
14285                          * buffer that have been sent to cover this
14286                          * measurement. Cancel it.
14287                          */
14288                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
14289                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
14290                                                    tp->gput_seq,
14291                                                    0, 0, 18, __LINE__, NULL, 0);
14292                         tp->t_flags &= ~TF_GPUTINPROG;
14293                 }
14294         }
14295         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
14296                 union tcp_log_stackspecific log;
14297                 struct timeval ltv;
14298 #ifdef NETFLIX_HTTP_LOGGING
14299                 struct http_sendfile_track *http_req;
14300
14301                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
14302                         http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
14303                 } else {
14304                         http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
14305                 }
14306 #endif
14307                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
14308                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
14309                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
14310                 if (rack->rack_no_prr == 0)
14311                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
14312                 else
14313                         log.u_bbr.flex1 = 0;
14314                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
14315                 log.u_bbr.use_lt_bw <<= 1;
14316                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
14317                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
14318                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14319                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
14320                 log.u_bbr.flex3 = m->m_flags;
14321                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
14322                 log.u_bbr.lost = thflags;
14323                 log.u_bbr.pacing_gain = 0x1;
14324 #ifdef TCP_ACCOUNTING
14325                 log.u_bbr.cwnd_gain = ack_val_set;
14326 #endif
14327                 log.u_bbr.flex7 = 2;
14328                 if (m->m_flags & M_TSTMP) {
14329                         /* Record the hardware timestamp if present */
14330                         mbuf_tstmp2timespec(m, &ts);
14331                         ltv.tv_sec = ts.tv_sec;
14332                         ltv.tv_usec = ts.tv_nsec / 1000;
14333                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
14334                 } else if (m->m_flags & M_TSTMP_LRO) {
14335                         /* Record the LRO the arrival timestamp */
14336                         mbuf_tstmp2timespec(m, &ts);
14337                         ltv.tv_sec = ts.tv_sec;
14338                         ltv.tv_usec = ts.tv_nsec / 1000;
14339                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
14340                 }
14341                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
14342                 /* Log the rcv time */
14343                 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
14344 #ifdef NETFLIX_HTTP_LOGGING
14345                 log.u_bbr.applimited = tp->t_http_closed;
14346                 log.u_bbr.applimited <<= 8;
14347                 log.u_bbr.applimited |= tp->t_http_open;
14348                 log.u_bbr.applimited <<= 8;
14349                 log.u_bbr.applimited |= tp->t_http_req;
14350                 if (http_req) {
14351                         /* Copy out any client req info */
14352                         /* seconds */
14353                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
14354                         /* useconds */
14355                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
14356                         log.u_bbr.rttProp = http_req->timestamp;
14357                         log.u_bbr.cur_del_rate = http_req->start;
14358                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
14359                                 log.u_bbr.flex8 |= 1;
14360                         } else {
14361                                 log.u_bbr.flex8 |= 2;
14362                                 log.u_bbr.bw_inuse = http_req->end;
14363                         }
14364                         log.u_bbr.flex6 = http_req->start_seq;
14365                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
14366                                 log.u_bbr.flex8 |= 4;
14367                                 log.u_bbr.epoch = http_req->end_seq;
14368                         }
14369                 }
14370 #endif
14371                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
14372                     tlen, &log, true, &ltv);
14373         }
14374         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
14375                 way_out = 4;
14376                 retval = 0;
14377                 m_freem(m);
14378                 goto done_with_input;
14379         }
14380         /*
14381          * If a segment with the ACK-bit set arrives in the SYN-SENT state
14382          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
14383          */
14384         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
14385             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
14386                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
14387                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
14388 #ifdef TCP_ACCOUNTING
14389                 sched_unpin();
14390 #endif
14391                 return (1);
14392         }
14393         /*
14394          * If timestamps were negotiated during SYN/ACK and a
14395          * segment without a timestamp is received, silently drop
14396          * the segment, unless it is a RST segment or missing timestamps are
14397          * tolerated.
14398          * See section 3.2 of RFC 7323.
14399          */
14400         if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
14401             ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
14402                 way_out = 5;
14403                 retval = 0;
14404                 m_freem(m);
14405                 goto done_with_input;
14406         }
14407
14408         /*
14409          * Segment received on connection. Reset idle time and keep-alive
14410          * timer. XXX: This should be done after segment validation to
14411          * ignore broken/spoofed segs.
14412          */
14413         if  (tp->t_idle_reduce &&
14414              (tp->snd_max == tp->snd_una) &&
14415              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
14416                 counter_u64_add(rack_input_idle_reduces, 1);
14417                 rack_cc_after_idle(rack, tp);
14418         }
14419         tp->t_rcvtime = ticks;
14420 #ifdef STATS
14421         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
14422 #endif
14423         if (tiwin > rack->r_ctl.rc_high_rwnd)
14424                 rack->r_ctl.rc_high_rwnd = tiwin;
14425         /*
14426          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
14427          * this to occur after we've validated the segment.
14428          */
14429         if (tp->t_flags2 & TF2_ECN_PERMIT) {
14430                 if (thflags & TH_CWR) {
14431                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
14432                         tp->t_flags |= TF_ACKNOW;
14433                 }
14434                 switch (iptos & IPTOS_ECN_MASK) {
14435                 case IPTOS_ECN_CE:
14436                         tp->t_flags2 |= TF2_ECN_SND_ECE;
14437                         KMOD_TCPSTAT_INC(tcps_ecn_ce);
14438                         break;
14439                 case IPTOS_ECN_ECT0:
14440                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
14441                         break;
14442                 case IPTOS_ECN_ECT1:
14443                         KMOD_TCPSTAT_INC(tcps_ecn_ect1);
14444                         break;
14445                 }
14446
14447                 /* Process a packet differently from RFC3168. */
14448                 cc_ecnpkt_handler(tp, th, iptos);
14449
14450                 /* Congestion experienced. */
14451                 if (thflags & TH_ECE) {
14452                         rack_cong_signal(tp, CC_ECN, th->th_ack);
14453                 }
14454         }
14455
14456         /*
14457          * If echoed timestamp is later than the current time, fall back to
14458          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
14459          * were used when this connection was established.
14460          */
14461         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
14462                 to.to_tsecr -= tp->ts_offset;
14463                 if (TSTMP_GT(to.to_tsecr, ms_cts))
14464                         to.to_tsecr = 0;
14465         }
14466
14467         /*
14468          * If its the first time in we need to take care of options and
14469          * verify we can do SACK for rack!
14470          */
14471         if (rack->r_state == 0) {
14472                 /* Should be init'd by rack_init() */
14473                 KASSERT(rack->rc_inp != NULL,
14474                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
14475                 if (rack->rc_inp == NULL) {
14476                         rack->rc_inp = tp->t_inpcb;
14477                 }
14478
14479                 /*
14480                  * Process options only when we get SYN/ACK back. The SYN
14481                  * case for incoming connections is handled in tcp_syncache.
14482                  * According to RFC1323 the window field in a SYN (i.e., a
14483                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
14484                  * this is traditional behavior, may need to be cleaned up.
14485                  */
14486                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
14487                         /* Handle parallel SYN for ECN */
14488                         if (!(thflags & TH_ACK) &&
14489                             ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) &&
14490                             ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) {
14491                                 tp->t_flags2 |= TF2_ECN_PERMIT;
14492                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
14493                                 TCPSTAT_INC(tcps_ecn_shs);
14494                         }
14495                         if ((to.to_flags & TOF_SCALE) &&
14496                             (tp->t_flags & TF_REQ_SCALE)) {
14497                                 tp->t_flags |= TF_RCVD_SCALE;
14498                                 tp->snd_scale = to.to_wscale;
14499                         } else
14500                                 tp->t_flags &= ~TF_REQ_SCALE;
14501                         /*
14502                          * Initial send window.  It will be updated with the
14503                          * next incoming segment to the scaled value.
14504                          */
14505                         tp->snd_wnd = th->th_win;
14506                         rack_validate_fo_sendwin_up(tp, rack);
14507                         if ((to.to_flags & TOF_TS) &&
14508                             (tp->t_flags & TF_REQ_TSTMP)) {
14509                                 tp->t_flags |= TF_RCVD_TSTMP;
14510                                 tp->ts_recent = to.to_tsval;
14511                                 tp->ts_recent_age = cts;
14512                         } else
14513                                 tp->t_flags &= ~TF_REQ_TSTMP;
14514                         if (to.to_flags & TOF_MSS) {
14515                                 tcp_mss(tp, to.to_mss);
14516                         }
14517                         if ((tp->t_flags & TF_SACK_PERMIT) &&
14518                             (to.to_flags & TOF_SACKPERM) == 0)
14519                                 tp->t_flags &= ~TF_SACK_PERMIT;
14520                         if (IS_FASTOPEN(tp->t_flags)) {
14521                                 if (to.to_flags & TOF_FASTOPEN) {
14522                                         uint16_t mss;
14523
14524                                         if (to.to_flags & TOF_MSS)
14525                                                 mss = to.to_mss;
14526                                         else
14527                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
14528                                                         mss = TCP6_MSS;
14529                                                 else
14530                                                         mss = TCP_MSS;
14531                                         tcp_fastopen_update_cache(tp, mss,
14532                                             to.to_tfo_len, to.to_tfo_cookie);
14533                                 } else
14534                                         tcp_fastopen_disable_path(tp);
14535                         }
14536                 }
14537                 /*
14538                  * At this point we are at the initial call. Here we decide
14539                  * if we are doing RACK or not. We do this by seeing if
14540                  * TF_SACK_PERMIT is set and the sack-not-required is clear.
14541                  * The code now does do dup-ack counting so if you don't
14542                  * switch back you won't get rack & TLP, but you will still
14543                  * get this stack.
14544                  */
14545
14546                 if ((rack_sack_not_required == 0) &&
14547                     ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
14548                         tcp_switch_back_to_default(tp);
14549                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
14550                             tlen, iptos);
14551 #ifdef TCP_ACCOUNTING
14552                         sched_unpin();
14553 #endif
14554                         return (1);
14555                 }
14556                 tcp_set_hpts(tp->t_inpcb);
14557                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
14558         }
14559         if (thflags & TH_FIN)
14560                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
14561         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
14562         if ((rack->rc_gp_dyn_mul) &&
14563             (rack->use_fixed_rate == 0) &&
14564             (rack->rc_always_pace)) {
14565                 /* Check in on probertt */
14566                 rack_check_probe_rtt(rack, us_cts);
14567         }
14568         rack_clear_rate_sample(rack);
14569         if (rack->forced_ack) {
14570                 rack_handle_probe_response(rack, tiwin, us_cts);
14571         }
14572         /*
14573          * This is the one exception case where we set the rack state
14574          * always. All other times (timers etc) we must have a rack-state
14575          * set (so we assure we have done the checks above for SACK).
14576          */
14577         rack->r_ctl.rc_rcvtime = cts;
14578         if (rack->r_state != tp->t_state)
14579                 rack_set_state(tp, rack);
14580         if (SEQ_GT(th->th_ack, tp->snd_una) &&
14581             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
14582                 kern_prefetch(rsm, &prev_state);
14583         prev_state = rack->r_state;
14584         retval = (*rack->r_substate) (m, th, so,
14585             tp, &to, drop_hdrlen,
14586             tlen, tiwin, thflags, nxt_pkt, iptos);
14587 #ifdef INVARIANTS
14588         if ((retval == 0) &&
14589             (tp->t_inpcb == NULL)) {
14590                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
14591                     retval, tp, prev_state);
14592         }
14593 #endif
14594         if (retval == 0) {
14595                 /*
14596                  * If retval is 1 the tcb is unlocked and most likely the tp
14597                  * is gone.
14598                  */
14599                 INP_WLOCK_ASSERT(tp->t_inpcb);
14600                 if ((rack->rc_gp_dyn_mul) &&
14601                     (rack->rc_always_pace) &&
14602                     (rack->use_fixed_rate == 0) &&
14603                     rack->in_probe_rtt &&
14604                     (rack->r_ctl.rc_time_probertt_starts == 0)) {
14605                         /*
14606                          * If we are going for target, lets recheck before
14607                          * we output.
14608                          */
14609                         rack_check_probe_rtt(rack, us_cts);
14610                 }
14611                 if (rack->set_pacing_done_a_iw == 0) {
14612                         /* How much has been acked? */
14613                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
14614                                 /* We have enough to set in the pacing segment size */
14615                                 rack->set_pacing_done_a_iw = 1;
14616                                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14617                         }
14618                 }
14619                 tcp_rack_xmit_timer_commit(rack, tp);
14620 #ifdef TCP_ACCOUNTING
14621                 /*
14622                  * If we set the ack_val_se to what ack processing we are doing
14623                  * we also want to track how many cycles we burned. Note
14624                  * the bits after tcp_output we let be "free". This is because
14625                  * we are also tracking the tcp_output times as well. Note the
14626                  * use of 0xf here since we only have 11 counter (0 - 0xa) and
14627                  * 0xf cannot be returned and is what we initialize it too to
14628                  * indicate we are not doing the tabulations.
14629                  */
14630                 if (ack_val_set != 0xf) {
14631                         uint64_t crtsc;
14632
14633                         crtsc = get_cyclecount();
14634                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14635                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14636                                 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
14637                         }
14638                 }
14639 #endif
14640                 if (nxt_pkt == 0) {
14641                         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
14642 do_output_now:
14643                                 did_out = 1;
14644                                 (void)tp->t_fb->tfb_tcp_output(tp);
14645                         }
14646                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
14647                         rack_free_trim(rack);
14648                 }
14649                 /* Update any rounds needed */
14650                 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends)) {
14651                         rack->r_ctl.current_round++;
14652                         rack->r_ctl.roundends = tp->snd_max;
14653                         if (CC_ALGO(tp)->newround != NULL) {
14654                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
14655                         }
14656                 }
14657                 if ((nxt_pkt == 0) &&
14658                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
14659                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
14660                      (tp->t_flags & TF_DELACK) ||
14661                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
14662                       (tp->t_state <= TCPS_CLOSING)))) {
14663                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
14664                         if ((tp->snd_max == tp->snd_una) &&
14665                             ((tp->t_flags & TF_DELACK) == 0) &&
14666                             (rack->rc_inp->inp_in_hpts) &&
14667                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
14668                                 /* keep alive not needed if we are hptsi output yet */
14669                                 ;
14670                         } else {
14671                                 int late = 0;
14672                                 if (rack->rc_inp->inp_in_hpts) {
14673                                         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14674                                                 us_cts = tcp_get_usecs(NULL);
14675                                                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
14676                                                         rack->r_early = 1;
14677                                                         rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
14678                                                 } else
14679                                                         late = 1;
14680                                                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
14681                                         }
14682                                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
14683                                 }
14684                                 if (late && (did_out == 0)) {
14685                                         /*
14686                                          * We are late in the sending
14687                                          * and we did not call the output
14688                                          * (this probably should not happen).
14689                                          */
14690                                         goto do_output_now;
14691                                 }
14692                                 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
14693                         }
14694                         way_out = 1;
14695                 } else if (nxt_pkt == 0) {
14696                         /* Do we have the correct timer running? */
14697                         rack_timer_audit(tp, rack, &so->so_snd);
14698                         way_out = 2;
14699                 }
14700         done_with_input:
14701                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
14702                 if (did_out)
14703                         rack->r_wanted_output = 0;
14704 #ifdef INVARIANTS
14705                 if (tp->t_inpcb == NULL) {
14706                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
14707                               did_out,
14708                               retval, tp, prev_state);
14709                 }
14710 #endif
14711 #ifdef TCP_ACCOUNTING
14712         } else {
14713                 /*
14714                  * Track the time (see above).
14715                  */
14716                 if (ack_val_set != 0xf) {
14717                         uint64_t crtsc;
14718
14719                         crtsc = get_cyclecount();
14720                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14721                         /*
14722                          * Note we *DO NOT* increment the per-tcb counters since
14723                          * in the else the TP may be gone!!
14724                          */
14725                 }
14726 #endif
14727         }
14728 #ifdef TCP_ACCOUNTING
14729         sched_unpin();
14730 #endif
14731         return (retval);
14732 }
14733
14734 void
14735 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
14736     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
14737 {
14738         struct timeval tv;
14739
14740         /* First lets see if we have old packets */
14741         if (tp->t_in_pkt) {
14742                 if (ctf_do_queued_segments(so, tp, 1)) {
14743                         m_freem(m);
14744                         return;
14745                 }
14746         }
14747         if (m->m_flags & M_TSTMP_LRO) {
14748                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
14749                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
14750         } else {
14751                 /* Should not be should we kassert instead? */
14752                 tcp_get_usecs(&tv);
14753         }
14754         if (rack_do_segment_nounlock(m, th, so, tp,
14755                                      drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
14756                 INP_WUNLOCK(tp->t_inpcb);
14757         }
14758 }
14759
14760 struct rack_sendmap *
14761 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
14762 {
14763         struct rack_sendmap *rsm = NULL;
14764         int32_t idx;
14765         uint32_t srtt = 0, thresh = 0, ts_low = 0;
14766
14767         /* Return the next guy to be re-transmitted */
14768         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
14769                 return (NULL);
14770         }
14771         if (tp->t_flags & TF_SENTFIN) {
14772                 /* retran the end FIN? */
14773                 return (NULL);
14774         }
14775         /* ok lets look at this one */
14776         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
14777         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
14778                 goto check_it;
14779         }
14780         rsm = rack_find_lowest_rsm(rack);
14781         if (rsm == NULL) {
14782                 return (NULL);
14783         }
14784 check_it:
14785         if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
14786             (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
14787                 /*
14788                  * No sack so we automatically do the 3 strikes and
14789                  * retransmit (no rack timer would be started).
14790                  */
14791
14792                 return (rsm);
14793         }
14794         if (rsm->r_flags & RACK_ACKED) {
14795                 return (NULL);
14796         }
14797         if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
14798             (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
14799                 /* Its not yet ready */
14800                 return (NULL);
14801         }
14802         srtt = rack_grab_rtt(tp, rack);
14803         idx = rsm->r_rtr_cnt - 1;
14804         ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
14805         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
14806         if ((tsused == ts_low) ||
14807             (TSTMP_LT(tsused, ts_low))) {
14808                 /* No time since sending */
14809                 return (NULL);
14810         }
14811         if ((tsused - ts_low) < thresh) {
14812                 /* It has not been long enough yet */
14813                 return (NULL);
14814         }
14815         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
14816             ((rsm->r_flags & RACK_SACK_PASSED) &&
14817              (rack->sack_attack_disable == 0))) {
14818                 /*
14819                  * We have passed the dup-ack threshold <or>
14820                  * a SACK has indicated this is missing.
14821                  * Note that if you are a declared attacker
14822                  * it is only the dup-ack threshold that
14823                  * will cause retransmits.
14824                  */
14825                 /* log retransmit reason */
14826                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
14827                 rack->r_fast_output = 0;
14828                 return (rsm);
14829         }
14830         return (NULL);
14831 }
14832
14833 static void
14834 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
14835                            uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
14836                            int line, struct rack_sendmap *rsm, uint8_t quality)
14837 {
14838         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
14839                 union tcp_log_stackspecific log;
14840                 struct timeval tv;
14841
14842                 memset(&log, 0, sizeof(log));
14843                 log.u_bbr.flex1 = slot;
14844                 log.u_bbr.flex2 = len;
14845                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
14846                 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
14847                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
14848                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
14849                 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
14850                 log.u_bbr.use_lt_bw <<= 1;
14851                 log.u_bbr.use_lt_bw |= rack->r_late;
14852                 log.u_bbr.use_lt_bw <<= 1;
14853                 log.u_bbr.use_lt_bw |= rack->r_early;
14854                 log.u_bbr.use_lt_bw <<= 1;
14855                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
14856                 log.u_bbr.use_lt_bw <<= 1;
14857                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
14858                 log.u_bbr.use_lt_bw <<= 1;
14859                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
14860                 log.u_bbr.use_lt_bw <<= 1;
14861                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
14862                 log.u_bbr.use_lt_bw <<= 1;
14863                 log.u_bbr.use_lt_bw |= rack->gp_ready;
14864                 log.u_bbr.pkt_epoch = line;
14865                 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
14866                 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
14867                 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
14868                 log.u_bbr.bw_inuse = bw_est;
14869                 log.u_bbr.delRate = bw;
14870                 if (rack->r_ctl.gp_bw == 0)
14871                         log.u_bbr.cur_del_rate = 0;
14872                 else
14873                         log.u_bbr.cur_del_rate = rack_get_bw(rack);
14874                 log.u_bbr.rttProp = len_time;
14875                 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
14876                 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
14877                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
14878                 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
14879                         /* We are in slow start */
14880                         log.u_bbr.flex7 = 1;
14881                 } else {
14882                         /* we are on congestion avoidance */
14883                         log.u_bbr.flex7 = 0;
14884                 }
14885                 log.u_bbr.flex8 = method;
14886                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14887                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14888                 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
14889                 log.u_bbr.cwnd_gain <<= 1;
14890                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
14891                 log.u_bbr.cwnd_gain <<= 1;
14892                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
14893                 log.u_bbr.bbr_substate = quality;
14894                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
14895                     &rack->rc_inp->inp_socket->so_rcv,
14896                     &rack->rc_inp->inp_socket->so_snd,
14897                     BBR_LOG_HPTSI_CALC, 0,
14898                     0, &log, false, &tv);
14899         }
14900 }
14901
14902 static uint32_t
14903 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
14904 {
14905         uint32_t new_tso, user_max;
14906
14907         user_max = rack->rc_user_set_max_segs * mss;
14908         if (rack->rc_force_max_seg) {
14909                 return (user_max);
14910         }
14911         if (rack->use_fixed_rate &&
14912             ((rack->r_ctl.crte == NULL) ||
14913              (bw != rack->r_ctl.crte->rate))) {
14914                 /* Use the user mss since we are not exactly matched */
14915                 return (user_max);
14916         }
14917         new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
14918         if (new_tso > user_max)
14919                 new_tso = user_max;
14920         return (new_tso);
14921 }
14922
14923 static int32_t
14924 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
14925 {
14926         uint64_t lentim, fill_bw;
14927
14928         /* Lets first see if we are full, if so continue with normal rate */
14929         rack->r_via_fill_cw = 0;
14930         if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
14931                 return (slot);
14932         if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
14933                 return (slot);
14934         if (rack->r_ctl.rc_last_us_rtt == 0)
14935                 return (slot);
14936         if (rack->rc_pace_fill_if_rttin_range &&
14937             (rack->r_ctl.rc_last_us_rtt >=
14938              (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
14939                 /* The rtt is huge, N * smallest, lets not fill */
14940                 return (slot);
14941         }
14942         /*
14943          * first lets calculate the b/w based on the last us-rtt
14944          * and the sndwnd.
14945          */
14946         fill_bw = rack->r_ctl.cwnd_to_use;
14947         /* Take the rwnd if its smaller */
14948         if (fill_bw > rack->rc_tp->snd_wnd)
14949                 fill_bw = rack->rc_tp->snd_wnd;
14950         if (rack->r_fill_less_agg) {
14951                 /*
14952                  * Now take away the inflight (this will reduce our
14953                  * aggressiveness and yeah, if we get that much out in 1RTT
14954                  * we will have had acks come back and still be behind).
14955                  */
14956                 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14957         }
14958         /* Now lets make it into a b/w */
14959         fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
14960         fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
14961         /* We are below the min b/w */
14962         if (non_paced)
14963                 *rate_wanted = fill_bw;
14964         if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
14965                 return (slot);
14966         if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
14967                 fill_bw = rack->r_ctl.bw_rate_cap;
14968         rack->r_via_fill_cw = 1;
14969         if (rack->r_rack_hw_rate_caps &&
14970             (rack->r_ctl.crte != NULL)) {
14971                 uint64_t high_rate;
14972
14973                 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
14974                 if (fill_bw > high_rate) {
14975                         /* We are capping bw at the highest rate table entry */
14976                         if (*rate_wanted > high_rate) {
14977                                 /* The original rate was also capped */
14978                                 rack->r_via_fill_cw = 0;
14979                         }
14980                         rack_log_hdwr_pacing(rack,
14981                                              fill_bw, high_rate, __LINE__,
14982                                              0, 3);
14983                         fill_bw = high_rate;
14984                         if (capped)
14985                                 *capped = 1;
14986                 }
14987         } else if ((rack->r_ctl.crte == NULL) &&
14988                    (rack->rack_hdrw_pacing == 0) &&
14989                    (rack->rack_hdw_pace_ena) &&
14990                    rack->r_rack_hw_rate_caps &&
14991                    (rack->rack_attempt_hdwr_pace == 0) &&
14992                    (rack->rc_inp->inp_route.ro_nh != NULL) &&
14993                    (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
14994                 /*
14995                  * Ok we may have a first attempt that is greater than our top rate
14996                  * lets check.
14997                  */
14998                 uint64_t high_rate;
14999
15000                 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
15001                 if (high_rate) {
15002                         if (fill_bw > high_rate) {
15003                                 fill_bw = high_rate;
15004                                 if (capped)
15005                                         *capped = 1;
15006                         }
15007                 }
15008         }
15009         /*
15010          * Ok fill_bw holds our mythical b/w to fill the cwnd
15011          * in a rtt, what does that time wise equate too?
15012          */
15013         lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
15014         lentim /= fill_bw;
15015         *rate_wanted = fill_bw;
15016         if (non_paced || (lentim < slot)) {
15017                 rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
15018                                            0, lentim, 12, __LINE__, NULL, 0);
15019                 return ((int32_t)lentim);
15020         } else
15021                 return (slot);
15022 }
15023
15024 static int32_t
15025 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
15026 {
15027         uint64_t srtt;
15028         int32_t slot = 0;
15029         int can_start_hw_pacing = 1;
15030         int err;
15031
15032         if (rack->rc_always_pace == 0) {
15033                 /*
15034                  * We use the most optimistic possible cwnd/srtt for
15035                  * sending calculations. This will make our
15036                  * calculation anticipate getting more through
15037                  * quicker then possible. But thats ok we don't want
15038                  * the peer to have a gap in data sending.
15039                  */
15040                 uint64_t cwnd, tr_perms = 0;
15041                 int32_t reduce = 0;
15042
15043         old_method:
15044                 /*
15045                  * We keep no precise pacing with the old method
15046                  * instead we use the pacer to mitigate bursts.
15047                  */
15048                 if (rack->r_ctl.rc_rack_min_rtt)
15049                         srtt = rack->r_ctl.rc_rack_min_rtt;
15050                 else
15051                         srtt = max(tp->t_srtt, 1);
15052                 if (rack->r_ctl.rc_rack_largest_cwnd)
15053                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
15054                 else
15055                         cwnd = rack->r_ctl.cwnd_to_use;
15056                 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
15057                 tr_perms = (cwnd * 1000) / srtt;
15058                 if (tr_perms == 0) {
15059                         tr_perms = ctf_fixed_maxseg(tp);
15060                 }
15061                 /*
15062                  * Calculate how long this will take to drain, if
15063                  * the calculation comes out to zero, thats ok we
15064                  * will use send_a_lot to possibly spin around for
15065                  * more increasing tot_len_this_send to the point
15066                  * that its going to require a pace, or we hit the
15067                  * cwnd. Which in that case we are just waiting for
15068                  * a ACK.
15069                  */
15070                 slot = len / tr_perms;
15071                 /* Now do we reduce the time so we don't run dry? */
15072                 if (slot && rack_slot_reduction) {
15073                         reduce = (slot / rack_slot_reduction);
15074                         if (reduce < slot) {
15075                                 slot -= reduce;
15076                         } else
15077                                 slot = 0;
15078                 }
15079                 slot *= HPTS_USEC_IN_MSEC;
15080                 if (rack->rc_pace_to_cwnd) {
15081                         uint64_t rate_wanted = 0;
15082
15083                         slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
15084                         rack->rc_ack_can_sendout_data = 1;
15085                         rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
15086                 } else
15087                         rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
15088         } else {
15089                 uint64_t bw_est, res, lentim, rate_wanted;
15090                 uint32_t orig_val, segs, oh;
15091                 int capped = 0;
15092                 int prev_fill;
15093
15094                 if ((rack->r_rr_config == 1) && rsm) {
15095                         return (rack->r_ctl.rc_min_to);
15096                 }
15097                 if (rack->use_fixed_rate) {
15098                         rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
15099                 } else if ((rack->r_ctl.init_rate == 0) &&
15100 #ifdef NETFLIX_PEAKRATE
15101                            (rack->rc_tp->t_maxpeakrate == 0) &&
15102 #endif
15103                            (rack->r_ctl.gp_bw == 0)) {
15104                         /* no way to yet do an estimate */
15105                         bw_est = rate_wanted = 0;
15106                 } else {
15107                         bw_est = rack_get_bw(rack);
15108                         rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
15109                 }
15110                 if ((bw_est == 0) || (rate_wanted == 0) ||
15111                     ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
15112                         /*
15113                          * No way yet to make a b/w estimate or
15114                          * our raise is set incorrectly.
15115                          */
15116                         goto old_method;
15117                 }
15118                 /* We need to account for all the overheads */
15119                 segs = (len + segsiz - 1) / segsiz;
15120                 /*
15121                  * We need the diff between 1514 bytes (e-mtu with e-hdr)
15122                  * and how much data we put in each packet. Yes this
15123                  * means we may be off if we are larger than 1500 bytes
15124                  * or smaller. But this just makes us more conservative.
15125                  */
15126                 if (rack_hw_rate_min &&
15127                     (bw_est < rack_hw_rate_min))
15128                         can_start_hw_pacing = 0;
15129                 if (ETHERNET_SEGMENT_SIZE > segsiz)
15130                         oh = ETHERNET_SEGMENT_SIZE - segsiz;
15131                 else
15132                         oh = 0;
15133                 segs *= oh;
15134                 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
15135                 res = lentim / rate_wanted;
15136                 slot = (uint32_t)res;
15137                 orig_val = rack->r_ctl.rc_pace_max_segs;
15138                 if (rack->r_ctl.crte == NULL) {
15139                         /*
15140                          * Only do this if we are not hardware pacing
15141                          * since if we are doing hw-pacing below we will
15142                          * set make a call after setting up or changing
15143                          * the rate.
15144                          */
15145                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
15146                 } else if (rack->rc_inp->inp_snd_tag == NULL) {
15147                         /*
15148                          * We lost our rate somehow, this can happen
15149                          * if the interface changed underneath us.
15150                          */
15151                         tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
15152                         rack->r_ctl.crte = NULL;
15153                         /* Lets re-allow attempting to setup pacing */
15154                         rack->rack_hdrw_pacing = 0;
15155                         rack->rack_attempt_hdwr_pace = 0;
15156                         rack_log_hdwr_pacing(rack,
15157                                              rate_wanted, bw_est, __LINE__,
15158                                              0, 6);
15159                 }
15160                 /* Did we change the TSO size, if so log it */
15161                 if (rack->r_ctl.rc_pace_max_segs != orig_val)
15162                         rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0);
15163                 prev_fill = rack->r_via_fill_cw;
15164                 if ((rack->rc_pace_to_cwnd) &&
15165                     (capped == 0) &&
15166                     (rack->use_fixed_rate == 0) &&
15167                     (rack->in_probe_rtt == 0) &&
15168                     (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
15169                         /*
15170                          * We want to pace at our rate *or* faster to
15171                          * fill the cwnd to the max if its not full.
15172                          */
15173                         slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
15174                 }
15175                 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
15176                     (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
15177                         if ((rack->rack_hdw_pace_ena) &&
15178                             (can_start_hw_pacing > 0) &&
15179                             (rack->rack_hdrw_pacing == 0) &&
15180                             (rack->rack_attempt_hdwr_pace == 0)) {
15181                                 /*
15182                                  * Lets attempt to turn on hardware pacing
15183                                  * if we can.
15184                                  */
15185                                 rack->rack_attempt_hdwr_pace = 1;
15186                                 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
15187                                                                        rack->rc_inp->inp_route.ro_nh->nh_ifp,
15188                                                                        rate_wanted,
15189                                                                        RS_PACING_GEQ,
15190                                                                        &err, &rack->r_ctl.crte_prev_rate);
15191                                 if (rack->r_ctl.crte) {
15192                                         rack->rack_hdrw_pacing = 1;
15193                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
15194                                                                                                  0, rack->r_ctl.crte,
15195                                                                                                  NULL);
15196                                         rack_log_hdwr_pacing(rack,
15197                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15198                                                              err, 0);
15199                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15200                                 } else {
15201                                         counter_u64_add(rack_hw_pace_init_fail, 1);
15202                                 }
15203                         } else if (rack->rack_hdrw_pacing &&
15204                                    (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
15205                                 /* Do we need to adjust our rate? */
15206                                 const struct tcp_hwrate_limit_table *nrte;
15207
15208                                 if (rack->r_up_only &&
15209                                     (rate_wanted < rack->r_ctl.crte->rate)) {
15210                                         /**
15211                                          * We have four possible states here
15212                                          * having to do with the previous time
15213                                          * and this time.
15214                                          *   previous  |  this-time
15215                                          * A)     0      |     0   -- fill_cw not in the picture
15216                                          * B)     1      |     0   -- we were doing a fill-cw but now are not
15217                                          * C)     1      |     1   -- all rates from fill_cw
15218                                          * D)     0      |     1   -- we were doing non-fill and now we are filling
15219                                          *
15220                                          * For case A, C and D we don't allow a drop. But for
15221                                          * case B where we now our on our steady rate we do
15222                                          * allow a drop.
15223                                          *
15224                                          */
15225                                         if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
15226                                                 goto done_w_hdwr;
15227                                 }
15228                                 if ((rate_wanted > rack->r_ctl.crte->rate) ||
15229                                     (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
15230                                         if (rack_hw_rate_to_low &&
15231                                             (bw_est < rack_hw_rate_to_low)) {
15232                                                 /*
15233                                                  * The pacing rate is too low for hardware, but
15234                                                  * do allow hardware pacing to be restarted.
15235                                                  */
15236                                                 rack_log_hdwr_pacing(rack,
15237                                                              bw_est, rack->r_ctl.crte->rate, __LINE__,
15238                                                              0, 5);
15239                                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
15240                                                 rack->r_ctl.crte = NULL;
15241                                                 rack->rack_attempt_hdwr_pace = 0;
15242                                                 rack->rack_hdrw_pacing = 0;
15243                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15244                                                 goto done_w_hdwr;
15245                                         }
15246                                         nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
15247                                                                    rack->rc_tp,
15248                                                                    rack->rc_inp->inp_route.ro_nh->nh_ifp,
15249                                                                    rate_wanted,
15250                                                                    RS_PACING_GEQ,
15251                                                                    &err, &rack->r_ctl.crte_prev_rate);
15252                                         if (nrte == NULL) {
15253                                                 /* Lost the rate */
15254                                                 rack->rack_hdrw_pacing = 0;
15255                                                 rack->r_ctl.crte = NULL;
15256                                                 rack_log_hdwr_pacing(rack,
15257                                                                      rate_wanted, 0, __LINE__,
15258                                                                      err, 1);
15259                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15260                                                 counter_u64_add(rack_hw_pace_lost, 1);
15261                                         } else if (nrte != rack->r_ctl.crte) {
15262                                                 rack->r_ctl.crte = nrte;
15263                                                 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
15264                                                                                                          segsiz, 0,
15265                                                                                                          rack->r_ctl.crte,
15266                                                                                                          NULL);
15267                                                 rack_log_hdwr_pacing(rack,
15268                                                                      rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15269                                                                      err, 2);
15270                                                 rack->r_ctl.last_hw_bw_req = rate_wanted;
15271                                         }
15272                                 } else {
15273                                         /* We just need to adjust the segment size */
15274                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15275                                         rack_log_hdwr_pacing(rack,
15276                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15277                                                              0, 4);
15278                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15279                                 }
15280                         }
15281                 }
15282                 if ((rack->r_ctl.crte != NULL) &&
15283                     (rack->r_ctl.crte->rate == rate_wanted)) {
15284                         /*
15285                          * We need to add a extra if the rates
15286                          * are exactly matched. The idea is
15287                          * we want the software to make sure the
15288                          * queue is empty before adding more, this
15289                          * gives us N MSS extra pace times where
15290                          * N is our sysctl
15291                          */
15292                         slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
15293                 }
15294 done_w_hdwr:
15295                 if (rack_limit_time_with_srtt &&
15296                     (rack->use_fixed_rate == 0) &&
15297 #ifdef NETFLIX_PEAKRATE
15298                     (rack->rc_tp->t_maxpeakrate == 0) &&
15299 #endif
15300                     (rack->rack_hdrw_pacing == 0)) {
15301                         /*
15302                          * Sanity check, we do not allow the pacing delay
15303                          * to be longer than the SRTT of the path. If it is
15304                          * a slow path, then adding a packet should increase
15305                          * the RTT and compensate for this i.e. the srtt will
15306                          * be greater so the allowed pacing time will be greater.
15307                          *
15308                          * Note this restriction is not for where a peak rate
15309                          * is set, we are doing fixed pacing or hardware pacing.
15310                          */
15311                         if (rack->rc_tp->t_srtt)
15312                                 srtt = rack->rc_tp->t_srtt;
15313                         else
15314                                 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;    /* its in ms convert */
15315                         if (srtt < (uint64_t)slot) {
15316                                 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
15317                                 slot = srtt;
15318                         }
15319                 }
15320                 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
15321         }
15322         if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
15323                 /*
15324                  * If this rate is seeing enobufs when it
15325                  * goes to send then either the nic is out
15326                  * of gas or we are mis-estimating the time
15327                  * somehow and not letting the queue empty
15328                  * completely. Lets add to the pacing time.
15329                  */
15330                 int hw_boost_delay;
15331
15332                 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
15333                 if (hw_boost_delay > rack_enobuf_hw_max)
15334                         hw_boost_delay = rack_enobuf_hw_max;
15335                 else if (hw_boost_delay < rack_enobuf_hw_min)
15336                         hw_boost_delay = rack_enobuf_hw_min;
15337                 slot += hw_boost_delay;
15338         }
15339         if (slot)
15340                 counter_u64_add(rack_calc_nonzero, 1);
15341         else
15342                 counter_u64_add(rack_calc_zero, 1);
15343         return (slot);
15344 }
15345
15346 static void
15347 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
15348     tcp_seq startseq, uint32_t sb_offset)
15349 {
15350         struct rack_sendmap *my_rsm = NULL;
15351         struct rack_sendmap fe;
15352
15353         if (tp->t_state < TCPS_ESTABLISHED) {
15354                 /*
15355                  * We don't start any measurements if we are
15356                  * not at least established.
15357                  */
15358                 return;
15359         }
15360         if (tp->t_state >= TCPS_FIN_WAIT_1) {
15361                 /*
15362                  * We will get no more data into the SB
15363                  * this means we need to have the data available
15364                  * before we start a measurement.
15365                  */
15366
15367                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) <
15368                     max(rc_init_window(rack),
15369                         (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
15370                         /* Nope not enough data */
15371                         return;
15372                 }
15373         }
15374         tp->t_flags |= TF_GPUTINPROG;
15375         rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
15376         rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
15377         tp->gput_seq = startseq;
15378         rack->app_limited_needs_set = 0;
15379         if (rack->in_probe_rtt)
15380                 rack->measure_saw_probe_rtt = 1;
15381         else if ((rack->measure_saw_probe_rtt) &&
15382                  (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
15383                 rack->measure_saw_probe_rtt = 0;
15384         if (rack->rc_gp_filled)
15385                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
15386         else {
15387                 /* Special case initial measurement */
15388                 struct timeval tv;
15389
15390                 tp->gput_ts = tcp_get_usecs(&tv);
15391                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15392         }
15393         /*
15394          * We take a guess out into the future,
15395          * if we have no measurement and no
15396          * initial rate, we measure the first
15397          * initial-windows worth of data to
15398          * speed up getting some GP measurement and
15399          * thus start pacing.
15400          */
15401         if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
15402                 rack->app_limited_needs_set = 1;
15403                 tp->gput_ack = startseq + max(rc_init_window(rack),
15404                                               (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
15405                 rack_log_pacing_delay_calc(rack,
15406                                            tp->gput_seq,
15407                                            tp->gput_ack,
15408                                            0,
15409                                            tp->gput_ts,
15410                                            rack->r_ctl.rc_app_limited_cnt,
15411                                            9,
15412                                            __LINE__, NULL, 0);
15413                 return;
15414         }
15415         if (sb_offset) {
15416                 /*
15417                  * We are out somewhere in the sb
15418                  * can we use the already outstanding data?
15419                  */
15420                 if (rack->r_ctl.rc_app_limited_cnt == 0) {
15421                         /*
15422                          * Yes first one is good and in this case
15423                          * the tp->gput_ts is correctly set based on
15424                          * the last ack that arrived (no need to
15425                          * set things up when an ack comes in).
15426                          */
15427                         my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
15428                         if ((my_rsm == NULL) ||
15429                             (my_rsm->r_rtr_cnt != 1)) {
15430                                 /* retransmission? */
15431                                 goto use_latest;
15432                         }
15433                 } else {
15434                         if (rack->r_ctl.rc_first_appl == NULL) {
15435                                 /*
15436                                  * If rc_first_appl is NULL
15437                                  * then the cnt should be 0.
15438                                  * This is probably an error, maybe
15439                                  * a KASSERT would be approprate.
15440                                  */
15441                                 goto use_latest;
15442                         }
15443                         /*
15444                          * If we have a marker pointer to the last one that is
15445                          * app limited we can use that, but we need to set
15446                          * things up so that when it gets ack'ed we record
15447                          * the ack time (if its not already acked).
15448                          */
15449                         rack->app_limited_needs_set = 1;
15450                         /*
15451                          * We want to get to the rsm that is either
15452                          * next with space i.e. over 1 MSS or the one
15453                          * after that (after the app-limited).
15454                          */
15455                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15456                                          rack->r_ctl.rc_first_appl);
15457                         if (my_rsm) {
15458                                 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
15459                                         /* Have to use the next one */
15460                                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15461                                                          my_rsm);
15462                                 else {
15463                                         /* Use after the first MSS of it is acked */
15464                                         tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
15465                                         goto start_set;
15466                                 }
15467                         }
15468                         if ((my_rsm == NULL) ||
15469                             (my_rsm->r_rtr_cnt != 1)) {
15470                                 /*
15471                                  * Either its a retransmit or
15472                                  * the last is the app-limited one.
15473                                  */
15474                                 goto use_latest;
15475                         }
15476                 }
15477                 tp->gput_seq = my_rsm->r_start;
15478 start_set:
15479                 if (my_rsm->r_flags & RACK_ACKED) {
15480                         /*
15481                          * This one has been acked use the arrival ack time
15482                          */
15483                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15484                         rack->app_limited_needs_set = 0;
15485                 }
15486                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15487                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
15488                 rack_log_pacing_delay_calc(rack,
15489                                            tp->gput_seq,
15490                                            tp->gput_ack,
15491                                            (uint64_t)my_rsm,
15492                                            tp->gput_ts,
15493                                            rack->r_ctl.rc_app_limited_cnt,
15494                                            9,
15495                                            __LINE__, NULL, 0);
15496                 return;
15497         }
15498
15499 use_latest:
15500         /*
15501          * We don't know how long we may have been
15502          * idle or if this is the first-send. Lets
15503          * setup the flag so we will trim off
15504          * the first ack'd data so we get a true
15505          * measurement.
15506          */
15507         rack->app_limited_needs_set = 1;
15508         tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
15509         /* Find this guy so we can pull the send time */
15510         fe.r_start = startseq;
15511         my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
15512         if (my_rsm) {
15513                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15514                 if (my_rsm->r_flags & RACK_ACKED) {
15515                         /*
15516                          * Unlikely since its probably what was
15517                          * just transmitted (but I am paranoid).
15518                          */
15519                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15520                         rack->app_limited_needs_set = 0;
15521                 }
15522                 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
15523                         /* This also is unlikely */
15524                         tp->gput_seq = my_rsm->r_start;
15525                 }
15526         } else {
15527                 /*
15528                  * TSNH unless we have some send-map limit,
15529                  * and even at that it should not be hitting
15530                  * that limit (we should have stopped sending).
15531                  */
15532                 struct timeval tv;
15533
15534                 microuptime(&tv);
15535                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15536         }
15537         rack_log_pacing_delay_calc(rack,
15538                                    tp->gput_seq,
15539                                    tp->gput_ack,
15540                                    (uint64_t)my_rsm,
15541                                    tp->gput_ts,
15542                                    rack->r_ctl.rc_app_limited_cnt,
15543                                    9, __LINE__, NULL, 0);
15544 }
15545
15546 static inline uint32_t
15547 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
15548     uint32_t avail, int32_t sb_offset)
15549 {
15550         uint32_t len;
15551         uint32_t sendwin;
15552
15553         if (tp->snd_wnd > cwnd_to_use)
15554                 sendwin = cwnd_to_use;
15555         else
15556                 sendwin = tp->snd_wnd;
15557         if (ctf_outstanding(tp) >= tp->snd_wnd) {
15558                 /* We never want to go over our peers rcv-window */
15559                 len = 0;
15560         } else {
15561                 uint32_t flight;
15562
15563                 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15564                 if (flight >= sendwin) {
15565                         /*
15566                          * We have in flight what we are allowed by cwnd (if
15567                          * it was rwnd blocking it would have hit above out
15568                          * >= tp->snd_wnd).
15569                          */
15570                         return (0);
15571                 }
15572                 len = sendwin - flight;
15573                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
15574                         /* We would send too much (beyond the rwnd) */
15575                         len = tp->snd_wnd - ctf_outstanding(tp);
15576                 }
15577                 if ((len + sb_offset) > avail) {
15578                         /*
15579                          * We don't have that much in the SB, how much is
15580                          * there?
15581                          */
15582                         len = avail - sb_offset;
15583                 }
15584         }
15585         return (len);
15586 }
15587
15588 static void
15589 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
15590              unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
15591              int rsm_is_null, int optlen, int line, uint16_t mode)
15592 {
15593         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
15594                 union tcp_log_stackspecific log;
15595                 struct timeval tv;
15596
15597                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15598                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
15599                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
15600                 log.u_bbr.flex1 = error;
15601                 log.u_bbr.flex2 = flags;
15602                 log.u_bbr.flex3 = rsm_is_null;
15603                 log.u_bbr.flex4 = ipoptlen;
15604                 log.u_bbr.flex5 = tp->rcv_numsacks;
15605                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
15606                 log.u_bbr.flex7 = optlen;
15607                 log.u_bbr.flex8 = rack->r_fsb_inited;
15608                 log.u_bbr.applimited = rack->r_fast_output;
15609                 log.u_bbr.bw_inuse = rack_get_bw(rack);
15610                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
15611                 log.u_bbr.cwnd_gain = mode;
15612                 log.u_bbr.pkts_out = orig_len;
15613                 log.u_bbr.lt_epoch = len;
15614                 log.u_bbr.delivered = line;
15615                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15616                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15617                 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
15618                                len, &log, false, NULL, NULL, 0, &tv);
15619         }
15620 }
15621
15622
15623 static struct mbuf *
15624 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
15625                    struct rack_fast_send_blk *fsb,
15626                    int32_t seglimit, int32_t segsize, int hw_tls)
15627 {
15628 #ifdef KERN_TLS
15629         struct ktls_session *tls, *ntls;
15630         struct mbuf *start;
15631 #endif
15632         struct mbuf *m, *n, **np, *smb;
15633         struct mbuf *top;
15634         int32_t off, soff;
15635         int32_t len = *plen;
15636         int32_t fragsize;
15637         int32_t len_cp = 0;
15638         uint32_t mlen, frags;
15639
15640         soff = off = the_off;
15641         smb = m = the_m;
15642         np = &top;
15643         top = NULL;
15644 #ifdef KERN_TLS
15645         if (hw_tls && (m->m_flags & M_EXTPG))
15646                 tls = m->m_epg_tls;
15647         else
15648                 tls = NULL;
15649         start = m;
15650 #endif
15651         while (len > 0) {
15652                 if (m == NULL) {
15653                         *plen = len_cp;
15654                         break;
15655                 }
15656 #ifdef KERN_TLS
15657                 if (hw_tls) {
15658                         if (m->m_flags & M_EXTPG)
15659                                 ntls = m->m_epg_tls;
15660                         else
15661                                 ntls = NULL;
15662
15663                         /*
15664                          * Avoid mixing TLS records with handshake
15665                          * data or TLS records from different
15666                          * sessions.
15667                          */
15668                         if (tls != ntls) {
15669                                 MPASS(m != start);
15670                                 *plen = len_cp;
15671                                 break;
15672                         }
15673                 }
15674 #endif
15675                 mlen = min(len, m->m_len - off);
15676                 if (seglimit) {
15677                         /*
15678                          * For M_EXTPG mbufs, add 3 segments
15679                          * + 1 in case we are crossing page boundaries
15680                          * + 2 in case the TLS hdr/trailer are used
15681                          * It is cheaper to just add the segments
15682                          * than it is to take the cache miss to look
15683                          * at the mbuf ext_pgs state in detail.
15684                          */
15685                         if (m->m_flags & M_EXTPG) {
15686                                 fragsize = min(segsize, PAGE_SIZE);
15687                                 frags = 3;
15688                         } else {
15689                                 fragsize = segsize;
15690                                 frags = 0;
15691                         }
15692
15693                         /* Break if we really can't fit anymore. */
15694                         if ((frags + 1) >= seglimit) {
15695                                 *plen = len_cp;
15696                                 break;
15697                         }
15698
15699                         /*
15700                          * Reduce size if you can't copy the whole
15701                          * mbuf. If we can't copy the whole mbuf, also
15702                          * adjust len so the loop will end after this
15703                          * mbuf.
15704                          */
15705                         if ((frags + howmany(mlen, fragsize)) >= seglimit) {
15706                                 mlen = (seglimit - frags - 1) * fragsize;
15707                                 len = mlen;
15708                                 *plen = len_cp + len;
15709                         }
15710                         frags += howmany(mlen, fragsize);
15711                         if (frags == 0)
15712                                 frags++;
15713                         seglimit -= frags;
15714                         KASSERT(seglimit > 0,
15715                             ("%s: seglimit went too low", __func__));
15716                 }
15717                 n = m_get(M_NOWAIT, m->m_type);
15718                 *np = n;
15719                 if (n == NULL)
15720                         goto nospace;
15721                 n->m_len = mlen;
15722                 soff += mlen;
15723                 len_cp += n->m_len;
15724                 if (m->m_flags & (M_EXT|M_EXTPG)) {
15725                         n->m_data = m->m_data + off;
15726                         mb_dupcl(n, m);
15727                 } else {
15728                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
15729                             (u_int)n->m_len);
15730                 }
15731                 len -= n->m_len;
15732                 off = 0;
15733                 m = m->m_next;
15734                 np = &n->m_next;
15735                 if (len || (soff == smb->m_len)) {
15736                         /*
15737                          * We have more so we move forward  or
15738                          * we have consumed the entire mbuf and
15739                          * len has fell to 0.
15740                          */
15741                         soff = 0;
15742                         smb = m;
15743                 }
15744
15745         }
15746         if (fsb != NULL) {
15747                 fsb->m = smb;
15748                 fsb->off = soff;
15749                 if (smb) {
15750                         /*
15751                          * Save off the size of the mbuf. We do
15752                          * this so that we can recognize when it
15753                          * has been trimmed by sbcut() as acks
15754                          * come in.
15755                          */
15756                         fsb->o_m_len = smb->m_len;
15757                 } else {
15758                         /*
15759                          * This is the case where the next mbuf went to NULL. This
15760                          * means with this copy we have sent everything in the sb.
15761                          * In theory we could clear the fast_output flag, but lets
15762                          * not since its possible that we could get more added
15763                          * and acks that call the extend function which would let
15764                          * us send more.
15765                          */
15766                         fsb->o_m_len = 0;
15767                 }
15768         }
15769         return (top);
15770 nospace:
15771         if (top)
15772                 m_freem(top);
15773         return (NULL);
15774
15775 }
15776
15777 /*
15778  * This is a copy of m_copym(), taking the TSO segment size/limit
15779  * constraints into account, and advancing the sndptr as it goes.
15780  */
15781 static struct mbuf *
15782 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
15783                 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
15784 {
15785         struct mbuf *m, *n;
15786         int32_t soff;
15787
15788         soff = rack->r_ctl.fsb.off;
15789         m = rack->r_ctl.fsb.m;
15790         if (rack->r_ctl.fsb.o_m_len > m->m_len) {
15791                 /*
15792                  * The mbuf had the front of it chopped off by an ack
15793                  * we need to adjust the soff/off by that difference.
15794                  */
15795                 uint32_t delta;
15796
15797                 delta = rack->r_ctl.fsb.o_m_len - m->m_len;
15798                 soff -= delta;
15799         } else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
15800                 /*
15801                  * The mbuf was expanded probably by
15802                  * a m_compress. Just update o_m_len.
15803                  */
15804                 rack->r_ctl.fsb.o_m_len = m->m_len;
15805         }
15806         KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
15807         KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
15808         KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
15809                                  __FUNCTION__,
15810                                  rack, *plen, m, m->m_len));
15811         /* Save off the right location before we copy and advance */
15812         *s_soff = soff;
15813         *s_mb = rack->r_ctl.fsb.m;
15814         n = rack_fo_base_copym(m, soff, plen,
15815                                &rack->r_ctl.fsb,
15816                                seglimit, segsize, rack->r_ctl.fsb.hw_tls);
15817         return (n);
15818 }
15819
15820 static int
15821 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
15822                      uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
15823 {
15824         /*
15825          * Enter the fast retransmit path. We are given that a sched_pin is
15826          * in place (if accounting is compliled in) and the cycle count taken
15827          * at the entry is in the ts_val. The concept her is that the rsm
15828          * now holds the mbuf offsets and such so we can directly transmit
15829          * without a lot of overhead, the len field is already set for
15830          * us to prohibit us from sending too much (usually its 1MSS).
15831          */
15832         struct ip *ip = NULL;
15833         struct udphdr *udp = NULL;
15834         struct tcphdr *th = NULL;
15835         struct mbuf *m = NULL;
15836         struct inpcb *inp;
15837         uint8_t *cpto;
15838         struct tcp_log_buffer *lgb;
15839 #ifdef TCP_ACCOUNTING
15840         uint64_t crtsc;
15841         int cnt_thru = 1;
15842 #endif
15843         struct tcpopt to;
15844         u_char opt[TCP_MAXOLEN];
15845         uint32_t hdrlen, optlen;
15846         int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0;
15847         uint32_t us_cts;
15848         uint32_t if_hw_tsomaxsegcount = 0, startseq;
15849         uint32_t if_hw_tsomaxsegsize;
15850
15851 #ifdef INET6
15852         struct ip6_hdr *ip6 = NULL;
15853
15854         if (rack->r_is_v6) {
15855                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
15856                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
15857         } else
15858 #endif                          /* INET6 */
15859         {
15860                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
15861                 hdrlen = sizeof(struct tcpiphdr);
15862         }
15863         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
15864                 goto failed;
15865         }
15866         if (doing_tlp) {
15867                 /* Its a TLP add the flag, it may already be there but be sure */
15868                 rsm->r_flags |= RACK_TLP;
15869         } else {
15870                 /* If it was a TLP it is not not on this retransmit */
15871                 rsm->r_flags &= ~RACK_TLP;
15872         }
15873         startseq = rsm->r_start;
15874         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
15875         inp = rack->rc_inp;
15876         to.to_flags = 0;
15877         flags = tcp_outflags[tp->t_state];
15878         if (flags & (TH_SYN|TH_RST)) {
15879                 goto failed;
15880         }
15881         if (rsm->r_flags & RACK_HAS_FIN) {
15882                 /* We can't send a FIN here */
15883                 goto failed;
15884         }
15885         if (flags & TH_FIN) {
15886                 /* We never send a FIN */
15887                 flags &= ~TH_FIN;
15888         }
15889         if (tp->t_flags & TF_RCVD_TSTMP) {
15890                 to.to_tsval = ms_cts + tp->ts_offset;
15891                 to.to_tsecr = tp->ts_recent;
15892                 to.to_flags = TOF_TS;
15893         }
15894         optlen = tcp_addoptions(&to, opt);
15895         hdrlen += optlen;
15896         udp = rack->r_ctl.fsb.udp;
15897         if (udp)
15898                 hdrlen += sizeof(struct udphdr);
15899         if (rack->r_ctl.rc_pace_max_segs)
15900                 max_val = rack->r_ctl.rc_pace_max_segs;
15901         else if (rack->rc_user_set_max_segs)
15902                 max_val = rack->rc_user_set_max_segs * segsiz;
15903         else
15904                 max_val = len;
15905         if ((tp->t_flags & TF_TSO) &&
15906             V_tcp_do_tso &&
15907             (len > segsiz) &&
15908             (tp->t_port == 0))
15909                 tso = 1;
15910 #ifdef INET6
15911         if (MHLEN < hdrlen + max_linkhdr)
15912                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
15913         else
15914 #endif
15915                 m = m_gethdr(M_NOWAIT, MT_DATA);
15916         if (m == NULL)
15917                 goto failed;
15918         m->m_data += max_linkhdr;
15919         m->m_len = hdrlen;
15920         th = rack->r_ctl.fsb.th;
15921         /* Establish the len to send */
15922         if (len > max_val)
15923                 len = max_val;
15924         if ((tso) && (len + optlen > tp->t_maxseg)) {
15925                 uint32_t if_hw_tsomax;
15926                 int32_t max_len;
15927
15928                 /* extract TSO information */
15929                 if_hw_tsomax = tp->t_tsomax;
15930                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
15931                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
15932                 /*
15933                  * Check if we should limit by maximum payload
15934                  * length:
15935                  */
15936                 if (if_hw_tsomax != 0) {
15937                         /* compute maximum TSO length */
15938                         max_len = (if_hw_tsomax - hdrlen -
15939                                    max_linkhdr);
15940                         if (max_len <= 0) {
15941                                 goto failed;
15942                         } else if (len > max_len) {
15943                                 len = max_len;
15944                         }
15945                 }
15946                 if (len <= segsiz) {
15947                         /*
15948                          * In case there are too many small fragments don't
15949                          * use TSO:
15950                          */
15951                         tso = 0;
15952                 }
15953         } else {
15954                 tso = 0;
15955         }
15956         if ((tso == 0) && (len > segsiz))
15957                 len = segsiz;
15958         us_cts = tcp_get_usecs(tv);
15959         if ((len == 0) ||
15960             (len <= MHLEN - hdrlen - max_linkhdr)) {
15961                 goto failed;
15962         }
15963         th->th_seq = htonl(rsm->r_start);
15964         th->th_ack = htonl(tp->rcv_nxt);
15965         /*
15966          * The PUSH bit should only be applied
15967          * if the full retransmission is made. If
15968          * we are sending less than this is the
15969          * left hand edge and should not have
15970          * the PUSH bit.
15971          */
15972         if ((rsm->r_flags & RACK_HAD_PUSH) &&
15973             (len == (rsm->r_end - rsm->r_start)))
15974                 flags |= TH_PUSH;
15975         th->th_flags = flags;
15976         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
15977         if (th->th_win == 0) {
15978                 tp->t_sndzerowin++;
15979                 tp->t_flags |= TF_RXWIN0SENT;
15980         } else
15981                 tp->t_flags &= ~TF_RXWIN0SENT;
15982         if (rsm->r_flags & RACK_TLP) {
15983                 /*
15984                  * TLP should not count in retran count, but
15985                  * in its own bin
15986                  */
15987                 counter_u64_add(rack_tlp_retran, 1);
15988                 counter_u64_add(rack_tlp_retran_bytes, len);
15989         } else {
15990                 tp->t_sndrexmitpack++;
15991                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
15992                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
15993         }
15994 #ifdef STATS
15995         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
15996                                  len);
15997 #endif
15998         if (rsm->m == NULL)
15999                 goto failed;
16000         if (rsm->orig_m_len != rsm->m->m_len) {
16001                 /* Fix up the orig_m_len and possibly the mbuf offset */
16002                 rack_adjust_orig_mlen(rsm);
16003         }
16004         m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
16005         if (len <= segsiz) {
16006                 /*
16007                  * Must have ran out of mbufs for the copy
16008                  * shorten it to no longer need tso. Lets
16009                  * not put on sendalot since we are low on
16010                  * mbufs.
16011                  */
16012                 tso = 0;
16013         }
16014         if ((m->m_next == NULL) || (len <= 0)){
16015                 goto failed;
16016         }
16017         if (udp) {
16018                 if (rack->r_is_v6)
16019                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
16020                 else
16021                         ulen = hdrlen + len - sizeof(struct ip);
16022                 udp->uh_ulen = htons(ulen);
16023         }
16024         m->m_pkthdr.rcvif = (struct ifnet *)0;
16025         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
16026 #ifdef INET6
16027         if (rack->r_is_v6) {
16028                 if (tp->t_port) {
16029                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
16030                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16031                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
16032                         th->th_sum = htons(0);
16033                         UDPSTAT_INC(udps_opackets);
16034                 } else {
16035                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
16036                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16037                         th->th_sum = in6_cksum_pseudo(ip6,
16038                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
16039                                                       0);
16040                 }
16041         }
16042 #endif
16043 #if defined(INET6) && defined(INET)
16044         else
16045 #endif
16046 #ifdef INET
16047         {
16048                 if (tp->t_port) {
16049                         m->m_pkthdr.csum_flags = CSUM_UDP;
16050                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16051                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
16052                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
16053                         th->th_sum = htons(0);
16054                         UDPSTAT_INC(udps_opackets);
16055                 } else {
16056                         m->m_pkthdr.csum_flags = CSUM_TCP;
16057                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16058                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
16059                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
16060                                                                         IPPROTO_TCP + len + optlen));
16061                 }
16062                 /* IP version must be set here for ipv4/ipv6 checking later */
16063                 KASSERT(ip->ip_v == IPVERSION,
16064                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
16065         }
16066 #endif
16067         if (tso) {
16068                 KASSERT(len > tp->t_maxseg - optlen,
16069                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
16070                 m->m_pkthdr.csum_flags |= CSUM_TSO;
16071                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
16072         }
16073 #ifdef INET6
16074         if (rack->r_is_v6) {
16075                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
16076                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
16077                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
16078                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16079                 else
16080                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16081         }
16082 #endif
16083 #if defined(INET) && defined(INET6)
16084         else
16085 #endif
16086 #ifdef INET
16087         {
16088                 ip->ip_len = htons(m->m_pkthdr.len);
16089                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
16090                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
16091                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16092                         if (tp->t_port == 0 || len < V_tcp_minmss) {
16093                                 ip->ip_off |= htons(IP_DF);
16094                         }
16095                 } else {
16096                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16097                 }
16098         }
16099 #endif
16100         /* Time to copy in our header */
16101         cpto = mtod(m, uint8_t *);
16102         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
16103         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
16104         if (optlen) {
16105                 bcopy(opt, th + 1, optlen);
16106                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
16107         } else {
16108                 th->th_off = sizeof(struct tcphdr) >> 2;
16109         }
16110         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
16111                 union tcp_log_stackspecific log;
16112
16113                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16114                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
16115                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
16116                 if (rack->rack_no_prr)
16117                         log.u_bbr.flex1 = 0;
16118                 else
16119                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16120                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
16121                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
16122                 log.u_bbr.flex4 = max_val;
16123                 log.u_bbr.flex5 = 0;
16124                 /* Save off the early/late values */
16125                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
16126                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
16127                 log.u_bbr.bw_inuse = rack_get_bw(rack);
16128                 if (doing_tlp == 0)
16129                         log.u_bbr.flex8 = 1;
16130                 else
16131                         log.u_bbr.flex8 = 2;
16132                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
16133                 log.u_bbr.flex7 = 55;
16134                 log.u_bbr.pkts_out = tp->t_maxseg;
16135                 log.u_bbr.timeStamp = cts;
16136                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16137                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
16138                 log.u_bbr.delivered = 0;
16139                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
16140                                      len, &log, false, NULL, NULL, 0, tv);
16141         } else
16142                 lgb = NULL;
16143 #ifdef INET6
16144         if (rack->r_is_v6) {
16145                 error = ip6_output(m, NULL,
16146                                    &inp->inp_route6,
16147                                    0, NULL, NULL, inp);
16148         }
16149 #endif
16150 #if defined(INET) && defined(INET6)
16151         else
16152 #endif
16153 #ifdef INET
16154         {
16155                 error = ip_output(m, NULL,
16156                                   &inp->inp_route,
16157                                   0, 0, inp);
16158         }
16159 #endif
16160         m = NULL;
16161         if (lgb) {
16162                 lgb->tlb_errno = error;
16163                 lgb = NULL;
16164         }
16165         if (error) {
16166                 goto failed;
16167         }
16168         rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
16169                         rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls);
16170         if (doing_tlp && (rack->fast_rsm_hack == 0)) {
16171                 rack->rc_tlp_in_progress = 1;
16172                 rack->r_ctl.rc_tlp_cnt_out++;
16173         }
16174         if (error == 0) {
16175                 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
16176                 if (doing_tlp) {
16177                         rack->rc_last_sent_tlp_past_cumack = 0;
16178                         rack->rc_last_sent_tlp_seq_valid = 1;
16179                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
16180                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
16181                 }
16182         }
16183         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16184         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16185         if (IN_FASTRECOVERY(tp->t_flags) && rsm)
16186                 rack->r_ctl.retran_during_recovery += len;
16187         {
16188                 int idx;
16189
16190                 idx = (len / segsiz) + 3;
16191                 if (idx >= TCP_MSS_ACCT_ATIMER)
16192                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16193                 else
16194                         counter_u64_add(rack_out_size[idx], 1);
16195         }
16196         if (tp->t_rtttime == 0) {
16197                 tp->t_rtttime = ticks;
16198                 tp->t_rtseq = startseq;
16199                 KMOD_TCPSTAT_INC(tcps_segstimed);
16200         }
16201         counter_u64_add(rack_fto_rsm_send, 1);
16202         if (error && (error == ENOBUFS)) {
16203                 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
16204                 if (rack->rc_enobuf < 0x7f)
16205                         rack->rc_enobuf++;
16206                 if (slot < (10 * HPTS_USEC_IN_MSEC))
16207                         slot = 10 * HPTS_USEC_IN_MSEC;
16208         } else
16209                 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
16210         if ((slot == 0) ||
16211             (rack->rc_always_pace == 0) ||
16212             (rack->r_rr_config == 1)) {
16213                 /*
16214                  * We have no pacing set or we
16215                  * are using old-style rack or
16216                  * we are overriden to use the old 1ms pacing.
16217                  */
16218                 slot = rack->r_ctl.rc_min_to;
16219         }
16220         rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
16221         if (rack->r_must_retran) {
16222                 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
16223                 if ((SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) ||
16224                     ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
16225                         /*
16226                          * We have retransmitted all we need. If
16227                          * RACK_MUST_RXT is not set then we need to
16228                          * not retransmit this guy.
16229                          */
16230                         rack->r_must_retran = 0;
16231                         rack->r_ctl.rc_out_at_rto = 0;
16232                         if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
16233                                 /* Not one we should rxt */
16234                                 goto failed;
16235                         } else {
16236                                 /* Clear the flag */
16237                                 rsm->r_flags &= ~RACK_MUST_RXT;
16238                         }
16239                 } else {
16240                         /* Remove  the flag */
16241                         rsm->r_flags &= ~RACK_MUST_RXT;
16242                 }
16243         }
16244 #ifdef TCP_ACCOUNTING
16245         crtsc = get_cyclecount();
16246         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16247                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16248         }
16249         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16250         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16251                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16252         }
16253         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16254         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16255                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
16256         }
16257         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz));
16258         sched_unpin();
16259 #endif
16260         return (0);
16261 failed:
16262         if (m)
16263                 m_free(m);
16264         return (-1);
16265 }
16266
16267 static void
16268 rack_sndbuf_autoscale(struct tcp_rack *rack)
16269 {
16270         /*
16271          * Automatic sizing of send socket buffer.  Often the send buffer
16272          * size is not optimally adjusted to the actual network conditions
16273          * at hand (delay bandwidth product).  Setting the buffer size too
16274          * small limits throughput on links with high bandwidth and high
16275          * delay (eg. trans-continental/oceanic links).  Setting the
16276          * buffer size too big consumes too much real kernel memory,
16277          * especially with many connections on busy servers.
16278          *
16279          * The criteria to step up the send buffer one notch are:
16280          *  1. receive window of remote host is larger than send buffer
16281          *     (with a fudge factor of 5/4th);
16282          *  2. send buffer is filled to 7/8th with data (so we actually
16283          *     have data to make use of it);
16284          *  3. send buffer fill has not hit maximal automatic size;
16285          *  4. our send window (slow start and cogestion controlled) is
16286          *     larger than sent but unacknowledged data in send buffer.
16287          *
16288          * Note that the rack version moves things much faster since
16289          * we want to avoid hitting cache lines in the rack_fast_output()
16290          * path so this is called much less often and thus moves
16291          * the SB forward by a percentage.
16292          */
16293         struct socket *so;
16294         struct tcpcb *tp;
16295         uint32_t sendwin, scaleup;
16296
16297         tp = rack->rc_tp;
16298         so = rack->rc_inp->inp_socket;
16299         sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
16300         if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
16301                 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
16302                     sbused(&so->so_snd) >=
16303                     (so->so_snd.sb_hiwat / 8 * 7) &&
16304                     sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
16305                     sendwin >= (sbused(&so->so_snd) -
16306                     (tp->snd_nxt - tp->snd_una))) {
16307                         if (rack_autosndbuf_inc)
16308                                 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
16309                         else
16310                                 scaleup = V_tcp_autosndbuf_inc;
16311                         if (scaleup < V_tcp_autosndbuf_inc)
16312                                 scaleup = V_tcp_autosndbuf_inc;
16313                         scaleup += so->so_snd.sb_hiwat;
16314                         if (scaleup > V_tcp_autosndbuf_max)
16315                                 scaleup = V_tcp_autosndbuf_max;
16316                         if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread))
16317                                 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
16318                 }
16319         }
16320 }
16321
16322 static int
16323 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
16324                  uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
16325 {
16326         /*
16327          * Enter to do fast output. We are given that the sched_pin is
16328          * in place (if accounting is compiled in) and the cycle count taken
16329          * at entry is in place in ts_val. The idea here is that
16330          * we know how many more bytes needs to be sent (presumably either
16331          * during pacing or to fill the cwnd and that was greater than
16332          * the max-burst). We have how much to send and all the info we
16333          * need to just send.
16334          */
16335         struct ip *ip = NULL;
16336         struct udphdr *udp = NULL;
16337         struct tcphdr *th = NULL;
16338         struct mbuf *m, *s_mb;
16339         struct inpcb *inp;
16340         uint8_t *cpto;
16341         struct tcp_log_buffer *lgb;
16342 #ifdef TCP_ACCOUNTING
16343         uint64_t crtsc;
16344 #endif
16345         struct tcpopt to;
16346         u_char opt[TCP_MAXOLEN];
16347         uint32_t hdrlen, optlen;
16348         int cnt_thru = 1;
16349         int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0;
16350         uint32_t us_cts, s_soff;
16351         uint32_t if_hw_tsomaxsegcount = 0, startseq;
16352         uint32_t if_hw_tsomaxsegsize;
16353         uint16_t add_flag = RACK_SENT_FP;
16354 #ifdef INET6
16355         struct ip6_hdr *ip6 = NULL;
16356
16357         if (rack->r_is_v6) {
16358                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
16359                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
16360         } else
16361 #endif                          /* INET6 */
16362         {
16363                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
16364                 hdrlen = sizeof(struct tcpiphdr);
16365         }
16366         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
16367                 m = NULL;
16368                 goto failed;
16369         }
16370         startseq = tp->snd_max;
16371         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
16372         inp = rack->rc_inp;
16373         len = rack->r_ctl.fsb.left_to_send;
16374         to.to_flags = 0;
16375         flags = rack->r_ctl.fsb.tcp_flags;
16376         if (tp->t_flags & TF_RCVD_TSTMP) {
16377                 to.to_tsval = ms_cts + tp->ts_offset;
16378                 to.to_tsecr = tp->ts_recent;
16379                 to.to_flags = TOF_TS;
16380         }
16381         optlen = tcp_addoptions(&to, opt);
16382         hdrlen += optlen;
16383         udp = rack->r_ctl.fsb.udp;
16384         if (udp)
16385                 hdrlen += sizeof(struct udphdr);
16386         if (rack->r_ctl.rc_pace_max_segs)
16387                 max_val = rack->r_ctl.rc_pace_max_segs;
16388         else if (rack->rc_user_set_max_segs)
16389                 max_val = rack->rc_user_set_max_segs * segsiz;
16390         else
16391                 max_val = len;
16392         if ((tp->t_flags & TF_TSO) &&
16393             V_tcp_do_tso &&
16394             (len > segsiz) &&
16395             (tp->t_port == 0))
16396                 tso = 1;
16397 again:
16398 #ifdef INET6
16399         if (MHLEN < hdrlen + max_linkhdr)
16400                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
16401         else
16402 #endif
16403                 m = m_gethdr(M_NOWAIT, MT_DATA);
16404         if (m == NULL)
16405                 goto failed;
16406         m->m_data += max_linkhdr;
16407         m->m_len = hdrlen;
16408         th = rack->r_ctl.fsb.th;
16409         /* Establish the len to send */
16410         if (len > max_val)
16411                 len = max_val;
16412         if ((tso) && (len + optlen > tp->t_maxseg)) {
16413                 uint32_t if_hw_tsomax;
16414                 int32_t max_len;
16415
16416                 /* extract TSO information */
16417                 if_hw_tsomax = tp->t_tsomax;
16418                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
16419                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
16420                 /*
16421                  * Check if we should limit by maximum payload
16422                  * length:
16423                  */
16424                 if (if_hw_tsomax != 0) {
16425                         /* compute maximum TSO length */
16426                         max_len = (if_hw_tsomax - hdrlen -
16427                                    max_linkhdr);
16428                         if (max_len <= 0) {
16429                                 goto failed;
16430                         } else if (len > max_len) {
16431                                 len = max_len;
16432                         }
16433                 }
16434                 if (len <= segsiz) {
16435                         /*
16436                          * In case there are too many small fragments don't
16437                          * use TSO:
16438                          */
16439                         tso = 0;
16440                 }
16441         } else {
16442                 tso = 0;
16443         }
16444         if ((tso == 0) && (len > segsiz))
16445                 len = segsiz;
16446         us_cts = tcp_get_usecs(tv);
16447         if ((len == 0) ||
16448             (len <= MHLEN - hdrlen - max_linkhdr)) {
16449                 goto failed;
16450         }
16451         sb_offset = tp->snd_max - tp->snd_una;
16452         th->th_seq = htonl(tp->snd_max);
16453         th->th_ack = htonl(tp->rcv_nxt);
16454         th->th_flags = flags;
16455         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
16456         if (th->th_win == 0) {
16457                 tp->t_sndzerowin++;
16458                 tp->t_flags |= TF_RXWIN0SENT;
16459         } else
16460                 tp->t_flags &= ~TF_RXWIN0SENT;
16461         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
16462         KMOD_TCPSTAT_INC(tcps_sndpack);
16463         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
16464 #ifdef STATS
16465         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
16466                                  len);
16467 #endif
16468         if (rack->r_ctl.fsb.m == NULL)
16469                 goto failed;
16470
16471         /* s_mb and s_soff are saved for rack_log_output */
16472         m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
16473                                     &s_mb, &s_soff);
16474         if (len <= segsiz) {
16475                 /*
16476                  * Must have ran out of mbufs for the copy
16477                  * shorten it to no longer need tso. Lets
16478                  * not put on sendalot since we are low on
16479                  * mbufs.
16480                  */
16481                 tso = 0;
16482         }
16483         if (rack->r_ctl.fsb.rfo_apply_push &&
16484             (len == rack->r_ctl.fsb.left_to_send)) {
16485                 th->th_flags |= TH_PUSH;
16486                 add_flag |= RACK_HAD_PUSH;
16487         }
16488         if ((m->m_next == NULL) || (len <= 0)){
16489                 goto failed;
16490         }
16491         if (udp) {
16492                 if (rack->r_is_v6)
16493                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
16494                 else
16495                         ulen = hdrlen + len - sizeof(struct ip);
16496                 udp->uh_ulen = htons(ulen);
16497         }
16498         m->m_pkthdr.rcvif = (struct ifnet *)0;
16499         if (tp->t_state == TCPS_ESTABLISHED &&
16500             (tp->t_flags2 & TF2_ECN_PERMIT)) {
16501                 /*
16502                  * If the peer has ECN, mark data packets with ECN capable
16503                  * transmission (ECT). Ignore pure ack packets,
16504                  * retransmissions.
16505                  */
16506                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
16507 #ifdef INET6
16508                         if (rack->r_is_v6)
16509                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
16510                         else
16511 #endif
16512                                 ip->ip_tos |= IPTOS_ECN_ECT0;
16513                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
16514                         /*
16515                          * Reply with proper ECN notifications.
16516                          * Only set CWR on new data segments.
16517                          */
16518                         if (tp->t_flags2 & TF2_ECN_SND_CWR) {
16519                                 flags |= TH_CWR;
16520                                 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
16521                         }
16522                 }
16523                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
16524                         flags |= TH_ECE;
16525         }
16526         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
16527 #ifdef INET6
16528         if (rack->r_is_v6) {
16529                 if (tp->t_port) {
16530                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
16531                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16532                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
16533                         th->th_sum = htons(0);
16534                         UDPSTAT_INC(udps_opackets);
16535                 } else {
16536                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
16537                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16538                         th->th_sum = in6_cksum_pseudo(ip6,
16539                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
16540                                                       0);
16541                 }
16542         }
16543 #endif
16544 #if defined(INET6) && defined(INET)
16545         else
16546 #endif
16547 #ifdef INET
16548         {
16549                 if (tp->t_port) {
16550                         m->m_pkthdr.csum_flags = CSUM_UDP;
16551                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16552                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
16553                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
16554                         th->th_sum = htons(0);
16555                         UDPSTAT_INC(udps_opackets);
16556                 } else {
16557                         m->m_pkthdr.csum_flags = CSUM_TCP;
16558                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16559                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
16560                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
16561                                                                         IPPROTO_TCP + len + optlen));
16562                 }
16563                 /* IP version must be set here for ipv4/ipv6 checking later */
16564                 KASSERT(ip->ip_v == IPVERSION,
16565                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
16566         }
16567 #endif
16568         if (tso) {
16569                 KASSERT(len > tp->t_maxseg - optlen,
16570                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
16571                 m->m_pkthdr.csum_flags |= CSUM_TSO;
16572                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
16573         }
16574 #ifdef INET6
16575         if (rack->r_is_v6) {
16576                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
16577                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
16578                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
16579                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16580                 else
16581                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16582         }
16583 #endif
16584 #if defined(INET) && defined(INET6)
16585         else
16586 #endif
16587 #ifdef INET
16588         {
16589                 ip->ip_len = htons(m->m_pkthdr.len);
16590                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
16591                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
16592                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16593                         if (tp->t_port == 0 || len < V_tcp_minmss) {
16594                                 ip->ip_off |= htons(IP_DF);
16595                         }
16596                 } else {
16597                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16598                 }
16599         }
16600 #endif
16601         /* Time to copy in our header */
16602         cpto = mtod(m, uint8_t *);
16603         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
16604         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
16605         if (optlen) {
16606                 bcopy(opt, th + 1, optlen);
16607                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
16608         } else {
16609                 th->th_off = sizeof(struct tcphdr) >> 2;
16610         }
16611         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
16612                 union tcp_log_stackspecific log;
16613
16614                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16615                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
16616                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
16617                 if (rack->rack_no_prr)
16618                         log.u_bbr.flex1 = 0;
16619                 else
16620                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16621                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
16622                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
16623                 log.u_bbr.flex4 = max_val;
16624                 log.u_bbr.flex5 = 0;
16625                 /* Save off the early/late values */
16626                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
16627                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
16628                 log.u_bbr.bw_inuse = rack_get_bw(rack);
16629                 log.u_bbr.flex8 = 0;
16630                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
16631                 log.u_bbr.flex7 = 44;
16632                 log.u_bbr.pkts_out = tp->t_maxseg;
16633                 log.u_bbr.timeStamp = cts;
16634                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16635                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
16636                 log.u_bbr.delivered = 0;
16637                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
16638                                      len, &log, false, NULL, NULL, 0, tv);
16639         } else
16640                 lgb = NULL;
16641 #ifdef INET6
16642         if (rack->r_is_v6) {
16643                 error = ip6_output(m, NULL,
16644                                    &inp->inp_route6,
16645                                    0, NULL, NULL, inp);
16646         }
16647 #endif
16648 #if defined(INET) && defined(INET6)
16649         else
16650 #endif
16651 #ifdef INET
16652         {
16653                 error = ip_output(m, NULL,
16654                                   &inp->inp_route,
16655                                   0, 0, inp);
16656         }
16657 #endif
16658         if (lgb) {
16659                 lgb->tlb_errno = error;
16660                 lgb = NULL;
16661         }
16662         if (error) {
16663                 *send_err = error;
16664                 m = NULL;
16665                 goto failed;
16666         }
16667         rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
16668                         NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls);
16669         m = NULL;
16670         if (tp->snd_una == tp->snd_max) {
16671                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
16672                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
16673                 tp->t_acktime = ticks;
16674         }
16675         if (error == 0)
16676                 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
16677
16678         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16679         tot_len += len;
16680         if ((tp->t_flags & TF_GPUTINPROG) == 0)
16681                 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
16682         tp->snd_max += len;
16683         tp->snd_nxt = tp->snd_max;
16684         {
16685                 int idx;
16686
16687                 idx = (len / segsiz) + 3;
16688                 if (idx >= TCP_MSS_ACCT_ATIMER)
16689                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16690                 else
16691                         counter_u64_add(rack_out_size[idx], 1);
16692         }
16693         if (len <= rack->r_ctl.fsb.left_to_send)
16694                 rack->r_ctl.fsb.left_to_send -= len;
16695         else
16696                 rack->r_ctl.fsb.left_to_send = 0;
16697         if (rack->r_ctl.fsb.left_to_send < segsiz) {
16698                 rack->r_fast_output = 0;
16699                 rack->r_ctl.fsb.left_to_send = 0;
16700                 /* At the end of fast_output scale up the sb */
16701                 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
16702                 rack_sndbuf_autoscale(rack);
16703                 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
16704         }
16705         if (tp->t_rtttime == 0) {
16706                 tp->t_rtttime = ticks;
16707                 tp->t_rtseq = startseq;
16708                 KMOD_TCPSTAT_INC(tcps_segstimed);
16709         }
16710         if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
16711             (max_val > len) &&
16712             (tso == 0)) {
16713                 max_val -= len;
16714                 len = segsiz;
16715                 th = rack->r_ctl.fsb.th;
16716                 cnt_thru++;
16717                 goto again;
16718         }
16719         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16720         counter_u64_add(rack_fto_send, 1);
16721         slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
16722         rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
16723 #ifdef TCP_ACCOUNTING
16724         crtsc = get_cyclecount();
16725         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16726                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16727         }
16728         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16729         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16730                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16731         }
16732         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16733         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16734                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
16735         }
16736         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz));
16737         sched_unpin();
16738 #endif
16739         return (0);
16740 failed:
16741         if (m)
16742                 m_free(m);
16743         rack->r_fast_output = 0;
16744         return (-1);
16745 }
16746
16747 static int
16748 rack_output(struct tcpcb *tp)
16749 {
16750         struct socket *so;
16751         uint32_t recwin;
16752         uint32_t sb_offset, s_moff = 0;
16753         int32_t len, flags, error = 0;
16754         struct mbuf *m, *s_mb = NULL;
16755         struct mbuf *mb;
16756         uint32_t if_hw_tsomaxsegcount = 0;
16757         uint32_t if_hw_tsomaxsegsize;
16758         int32_t segsiz, minseg;
16759         long tot_len_this_send = 0;
16760 #ifdef INET
16761         struct ip *ip = NULL;
16762 #endif
16763 #ifdef TCPDEBUG
16764         struct ipovly *ipov = NULL;
16765 #endif
16766         struct udphdr *udp = NULL;
16767         struct tcp_rack *rack;
16768         struct tcphdr *th;
16769         uint8_t pass = 0;
16770         uint8_t mark = 0;
16771         uint8_t wanted_cookie = 0;
16772         u_char opt[TCP_MAXOLEN];
16773         unsigned ipoptlen, optlen, hdrlen, ulen=0;
16774         uint32_t rack_seq;
16775
16776 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
16777         unsigned ipsec_optlen = 0;
16778
16779 #endif
16780         int32_t idle, sendalot;
16781         int32_t sub_from_prr = 0;
16782         volatile int32_t sack_rxmit;
16783         struct rack_sendmap *rsm = NULL;
16784         int32_t tso, mtu;
16785         struct tcpopt to;
16786         int32_t slot = 0;
16787         int32_t sup_rack = 0;
16788         uint32_t cts, ms_cts, delayed, early;
16789         uint16_t add_flag = RACK_SENT_SP;
16790         /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
16791         uint8_t hpts_calling,  doing_tlp = 0;
16792         uint32_t cwnd_to_use, pace_max_seg;
16793         int32_t do_a_prefetch = 0;
16794         int32_t prefetch_rsm = 0;
16795         int32_t orig_len = 0;
16796         struct timeval tv;
16797         int32_t prefetch_so_done = 0;
16798         struct tcp_log_buffer *lgb;
16799         struct inpcb *inp;
16800         struct sockbuf *sb;
16801         uint64_t ts_val = 0;
16802 #ifdef TCP_ACCOUNTING
16803         uint64_t crtsc;
16804 #endif
16805 #ifdef INET6
16806         struct ip6_hdr *ip6 = NULL;
16807         int32_t isipv6;
16808 #endif
16809         uint8_t filled_all = 0;
16810         bool hw_tls = false;
16811
16812         /* setup and take the cache hits here */
16813         rack = (struct tcp_rack *)tp->t_fb_ptr;
16814 #ifdef TCP_ACCOUNTING
16815         sched_pin();
16816         ts_val = get_cyclecount();
16817 #endif
16818         hpts_calling = rack->rc_inp->inp_hpts_calls;
16819         NET_EPOCH_ASSERT();
16820         INP_WLOCK_ASSERT(rack->rc_inp);
16821 #ifdef TCP_OFFLOAD
16822         if (tp->t_flags & TF_TOE) {
16823 #ifdef TCP_ACCOUNTING
16824                 sched_unpin();
16825 #endif
16826                 return (tcp_offload_output(tp));
16827         }
16828 #endif
16829         /*
16830          * For TFO connections in SYN_RECEIVED, only allow the initial
16831          * SYN|ACK and those sent by the retransmit timer.
16832          */
16833         if (IS_FASTOPEN(tp->t_flags) &&
16834             (tp->t_state == TCPS_SYN_RECEIVED) &&
16835             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
16836             (rack->r_ctl.rc_resend == NULL)) {         /* not a retransmit */
16837 #ifdef TCP_ACCOUNTING
16838                 sched_unpin();
16839 #endif
16840                 return (0);
16841         }
16842 #ifdef INET6
16843         if (rack->r_state) {
16844                 /* Use the cache line loaded if possible */
16845                 isipv6 = rack->r_is_v6;
16846         } else {
16847                 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
16848         }
16849 #endif
16850         early = 0;
16851         cts = tcp_get_usecs(&tv);
16852         ms_cts = tcp_tv_to_mssectick(&tv);
16853         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
16854             rack->rc_inp->inp_in_hpts) {
16855                 /*
16856                  * We are on the hpts for some timer but not hptsi output.
16857                  * Remove from the hpts unconditionally.
16858                  */
16859                 rack_timer_cancel(tp, rack, cts, __LINE__);
16860         }
16861         /* Are we pacing and late? */
16862         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16863             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
16864                 /* We are delayed */
16865                 delayed = cts - rack->r_ctl.rc_last_output_to;
16866         } else {
16867                 delayed = 0;
16868         }
16869         /* Do the timers, which may override the pacer */
16870         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
16871                 if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) {
16872                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
16873 #ifdef TCP_ACCOUNTING
16874                         sched_unpin();
16875 #endif
16876                         return (0);
16877                 }
16878         }
16879         if (rack->rc_in_persist) {
16880                 if (rack->rc_inp->inp_in_hpts == 0) {
16881                         /* Timer is not running */
16882                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16883                 }
16884 #ifdef TCP_ACCOUNTING
16885                 sched_unpin();
16886 #endif
16887                 return (0);
16888         }
16889         if ((rack->r_timer_override) ||
16890             (rack->rc_ack_can_sendout_data) ||
16891             (delayed) ||
16892             (tp->t_state < TCPS_ESTABLISHED)) {
16893                 rack->rc_ack_can_sendout_data = 0;
16894                 if (rack->rc_inp->inp_in_hpts)
16895                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
16896         } else if (rack->rc_inp->inp_in_hpts) {
16897                 /*
16898                  * On the hpts you can't pass even if ACKNOW is on, we will
16899                  * when the hpts fires.
16900                  */
16901 #ifdef TCP_ACCOUNTING
16902                 crtsc = get_cyclecount();
16903                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16904                         tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
16905                 }
16906                 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val));
16907                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16908                         tp->tcp_cnt_counters[SND_BLOCKED]++;
16909                 }
16910                 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1);
16911                 sched_unpin();
16912 #endif
16913                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
16914                 return (0);
16915         }
16916         rack->rc_inp->inp_hpts_calls = 0;
16917         /* Finish out both pacing early and late accounting */
16918         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16919             TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
16920                 early = rack->r_ctl.rc_last_output_to - cts;
16921         } else
16922                 early = 0;
16923         if (delayed) {
16924                 rack->r_ctl.rc_agg_delayed += delayed;
16925                 rack->r_late = 1;
16926         } else if (early) {
16927                 rack->r_ctl.rc_agg_early += early;
16928                 rack->r_early = 1;
16929         }
16930         /* Now that early/late accounting is done turn off the flag */
16931         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
16932         rack->r_wanted_output = 0;
16933         rack->r_timer_override = 0;
16934         if ((tp->t_state != rack->r_state) &&
16935             TCPS_HAVEESTABLISHED(tp->t_state)) {
16936                 rack_set_state(tp, rack);
16937         }
16938         if ((rack->r_fast_output) &&
16939             (doing_tlp == 0) &&
16940             (tp->rcv_numsacks == 0)) {
16941                 int ret;
16942
16943                 error = 0;
16944                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
16945                 if (ret >= 0)
16946                         return(ret);
16947                 else if (error) {
16948                         inp = rack->rc_inp;
16949                         so = inp->inp_socket;
16950                         sb = &so->so_snd;
16951                         goto nomore;
16952                 }
16953         }
16954         inp = rack->rc_inp;
16955         /*
16956          * For TFO connections in SYN_SENT or SYN_RECEIVED,
16957          * only allow the initial SYN or SYN|ACK and those sent
16958          * by the retransmit timer.
16959          */
16960         if (IS_FASTOPEN(tp->t_flags) &&
16961             ((tp->t_state == TCPS_SYN_RECEIVED) ||
16962              (tp->t_state == TCPS_SYN_SENT)) &&
16963             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
16964             (tp->t_rxtshift == 0)) {              /* not a retransmit */
16965                 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
16966                 so = inp->inp_socket;
16967                 sb = &so->so_snd;
16968                 goto just_return_nolock;
16969         }
16970         /*
16971          * Determine length of data that should be transmitted, and flags
16972          * that will be used. If there is some data or critical controls
16973          * (SYN, RST) to send, then transmit; otherwise, investigate
16974          * further.
16975          */
16976         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
16977         if (tp->t_idle_reduce) {
16978                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
16979                         rack_cc_after_idle(rack, tp);
16980         }
16981         tp->t_flags &= ~TF_LASTIDLE;
16982         if (idle) {
16983                 if (tp->t_flags & TF_MORETOCOME) {
16984                         tp->t_flags |= TF_LASTIDLE;
16985                         idle = 0;
16986                 }
16987         }
16988         if ((tp->snd_una == tp->snd_max) &&
16989             rack->r_ctl.rc_went_idle_time &&
16990             TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
16991                 idle = cts - rack->r_ctl.rc_went_idle_time;
16992                 if (idle > rack_min_probertt_hold) {
16993                         /* Count as a probe rtt */
16994                         if (rack->in_probe_rtt == 0) {
16995                                 rack->r_ctl.rc_lower_rtt_us_cts = cts;
16996                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
16997                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
16998                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
16999                         } else {
17000                                 rack_exit_probertt(rack, cts);
17001                         }
17002                 }
17003                 idle = 0;
17004         }
17005         if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED))
17006                 rack_init_fsb_block(tp, rack);
17007 again:
17008         /*
17009          * If we've recently taken a timeout, snd_max will be greater than
17010          * snd_nxt.  There may be SACK information that allows us to avoid
17011          * resending already delivered data.  Adjust snd_nxt accordingly.
17012          */
17013         sendalot = 0;
17014         cts = tcp_get_usecs(&tv);
17015         ms_cts = tcp_tv_to_mssectick(&tv);
17016         tso = 0;
17017         mtu = 0;
17018         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
17019         minseg = segsiz;
17020         if (rack->r_ctl.rc_pace_max_segs == 0)
17021                 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
17022         else
17023                 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
17024         sb_offset = tp->snd_max - tp->snd_una;
17025         cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
17026         flags = tcp_outflags[tp->t_state];
17027         while (rack->rc_free_cnt < rack_free_cache) {
17028                 rsm = rack_alloc(rack);
17029                 if (rsm == NULL) {
17030                         if (inp->inp_hpts_calls)
17031                                 /* Retry in a ms */
17032                                 slot = (1 * HPTS_USEC_IN_MSEC);
17033                         so = inp->inp_socket;
17034                         sb = &so->so_snd;
17035                         goto just_return_nolock;
17036                 }
17037                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
17038                 rack->rc_free_cnt++;
17039                 rsm = NULL;
17040         }
17041         if (inp->inp_hpts_calls)
17042                 inp->inp_hpts_calls = 0;
17043         sack_rxmit = 0;
17044         len = 0;
17045         rsm = NULL;
17046         if (flags & TH_RST) {
17047                 SOCKBUF_LOCK(&inp->inp_socket->so_snd);
17048                 so = inp->inp_socket;
17049                 sb = &so->so_snd;
17050                 goto send;
17051         }
17052         if (rack->r_ctl.rc_resend) {
17053                 /* Retransmit timer */
17054                 rsm = rack->r_ctl.rc_resend;
17055                 rack->r_ctl.rc_resend = NULL;
17056                 len = rsm->r_end - rsm->r_start;
17057                 sack_rxmit = 1;
17058                 sendalot = 0;
17059                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
17060                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
17061                          __func__, __LINE__,
17062                          rsm->r_start, tp->snd_una, tp, rack, rsm));
17063                 sb_offset = rsm->r_start - tp->snd_una;
17064                 if (len >= segsiz)
17065                         len = segsiz;
17066         } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
17067                 /* We have a retransmit that takes precedence */
17068                 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
17069                     ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
17070                         /* Enter recovery if not induced by a time-out */
17071                         rack->r_ctl.rc_rsm_start = rsm->r_start;
17072                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
17073                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
17074                         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
17075                 }
17076 #ifdef INVARIANTS
17077                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
17078                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
17079                               tp, rack, rsm, rsm->r_start, tp->snd_una);
17080                 }
17081 #endif
17082                 len = rsm->r_end - rsm->r_start;
17083                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
17084                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
17085                          __func__, __LINE__,
17086                          rsm->r_start, tp->snd_una, tp, rack, rsm));
17087                 sb_offset = rsm->r_start - tp->snd_una;
17088                 sendalot = 0;
17089                 if (len >= segsiz)
17090                         len = segsiz;
17091                 if (len > 0) {
17092                         sack_rxmit = 1;
17093                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
17094                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
17095                             min(len, segsiz));
17096                         counter_u64_add(rack_rtm_prr_retran, 1);
17097                 }
17098         } else if (rack->r_ctl.rc_tlpsend) {
17099                 /* Tail loss probe */
17100                 long cwin;
17101                 long tlen;
17102
17103                 /*
17104                  * Check if we can do a TLP with a RACK'd packet
17105                  * this can happen if we are not doing the rack
17106                  * cheat and we skipped to a TLP and it
17107                  * went off.
17108                  */
17109                 rsm = rack->r_ctl.rc_tlpsend;
17110                 /* We are doing a TLP make sure the flag is preent */
17111                 rsm->r_flags |= RACK_TLP;
17112                 rack->r_ctl.rc_tlpsend = NULL;
17113                 sack_rxmit = 1;
17114                 tlen = rsm->r_end - rsm->r_start;
17115                 if (tlen > segsiz)
17116                         tlen = segsiz;
17117                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
17118                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
17119                          __func__, __LINE__,
17120                          rsm->r_start, tp->snd_una, tp, rack, rsm));
17121                 sb_offset = rsm->r_start - tp->snd_una;
17122                 cwin = min(tp->snd_wnd, tlen);
17123                 len = cwin;
17124         }
17125         if (rack->r_must_retran &&
17126             (rsm == NULL)) {
17127                 /*
17128                  * Non-Sack and we had a RTO or Sack/non-Sack and a
17129                  * MTU change, we need to retransmit until we reach
17130                  * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
17131                  */
17132                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
17133                         int sendwin, flight;
17134
17135                         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
17136                         flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
17137                         if (flight >= sendwin) {
17138                                 so = inp->inp_socket;
17139                                 sb = &so->so_snd;
17140                                 goto just_return_nolock;
17141                         }
17142                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
17143                         if (rsm == NULL) {
17144                                 /* TSNH */
17145                                 rack->r_must_retran = 0;
17146                                 rack->r_ctl.rc_out_at_rto = 0;
17147                                 rack->r_must_retran = 0;
17148                                 so = inp->inp_socket;
17149                                 sb = &so->so_snd;
17150                                 goto just_return_nolock;
17151                         }
17152                         if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
17153                                 /* It does not have the flag, we are done */
17154                                 rack->r_must_retran = 0;
17155                                 rack->r_ctl.rc_out_at_rto = 0;
17156                         } else {
17157                                 sack_rxmit = 1;
17158                                 len = rsm->r_end - rsm->r_start;
17159                                 sendalot = 0;
17160                                 sb_offset = rsm->r_start - tp->snd_una;
17161                                 if (len >= segsiz)
17162                                         len = segsiz;
17163                                 /*
17164                                  * Delay removing the flag RACK_MUST_RXT so
17165                                  * that the fastpath for retransmit will
17166                                  * work with this rsm.
17167                                  */
17168
17169                         }
17170                 } else {
17171                         /* We must be done if there is nothing outstanding */
17172                         rack->r_must_retran = 0;
17173                         rack->r_ctl.rc_out_at_rto = 0;
17174                 }
17175         }
17176         /*
17177          * Enforce a connection sendmap count limit if set
17178          * as long as we are not retransmiting.
17179          */
17180         if ((rsm == NULL) &&
17181             (rack->do_detection == 0) &&
17182             (V_tcp_map_entries_limit > 0) &&
17183             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
17184                 counter_u64_add(rack_to_alloc_limited, 1);
17185                 if (!rack->alloc_limit_reported) {
17186                         rack->alloc_limit_reported = 1;
17187                         counter_u64_add(rack_alloc_limited_conns, 1);
17188                 }
17189                 so = inp->inp_socket;
17190                 sb = &so->so_snd;
17191                 goto just_return_nolock;
17192         }
17193         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
17194                 /* we are retransmitting the fin */
17195                 len--;
17196                 if (len) {
17197                         /*
17198                          * When retransmitting data do *not* include the
17199                          * FIN. This could happen from a TLP probe.
17200                          */
17201                         flags &= ~TH_FIN;
17202                 }
17203         }
17204 #ifdef INVARIANTS
17205         /* For debugging */
17206         rack->r_ctl.rc_rsm_at_retran = rsm;
17207 #endif
17208         if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
17209             ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
17210                 int ret;
17211
17212                 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
17213                 if (ret == 0)
17214                         return (0);
17215         }
17216         if (rsm && (rsm->r_flags & RACK_MUST_RXT)) {
17217                 /*
17218                  * Clear the flag in prep for the send
17219                  * note that if we can't get an mbuf
17220                  * and fail, we won't retransmit this
17221                  * rsm but that should be ok (its rare).
17222                  */
17223                 rsm->r_flags &= ~RACK_MUST_RXT;
17224         }
17225         so = inp->inp_socket;
17226         sb = &so->so_snd;
17227         if (do_a_prefetch == 0) {
17228                 kern_prefetch(sb, &do_a_prefetch);
17229                 do_a_prefetch = 1;
17230         }
17231 #ifdef NETFLIX_SHARED_CWND
17232         if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
17233             rack->rack_enable_scwnd) {
17234                 /* We are doing cwnd sharing */
17235                 if (rack->gp_ready &&
17236                     (rack->rack_attempted_scwnd == 0) &&
17237                     (rack->r_ctl.rc_scw == NULL) &&
17238                     tp->t_lib) {
17239                         /* The pcbid is in, lets make an attempt */
17240                         counter_u64_add(rack_try_scwnd, 1);
17241                         rack->rack_attempted_scwnd = 1;
17242                         rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
17243                                                                    &rack->r_ctl.rc_scw_index,
17244                                                                    segsiz);
17245                 }
17246                 if (rack->r_ctl.rc_scw &&
17247                     (rack->rack_scwnd_is_idle == 1) &&
17248                     sbavail(&so->so_snd)) {
17249                         /* we are no longer out of data */
17250                         tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
17251                         rack->rack_scwnd_is_idle = 0;
17252                 }
17253                 if (rack->r_ctl.rc_scw) {
17254                         /* First lets update and get the cwnd */
17255                         rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
17256                                                                     rack->r_ctl.rc_scw_index,
17257                                                                     tp->snd_cwnd, tp->snd_wnd, segsiz);
17258                 }
17259         }
17260 #endif
17261         /*
17262          * Get standard flags, and add SYN or FIN if requested by 'hidden'
17263          * state flags.
17264          */
17265         if (tp->t_flags & TF_NEEDFIN)
17266                 flags |= TH_FIN;
17267         if (tp->t_flags & TF_NEEDSYN)
17268                 flags |= TH_SYN;
17269         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
17270                 void *end_rsm;
17271                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
17272                 if (end_rsm)
17273                         kern_prefetch(end_rsm, &prefetch_rsm);
17274                 prefetch_rsm = 1;
17275         }
17276         SOCKBUF_LOCK(sb);
17277         /*
17278          * If snd_nxt == snd_max and we have transmitted a FIN, the
17279          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
17280          * negative length.  This can also occur when TCP opens up its
17281          * congestion window while receiving additional duplicate acks after
17282          * fast-retransmit because TCP will reset snd_nxt to snd_max after
17283          * the fast-retransmit.
17284          *
17285          * In the normal retransmit-FIN-only case, however, snd_nxt will be
17286          * set to snd_una, the sb_offset will be 0, and the length may wind
17287          * up 0.
17288          *
17289          * If sack_rxmit is true we are retransmitting from the scoreboard
17290          * in which case len is already set.
17291          */
17292         if ((sack_rxmit == 0) &&
17293             (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
17294                 uint32_t avail;
17295
17296                 avail = sbavail(sb);
17297                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
17298                         sb_offset = tp->snd_nxt - tp->snd_una;
17299                 else
17300                         sb_offset = 0;
17301                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
17302                         if (rack->r_ctl.rc_tlp_new_data) {
17303                                 /* TLP is forcing out new data */
17304                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
17305                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
17306                                 }
17307                                 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
17308                                         if (tp->snd_wnd > sb_offset)
17309                                                 len = tp->snd_wnd - sb_offset;
17310                                         else
17311                                                 len = 0;
17312                                 } else {
17313                                         len = rack->r_ctl.rc_tlp_new_data;
17314                                 }
17315                         }  else {
17316                                 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
17317                         }
17318                         if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
17319                                 /*
17320                                  * For prr=off, we need to send only 1 MSS
17321                                  * at a time. We do this because another sack could
17322                                  * be arriving that causes us to send retransmits and
17323                                  * we don't want to be on a long pace due to a larger send
17324                                  * that keeps us from sending out the retransmit.
17325                                  */
17326                                 len = segsiz;
17327                         }
17328                 } else {
17329                         uint32_t outstanding;
17330                         /*
17331                          * We are inside of a Fast recovery episode, this
17332                          * is caused by a SACK or 3 dup acks. At this point
17333                          * we have sent all the retransmissions and we rely
17334                          * on PRR to dictate what we will send in the form of
17335                          * new data.
17336                          */
17337
17338                         outstanding = tp->snd_max - tp->snd_una;
17339                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
17340                                 if (tp->snd_wnd > outstanding) {
17341                                         len = tp->snd_wnd - outstanding;
17342                                         /* Check to see if we have the data */
17343                                         if ((sb_offset + len) > avail) {
17344                                                 /* It does not all fit */
17345                                                 if (avail > sb_offset)
17346                                                         len = avail - sb_offset;
17347                                                 else
17348                                                         len = 0;
17349                                         }
17350                                 } else {
17351                                         len = 0;
17352                                 }
17353                         } else if (avail > sb_offset) {
17354                                 len = avail - sb_offset;
17355                         } else {
17356                                 len = 0;
17357                         }
17358                         if (len > 0) {
17359                                 if (len > rack->r_ctl.rc_prr_sndcnt) {
17360                                         len = rack->r_ctl.rc_prr_sndcnt;
17361                                 }
17362                                 if (len > 0) {
17363                                         sub_from_prr = 1;
17364                                         counter_u64_add(rack_rtm_prr_newdata, 1);
17365                                 }
17366                         }
17367                         if (len > segsiz) {
17368                                 /*
17369                                  * We should never send more than a MSS when
17370                                  * retransmitting or sending new data in prr
17371                                  * mode unless the override flag is on. Most
17372                                  * likely the PRR algorithm is not going to
17373                                  * let us send a lot as well :-)
17374                                  */
17375                                 if (rack->r_ctl.rc_prr_sendalot == 0) {
17376                                         len = segsiz;
17377                                 }
17378                         } else if (len < segsiz) {
17379                                 /*
17380                                  * Do we send any? The idea here is if the
17381                                  * send empty's the socket buffer we want to
17382                                  * do it. However if not then lets just wait
17383                                  * for our prr_sndcnt to get bigger.
17384                                  */
17385                                 long leftinsb;
17386
17387                                 leftinsb = sbavail(sb) - sb_offset;
17388                                 if (leftinsb > len) {
17389                                         /* This send does not empty the sb */
17390                                         len = 0;
17391                                 }
17392                         }
17393                 }
17394         } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
17395                 /*
17396                  * If you have not established
17397                  * and are not doing FAST OPEN
17398                  * no data please.
17399                  */
17400                 if ((sack_rxmit == 0) &&
17401                     (!IS_FASTOPEN(tp->t_flags))){
17402                         len = 0;
17403                         sb_offset = 0;
17404                 }
17405         }
17406         if (prefetch_so_done == 0) {
17407                 kern_prefetch(so, &prefetch_so_done);
17408                 prefetch_so_done = 1;
17409         }
17410         /*
17411          * Lop off SYN bit if it has already been sent.  However, if this is
17412          * SYN-SENT state and if segment contains data and if we don't know
17413          * that foreign host supports TAO, suppress sending segment.
17414          */
17415         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
17416             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
17417                 /*
17418                  * When sending additional segments following a TFO SYN|ACK,
17419                  * do not include the SYN bit.
17420                  */
17421                 if (IS_FASTOPEN(tp->t_flags) &&
17422                     (tp->t_state == TCPS_SYN_RECEIVED))
17423                         flags &= ~TH_SYN;
17424         }
17425         /*
17426          * Be careful not to send data and/or FIN on SYN segments. This
17427          * measure is needed to prevent interoperability problems with not
17428          * fully conformant TCP implementations.
17429          */
17430         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
17431                 len = 0;
17432                 flags &= ~TH_FIN;
17433         }
17434         /*
17435          * On TFO sockets, ensure no data is sent in the following cases:
17436          *
17437          *  - When retransmitting SYN|ACK on a passively-created socket
17438          *
17439          *  - When retransmitting SYN on an actively created socket
17440          *
17441          *  - When sending a zero-length cookie (cookie request) on an
17442          *    actively created socket
17443          *
17444          *  - When the socket is in the CLOSED state (RST is being sent)
17445          */
17446         if (IS_FASTOPEN(tp->t_flags) &&
17447             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
17448              ((tp->t_state == TCPS_SYN_SENT) &&
17449               (tp->t_tfo_client_cookie_len == 0)) ||
17450              (flags & TH_RST))) {
17451                 sack_rxmit = 0;
17452                 len = 0;
17453         }
17454         /* Without fast-open there should never be data sent on a SYN */
17455         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
17456                 tp->snd_nxt = tp->iss;
17457                 len = 0;
17458         }
17459         if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
17460                 /* We only send 1 MSS if we have a DSACK block */
17461                 add_flag |= RACK_SENT_W_DSACK;
17462                 len = segsiz;
17463         }
17464         orig_len = len;
17465         if (len <= 0) {
17466                 /*
17467                  * If FIN has been sent but not acked, but we haven't been
17468                  * called to retransmit, len will be < 0.  Otherwise, window
17469                  * shrank after we sent into it.  If window shrank to 0,
17470                  * cancel pending retransmit, pull snd_nxt back to (closed)
17471                  * window, and set the persist timer if it isn't already
17472                  * going.  If the window didn't close completely, just wait
17473                  * for an ACK.
17474                  *
17475                  * We also do a general check here to ensure that we will
17476                  * set the persist timer when we have data to send, but a
17477                  * 0-byte window. This makes sure the persist timer is set
17478                  * even if the packet hits one of the "goto send" lines
17479                  * below.
17480                  */
17481                 len = 0;
17482                 if ((tp->snd_wnd == 0) &&
17483                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17484                     (tp->snd_una == tp->snd_max) &&
17485                     (sb_offset < (int)sbavail(sb))) {
17486                         rack_enter_persist(tp, rack, cts);
17487                 }
17488         } else if ((rsm == NULL) &&
17489                    (doing_tlp == 0) &&
17490                    (len < pace_max_seg)) {
17491                 /*
17492                  * We are not sending a maximum sized segment for
17493                  * some reason. Should we not send anything (think
17494                  * sws or persists)?
17495                  */
17496                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17497                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17498                     (len < minseg) &&
17499                     (len < (int)(sbavail(sb) - sb_offset))) {
17500                         /*
17501                          * Here the rwnd is less than
17502                          * the minimum pacing size, this is not a retransmit,
17503                          * we are established and
17504                          * the send is not the last in the socket buffer
17505                          * we send nothing, and we may enter persists
17506                          * if nothing is outstanding.
17507                          */
17508                         len = 0;
17509                         if (tp->snd_max == tp->snd_una) {
17510                                 /*
17511                                  * Nothing out we can
17512                                  * go into persists.
17513                                  */
17514                                 rack_enter_persist(tp, rack, cts);
17515                         }
17516                      } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
17517                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17518                            (len < (int)(sbavail(sb) - sb_offset)) &&
17519                            (len < minseg)) {
17520                         /*
17521                          * Here we are not retransmitting, and
17522                          * the cwnd is not so small that we could
17523                          * not send at least a min size (rxt timer
17524                          * not having gone off), We have 2 segments or
17525                          * more already in flight, its not the tail end
17526                          * of the socket buffer  and the cwnd is blocking
17527                          * us from sending out a minimum pacing segment size.
17528                          * Lets not send anything.
17529                          */
17530                         len = 0;
17531                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
17532                             min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17533                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17534                            (len < (int)(sbavail(sb) - sb_offset)) &&
17535                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
17536                         /*
17537                          * Here we have a send window but we have
17538                          * filled it up and we can't send another pacing segment.
17539                          * We also have in flight more than 2 segments
17540                          * and we are not completing the sb i.e. we allow
17541                          * the last bytes of the sb to go out even if
17542                          * its not a full pacing segment.
17543                          */
17544                         len = 0;
17545                 } else if ((rack->r_ctl.crte != NULL) &&
17546                            (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
17547                            (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
17548                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
17549                            (len < (int)(sbavail(sb) - sb_offset))) {
17550                         /*
17551                          * Here we are doing hardware pacing, this is not a TLP,
17552                          * we are not sending a pace max segment size, there is rwnd
17553                          * room to send at least N pace_max_seg, the cwnd is greater
17554                          * than or equal to a full pacing segments plus 4 mss and we have 2 or
17555                          * more segments in flight and its not the tail of the socket buffer.
17556                          *
17557                          * We don't want to send instead we need to get more ack's in to
17558                          * allow us to send a full pacing segment. Normally, if we are pacing
17559                          * about the right speed, we should have finished our pacing
17560                          * send as most of the acks have come back if we are at the
17561                          * right rate. This is a bit fuzzy since return path delay
17562                          * can delay the acks, which is why we want to make sure we
17563                          * have cwnd space to have a bit more than a max pace segments in flight.
17564                          *
17565                          * If we have not gotten our acks back we are pacing at too high a
17566                          * rate delaying will not hurt and will bring our GP estimate down by
17567                          * injecting the delay. If we don't do this we will send
17568                          * 2 MSS out in response to the acks being clocked in which
17569                          * defeats the point of hw-pacing (i.e. to help us get
17570                          * larger TSO's out).
17571                          */
17572                         len = 0;
17573
17574                 }
17575
17576         }
17577         /* len will be >= 0 after this point. */
17578         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
17579         rack_sndbuf_autoscale(rack);
17580         /*
17581          * Decide if we can use TCP Segmentation Offloading (if supported by
17582          * hardware).
17583          *
17584          * TSO may only be used if we are in a pure bulk sending state.  The
17585          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
17586          * options prevent using TSO.  With TSO the TCP header is the same
17587          * (except for the sequence number) for all generated packets.  This
17588          * makes it impossible to transmit any options which vary per
17589          * generated segment or packet.
17590          *
17591          * IPv4 handling has a clear separation of ip options and ip header
17592          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
17593          * the right thing below to provide length of just ip options and thus
17594          * checking for ipoptlen is enough to decide if ip options are present.
17595          */
17596         ipoptlen = 0;
17597 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17598         /*
17599          * Pre-calculate here as we save another lookup into the darknesses
17600          * of IPsec that way and can actually decide if TSO is ok.
17601          */
17602 #ifdef INET6
17603         if (isipv6 && IPSEC_ENABLED(ipv6))
17604                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
17605 #ifdef INET
17606         else
17607 #endif
17608 #endif                          /* INET6 */
17609 #ifdef INET
17610                 if (IPSEC_ENABLED(ipv4))
17611                         ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
17612 #endif                          /* INET */
17613 #endif
17614
17615 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17616         ipoptlen += ipsec_optlen;
17617 #endif
17618         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
17619             (tp->t_port == 0) &&
17620             ((tp->t_flags & TF_SIGNATURE) == 0) &&
17621             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
17622             ipoptlen == 0)
17623                 tso = 1;
17624         {
17625                 uint32_t outstanding;
17626
17627                 outstanding = tp->snd_max - tp->snd_una;
17628                 if (tp->t_flags & TF_SENTFIN) {
17629                         /*
17630                          * If we sent a fin, snd_max is 1 higher than
17631                          * snd_una
17632                          */
17633                         outstanding--;
17634                 }
17635                 if (sack_rxmit) {
17636                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
17637                                 flags &= ~TH_FIN;
17638                 } else {
17639                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
17640                                    sbused(sb)))
17641                                 flags &= ~TH_FIN;
17642                 }
17643         }
17644         recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
17645             (long)TCP_MAXWIN << tp->rcv_scale);
17646
17647         /*
17648          * Sender silly window avoidance.   We transmit under the following
17649          * conditions when len is non-zero:
17650          *
17651          * - We have a full segment (or more with TSO) - This is the last
17652          * buffer in a write()/send() and we are either idle or running
17653          * NODELAY - we've timed out (e.g. persist timer) - we have more
17654          * then 1/2 the maximum send window's worth of data (receiver may be
17655          * limited the window size) - we need to retransmit
17656          */
17657         if (len) {
17658                 if (len >= segsiz) {
17659                         goto send;
17660                 }
17661                 /*
17662                  * NOTE! on localhost connections an 'ack' from the remote
17663                  * end may occur synchronously with the output and cause us
17664                  * to flush a buffer queued with moretocome.  XXX
17665                  *
17666                  */
17667                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
17668                     (idle || (tp->t_flags & TF_NODELAY)) &&
17669                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17670                     (tp->t_flags & TF_NOPUSH) == 0) {
17671                         pass = 2;
17672                         goto send;
17673                 }
17674                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
17675                         pass = 22;
17676                         goto send;
17677                 }
17678                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
17679                         pass = 4;
17680                         goto send;
17681                 }
17682                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
17683                         pass = 5;
17684                         goto send;
17685                 }
17686                 if (sack_rxmit) {
17687                         pass = 6;
17688                         goto send;
17689                 }
17690                 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
17691                     (ctf_outstanding(tp) < (segsiz * 2))) {
17692                         /*
17693                          * We have less than two MSS outstanding (delayed ack)
17694                          * and our rwnd will not let us send a full sized
17695                          * MSS. Lets go ahead and let this small segment
17696                          * out because we want to try to have at least two
17697                          * packets inflight to not be caught by delayed ack.
17698                          */
17699                         pass = 12;
17700                         goto send;
17701                 }
17702         }
17703         /*
17704          * Sending of standalone window updates.
17705          *
17706          * Window updates are important when we close our window due to a
17707          * full socket buffer and are opening it again after the application
17708          * reads data from it.  Once the window has opened again and the
17709          * remote end starts to send again the ACK clock takes over and
17710          * provides the most current window information.
17711          *
17712          * We must avoid the silly window syndrome whereas every read from
17713          * the receive buffer, no matter how small, causes a window update
17714          * to be sent.  We also should avoid sending a flurry of window
17715          * updates when the socket buffer had queued a lot of data and the
17716          * application is doing small reads.
17717          *
17718          * Prevent a flurry of pointless window updates by only sending an
17719          * update when we can increase the advertized window by more than
17720          * 1/4th of the socket buffer capacity.  When the buffer is getting
17721          * full or is very small be more aggressive and send an update
17722          * whenever we can increase by two mss sized segments. In all other
17723          * situations the ACK's to new incoming data will carry further
17724          * window increases.
17725          *
17726          * Don't send an independent window update if a delayed ACK is
17727          * pending (it will get piggy-backed on it) or the remote side
17728          * already has done a half-close and won't send more data.  Skip
17729          * this if the connection is in T/TCP half-open state.
17730          */
17731         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
17732             !(tp->t_flags & TF_DELACK) &&
17733             !TCPS_HAVERCVDFIN(tp->t_state)) {
17734                 /*
17735                  * "adv" is the amount we could increase the window, taking
17736                  * into account that we are limited by TCP_MAXWIN <<
17737                  * tp->rcv_scale.
17738                  */
17739                 int32_t adv;
17740                 int oldwin;
17741
17742                 adv = recwin;
17743                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
17744                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
17745                         if (adv > oldwin)
17746                             adv -= oldwin;
17747                         else {
17748                                 /* We can't increase the window */
17749                                 adv = 0;
17750                         }
17751                 } else
17752                         oldwin = 0;
17753
17754                 /*
17755                  * If the new window size ends up being the same as or less
17756                  * than the old size when it is scaled, then don't force
17757                  * a window update.
17758                  */
17759                 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
17760                         goto dontupdate;
17761
17762                 if (adv >= (int32_t)(2 * segsiz) &&
17763                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
17764                      recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
17765                      so->so_rcv.sb_hiwat <= 8 * segsiz)) {
17766                         pass = 7;
17767                         goto send;
17768                 }
17769                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
17770                         pass = 23;
17771                         goto send;
17772                 }
17773         }
17774 dontupdate:
17775
17776         /*
17777          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
17778          * is also a catch-all for the retransmit timer timeout case.
17779          */
17780         if (tp->t_flags & TF_ACKNOW) {
17781                 pass = 8;
17782                 goto send;
17783         }
17784         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
17785                 pass = 9;
17786                 goto send;
17787         }
17788         /*
17789          * If our state indicates that FIN should be sent and we have not
17790          * yet done so, then we need to send.
17791          */
17792         if ((flags & TH_FIN) &&
17793             (tp->snd_nxt == tp->snd_una)) {
17794                 pass = 11;
17795                 goto send;
17796         }
17797         /*
17798          * No reason to send a segment, just return.
17799          */
17800 just_return:
17801         SOCKBUF_UNLOCK(sb);
17802 just_return_nolock:
17803         {
17804                 int app_limited = CTF_JR_SENT_DATA;
17805
17806                 if (tot_len_this_send > 0) {
17807                         /* Make sure snd_nxt is up to max */
17808                         rack->r_ctl.fsb.recwin = recwin;
17809                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
17810                         if ((error == 0) &&
17811                             rack_use_rfo &&
17812                             ((flags & (TH_SYN|TH_FIN)) == 0) &&
17813                             (ipoptlen == 0) &&
17814                             (tp->snd_nxt == tp->snd_max) &&
17815                             (tp->rcv_numsacks == 0) &&
17816                             rack->r_fsb_inited &&
17817                             TCPS_HAVEESTABLISHED(tp->t_state) &&
17818                             (rack->r_must_retran == 0) &&
17819                             ((tp->t_flags & TF_NEEDFIN) == 0) &&
17820                             (len > 0) && (orig_len > 0) &&
17821                             (orig_len > len) &&
17822                             ((orig_len - len) >= segsiz) &&
17823                             ((optlen == 0) ||
17824                              ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
17825                                 /* We can send at least one more MSS using our fsb */
17826
17827                                 rack->r_fast_output = 1;
17828                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
17829                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
17830                                 rack->r_ctl.fsb.tcp_flags = flags;
17831                                 rack->r_ctl.fsb.left_to_send = orig_len - len;
17832                                 if (hw_tls)
17833                                         rack->r_ctl.fsb.hw_tls = 1;
17834                                 else
17835                                         rack->r_ctl.fsb.hw_tls = 0;
17836                                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
17837                                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
17838                                         rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
17839                                          (tp->snd_max - tp->snd_una)));
17840                                 if (rack->r_ctl.fsb.left_to_send < segsiz)
17841                                         rack->r_fast_output = 0;
17842                                 else {
17843                                         if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
17844                                                 rack->r_ctl.fsb.rfo_apply_push = 1;
17845                                         else
17846                                                 rack->r_ctl.fsb.rfo_apply_push = 0;
17847                                 }
17848                         } else
17849                                 rack->r_fast_output = 0;
17850
17851
17852                         rack_log_fsb(rack, tp, so, flags,
17853                                      ipoptlen, orig_len, len, 0,
17854                                      1, optlen, __LINE__, 1);
17855                         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
17856                                 tp->snd_nxt = tp->snd_max;
17857                 } else {
17858                         int end_window = 0;
17859                         uint32_t seq = tp->gput_ack;
17860
17861                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17862                         if (rsm) {
17863                                 /*
17864                                  * Mark the last sent that we just-returned (hinting
17865                                  * that delayed ack may play a role in any rtt measurement).
17866                                  */
17867                                 rsm->r_just_ret = 1;
17868                         }
17869                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
17870                         rack->r_ctl.rc_agg_delayed = 0;
17871                         rack->r_early = 0;
17872                         rack->r_late = 0;
17873                         rack->r_ctl.rc_agg_early = 0;
17874                         if ((ctf_outstanding(tp) +
17875                              min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
17876                                  minseg)) >= tp->snd_wnd) {
17877                                 /* We are limited by the rwnd */
17878                                 app_limited = CTF_JR_RWND_LIMITED;
17879                                 if (IN_FASTRECOVERY(tp->t_flags))
17880                                     rack->r_ctl.rc_prr_sndcnt = 0;
17881                         } else if (ctf_outstanding(tp) >= sbavail(sb)) {
17882                                 /* We are limited by whats available -- app limited */
17883                                 app_limited = CTF_JR_APP_LIMITED;
17884                                 if (IN_FASTRECOVERY(tp->t_flags))
17885                                     rack->r_ctl.rc_prr_sndcnt = 0;
17886                         } else if ((idle == 0) &&
17887                                    ((tp->t_flags & TF_NODELAY) == 0) &&
17888                                    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17889                                    (len < segsiz)) {
17890                                 /*
17891                                  * No delay is not on and the
17892                                  * user is sending less than 1MSS. This
17893                                  * brings out SWS avoidance so we
17894                                  * don't send. Another app-limited case.
17895                                  */
17896                                 app_limited = CTF_JR_APP_LIMITED;
17897                         } else if (tp->t_flags & TF_NOPUSH) {
17898                                 /*
17899                                  * The user has requested no push of
17900                                  * the last segment and we are
17901                                  * at the last segment. Another app
17902                                  * limited case.
17903                                  */
17904                                 app_limited = CTF_JR_APP_LIMITED;
17905                         } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
17906                                 /* Its the cwnd */
17907                                 app_limited = CTF_JR_CWND_LIMITED;
17908                         } else if (IN_FASTRECOVERY(tp->t_flags) &&
17909                                    (rack->rack_no_prr == 0) &&
17910                                    (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
17911                                 app_limited = CTF_JR_PRR;
17912                         } else {
17913                                 /* Now why here are we not sending? */
17914 #ifdef NOW
17915 #ifdef INVARIANTS
17916                                 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
17917 #endif
17918 #endif
17919                                 app_limited = CTF_JR_ASSESSING;
17920                         }
17921                         /*
17922                          * App limited in some fashion, for our pacing GP
17923                          * measurements we don't want any gap (even cwnd).
17924                          * Close  down the measurement window.
17925                          */
17926                         if (rack_cwnd_block_ends_measure &&
17927                             ((app_limited == CTF_JR_CWND_LIMITED) ||
17928                              (app_limited == CTF_JR_PRR))) {
17929                                 /*
17930                                  * The reason we are not sending is
17931                                  * the cwnd (or prr). We have been configured
17932                                  * to end the measurement window in
17933                                  * this case.
17934                                  */
17935                                 end_window = 1;
17936                         } else if (rack_rwnd_block_ends_measure &&
17937                                    (app_limited == CTF_JR_RWND_LIMITED)) {
17938                                 /*
17939                                  * We are rwnd limited and have been
17940                                  * configured to end the measurement
17941                                  * window in this case.
17942                                  */
17943                                 end_window = 1;
17944                         } else if (app_limited == CTF_JR_APP_LIMITED) {
17945                                 /*
17946                                  * A true application limited period, we have
17947                                  * ran out of data.
17948                                  */
17949                                 end_window = 1;
17950                         } else if (app_limited == CTF_JR_ASSESSING) {
17951                                 /*
17952                                  * In the assessing case we hit the end of
17953                                  * the if/else and had no known reason
17954                                  * This will panic us under invariants..
17955                                  *
17956                                  * If we get this out in logs we need to
17957                                  * investagate which reason we missed.
17958                                  */
17959                                 end_window = 1;
17960                         }
17961                         if (end_window) {
17962                                 uint8_t log = 0;
17963
17964                                 /* Adjust the Gput measurement */
17965                                 if ((tp->t_flags & TF_GPUTINPROG) &&
17966                                     SEQ_GT(tp->gput_ack, tp->snd_max)) {
17967                                         tp->gput_ack = tp->snd_max;
17968                                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
17969                                                 /*
17970                                                  * There is not enough to measure.
17971                                                  */
17972                                                 tp->t_flags &= ~TF_GPUTINPROG;
17973                                                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
17974                                                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
17975                                                                            tp->gput_seq,
17976                                                                            0, 0, 18, __LINE__, NULL, 0);
17977                                         } else
17978                                                 log = 1;
17979                                 }
17980                                 /* Mark the last packet has app limited */
17981                                 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17982                                 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
17983                                         if (rack->r_ctl.rc_app_limited_cnt == 0)
17984                                                 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
17985                                         else {
17986                                                 /*
17987                                                  * Go out to the end app limited and mark
17988                                                  * this new one as next and move the end_appl up
17989                                                  * to this guy.
17990                                                  */
17991                                                 if (rack->r_ctl.rc_end_appl)
17992                                                         rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
17993                                                 rack->r_ctl.rc_end_appl = rsm;
17994                                         }
17995                                         rsm->r_flags |= RACK_APP_LIMITED;
17996                                         rack->r_ctl.rc_app_limited_cnt++;
17997                                 }
17998                                 if (log)
17999                                         rack_log_pacing_delay_calc(rack,
18000                                                                    rack->r_ctl.rc_app_limited_cnt, seq,
18001                                                                    tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
18002                         }
18003                 }
18004                 if (slot) {
18005                         /* set the rack tcb into the slot N */
18006                         counter_u64_add(rack_paced_segments, 1);
18007                 } else if (tot_len_this_send) {
18008                         counter_u64_add(rack_unpaced_segments, 1);
18009                 }
18010                 /* Check if we need to go into persists or not */
18011                 if ((tp->snd_max == tp->snd_una) &&
18012                     TCPS_HAVEESTABLISHED(tp->t_state) &&
18013                     sbavail(sb) &&
18014                     (sbavail(sb) > tp->snd_wnd) &&
18015                     (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
18016                         /* Yes lets make sure to move to persist before timer-start */
18017                         rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
18018                 }
18019                 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
18020                 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
18021         }
18022 #ifdef NETFLIX_SHARED_CWND
18023         if ((sbavail(sb) == 0) &&
18024             rack->r_ctl.rc_scw) {
18025                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
18026                 rack->rack_scwnd_is_idle = 1;
18027         }
18028 #endif
18029 #ifdef TCP_ACCOUNTING
18030         if (tot_len_this_send > 0) {
18031                 crtsc = get_cyclecount();
18032                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18033                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
18034                 }
18035                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
18036                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18037                         tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
18038                 }
18039                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
18040                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18041                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
18042                 }
18043                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz));
18044         } else {
18045                 crtsc = get_cyclecount();
18046                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18047                         tp->tcp_cnt_counters[SND_LIMITED]++;
18048                 }
18049                 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1);
18050                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18051                         tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
18052                 }
18053                 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val));
18054         }
18055         sched_unpin();
18056 #endif
18057         return (0);
18058
18059 send:
18060         if (rsm || sack_rxmit)
18061                 counter_u64_add(rack_nfto_resend, 1);
18062         else
18063                 counter_u64_add(rack_non_fto_send, 1);
18064         if ((flags & TH_FIN) &&
18065             sbavail(sb)) {
18066                 /*
18067                  * We do not transmit a FIN
18068                  * with data outstanding. We
18069                  * need to make it so all data
18070                  * is acked first.
18071                  */
18072                 flags &= ~TH_FIN;
18073         }
18074         /* Enforce stack imposed max seg size if we have one */
18075         if (rack->r_ctl.rc_pace_max_segs &&
18076             (len > rack->r_ctl.rc_pace_max_segs)) {
18077                 mark = 1;
18078                 len = rack->r_ctl.rc_pace_max_segs;
18079         }
18080         SOCKBUF_LOCK_ASSERT(sb);
18081         if (len > 0) {
18082                 if (len >= segsiz)
18083                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
18084                 else
18085                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
18086         }
18087         /*
18088          * Before ESTABLISHED, force sending of initial options unless TCP
18089          * set not to do any options. NOTE: we assume that the IP/TCP header
18090          * plus TCP options always fit in a single mbuf, leaving room for a
18091          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
18092          * + optlen <= MCLBYTES
18093          */
18094         optlen = 0;
18095 #ifdef INET6
18096         if (isipv6)
18097                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18098         else
18099 #endif
18100                 hdrlen = sizeof(struct tcpiphdr);
18101
18102         /*
18103          * Compute options for segment. We only have to care about SYN and
18104          * established connection segments.  Options for SYN-ACK segments
18105          * are handled in TCP syncache.
18106          */
18107         to.to_flags = 0;
18108         if ((tp->t_flags & TF_NOOPT) == 0) {
18109                 /* Maximum segment size. */
18110                 if (flags & TH_SYN) {
18111                         tp->snd_nxt = tp->iss;
18112                         to.to_mss = tcp_mssopt(&inp->inp_inc);
18113                         if (tp->t_port)
18114                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
18115                         to.to_flags |= TOF_MSS;
18116
18117                         /*
18118                          * On SYN or SYN|ACK transmits on TFO connections,
18119                          * only include the TFO option if it is not a
18120                          * retransmit, as the presence of the TFO option may
18121                          * have caused the original SYN or SYN|ACK to have
18122                          * been dropped by a middlebox.
18123                          */
18124                         if (IS_FASTOPEN(tp->t_flags) &&
18125                             (tp->t_rxtshift == 0)) {
18126                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
18127                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
18128                                         to.to_tfo_cookie =
18129                                                 (u_int8_t *)&tp->t_tfo_cookie.server;
18130                                         to.to_flags |= TOF_FASTOPEN;
18131                                         wanted_cookie = 1;
18132                                 } else if (tp->t_state == TCPS_SYN_SENT) {
18133                                         to.to_tfo_len =
18134                                                 tp->t_tfo_client_cookie_len;
18135                                         to.to_tfo_cookie =
18136                                                 tp->t_tfo_cookie.client;
18137                                         to.to_flags |= TOF_FASTOPEN;
18138                                         wanted_cookie = 1;
18139                                         /*
18140                                          * If we wind up having more data to
18141                                          * send with the SYN than can fit in
18142                                          * one segment, don't send any more
18143                                          * until the SYN|ACK comes back from
18144                                          * the other end.
18145                                          */
18146                                         sendalot = 0;
18147                                 }
18148                         }
18149                 }
18150                 /* Window scaling. */
18151                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
18152                         to.to_wscale = tp->request_r_scale;
18153                         to.to_flags |= TOF_SCALE;
18154                 }
18155                 /* Timestamps. */
18156                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
18157                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
18158                         to.to_tsval = ms_cts + tp->ts_offset;
18159                         to.to_tsecr = tp->ts_recent;
18160                         to.to_flags |= TOF_TS;
18161                 }
18162                 /* Set receive buffer autosizing timestamp. */
18163                 if (tp->rfbuf_ts == 0 &&
18164                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
18165                         tp->rfbuf_ts = tcp_ts_getticks();
18166                 /* Selective ACK's. */
18167                 if (tp->t_flags & TF_SACK_PERMIT) {
18168                         if (flags & TH_SYN)
18169                                 to.to_flags |= TOF_SACKPERM;
18170                         else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18171                                  tp->rcv_numsacks > 0) {
18172                                 to.to_flags |= TOF_SACK;
18173                                 to.to_nsacks = tp->rcv_numsacks;
18174                                 to.to_sacks = (u_char *)tp->sackblks;
18175                         }
18176                 }
18177 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18178                 /* TCP-MD5 (RFC2385). */
18179                 if (tp->t_flags & TF_SIGNATURE)
18180                         to.to_flags |= TOF_SIGNATURE;
18181 #endif                          /* TCP_SIGNATURE */
18182
18183                 /* Processing the options. */
18184                 hdrlen += optlen = tcp_addoptions(&to, opt);
18185                 /*
18186                  * If we wanted a TFO option to be added, but it was unable
18187                  * to fit, ensure no data is sent.
18188                  */
18189                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
18190                     !(to.to_flags & TOF_FASTOPEN))
18191                         len = 0;
18192         }
18193         if (tp->t_port) {
18194                 if (V_tcp_udp_tunneling_port == 0) {
18195                         /* The port was removed?? */
18196                         SOCKBUF_UNLOCK(&so->so_snd);
18197 #ifdef TCP_ACCOUNTING
18198                         crtsc = get_cyclecount();
18199                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18200                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
18201                         }
18202                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
18203                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18204                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
18205                         }
18206                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
18207                         sched_unpin();
18208 #endif
18209                         return (EHOSTUNREACH);
18210                 }
18211                 hdrlen += sizeof(struct udphdr);
18212         }
18213 #ifdef INET6
18214         if (isipv6)
18215                 ipoptlen = ip6_optlen(tp->t_inpcb);
18216         else
18217 #endif
18218                 if (tp->t_inpcb->inp_options)
18219                         ipoptlen = tp->t_inpcb->inp_options->m_len -
18220                                 offsetof(struct ipoption, ipopt_list);
18221                 else
18222                         ipoptlen = 0;
18223 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18224         ipoptlen += ipsec_optlen;
18225 #endif
18226
18227         /*
18228          * Adjust data length if insertion of options will bump the packet
18229          * length beyond the t_maxseg length. Clear the FIN bit because we
18230          * cut off the tail of the segment.
18231          */
18232         if (len + optlen + ipoptlen > tp->t_maxseg) {
18233                 if (tso) {
18234                         uint32_t if_hw_tsomax;
18235                         uint32_t moff;
18236                         int32_t max_len;
18237
18238                         /* extract TSO information */
18239                         if_hw_tsomax = tp->t_tsomax;
18240                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18241                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18242                         KASSERT(ipoptlen == 0,
18243                                 ("%s: TSO can't do IP options", __func__));
18244
18245                         /*
18246                          * Check if we should limit by maximum payload
18247                          * length:
18248                          */
18249                         if (if_hw_tsomax != 0) {
18250                                 /* compute maximum TSO length */
18251                                 max_len = (if_hw_tsomax - hdrlen -
18252                                            max_linkhdr);
18253                                 if (max_len <= 0) {
18254                                         len = 0;
18255                                 } else if (len > max_len) {
18256                                         sendalot = 1;
18257                                         len = max_len;
18258                                         mark = 2;
18259                                 }
18260                         }
18261                         /*
18262                          * Prevent the last segment from being fractional
18263                          * unless the send sockbuf can be emptied:
18264                          */
18265                         max_len = (tp->t_maxseg - optlen);
18266                         if ((sb_offset + len) < sbavail(sb)) {
18267                                 moff = len % (u_int)max_len;
18268                                 if (moff != 0) {
18269                                         mark = 3;
18270                                         len -= moff;
18271                                 }
18272                         }
18273                         /*
18274                          * In case there are too many small fragments don't
18275                          * use TSO:
18276                          */
18277                         if (len <= segsiz) {
18278                                 mark = 4;
18279                                 tso = 0;
18280                         }
18281                         /*
18282                          * Send the FIN in a separate segment after the bulk
18283                          * sending is done. We don't trust the TSO
18284                          * implementations to clear the FIN flag on all but
18285                          * the last segment.
18286                          */
18287                         if (tp->t_flags & TF_NEEDFIN) {
18288                                 sendalot = 4;
18289                         }
18290                 } else {
18291                         mark = 5;
18292                         if (optlen + ipoptlen >= tp->t_maxseg) {
18293                                 /*
18294                                  * Since we don't have enough space to put
18295                                  * the IP header chain and the TCP header in
18296                                  * one packet as required by RFC 7112, don't
18297                                  * send it. Also ensure that at least one
18298                                  * byte of the payload can be put into the
18299                                  * TCP segment.
18300                                  */
18301                                 SOCKBUF_UNLOCK(&so->so_snd);
18302                                 error = EMSGSIZE;
18303                                 sack_rxmit = 0;
18304                                 goto out;
18305                         }
18306                         len = tp->t_maxseg - optlen - ipoptlen;
18307                         sendalot = 5;
18308                 }
18309         } else {
18310                 tso = 0;
18311                 mark = 6;
18312         }
18313         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
18314                 ("%s: len > IP_MAXPACKET", __func__));
18315 #ifdef DIAGNOSTIC
18316 #ifdef INET6
18317         if (max_linkhdr + hdrlen > MCLBYTES)
18318 #else
18319                 if (max_linkhdr + hdrlen > MHLEN)
18320 #endif
18321                         panic("tcphdr too big");
18322 #endif
18323
18324         /*
18325          * This KASSERT is here to catch edge cases at a well defined place.
18326          * Before, those had triggered (random) panic conditions further
18327          * down.
18328          */
18329         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
18330         if ((len == 0) &&
18331             (flags & TH_FIN) &&
18332             (sbused(sb))) {
18333                 /*
18334                  * We have outstanding data, don't send a fin by itself!.
18335                  */
18336                 goto just_return;
18337         }
18338         /*
18339          * Grab a header mbuf, attaching a copy of data to be transmitted,
18340          * and initialize the header from the template for sends on this
18341          * connection.
18342          */
18343         hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
18344         if (len) {
18345                 uint32_t max_val;
18346                 uint32_t moff;
18347
18348                 if (rack->r_ctl.rc_pace_max_segs)
18349                         max_val = rack->r_ctl.rc_pace_max_segs;
18350                 else if (rack->rc_user_set_max_segs)
18351                         max_val = rack->rc_user_set_max_segs * segsiz;
18352                 else
18353                         max_val = len;
18354                 /*
18355                  * We allow a limit on sending with hptsi.
18356                  */
18357                 if (len > max_val) {
18358                         mark = 7;
18359                         len = max_val;
18360                 }
18361 #ifdef INET6
18362                 if (MHLEN < hdrlen + max_linkhdr)
18363                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18364                 else
18365 #endif
18366                         m = m_gethdr(M_NOWAIT, MT_DATA);
18367
18368                 if (m == NULL) {
18369                         SOCKBUF_UNLOCK(sb);
18370                         error = ENOBUFS;
18371                         sack_rxmit = 0;
18372                         goto out;
18373                 }
18374                 m->m_data += max_linkhdr;
18375                 m->m_len = hdrlen;
18376
18377                 /*
18378                  * Start the m_copy functions from the closest mbuf to the
18379                  * sb_offset in the socket buffer chain.
18380                  */
18381                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
18382                 s_mb = mb;
18383                 s_moff = moff;
18384                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
18385                         m_copydata(mb, moff, (int)len,
18386                                    mtod(m, caddr_t)+hdrlen);
18387                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18388                                 sbsndptr_adv(sb, mb, len);
18389                         m->m_len += len;
18390                 } else {
18391                         struct sockbuf *msb;
18392
18393                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18394                                 msb = NULL;
18395                         else
18396                                 msb = sb;
18397                         m->m_next = tcp_m_copym(
18398                                 mb, moff, &len,
18399                                 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
18400                                 ((rsm == NULL) ? hw_tls : 0)
18401 #ifdef NETFLIX_COPY_ARGS
18402                                 , &filled_all
18403 #endif
18404                                 );
18405                         if (len <= (tp->t_maxseg - optlen)) {
18406                                 /*
18407                                  * Must have ran out of mbufs for the copy
18408                                  * shorten it to no longer need tso. Lets
18409                                  * not put on sendalot since we are low on
18410                                  * mbufs.
18411                                  */
18412                                 tso = 0;
18413                         }
18414                         if (m->m_next == NULL) {
18415                                 SOCKBUF_UNLOCK(sb);
18416                                 (void)m_free(m);
18417                                 error = ENOBUFS;
18418                                 sack_rxmit = 0;
18419                                 goto out;
18420                         }
18421                 }
18422                 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
18423                         if (rsm && (rsm->r_flags & RACK_TLP)) {
18424                                 /*
18425                                  * TLP should not count in retran count, but
18426                                  * in its own bin
18427                                  */
18428                                 counter_u64_add(rack_tlp_retran, 1);
18429                                 counter_u64_add(rack_tlp_retran_bytes, len);
18430                         } else {
18431                                 tp->t_sndrexmitpack++;
18432                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18433                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18434                         }
18435 #ifdef STATS
18436                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18437                                                  len);
18438 #endif
18439                 } else {
18440                         KMOD_TCPSTAT_INC(tcps_sndpack);
18441                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
18442 #ifdef STATS
18443                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
18444                                                  len);
18445 #endif
18446                 }
18447                 /*
18448                  * If we're sending everything we've got, set PUSH. (This
18449                  * will keep happy those implementations which only give
18450                  * data to the user when a buffer fills or a PUSH comes in.)
18451                  */
18452                 if (sb_offset + len == sbused(sb) &&
18453                     sbused(sb) &&
18454                     !(flags & TH_SYN)) {
18455                         flags |= TH_PUSH;
18456                         add_flag |= RACK_HAD_PUSH;
18457                 }
18458
18459                 SOCKBUF_UNLOCK(sb);
18460         } else {
18461                 SOCKBUF_UNLOCK(sb);
18462                 if (tp->t_flags & TF_ACKNOW)
18463                         KMOD_TCPSTAT_INC(tcps_sndacks);
18464                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
18465                         KMOD_TCPSTAT_INC(tcps_sndctrl);
18466                 else
18467                         KMOD_TCPSTAT_INC(tcps_sndwinup);
18468
18469                 m = m_gethdr(M_NOWAIT, MT_DATA);
18470                 if (m == NULL) {
18471                         error = ENOBUFS;
18472                         sack_rxmit = 0;
18473                         goto out;
18474                 }
18475 #ifdef INET6
18476                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
18477                     MHLEN >= hdrlen) {
18478                         M_ALIGN(m, hdrlen);
18479                 } else
18480 #endif
18481                         m->m_data += max_linkhdr;
18482                 m->m_len = hdrlen;
18483         }
18484         SOCKBUF_UNLOCK_ASSERT(sb);
18485         m->m_pkthdr.rcvif = (struct ifnet *)0;
18486 #ifdef MAC
18487         mac_inpcb_create_mbuf(inp, m);
18488 #endif
18489         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) &&  rack->r_fsb_inited) {
18490 #ifdef INET6
18491                 if (isipv6)
18492                         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18493                 else
18494 #endif                          /* INET6 */
18495                         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18496                 th = rack->r_ctl.fsb.th;
18497                 udp = rack->r_ctl.fsb.udp;
18498                 if (udp) {
18499 #ifdef INET6
18500                         if (isipv6)
18501                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18502                         else
18503 #endif                          /* INET6 */
18504                                 ulen = hdrlen + len - sizeof(struct ip);
18505                         udp->uh_ulen = htons(ulen);
18506                 }
18507         } else {
18508 #ifdef INET6
18509                 if (isipv6) {
18510                         ip6 = mtod(m, struct ip6_hdr *);
18511                         if (tp->t_port) {
18512                                 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
18513                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18514                                 udp->uh_dport = tp->t_port;
18515                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18516                                 udp->uh_ulen = htons(ulen);
18517                                 th = (struct tcphdr *)(udp + 1);
18518                         } else
18519                                 th = (struct tcphdr *)(ip6 + 1);
18520                         tcpip_fillheaders(inp, tp->t_port, ip6, th);
18521                 } else
18522 #endif                          /* INET6 */
18523                 {
18524                         ip = mtod(m, struct ip *);
18525 #ifdef TCPDEBUG
18526                         ipov = (struct ipovly *)ip;
18527 #endif
18528                         if (tp->t_port) {
18529                                 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
18530                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18531                                 udp->uh_dport = tp->t_port;
18532                                 ulen = hdrlen + len - sizeof(struct ip);
18533                                 udp->uh_ulen = htons(ulen);
18534                                 th = (struct tcphdr *)(udp + 1);
18535                         } else
18536                                 th = (struct tcphdr *)(ip + 1);
18537                         tcpip_fillheaders(inp, tp->t_port, ip, th);
18538                 }
18539         }
18540         /*
18541          * Fill in fields, remembering maximum advertised window for use in
18542          * delaying messages about window sizes. If resending a FIN, be sure
18543          * not to use a new sequence number.
18544          */
18545         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
18546             tp->snd_nxt == tp->snd_max)
18547                 tp->snd_nxt--;
18548         /*
18549          * If we are starting a connection, send ECN setup SYN packet. If we
18550          * are on a retransmit, we may resend those bits a number of times
18551          * as per RFC 3168.
18552          */
18553         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
18554                 if (tp->t_rxtshift >= 1) {
18555                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
18556                                 flags |= TH_ECE | TH_CWR;
18557                 } else
18558                         flags |= TH_ECE | TH_CWR;
18559         }
18560         /* Handle parallel SYN for ECN */
18561         if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18562             (tp->t_flags2 & TF2_ECN_SND_ECE)) {
18563                 flags |= TH_ECE;
18564                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18565         }
18566         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18567             (tp->t_flags2 & TF2_ECN_PERMIT)) {
18568                 /*
18569                  * If the peer has ECN, mark data packets with ECN capable
18570                  * transmission (ECT). Ignore pure ack packets,
18571                  * retransmissions.
18572                  */
18573                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
18574                     (sack_rxmit == 0)) {
18575 #ifdef INET6
18576                         if (isipv6)
18577                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
18578                         else
18579 #endif
18580                                 ip->ip_tos |= IPTOS_ECN_ECT0;
18581                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
18582                         /*
18583                          * Reply with proper ECN notifications.
18584                          * Only set CWR on new data segments.
18585                          */
18586                         if (tp->t_flags2 & TF2_ECN_SND_CWR) {
18587                                 flags |= TH_CWR;
18588                                 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
18589                         }
18590                 }
18591                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
18592                         flags |= TH_ECE;
18593         }
18594         /*
18595          * If we are doing retransmissions, then snd_nxt will not reflect
18596          * the first unsent octet.  For ACK only packets, we do not want the
18597          * sequence number of the retransmitted packet, we want the sequence
18598          * number of the next unsent octet.  So, if there is no data (and no
18599          * SYN or FIN), use snd_max instead of snd_nxt when filling in
18600          * ti_seq.  But if we are in persist state, snd_max might reflect
18601          * one byte beyond the right edge of the window, so use snd_nxt in
18602          * that case, since we know we aren't doing a retransmission.
18603          * (retransmit and persist are mutually exclusive...)
18604          */
18605         if (sack_rxmit == 0) {
18606                 if (len || (flags & (TH_SYN | TH_FIN))) {
18607                         th->th_seq = htonl(tp->snd_nxt);
18608                         rack_seq = tp->snd_nxt;
18609                 } else {
18610                         th->th_seq = htonl(tp->snd_max);
18611                         rack_seq = tp->snd_max;
18612                 }
18613         } else {
18614                 th->th_seq = htonl(rsm->r_start);
18615                 rack_seq = rsm->r_start;
18616         }
18617         th->th_ack = htonl(tp->rcv_nxt);
18618         th->th_flags = flags;
18619         /*
18620          * Calculate receive window.  Don't shrink window, but avoid silly
18621          * window syndrome.
18622          * If a RST segment is sent, advertise a window of zero.
18623          */
18624         if (flags & TH_RST) {
18625                 recwin = 0;
18626         } else {
18627                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
18628                     recwin < (long)segsiz) {
18629                         recwin = 0;
18630                 }
18631                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
18632                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
18633                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
18634         }
18635
18636         /*
18637          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
18638          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
18639          * handled in syncache.
18640          */
18641         if (flags & TH_SYN)
18642                 th->th_win = htons((u_short)
18643                                    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
18644         else {
18645                 /* Avoid shrinking window with window scaling. */
18646                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
18647                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
18648         }
18649         /*
18650          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
18651          * window.  This may cause the remote transmitter to stall.  This
18652          * flag tells soreceive() to disable delayed acknowledgements when
18653          * draining the buffer.  This can occur if the receiver is
18654          * attempting to read more data than can be buffered prior to
18655          * transmitting on the connection.
18656          */
18657         if (th->th_win == 0) {
18658                 tp->t_sndzerowin++;
18659                 tp->t_flags |= TF_RXWIN0SENT;
18660         } else
18661                 tp->t_flags &= ~TF_RXWIN0SENT;
18662         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
18663         /* Now are we using fsb?, if so copy the template data to the mbuf */
18664         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
18665                 uint8_t *cpto;
18666
18667                 cpto = mtod(m, uint8_t *);
18668                 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18669                 /*
18670                  * We have just copied in:
18671                  * IP/IP6
18672                  * <optional udphdr>
18673                  * tcphdr (no options)
18674                  *
18675                  * We need to grab the correct pointers into the mbuf
18676                  * for both the tcp header, and possibly the udp header (if tunneling).
18677                  * We do this by using the offset in the copy buffer and adding it
18678                  * to the mbuf base pointer (cpto).
18679                  */
18680 #ifdef INET6
18681                 if (isipv6)
18682                         ip6 = mtod(m, struct ip6_hdr *);
18683                 else
18684 #endif                          /* INET6 */
18685                         ip = mtod(m, struct ip *);
18686                 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18687                 /* If we have a udp header lets set it into the mbuf as well */
18688                 if (udp)
18689                         udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
18690         }
18691 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18692         if (to.to_flags & TOF_SIGNATURE) {
18693                 /*
18694                  * Calculate MD5 signature and put it into the place
18695                  * determined before.
18696                  * NOTE: since TCP options buffer doesn't point into
18697                  * mbuf's data, calculate offset and use it.
18698                  */
18699                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
18700                                                        (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
18701                         /*
18702                          * Do not send segment if the calculation of MD5
18703                          * digest has failed.
18704                          */
18705                         goto out;
18706                 }
18707         }
18708 #endif
18709         if (optlen) {
18710                 bcopy(opt, th + 1, optlen);
18711                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18712         }
18713         /*
18714          * Put TCP length in extended header, and then checksum extended
18715          * header and data.
18716          */
18717         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18718 #ifdef INET6
18719         if (isipv6) {
18720                 /*
18721                  * ip6_plen is not need to be filled now, and will be filled
18722                  * in ip6_output.
18723                  */
18724                 if (tp->t_port) {
18725                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18726                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18727                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18728                         th->th_sum = htons(0);
18729                         UDPSTAT_INC(udps_opackets);
18730                 } else {
18731                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18732                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18733                         th->th_sum = in6_cksum_pseudo(ip6,
18734                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18735                                                       0);
18736                 }
18737         }
18738 #endif
18739 #if defined(INET6) && defined(INET)
18740         else
18741 #endif
18742 #ifdef INET
18743         {
18744                 if (tp->t_port) {
18745                         m->m_pkthdr.csum_flags = CSUM_UDP;
18746                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18747                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18748                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18749                         th->th_sum = htons(0);
18750                         UDPSTAT_INC(udps_opackets);
18751                 } else {
18752                         m->m_pkthdr.csum_flags = CSUM_TCP;
18753                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18754                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
18755                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18756                                                                         IPPROTO_TCP + len + optlen));
18757                 }
18758                 /* IP version must be set here for ipv4/ipv6 checking later */
18759                 KASSERT(ip->ip_v == IPVERSION,
18760                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18761         }
18762 #endif
18763         /*
18764          * Enable TSO and specify the size of the segments. The TCP pseudo
18765          * header checksum is always provided. XXX: Fixme: This is currently
18766          * not the case for IPv6.
18767          */
18768         if (tso) {
18769                 KASSERT(len > tp->t_maxseg - optlen,
18770                         ("%s: len <= tso_segsz", __func__));
18771                 m->m_pkthdr.csum_flags |= CSUM_TSO;
18772                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
18773         }
18774         KASSERT(len + hdrlen == m_length(m, NULL),
18775                 ("%s: mbuf chain different than expected: %d + %u != %u",
18776                  __func__, len, hdrlen, m_length(m, NULL)));
18777
18778 #ifdef TCP_HHOOK
18779         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
18780         hhook_run_tcp_est_out(tp, th, &to, len, tso);
18781 #endif
18782         /* We're getting ready to send; log now. */
18783         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
18784                 union tcp_log_stackspecific log;
18785
18786                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18787                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
18788                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
18789                 if (rack->rack_no_prr)
18790                         log.u_bbr.flex1 = 0;
18791                 else
18792                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18793                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18794                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18795                 log.u_bbr.flex4 = orig_len;
18796                 if (filled_all)
18797                         log.u_bbr.flex5 = 0x80000000;
18798                 else
18799                         log.u_bbr.flex5 = 0;
18800                 /* Save off the early/late values */
18801                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18802                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18803                 log.u_bbr.bw_inuse = rack_get_bw(rack);
18804                 if (rsm || sack_rxmit) {
18805                         if (doing_tlp)
18806                                 log.u_bbr.flex8 = 2;
18807                         else
18808                                 log.u_bbr.flex8 = 1;
18809                 } else {
18810                         if (doing_tlp)
18811                                 log.u_bbr.flex8 = 3;
18812                         else
18813                                 log.u_bbr.flex8 = 0;
18814                 }
18815                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
18816                 log.u_bbr.flex7 = mark;
18817                 log.u_bbr.flex7 <<= 8;
18818                 log.u_bbr.flex7 |= pass;
18819                 log.u_bbr.pkts_out = tp->t_maxseg;
18820                 log.u_bbr.timeStamp = cts;
18821                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18822                 log.u_bbr.lt_epoch = cwnd_to_use;
18823                 log.u_bbr.delivered = sendalot;
18824                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
18825                                      len, &log, false, NULL, NULL, 0, &tv);
18826         } else
18827                 lgb = NULL;
18828
18829         /*
18830          * Fill in IP length and desired time to live and send to IP level.
18831          * There should be a better way to handle ttl and tos; we could keep
18832          * them in the template, but need a way to checksum without them.
18833          */
18834         /*
18835          * m->m_pkthdr.len should have been set before cksum calcuration,
18836          * because in6_cksum() need it.
18837          */
18838 #ifdef INET6
18839         if (isipv6) {
18840                 /*
18841                  * we separately set hoplimit for every segment, since the
18842                  * user might want to change the value via setsockopt. Also,
18843                  * desired default hop limit might be changed via Neighbor
18844                  * Discovery.
18845                  */
18846                 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
18847
18848                 /*
18849                  * Set the packet size here for the benefit of DTrace
18850                  * probes. ip6_output() will set it properly; it's supposed
18851                  * to include the option header lengths as well.
18852                  */
18853                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18854
18855                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18856                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18857                 else
18858                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18859
18860                 if (tp->t_state == TCPS_SYN_SENT)
18861                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
18862
18863                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
18864                 /* TODO: IPv6 IP6TOS_ECT bit on */
18865                 error = ip6_output(m,
18866 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18867                                    inp->in6p_outputopts,
18868 #else
18869                                    NULL,
18870 #endif
18871                                    &inp->inp_route6,
18872                                    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
18873                                    NULL, NULL, inp);
18874
18875                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
18876                         mtu = inp->inp_route6.ro_nh->nh_mtu;
18877         }
18878 #endif                          /* INET6 */
18879 #if defined(INET) && defined(INET6)
18880         else
18881 #endif
18882 #ifdef INET
18883         {
18884                 ip->ip_len = htons(m->m_pkthdr.len);
18885 #ifdef INET6
18886                 if (inp->inp_vflag & INP_IPV6PROTO)
18887                         ip->ip_ttl = in6_selecthlim(inp, NULL);
18888 #endif                          /* INET6 */
18889                 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
18890                 /*
18891                  * If we do path MTU discovery, then we set DF on every
18892                  * packet. This might not be the best thing to do according
18893                  * to RFC3390 Section 2. However the tcp hostcache migitates
18894                  * the problem so it affects only the first tcp connection
18895                  * with a host.
18896                  *
18897                  * NB: Don't set DF on small MTU/MSS to have a safe
18898                  * fallback.
18899                  */
18900                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18901                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18902                         if (tp->t_port == 0 || len < V_tcp_minmss) {
18903                                 ip->ip_off |= htons(IP_DF);
18904                         }
18905                 } else {
18906                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18907                 }
18908
18909                 if (tp->t_state == TCPS_SYN_SENT)
18910                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
18911
18912                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
18913
18914                 error = ip_output(m,
18915 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18916                                   inp->inp_options,
18917 #else
18918                                   NULL,
18919 #endif
18920                                   &inp->inp_route,
18921                                   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
18922                                   inp);
18923                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
18924                         mtu = inp->inp_route.ro_nh->nh_mtu;
18925         }
18926 #endif                          /* INET */
18927
18928 out:
18929         if (lgb) {
18930                 lgb->tlb_errno = error;
18931                 lgb = NULL;
18932         }
18933         /*
18934          * In transmit state, time the transmission and arrange for the
18935          * retransmit.  In persist state, just set snd_max.
18936          */
18937         if (error == 0) {
18938                 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
18939                 if (rsm && doing_tlp) {
18940                         rack->rc_last_sent_tlp_past_cumack = 0;
18941                         rack->rc_last_sent_tlp_seq_valid = 1;
18942                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18943                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18944                 }
18945                 rack->forced_ack = 0;   /* If we send something zap the FA flag */
18946                 if (rsm && (doing_tlp == 0)) {
18947                         /* Set we retransmitted */
18948                         rack->rc_gp_saw_rec = 1;
18949                 } else {
18950                         if (cwnd_to_use > tp->snd_ssthresh) {
18951                                 /* Set we sent in CA */
18952                                 rack->rc_gp_saw_ca = 1;
18953                         } else {
18954                                 /* Set we sent in SS */
18955                                 rack->rc_gp_saw_ss = 1;
18956                         }
18957                 }
18958                 if (doing_tlp && (rsm == NULL)) {
18959                         /* Make sure new data TLP cnt is clear */
18960                         rack->r_ctl.rc_tlp_new_data = 0;
18961                 }
18962                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18963                     (tp->t_flags & TF_SACK_PERMIT) &&
18964                     tp->rcv_numsacks > 0)
18965                         tcp_clean_dsack_blocks(tp);
18966                 tot_len_this_send += len;
18967                 if (len == 0)
18968                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
18969                 else if (len == 1) {
18970                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
18971                 } else if (len > 1) {
18972                         int idx;
18973
18974                         idx = (len / segsiz) + 3;
18975                         if (idx >= TCP_MSS_ACCT_ATIMER)
18976                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18977                         else
18978                                 counter_u64_add(rack_out_size[idx], 1);
18979                 }
18980         }
18981         if ((rack->rack_no_prr == 0) &&
18982             sub_from_prr &&
18983             (error == 0)) {
18984                 if (rack->r_ctl.rc_prr_sndcnt >= len)
18985                         rack->r_ctl.rc_prr_sndcnt -= len;
18986                 else
18987                         rack->r_ctl.rc_prr_sndcnt = 0;
18988         }
18989         sub_from_prr = 0;
18990         if (doing_tlp) {
18991                 /* Make sure the TLP is added */
18992                 add_flag |= RACK_TLP;
18993         } else if (rsm) {
18994                 /* If its a resend without TLP then it must not have the flag */
18995                 rsm->r_flags &= ~RACK_TLP;
18996         }
18997         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
18998                         rack_to_usec_ts(&tv),
18999                         rsm, add_flag, s_mb, s_moff, hw_tls);
19000
19001
19002         if ((error == 0) &&
19003             (len > 0) &&
19004             (tp->snd_una == tp->snd_max))
19005                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
19006         {
19007                 tcp_seq startseq = tp->snd_nxt;
19008
19009                 /* Track our lost count */
19010                 if (rsm && (doing_tlp == 0))
19011                         rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
19012                 /*
19013                  * Advance snd_nxt over sequence space of this segment.
19014                  */
19015                 if (error)
19016                         /* We don't log or do anything with errors */
19017                         goto nomore;
19018                 if (doing_tlp == 0) {
19019                         if (rsm == NULL) {
19020                                 /*
19021                                  * Not a retransmission of some
19022                                  * sort, new data is going out so
19023                                  * clear our TLP count and flag.
19024                                  */
19025                                 rack->rc_tlp_in_progress = 0;
19026                                 rack->r_ctl.rc_tlp_cnt_out = 0;
19027                         }
19028                 } else {
19029                         /*
19030                          * We have just sent a TLP, mark that it is true
19031                          * and make sure our in progress is set so we
19032                          * continue to check the count.
19033                          */
19034                         rack->rc_tlp_in_progress = 1;
19035                         rack->r_ctl.rc_tlp_cnt_out++;
19036                 }
19037                 if (flags & (TH_SYN | TH_FIN)) {
19038                         if (flags & TH_SYN)
19039                                 tp->snd_nxt++;
19040                         if (flags & TH_FIN) {
19041                                 tp->snd_nxt++;
19042                                 tp->t_flags |= TF_SENTFIN;
19043                         }
19044                 }
19045                 /* In the ENOBUFS case we do *not* update snd_max */
19046                 if (sack_rxmit)
19047                         goto nomore;
19048
19049                 tp->snd_nxt += len;
19050                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
19051                         if (tp->snd_una == tp->snd_max) {
19052                                 /*
19053                                  * Update the time we just added data since
19054                                  * none was outstanding.
19055                                  */
19056                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
19057                                 tp->t_acktime = ticks;
19058                         }
19059                         tp->snd_max = tp->snd_nxt;
19060                         /*
19061                          * Time this transmission if not a retransmission and
19062                          * not currently timing anything.
19063                          * This is only relevant in case of switching back to
19064                          * the base stack.
19065                          */
19066                         if (tp->t_rtttime == 0) {
19067                                 tp->t_rtttime = ticks;
19068                                 tp->t_rtseq = startseq;
19069                                 KMOD_TCPSTAT_INC(tcps_segstimed);
19070                         }
19071                         if (len &&
19072                             ((tp->t_flags & TF_GPUTINPROG) == 0))
19073                                 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
19074                 }
19075                 /*
19076                  * If we are doing FO we need to update the mbuf position and subtract
19077                  * this happens when the peer sends us duplicate information and
19078                  * we thus want to send a DSACK.
19079                  *
19080                  * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
19081                  * turned off? If not then we are going to echo multiple DSACK blocks
19082                  * out (with the TSO), which we should not be doing.
19083                  */
19084                 if (rack->r_fast_output && len) {
19085                         if (rack->r_ctl.fsb.left_to_send > len)
19086                                 rack->r_ctl.fsb.left_to_send -= len;
19087                         else
19088                                 rack->r_ctl.fsb.left_to_send = 0;
19089                         if (rack->r_ctl.fsb.left_to_send < segsiz)
19090                                 rack->r_fast_output = 0;
19091                         if (rack->r_fast_output) {
19092                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19093                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19094                         }
19095                 }
19096         }
19097 nomore:
19098         if (error) {
19099                 rack->r_ctl.rc_agg_delayed = 0;
19100                 rack->r_early = 0;
19101                 rack->r_late = 0;
19102                 rack->r_ctl.rc_agg_early = 0;
19103                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
19104                 /*
19105                  * Failures do not advance the seq counter above. For the
19106                  * case of ENOBUFS we will fall out and retry in 1ms with
19107                  * the hpts. Everything else will just have to retransmit
19108                  * with the timer.
19109                  *
19110                  * In any case, we do not want to loop around for another
19111                  * send without a good reason.
19112                  */
19113                 sendalot = 0;
19114                 switch (error) {
19115                 case EPERM:
19116                         tp->t_softerror = error;
19117 #ifdef TCP_ACCOUNTING
19118                         crtsc = get_cyclecount();
19119                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19120                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19121                         }
19122                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19123                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19124                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19125                         }
19126                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19127                         sched_unpin();
19128 #endif
19129                         return (error);
19130                 case ENOBUFS:
19131                         /*
19132                          * Pace us right away to retry in a some
19133                          * time
19134                          */
19135                         slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
19136                         if (rack->rc_enobuf < 0x7f)
19137                                 rack->rc_enobuf++;
19138                         if (slot < (10 * HPTS_USEC_IN_MSEC))
19139                                 slot = 10 * HPTS_USEC_IN_MSEC;
19140                         if (rack->r_ctl.crte != NULL) {
19141                                 counter_u64_add(rack_saw_enobuf_hw, 1);
19142                                 tcp_rl_log_enobuf(rack->r_ctl.crte);
19143                         }
19144                         counter_u64_add(rack_saw_enobuf, 1);
19145                         goto enobufs;
19146                 case EMSGSIZE:
19147                         /*
19148                          * For some reason the interface we used initially
19149                          * to send segments changed to another or lowered
19150                          * its MTU. If TSO was active we either got an
19151                          * interface without TSO capabilits or TSO was
19152                          * turned off. If we obtained mtu from ip_output()
19153                          * then update it and try again.
19154                          */
19155                         if (tso)
19156                                 tp->t_flags &= ~TF_TSO;
19157                         if (mtu != 0) {
19158                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
19159                                 goto again;
19160                         }
19161                         slot = 10 * HPTS_USEC_IN_MSEC;
19162                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
19163 #ifdef TCP_ACCOUNTING
19164                         crtsc = get_cyclecount();
19165                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19166                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19167                         }
19168                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19169                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19170                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19171                         }
19172                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19173                         sched_unpin();
19174 #endif
19175                         return (error);
19176                 case ENETUNREACH:
19177                         counter_u64_add(rack_saw_enetunreach, 1);
19178                 case EHOSTDOWN:
19179                 case EHOSTUNREACH:
19180                 case ENETDOWN:
19181                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
19182                                 tp->t_softerror = error;
19183                         }
19184                         /* FALLTHROUGH */
19185                 default:
19186                         slot = 10 * HPTS_USEC_IN_MSEC;
19187                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
19188 #ifdef TCP_ACCOUNTING
19189                         crtsc = get_cyclecount();
19190                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19191                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19192                         }
19193                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19194                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19195                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19196                         }
19197                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19198                         sched_unpin();
19199 #endif
19200                         return (error);
19201                 }
19202         } else {
19203                 rack->rc_enobuf = 0;
19204                 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
19205                         rack->r_ctl.retran_during_recovery += len;
19206         }
19207         KMOD_TCPSTAT_INC(tcps_sndtotal);
19208
19209         /*
19210          * Data sent (as far as we can tell). If this advertises a larger
19211          * window than any other segment, then remember the size of the
19212          * advertised window. Any pending ACK has now been sent.
19213          */
19214         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
19215                 tp->rcv_adv = tp->rcv_nxt + recwin;
19216
19217         tp->last_ack_sent = tp->rcv_nxt;
19218         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
19219 enobufs:
19220         if (sendalot) {
19221                 /* Do we need to turn off sendalot? */
19222                 if (rack->r_ctl.rc_pace_max_segs &&
19223                     (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
19224                         /* We hit our max. */
19225                         sendalot = 0;
19226                 } else if ((rack->rc_user_set_max_segs) &&
19227                            (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
19228                         /* We hit the user defined max */
19229                         sendalot = 0;
19230                 }
19231         }
19232         if ((error == 0) && (flags & TH_FIN))
19233                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
19234         if (flags & TH_RST) {
19235                 /*
19236                  * We don't send again after sending a RST.
19237                  */
19238                 slot = 0;
19239                 sendalot = 0;
19240                 if (error == 0)
19241                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
19242         } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
19243                 /*
19244                  * Get our pacing rate, if an error
19245                  * occurred in sending (ENOBUF) we would
19246                  * hit the else if with slot preset. Other
19247                  * errors return.
19248                  */
19249                 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
19250         }
19251         if (rsm &&
19252             (rsm->r_flags & RACK_HAS_SYN) == 0 &&
19253             rack->use_rack_rr) {
19254                 /* Its a retransmit and we use the rack cheat? */
19255                 if ((slot == 0) ||
19256                     (rack->rc_always_pace == 0) ||
19257                     (rack->r_rr_config == 1)) {
19258                         /*
19259                          * We have no pacing set or we
19260                          * are using old-style rack or
19261                          * we are overriden to use the old 1ms pacing.
19262                          */
19263                         slot = rack->r_ctl.rc_min_to;
19264                 }
19265         }
19266         /* We have sent clear the flag */
19267         rack->r_ent_rec_ns = 0;
19268         if (rack->r_must_retran) {
19269                 if (rsm) {
19270                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
19271                         if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
19272                                 /*
19273                                  * We have retransmitted all.
19274                                  */
19275                                 rack->r_must_retran = 0;
19276                                 rack->r_ctl.rc_out_at_rto = 0;
19277                         }
19278                 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19279                         /*
19280                          * Sending new data will also kill
19281                          * the loop.
19282                          */
19283                         rack->r_must_retran = 0;
19284                         rack->r_ctl.rc_out_at_rto = 0;
19285                 }
19286         }
19287         rack->r_ctl.fsb.recwin = recwin;
19288         if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
19289             SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19290                 /*
19291                  * We hit an RTO and now have past snd_max at the RTO
19292                  * clear all the WAS flags.
19293                  */
19294                 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
19295         }
19296         if (slot) {
19297                 /* set the rack tcb into the slot N */
19298                 counter_u64_add(rack_paced_segments, 1);
19299                 if ((error == 0) &&
19300                     rack_use_rfo &&
19301                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19302                     (rsm == NULL) &&
19303                     (tp->snd_nxt == tp->snd_max) &&
19304                     (ipoptlen == 0) &&
19305                     (tp->rcv_numsacks == 0) &&
19306                     rack->r_fsb_inited &&
19307                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19308                     (rack->r_must_retran == 0) &&
19309                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19310                     (len > 0) && (orig_len > 0) &&
19311                     (orig_len > len) &&
19312                     ((orig_len - len) >= segsiz) &&
19313                     ((optlen == 0) ||
19314                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19315                         /* We can send at least one more MSS using our fsb */
19316
19317                         rack->r_fast_output = 1;
19318                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19319                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19320                         rack->r_ctl.fsb.tcp_flags = flags;
19321                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19322                         if (hw_tls)
19323                                 rack->r_ctl.fsb.hw_tls = 1;
19324                         else
19325                                 rack->r_ctl.fsb.hw_tls = 0;
19326                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19327                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19328                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19329                                  (tp->snd_max - tp->snd_una)));
19330                         if (rack->r_ctl.fsb.left_to_send < segsiz)
19331                                 rack->r_fast_output = 0;
19332                         else {
19333                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19334                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19335                                 else
19336                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19337                         }
19338                 } else
19339                         rack->r_fast_output = 0;
19340                 rack_log_fsb(rack, tp, so, flags,
19341                              ipoptlen, orig_len, len, error,
19342                              (rsm == NULL), optlen, __LINE__, 2);
19343         } else if (sendalot) {
19344                 int ret;
19345
19346                 if (len)
19347                         counter_u64_add(rack_unpaced_segments, 1);
19348                 sack_rxmit = 0;
19349                 if ((error == 0) &&
19350                     rack_use_rfo &&
19351                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19352                     (rsm == NULL) &&
19353                     (ipoptlen == 0) &&
19354                     (tp->rcv_numsacks == 0) &&
19355                     (tp->snd_nxt == tp->snd_max) &&
19356                     (rack->r_must_retran == 0) &&
19357                     rack->r_fsb_inited &&
19358                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19359                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19360                     (len > 0) && (orig_len > 0) &&
19361                     (orig_len > len) &&
19362                     ((orig_len - len) >= segsiz) &&
19363                     ((optlen == 0) ||
19364                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19365                         /* we can use fast_output for more */
19366
19367                         rack->r_fast_output = 1;
19368                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19369                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19370                         rack->r_ctl.fsb.tcp_flags = flags;
19371                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19372                         if (hw_tls)
19373                                 rack->r_ctl.fsb.hw_tls = 1;
19374                         else
19375                                 rack->r_ctl.fsb.hw_tls = 0;
19376                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19377                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19378                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19379                                  (tp->snd_max - tp->snd_una)));
19380                         if (rack->r_ctl.fsb.left_to_send < segsiz) {
19381                                 rack->r_fast_output = 0;
19382                         }
19383                         if (rack->r_fast_output) {
19384                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19385                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19386                                 else
19387                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19388                                 rack_log_fsb(rack, tp, so, flags,
19389                                              ipoptlen, orig_len, len, error,
19390                                              (rsm == NULL), optlen, __LINE__, 3);
19391                                 error = 0;
19392                                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
19393                                 if (ret >= 0)
19394                                         return (ret);
19395                                 else if (error)
19396                                         goto nomore;
19397
19398                         }
19399                 }
19400                 goto again;
19401         } else if (len) {
19402                 counter_u64_add(rack_unpaced_segments, 1);
19403         }
19404         /* Assure when we leave that snd_nxt will point to top */
19405         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
19406                 tp->snd_nxt = tp->snd_max;
19407         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
19408 #ifdef TCP_ACCOUNTING
19409         crtsc = get_cyclecount() - ts_val;
19410         if (tot_len_this_send) {
19411                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19412                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
19413                 }
19414                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
19415                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19416                         tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
19417                 }
19418                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc);
19419                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19420                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
19421                 }
19422                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz));
19423         } else {
19424                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19425                         tp->tcp_cnt_counters[SND_OUT_ACK]++;
19426                 }
19427                 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1);
19428                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19429                         tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
19430                 }
19431                 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc);
19432         }
19433         sched_unpin();
19434 #endif
19435         if (error == ENOBUFS)
19436                 error = 0;
19437         return (error);
19438 }
19439
19440 static void
19441 rack_update_seg(struct tcp_rack *rack)
19442 {
19443         uint32_t orig_val;
19444
19445         orig_val = rack->r_ctl.rc_pace_max_segs;
19446         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
19447         if (orig_val != rack->r_ctl.rc_pace_max_segs)
19448                 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
19449 }
19450
19451 static void
19452 rack_mtu_change(struct tcpcb *tp)
19453 {
19454         /*
19455          * The MSS may have changed
19456          */
19457         struct tcp_rack *rack;
19458         struct rack_sendmap *rsm;
19459
19460         rack = (struct tcp_rack *)tp->t_fb_ptr;
19461         if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
19462                 /*
19463                  * The MTU has changed we need to resend everything
19464                  * since all we have sent is lost. We first fix
19465                  * up the mtu though.
19466                  */
19467                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
19468                 /* We treat this like a full retransmit timeout without the cwnd adjustment */
19469                 rack_remxt_tmr(tp);
19470                 rack->r_fast_output = 0;
19471                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
19472                                                 rack->r_ctl.rc_sacked);
19473                 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
19474                 rack->r_must_retran = 1;
19475                 /* Mark all inflight to needing to be rxt'd */
19476                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
19477                         rsm->r_flags |= RACK_MUST_RXT;
19478                 }
19479         }
19480         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
19481         /* We don't use snd_nxt to retransmit */
19482         tp->snd_nxt = tp->snd_max;
19483 }
19484
19485 static int
19486 rack_set_profile(struct tcp_rack *rack, int prof)
19487 {
19488         int err = EINVAL;
19489         if (prof == 1) {
19490                 /* pace_always=1 */
19491                 if (rack->rc_always_pace == 0) {
19492                         if (tcp_can_enable_pacing() == 0)
19493                                 return (EBUSY);
19494                 }
19495                 rack->rc_always_pace = 1;
19496                 if (rack->use_fixed_rate || rack->gp_ready)
19497                         rack_set_cc_pacing(rack);
19498                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19499                 rack->rack_attempt_hdwr_pace = 0;
19500                 /* cmpack=1 */
19501                 if (rack_use_cmp_acks)
19502                         rack->r_use_cmp_ack = 1;
19503                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19504                     rack->r_use_cmp_ack)
19505                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19506                 /* scwnd=1 */
19507                 rack->rack_enable_scwnd = 1;
19508                 /* dynamic=100 */
19509                 rack->rc_gp_dyn_mul = 1;
19510                 /* gp_inc_ca */
19511                 rack->r_ctl.rack_per_of_gp_ca = 100;
19512                 /* rrr_conf=3 */
19513                 rack->r_rr_config = 3;
19514                 /* npush=2 */
19515                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19516                 /* fillcw=1 */
19517                 rack->rc_pace_to_cwnd = 1;
19518                 rack->rc_pace_fill_if_rttin_range = 0;
19519                 rack->rtt_limit_mul = 0;
19520                 /* noprr=1 */
19521                 rack->rack_no_prr = 1;
19522                 /* lscwnd=1 */
19523                 rack->r_limit_scw = 1;
19524                 /* gp_inc_rec */
19525                 rack->r_ctl.rack_per_of_gp_rec = 90;
19526                 err = 0;
19527
19528         } else if (prof == 3) {
19529                 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
19530                 /* pace_always=1 */
19531                 if (rack->rc_always_pace == 0) {
19532                         if (tcp_can_enable_pacing() == 0)
19533                                 return (EBUSY);
19534                 }
19535                 rack->rc_always_pace = 1;
19536                 if (rack->use_fixed_rate || rack->gp_ready)
19537                         rack_set_cc_pacing(rack);
19538                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19539                 rack->rack_attempt_hdwr_pace = 0;
19540                 /* cmpack=1 */
19541                 if (rack_use_cmp_acks)
19542                         rack->r_use_cmp_ack = 1;
19543                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19544                     rack->r_use_cmp_ack)
19545                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19546                 /* scwnd=1 */
19547                 rack->rack_enable_scwnd = 1;
19548                 /* dynamic=100 */
19549                 rack->rc_gp_dyn_mul = 1;
19550                 /* gp_inc_ca */
19551                 rack->r_ctl.rack_per_of_gp_ca = 100;
19552                 /* rrr_conf=3 */
19553                 rack->r_rr_config = 3;
19554                 /* npush=2 */
19555                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19556                 /* fillcw=2 */
19557                 rack->rc_pace_to_cwnd = 1;
19558                 rack->r_fill_less_agg = 1;
19559                 rack->rc_pace_fill_if_rttin_range = 0;
19560                 rack->rtt_limit_mul = 0;
19561                 /* noprr=1 */
19562                 rack->rack_no_prr = 1;
19563                 /* lscwnd=1 */
19564                 rack->r_limit_scw = 1;
19565                 /* gp_inc_rec */
19566                 rack->r_ctl.rack_per_of_gp_rec = 90;
19567                 err = 0;
19568
19569
19570         } else if (prof == 2) {
19571                 /* cmpack=1 */
19572                 if (rack->rc_always_pace == 0) {
19573                         if (tcp_can_enable_pacing() == 0)
19574                                 return (EBUSY);
19575                 }
19576                 rack->rc_always_pace = 1;
19577                 if (rack->use_fixed_rate || rack->gp_ready)
19578                         rack_set_cc_pacing(rack);
19579                 rack->r_use_cmp_ack = 1;
19580                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19581                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19582                 /* pace_always=1 */
19583                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19584                 /* scwnd=1 */
19585                 rack->rack_enable_scwnd = 1;
19586                 /* dynamic=100 */
19587                 rack->rc_gp_dyn_mul = 1;
19588                 rack->r_ctl.rack_per_of_gp_ca = 100;
19589                 /* rrr_conf=3 */
19590                 rack->r_rr_config = 3;
19591                 /* npush=2 */
19592                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19593                 /* fillcw=1 */
19594                 rack->rc_pace_to_cwnd = 1;
19595                 rack->rc_pace_fill_if_rttin_range = 0;
19596                 rack->rtt_limit_mul = 0;
19597                 /* noprr=1 */
19598                 rack->rack_no_prr = 1;
19599                 /* lscwnd=0 */
19600                 rack->r_limit_scw = 0;
19601                 err = 0;
19602         } else if (prof == 0) {
19603                 /* This changes things back to the default settings */
19604                 err = 0;
19605                 if (rack->rc_always_pace) {
19606                         tcp_decrement_paced_conn();
19607                         rack_undo_cc_pacing(rack);
19608                         rack->rc_always_pace = 0;
19609                 }
19610                 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
19611                         rack->rc_always_pace = 1;
19612                         if (rack->use_fixed_rate || rack->gp_ready)
19613                                 rack_set_cc_pacing(rack);
19614                 } else
19615                         rack->rc_always_pace = 0;
19616                 if (rack_dsack_std_based & 0x1) {
19617                         /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
19618                         rack->rc_rack_tmr_std_based = 1;
19619                 }
19620                 if (rack_dsack_std_based & 0x2) {
19621                         /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
19622                         rack->rc_rack_use_dsack = 1;
19623                 }
19624                 if (rack_use_cmp_acks)
19625                         rack->r_use_cmp_ack = 1;
19626                 else
19627                         rack->r_use_cmp_ack = 0;
19628                 if (rack_disable_prr)
19629                         rack->rack_no_prr = 1;
19630                 else
19631                         rack->rack_no_prr = 0;
19632                 if (rack_gp_no_rec_chg)
19633                         rack->rc_gp_no_rec_chg = 1;
19634                 else
19635                         rack->rc_gp_no_rec_chg = 0;
19636                 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
19637                         rack->r_mbuf_queue = 1;
19638                         if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19639                                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19640                         rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19641                 } else {
19642                         rack->r_mbuf_queue = 0;
19643                         rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19644                 }
19645                 if (rack_enable_shared_cwnd)
19646                         rack->rack_enable_scwnd = 1;
19647                 else
19648                         rack->rack_enable_scwnd = 0;
19649                 if (rack_do_dyn_mul) {
19650                         /* When dynamic adjustment is on CA needs to start at 100% */
19651                         rack->rc_gp_dyn_mul = 1;
19652                         if (rack_do_dyn_mul >= 100)
19653                                 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
19654                 } else {
19655                         rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
19656                         rack->rc_gp_dyn_mul = 0;
19657                 }
19658                 rack->r_rr_config = 0;
19659                 rack->r_ctl.rc_no_push_at_mrtt = 0;
19660                 rack->rc_pace_to_cwnd = 0;
19661                 rack->rc_pace_fill_if_rttin_range = 0;
19662                 rack->rtt_limit_mul = 0;
19663
19664                 if (rack_enable_hw_pacing)
19665                         rack->rack_hdw_pace_ena = 1;
19666                 else
19667                         rack->rack_hdw_pace_ena = 0;
19668                 if (rack_disable_prr)
19669                         rack->rack_no_prr = 1;
19670                 else
19671                         rack->rack_no_prr = 0;
19672                 if (rack_limits_scwnd)
19673                         rack->r_limit_scw  = 1;
19674                 else
19675                         rack->r_limit_scw  = 0;
19676                 err = 0;
19677         }
19678         return (err);
19679 }
19680
19681 static int
19682 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
19683 {
19684         struct deferred_opt_list *dol;
19685
19686         dol = malloc(sizeof(struct deferred_opt_list),
19687                      M_TCPFSB, M_NOWAIT|M_ZERO);
19688         if (dol == NULL) {
19689                 /*
19690                  * No space yikes -- fail out..
19691                  */
19692                 return (0);
19693         }
19694         dol->optname = sopt_name;
19695         dol->optval = loptval;
19696         TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
19697         return (1);
19698 }
19699
19700 static int
19701 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
19702                     uint32_t optval, uint64_t loptval)
19703 {
19704         struct epoch_tracker et;
19705         struct sockopt sopt;
19706         struct cc_newreno_opts opt;
19707         uint64_t val;
19708         int error = 0;
19709         uint16_t ca, ss;
19710
19711         switch (sopt_name) {
19712
19713         case TCP_RACK_DSACK_OPT:
19714                 RACK_OPTS_INC(tcp_rack_dsack_opt);
19715                 if (optval & 0x1) {
19716                         rack->rc_rack_tmr_std_based = 1;
19717                 } else {
19718                         rack->rc_rack_tmr_std_based = 0;
19719                 }
19720                 if (optval & 0x2) {
19721                         rack->rc_rack_use_dsack = 1;
19722                 } else {
19723                         rack->rc_rack_use_dsack = 0;
19724                 }
19725                 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
19726                 break;
19727         case TCP_RACK_PACING_BETA:
19728                 RACK_OPTS_INC(tcp_rack_beta);
19729                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19730                         /* This only works for newreno. */
19731                         error = EINVAL;
19732                         break;
19733                 }
19734                 if (rack->rc_pacing_cc_set) {
19735                         /*
19736                          * Set them into the real CC module
19737                          * whats in the rack pcb is the old values
19738                          * to be used on restoral/
19739                          */
19740                         sopt.sopt_dir = SOPT_SET;
19741                         opt.name = CC_NEWRENO_BETA;
19742                         opt.val = optval;
19743                         if (CC_ALGO(tp)->ctl_output != NULL)
19744                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19745                         else {
19746                                 error = ENOENT;
19747                                 break;
19748                         }
19749                 } else {
19750                         /*
19751                          * Not pacing yet so set it into our local
19752                          * rack pcb storage.
19753                          */
19754                         rack->r_ctl.rc_saved_beta.beta = optval;
19755                 }
19756                 break;
19757         case TCP_RACK_TIMER_SLOP:
19758                 RACK_OPTS_INC(tcp_rack_timer_slop);
19759                 rack->r_ctl.timer_slop = optval;
19760                 if (rack->rc_tp->t_srtt) {
19761                         /*
19762                          * If we have an SRTT lets update t_rxtcur
19763                          * to have the new slop.
19764                          */
19765                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
19766                                            rack_rto_min, rack_rto_max,
19767                                            rack->r_ctl.timer_slop);
19768                 }
19769                 break;
19770         case TCP_RACK_PACING_BETA_ECN:
19771                 RACK_OPTS_INC(tcp_rack_beta_ecn);
19772                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19773                         /* This only works for newreno. */
19774                         error = EINVAL;
19775                         break;
19776                 }
19777                 if (rack->rc_pacing_cc_set) {
19778                         /*
19779                          * Set them into the real CC module
19780                          * whats in the rack pcb is the old values
19781                          * to be used on restoral/
19782                          */
19783                         sopt.sopt_dir = SOPT_SET;
19784                         opt.name = CC_NEWRENO_BETA_ECN;
19785                         opt.val = optval;
19786                         if (CC_ALGO(tp)->ctl_output != NULL)
19787                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19788                         else
19789                                 error = ENOENT;
19790                 } else {
19791                         /*
19792                          * Not pacing yet so set it into our local
19793                          * rack pcb storage.
19794                          */
19795                         rack->r_ctl.rc_saved_beta.beta_ecn = optval;
19796                         rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED;
19797                 }
19798                 break;
19799         case TCP_DEFER_OPTIONS:
19800                 RACK_OPTS_INC(tcp_defer_opt);
19801                 if (optval) {
19802                         if (rack->gp_ready) {
19803                                 /* Too late */
19804                                 error = EINVAL;
19805                                 break;
19806                         }
19807                         rack->defer_options = 1;
19808                 } else
19809                         rack->defer_options = 0;
19810                 break;
19811         case TCP_RACK_MEASURE_CNT:
19812                 RACK_OPTS_INC(tcp_rack_measure_cnt);
19813                 if (optval && (optval <= 0xff)) {
19814                         rack->r_ctl.req_measurements = optval;
19815                 } else
19816                         error = EINVAL;
19817                 break;
19818         case TCP_REC_ABC_VAL:
19819                 RACK_OPTS_INC(tcp_rec_abc_val);
19820                 if (optval > 0)
19821                         rack->r_use_labc_for_rec = 1;
19822                 else
19823                         rack->r_use_labc_for_rec = 0;
19824                 break;
19825         case TCP_RACK_ABC_VAL:
19826                 RACK_OPTS_INC(tcp_rack_abc_val);
19827                 if ((optval > 0) && (optval < 255))
19828                         rack->rc_labc = optval;
19829                 else
19830                         error = EINVAL;
19831                 break;
19832         case TCP_HDWR_UP_ONLY:
19833                 RACK_OPTS_INC(tcp_pacing_up_only);
19834                 if (optval)
19835                         rack->r_up_only = 1;
19836                 else
19837                         rack->r_up_only = 0;
19838                 break;
19839         case TCP_PACING_RATE_CAP:
19840                 RACK_OPTS_INC(tcp_pacing_rate_cap);
19841                 rack->r_ctl.bw_rate_cap = loptval;
19842                 break;
19843         case TCP_RACK_PROFILE:
19844                 RACK_OPTS_INC(tcp_profile);
19845                 error = rack_set_profile(rack, optval);
19846                 break;
19847         case TCP_USE_CMP_ACKS:
19848                 RACK_OPTS_INC(tcp_use_cmp_acks);
19849                 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
19850                         /* You can't turn it off once its on! */
19851                         error = EINVAL;
19852                 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
19853                         rack->r_use_cmp_ack = 1;
19854                         rack->r_mbuf_queue = 1;
19855                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19856                 }
19857                 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
19858                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19859                 break;
19860         case TCP_SHARED_CWND_TIME_LIMIT:
19861                 RACK_OPTS_INC(tcp_lscwnd);
19862                 if (optval)
19863                         rack->r_limit_scw = 1;
19864                 else
19865                         rack->r_limit_scw = 0;
19866                 break;
19867         case TCP_RACK_PACE_TO_FILL:
19868                 RACK_OPTS_INC(tcp_fillcw);
19869                 if (optval == 0)
19870                         rack->rc_pace_to_cwnd = 0;
19871                 else {
19872                         rack->rc_pace_to_cwnd = 1;
19873                         if (optval > 1)
19874                                 rack->r_fill_less_agg = 1;
19875                 }
19876                 if ((optval >= rack_gp_rtt_maxmul) &&
19877                     rack_gp_rtt_maxmul &&
19878                     (optval < 0xf)) {
19879                         rack->rc_pace_fill_if_rttin_range = 1;
19880                         rack->rtt_limit_mul = optval;
19881                 } else {
19882                         rack->rc_pace_fill_if_rttin_range = 0;
19883                         rack->rtt_limit_mul = 0;
19884                 }
19885                 break;
19886         case TCP_RACK_NO_PUSH_AT_MAX:
19887                 RACK_OPTS_INC(tcp_npush);
19888                 if (optval == 0)
19889                         rack->r_ctl.rc_no_push_at_mrtt = 0;
19890                 else if (optval < 0xff)
19891                         rack->r_ctl.rc_no_push_at_mrtt = optval;
19892                 else
19893                         error = EINVAL;
19894                 break;
19895         case TCP_SHARED_CWND_ENABLE:
19896                 RACK_OPTS_INC(tcp_rack_scwnd);
19897                 if (optval == 0)
19898                         rack->rack_enable_scwnd = 0;
19899                 else
19900                         rack->rack_enable_scwnd = 1;
19901                 break;
19902         case TCP_RACK_MBUF_QUEUE:
19903                 /* Now do we use the LRO mbuf-queue feature */
19904                 RACK_OPTS_INC(tcp_rack_mbufq);
19905                 if (optval || rack->r_use_cmp_ack)
19906                         rack->r_mbuf_queue = 1;
19907                 else
19908                         rack->r_mbuf_queue = 0;
19909                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19910                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19911                 else
19912                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19913                 break;
19914         case TCP_RACK_NONRXT_CFG_RATE:
19915                 RACK_OPTS_INC(tcp_rack_cfg_rate);
19916                 if (optval == 0)
19917                         rack->rack_rec_nonrxt_use_cr = 0;
19918                 else
19919                         rack->rack_rec_nonrxt_use_cr = 1;
19920                 break;
19921         case TCP_NO_PRR:
19922                 RACK_OPTS_INC(tcp_rack_noprr);
19923                 if (optval == 0)
19924                         rack->rack_no_prr = 0;
19925                 else if (optval == 1)
19926                         rack->rack_no_prr = 1;
19927                 else if (optval == 2)
19928                         rack->no_prr_addback = 1;
19929                 else
19930                         error = EINVAL;
19931                 break;
19932         case TCP_TIMELY_DYN_ADJ:
19933                 RACK_OPTS_INC(tcp_timely_dyn);
19934                 if (optval == 0)
19935                         rack->rc_gp_dyn_mul = 0;
19936                 else {
19937                         rack->rc_gp_dyn_mul = 1;
19938                         if (optval >= 100) {
19939                                 /*
19940                                  * If the user sets something 100 or more
19941                                  * its the gp_ca value.
19942                                  */
19943                                 rack->r_ctl.rack_per_of_gp_ca  = optval;
19944                         }
19945                 }
19946                 break;
19947         case TCP_RACK_DO_DETECTION:
19948                 RACK_OPTS_INC(tcp_rack_do_detection);
19949                 if (optval == 0)
19950                         rack->do_detection = 0;
19951                 else
19952                         rack->do_detection = 1;
19953                 break;
19954         case TCP_RACK_TLP_USE:
19955                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
19956                         error = EINVAL;
19957                         break;
19958                 }
19959                 RACK_OPTS_INC(tcp_tlp_use);
19960                 rack->rack_tlp_threshold_use = optval;
19961                 break;
19962         case TCP_RACK_TLP_REDUCE:
19963                 /* RACK TLP cwnd reduction (bool) */
19964                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
19965                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
19966                 break;
19967         /*  Pacing related ones */
19968         case TCP_RACK_PACE_ALWAYS:
19969                 /*
19970                  * zero is old rack method, 1 is new
19971                  * method using a pacing rate.
19972                  */
19973                 RACK_OPTS_INC(tcp_rack_pace_always);
19974                 if (optval > 0) {
19975                         if (rack->rc_always_pace) {
19976                                 error = EALREADY;
19977                                 break;
19978                         } else if (tcp_can_enable_pacing()) {
19979                                 rack->rc_always_pace = 1;
19980                                 if (rack->use_fixed_rate || rack->gp_ready)
19981                                         rack_set_cc_pacing(rack);
19982                         }
19983                         else {
19984                                 error = ENOSPC;
19985                                 break;
19986                         }
19987                 } else {
19988                         if (rack->rc_always_pace) {
19989                                 tcp_decrement_paced_conn();
19990                                 rack->rc_always_pace = 0;
19991                                 rack_undo_cc_pacing(rack);
19992                         }
19993                 }
19994                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19995                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19996                 else
19997                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19998                 /* A rate may be set irate or other, if so set seg size */
19999                 rack_update_seg(rack);
20000                 break;
20001         case TCP_BBR_RACK_INIT_RATE:
20002                 RACK_OPTS_INC(tcp_initial_rate);
20003                 val = optval;
20004                 /* Change from kbits per second to bytes per second */
20005                 val *= 1000;
20006                 val /= 8;
20007                 rack->r_ctl.init_rate = val;
20008                 if (rack->rc_init_win != rack_default_init_window) {
20009                         uint32_t win, snt;
20010
20011                         /*
20012                          * Options don't always get applied
20013                          * in the order you think. So in order
20014                          * to assure we update a cwnd we need
20015                          * to check and see if we are still
20016                          * where we should raise the cwnd.
20017                          */
20018                         win = rc_init_window(rack);
20019                         if (SEQ_GT(tp->snd_max, tp->iss))
20020                                 snt = tp->snd_max - tp->iss;
20021                         else
20022                                 snt = 0;
20023                         if ((snt < win) &&
20024                             (tp->snd_cwnd < win))
20025                                 tp->snd_cwnd = win;
20026                 }
20027                 if (rack->rc_always_pace)
20028                         rack_update_seg(rack);
20029                 break;
20030         case TCP_BBR_IWINTSO:
20031                 RACK_OPTS_INC(tcp_initial_win);
20032                 if (optval && (optval <= 0xff)) {
20033                         uint32_t win, snt;
20034
20035                         rack->rc_init_win = optval;
20036                         win = rc_init_window(rack);
20037                         if (SEQ_GT(tp->snd_max, tp->iss))
20038                                 snt = tp->snd_max - tp->iss;
20039                         else
20040                                 snt = 0;
20041                         if ((snt < win) &&
20042                             (tp->t_srtt |
20043 #ifdef NETFLIX_PEAKRATE
20044                              tp->t_maxpeakrate |
20045 #endif
20046                              rack->r_ctl.init_rate)) {
20047                                 /*
20048                                  * We are not past the initial window
20049                                  * and we have some bases for pacing,
20050                                  * so we need to possibly adjust up
20051                                  * the cwnd. Note even if we don't set
20052                                  * the cwnd, its still ok to raise the rc_init_win
20053                                  * which can be used coming out of idle when we
20054                                  * would have a rate.
20055                                  */
20056                                 if (tp->snd_cwnd < win)
20057                                         tp->snd_cwnd = win;
20058                         }
20059                         if (rack->rc_always_pace)
20060                                 rack_update_seg(rack);
20061                 } else
20062                         error = EINVAL;
20063                 break;
20064         case TCP_RACK_FORCE_MSEG:
20065                 RACK_OPTS_INC(tcp_rack_force_max_seg);
20066                 if (optval)
20067                         rack->rc_force_max_seg = 1;
20068                 else
20069                         rack->rc_force_max_seg = 0;
20070                 break;
20071         case TCP_RACK_PACE_MAX_SEG:
20072                 /* Max segments size in a pace in bytes */
20073                 RACK_OPTS_INC(tcp_rack_max_seg);
20074                 rack->rc_user_set_max_segs = optval;
20075                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
20076                 break;
20077         case TCP_RACK_PACE_RATE_REC:
20078                 /* Set the fixed pacing rate in Bytes per second ca */
20079                 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
20080                 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
20081                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
20082                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
20083                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
20084                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
20085                 rack->use_fixed_rate = 1;
20086                 if (rack->rc_always_pace)
20087                         rack_set_cc_pacing(rack);
20088                 rack_log_pacing_delay_calc(rack,
20089                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
20090                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
20091                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
20092                                            __LINE__, NULL,0);
20093                 break;
20094
20095         case TCP_RACK_PACE_RATE_SS:
20096                 /* Set the fixed pacing rate in Bytes per second ca */
20097                 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
20098                 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
20099                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
20100                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
20101                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
20102                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
20103                 rack->use_fixed_rate = 1;
20104                 if (rack->rc_always_pace)
20105                         rack_set_cc_pacing(rack);
20106                 rack_log_pacing_delay_calc(rack,
20107                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
20108                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
20109                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
20110                                            __LINE__, NULL, 0);
20111                 break;
20112
20113         case TCP_RACK_PACE_RATE_CA:
20114                 /* Set the fixed pacing rate in Bytes per second ca */
20115                 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
20116                 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
20117                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
20118                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
20119                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
20120                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
20121                 rack->use_fixed_rate = 1;
20122                 if (rack->rc_always_pace)
20123                         rack_set_cc_pacing(rack);
20124                 rack_log_pacing_delay_calc(rack,
20125                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
20126                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
20127                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
20128                                            __LINE__, NULL, 0);
20129                 break;
20130         case TCP_RACK_GP_INCREASE_REC:
20131                 RACK_OPTS_INC(tcp_gp_inc_rec);
20132                 rack->r_ctl.rack_per_of_gp_rec = optval;
20133                 rack_log_pacing_delay_calc(rack,
20134                                            rack->r_ctl.rack_per_of_gp_ss,
20135                                            rack->r_ctl.rack_per_of_gp_ca,
20136                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20137                                            __LINE__, NULL, 0);
20138                 break;
20139         case TCP_RACK_GP_INCREASE_CA:
20140                 RACK_OPTS_INC(tcp_gp_inc_ca);
20141                 ca = optval;
20142                 if (ca < 100) {
20143                         /*
20144                          * We don't allow any reduction
20145                          * over the GP b/w.
20146                          */
20147                         error = EINVAL;
20148                         break;
20149                 }
20150                 rack->r_ctl.rack_per_of_gp_ca = ca;
20151                 rack_log_pacing_delay_calc(rack,
20152                                            rack->r_ctl.rack_per_of_gp_ss,
20153                                            rack->r_ctl.rack_per_of_gp_ca,
20154                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20155                                            __LINE__, NULL, 0);
20156                 break;
20157         case TCP_RACK_GP_INCREASE_SS:
20158                 RACK_OPTS_INC(tcp_gp_inc_ss);
20159                 ss = optval;
20160                 if (ss < 100) {
20161                         /*
20162                          * We don't allow any reduction
20163                          * over the GP b/w.
20164                          */
20165                         error = EINVAL;
20166                         break;
20167                 }
20168                 rack->r_ctl.rack_per_of_gp_ss = ss;
20169                 rack_log_pacing_delay_calc(rack,
20170                                            rack->r_ctl.rack_per_of_gp_ss,
20171                                            rack->r_ctl.rack_per_of_gp_ca,
20172                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20173                                            __LINE__, NULL, 0);
20174                 break;
20175         case TCP_RACK_RR_CONF:
20176                 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
20177                 if (optval && optval <= 3)
20178                         rack->r_rr_config = optval;
20179                 else
20180                         rack->r_rr_config = 0;
20181                 break;
20182         case TCP_HDWR_RATE_CAP:
20183                 RACK_OPTS_INC(tcp_hdwr_rate_cap);
20184                 if (optval) {
20185                         if (rack->r_rack_hw_rate_caps == 0)
20186                                 rack->r_rack_hw_rate_caps = 1;
20187                         else
20188                                 error = EALREADY;
20189                 } else {
20190                         rack->r_rack_hw_rate_caps = 0;
20191                 }
20192                 break;
20193         case TCP_BBR_HDWR_PACE:
20194                 RACK_OPTS_INC(tcp_hdwr_pacing);
20195                 if (optval){
20196                         if (rack->rack_hdrw_pacing == 0) {
20197                                 rack->rack_hdw_pace_ena = 1;
20198                                 rack->rack_attempt_hdwr_pace = 0;
20199                         } else
20200                                 error = EALREADY;
20201                 } else {
20202                         rack->rack_hdw_pace_ena = 0;
20203 #ifdef RATELIMIT
20204                         if (rack->r_ctl.crte != NULL) {
20205                                 rack->rack_hdrw_pacing = 0;
20206                                 rack->rack_attempt_hdwr_pace = 0;
20207                                 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
20208                                 rack->r_ctl.crte = NULL;
20209                         }
20210 #endif
20211                 }
20212                 break;
20213         /*  End Pacing related ones */
20214         case TCP_RACK_PRR_SENDALOT:
20215                 /* Allow PRR to send more than one seg */
20216                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
20217                 rack->r_ctl.rc_prr_sendalot = optval;
20218                 break;
20219         case TCP_RACK_MIN_TO:
20220                 /* Minimum time between rack t-o's in ms */
20221                 RACK_OPTS_INC(tcp_rack_min_to);
20222                 rack->r_ctl.rc_min_to = optval;
20223                 break;
20224         case TCP_RACK_EARLY_SEG:
20225                 /* If early recovery max segments */
20226                 RACK_OPTS_INC(tcp_rack_early_seg);
20227                 rack->r_ctl.rc_early_recovery_segs = optval;
20228                 break;
20229         case TCP_RACK_ENABLE_HYSTART:
20230         {
20231                 struct sockopt sopt;
20232                 struct cc_newreno_opts opt;
20233
20234                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
20235                 sopt.sopt_dir = SOPT_SET;
20236                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
20237                 opt.val = optval;
20238                 if (CC_ALGO(tp)->ctl_output != NULL)
20239                         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
20240                 else
20241                         error = EINVAL;
20242         }
20243         break;
20244         case TCP_RACK_REORD_THRESH:
20245                 /* RACK reorder threshold (shift amount) */
20246                 RACK_OPTS_INC(tcp_rack_reord_thresh);
20247                 if ((optval > 0) && (optval < 31))
20248                         rack->r_ctl.rc_reorder_shift = optval;
20249                 else
20250                         error = EINVAL;
20251                 break;
20252         case TCP_RACK_REORD_FADE:
20253                 /* Does reordering fade after ms time */
20254                 RACK_OPTS_INC(tcp_rack_reord_fade);
20255                 rack->r_ctl.rc_reorder_fade = optval;
20256                 break;
20257         case TCP_RACK_TLP_THRESH:
20258                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20259                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
20260                 if (optval)
20261                         rack->r_ctl.rc_tlp_threshold = optval;
20262                 else
20263                         error = EINVAL;
20264                 break;
20265         case TCP_BBR_USE_RACK_RR:
20266                 RACK_OPTS_INC(tcp_rack_rr);
20267                 if (optval)
20268                         rack->use_rack_rr = 1;
20269                 else
20270                         rack->use_rack_rr = 0;
20271                 break;
20272         case TCP_FAST_RSM_HACK:
20273                 RACK_OPTS_INC(tcp_rack_fastrsm_hack);
20274                 if (optval)
20275                         rack->fast_rsm_hack = 1;
20276                 else
20277                         rack->fast_rsm_hack = 0;
20278                 break;
20279         case TCP_RACK_PKT_DELAY:
20280                 /* RACK added ms i.e. rack-rtt + reord + N */
20281                 RACK_OPTS_INC(tcp_rack_pkt_delay);
20282                 rack->r_ctl.rc_pkt_delay = optval;
20283                 break;
20284         case TCP_DELACK:
20285                 RACK_OPTS_INC(tcp_rack_delayed_ack);
20286                 if (optval == 0)
20287                         tp->t_delayed_ack = 0;
20288                 else
20289                         tp->t_delayed_ack = 1;
20290                 if (tp->t_flags & TF_DELACK) {
20291                         tp->t_flags &= ~TF_DELACK;
20292                         tp->t_flags |= TF_ACKNOW;
20293                         NET_EPOCH_ENTER(et);
20294                         rack_output(tp);
20295                         NET_EPOCH_EXIT(et);
20296                 }
20297                 break;
20298
20299         case TCP_BBR_RACK_RTT_USE:
20300                 RACK_OPTS_INC(tcp_rack_rtt_use);
20301                 if ((optval != USE_RTT_HIGH) &&
20302                     (optval != USE_RTT_LOW) &&
20303                     (optval != USE_RTT_AVG))
20304                         error = EINVAL;
20305                 else
20306                         rack->r_ctl.rc_rate_sample_method = optval;
20307                 break;
20308         case TCP_DATA_AFTER_CLOSE:
20309                 RACK_OPTS_INC(tcp_data_after_close);
20310                 if (optval)
20311                         rack->rc_allow_data_af_clo = 1;
20312                 else
20313                         rack->rc_allow_data_af_clo = 0;
20314                 break;
20315         default:
20316                 break;
20317         }
20318 #ifdef NETFLIX_STATS
20319         tcp_log_socket_option(tp, sopt_name, optval, error);
20320 #endif
20321         return (error);
20322 }
20323
20324
20325 static void
20326 rack_apply_deferred_options(struct tcp_rack *rack)
20327 {
20328         struct deferred_opt_list *dol, *sdol;
20329         uint32_t s_optval;
20330
20331         TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
20332                 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
20333                 /* Disadvantage of deferal is you loose the error return */
20334                 s_optval = (uint32_t)dol->optval;
20335                 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
20336                 free(dol, M_TCPDO);
20337         }
20338 }
20339
20340 static void
20341 rack_hw_tls_change(struct tcpcb *tp, int chg)
20342 {
20343         /*
20344          * HW tls state has changed.. fix all
20345          * rsm's in flight.
20346          */
20347         struct tcp_rack *rack;
20348         struct rack_sendmap *rsm;
20349
20350         rack = (struct tcp_rack *)tp->t_fb_ptr;
20351         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
20352                 if (chg)
20353                         rsm->r_hw_tls = 1;
20354                 else
20355                         rsm->r_hw_tls = 0;
20356         }
20357         if (chg)
20358                 rack->r_ctl.fsb.hw_tls = 1;
20359         else
20360                 rack->r_ctl.fsb.hw_tls = 0;
20361 }
20362
20363 static int
20364 rack_pru_options(struct tcpcb *tp, int flags)
20365 {
20366         if (flags & PRUS_OOB)
20367                 return (EOPNOTSUPP);
20368         return (0);
20369 }
20370
20371 static struct tcp_function_block __tcp_rack = {
20372         .tfb_tcp_block_name = __XSTRING(STACKNAME),
20373         .tfb_tcp_output = rack_output,
20374         .tfb_do_queued_segments = ctf_do_queued_segments,
20375         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
20376         .tfb_tcp_do_segment = rack_do_segment,
20377         .tfb_tcp_ctloutput = rack_ctloutput,
20378         .tfb_tcp_fb_init = rack_init,
20379         .tfb_tcp_fb_fini = rack_fini,
20380         .tfb_tcp_timer_stop_all = rack_stopall,
20381         .tfb_tcp_timer_activate = rack_timer_activate,
20382         .tfb_tcp_timer_active = rack_timer_active,
20383         .tfb_tcp_timer_stop = rack_timer_stop,
20384         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
20385         .tfb_tcp_handoff_ok = rack_handoff_ok,
20386         .tfb_tcp_mtu_chg = rack_mtu_change,
20387         .tfb_pru_options = rack_pru_options,
20388         .tfb_hwtls_change = rack_hw_tls_change,
20389 };
20390
20391 /*
20392  * rack_ctloutput() must drop the inpcb lock before performing copyin on
20393  * socket option arguments.  When it re-acquires the lock after the copy, it
20394  * has to revalidate that the connection is still valid for the socket
20395  * option.
20396  */
20397 static int
20398 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
20399     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
20400 {
20401 #ifdef INET6
20402         struct ip6_hdr *ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
20403 #endif
20404 #ifdef INET
20405         struct ip *ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
20406 #endif
20407         uint64_t loptval;
20408         int32_t error = 0, optval;
20409
20410         switch (sopt->sopt_level) {
20411 #ifdef INET6
20412         case IPPROTO_IPV6:
20413                 MPASS(inp->inp_vflag & INP_IPV6PROTO);
20414                 switch (sopt->sopt_name) {
20415                 case IPV6_USE_MIN_MTU:
20416                         tcp6_use_min_mtu(tp);
20417                         break;
20418                 case IPV6_TCLASS:
20419                         /*
20420                          * The DSCP codepoint has changed, update the fsb.
20421                          */
20422                         ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
20423                             (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK);
20424                         break;
20425                 }
20426                 INP_WUNLOCK(inp);
20427                 return (0);
20428 #endif
20429 #ifdef INET
20430         case IPPROTO_IP:
20431                 switch (sopt->sopt_name) {
20432                 case IP_TOS:
20433                         /*
20434                          * The DSCP codepoint has changed, update the fsb.
20435                          */
20436                         ip->ip_tos = rack->rc_inp->inp_ip_tos;
20437                         break;
20438                 case IP_TTL:
20439                         /*
20440                          * The TTL has changed, update the fsb.
20441                          */
20442                         ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
20443                         break;
20444                 }
20445                 INP_WUNLOCK(inp);
20446                 return (0);
20447 #endif
20448         }
20449
20450         switch (sopt->sopt_name) {
20451         case TCP_RACK_TLP_REDUCE:               /*  URL:tlp_reduce */
20452         /*  Pacing related ones */
20453         case TCP_RACK_PACE_ALWAYS:              /*  URL:pace_always */
20454         case TCP_BBR_RACK_INIT_RATE:            /*  URL:irate */
20455         case TCP_BBR_IWINTSO:                   /*  URL:tso_iwin */
20456         case TCP_RACK_PACE_MAX_SEG:             /*  URL:pace_max_seg */
20457         case TCP_RACK_FORCE_MSEG:               /*  URL:force_max_seg */
20458         case TCP_RACK_PACE_RATE_CA:             /*  URL:pr_ca */
20459         case TCP_RACK_PACE_RATE_SS:             /*  URL:pr_ss*/
20460         case TCP_RACK_PACE_RATE_REC:            /*  URL:pr_rec */
20461         case TCP_RACK_GP_INCREASE_CA:           /*  URL:gp_inc_ca */
20462         case TCP_RACK_GP_INCREASE_SS:           /*  URL:gp_inc_ss */
20463         case TCP_RACK_GP_INCREASE_REC:          /*  URL:gp_inc_rec */
20464         case TCP_RACK_RR_CONF:                  /*  URL:rrr_conf */
20465         case TCP_BBR_HDWR_PACE:                 /*  URL:hdwrpace */
20466         case TCP_HDWR_RATE_CAP:                 /*  URL:hdwrcap boolean */
20467         case TCP_PACING_RATE_CAP:               /*  URL:cap  -- used by side-channel */
20468         case TCP_HDWR_UP_ONLY:                  /*  URL:uponly -- hardware pacing  boolean */
20469        /* End pacing related */
20470         case TCP_FAST_RSM_HACK:                 /*  URL:frsm_hack */
20471         case TCP_DELACK:                        /*  URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
20472         case TCP_RACK_PRR_SENDALOT:             /*  URL:prr_sendalot */
20473         case TCP_RACK_MIN_TO:                   /*  URL:min_to */
20474         case TCP_RACK_EARLY_SEG:                /*  URL:early_seg */
20475         case TCP_RACK_REORD_THRESH:             /*  URL:reord_thresh */
20476         case TCP_RACK_REORD_FADE:               /*  URL:reord_fade */
20477         case TCP_RACK_TLP_THRESH:               /*  URL:tlp_thresh */
20478         case TCP_RACK_PKT_DELAY:                /*  URL:pkt_delay */
20479         case TCP_RACK_TLP_USE:                  /*  URL:tlp_use */
20480         case TCP_BBR_RACK_RTT_USE:              /*  URL:rttuse */
20481         case TCP_BBR_USE_RACK_RR:               /*  URL:rackrr */
20482         case TCP_RACK_DO_DETECTION:             /*  URL:detect */
20483         case TCP_NO_PRR:                        /*  URL:noprr */
20484         case TCP_TIMELY_DYN_ADJ:                /*  URL:dynamic */
20485         case TCP_DATA_AFTER_CLOSE:              /*  no URL */
20486         case TCP_RACK_NONRXT_CFG_RATE:          /*  URL:nonrxtcr */
20487         case TCP_SHARED_CWND_ENABLE:            /*  URL:scwnd */
20488         case TCP_RACK_MBUF_QUEUE:               /*  URL:mqueue */
20489         case TCP_RACK_NO_PUSH_AT_MAX:           /*  URL:npush */
20490         case TCP_RACK_PACE_TO_FILL:             /*  URL:fillcw */
20491         case TCP_SHARED_CWND_TIME_LIMIT:        /*  URL:lscwnd */
20492         case TCP_RACK_PROFILE:                  /*  URL:profile */
20493         case TCP_USE_CMP_ACKS:                  /*  URL:cmpack */
20494         case TCP_RACK_ABC_VAL:                  /*  URL:labc */
20495         case TCP_REC_ABC_VAL:                   /*  URL:reclabc */
20496         case TCP_RACK_MEASURE_CNT:              /*  URL:measurecnt */
20497         case TCP_DEFER_OPTIONS:                 /*  URL:defer */
20498         case TCP_RACK_DSACK_OPT:                /*  URL:dsack */
20499         case TCP_RACK_PACING_BETA:              /*  URL:pacing_beta */
20500         case TCP_RACK_PACING_BETA_ECN:          /*  URL:pacing_beta_ecn */
20501         case TCP_RACK_TIMER_SLOP:               /*  URL:timer_slop */
20502         case TCP_RACK_ENABLE_HYSTART:           /*  URL:hystart */
20503                 break;
20504         default:
20505                 /* Filter off all unknown options to the base stack */
20506                 return (tcp_default_ctloutput(so, sopt, inp, tp));
20507                 break;
20508         }
20509         INP_WUNLOCK(inp);
20510         if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
20511                 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
20512                 /*
20513                  * We truncate it down to 32 bits for the socket-option trace this
20514                  * means rates > 34Gbps won't show right, but thats probably ok.
20515                  */
20516                 optval = (uint32_t)loptval;
20517         } else {
20518                 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
20519                 /* Save it in 64 bit form too */
20520                 loptval = optval;
20521         }
20522         if (error)
20523                 return (error);
20524         INP_WLOCK(inp);
20525         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
20526                 INP_WUNLOCK(inp);
20527                 return (ECONNRESET);
20528         }
20529         if (tp->t_fb != &__tcp_rack) {
20530                 INP_WUNLOCK(inp);
20531                 return (ENOPROTOOPT);
20532         }
20533         if (rack->defer_options && (rack->gp_ready == 0) &&
20534             (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
20535             (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
20536             (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
20537             (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
20538                 /* Options are beind deferred */
20539                 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
20540                         INP_WUNLOCK(inp);
20541                         return (0);
20542                 } else {
20543                         /* No memory to defer, fail */
20544                         INP_WUNLOCK(inp);
20545                         return (ENOMEM);
20546                 }
20547         }
20548         error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
20549         INP_WUNLOCK(inp);
20550         return (error);
20551 }
20552
20553 static void
20554 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
20555 {
20556
20557         INP_WLOCK_ASSERT(tp->t_inpcb);
20558         bzero(ti, sizeof(*ti));
20559
20560         ti->tcpi_state = tp->t_state;
20561         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
20562                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
20563         if (tp->t_flags & TF_SACK_PERMIT)
20564                 ti->tcpi_options |= TCPI_OPT_SACK;
20565         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
20566                 ti->tcpi_options |= TCPI_OPT_WSCALE;
20567                 ti->tcpi_snd_wscale = tp->snd_scale;
20568                 ti->tcpi_rcv_wscale = tp->rcv_scale;
20569         }
20570         if (tp->t_flags2 & TF2_ECN_PERMIT)
20571                 ti->tcpi_options |= TCPI_OPT_ECN;
20572         if (tp->t_flags & TF_FASTOPEN)
20573                 ti->tcpi_options |= TCPI_OPT_TFO;
20574         /* still kept in ticks is t_rcvtime */
20575         ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
20576         /* Since we hold everything in precise useconds this is easy */
20577         ti->tcpi_rtt = tp->t_srtt;
20578         ti->tcpi_rttvar = tp->t_rttvar;
20579         ti->tcpi_rto = tp->t_rxtcur;
20580         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
20581         ti->tcpi_snd_cwnd = tp->snd_cwnd;
20582         /*
20583          * FreeBSD-specific extension fields for tcp_info.
20584          */
20585         ti->tcpi_rcv_space = tp->rcv_wnd;
20586         ti->tcpi_rcv_nxt = tp->rcv_nxt;
20587         ti->tcpi_snd_wnd = tp->snd_wnd;
20588         ti->tcpi_snd_bwnd = 0;          /* Unused, kept for compat. */
20589         ti->tcpi_snd_nxt = tp->snd_nxt;
20590         ti->tcpi_snd_mss = tp->t_maxseg;
20591         ti->tcpi_rcv_mss = tp->t_maxseg;
20592         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
20593         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
20594         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
20595 #ifdef NETFLIX_STATS
20596         ti->tcpi_total_tlp = tp->t_sndtlppack;
20597         ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
20598         memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
20599 #endif
20600 #ifdef TCP_OFFLOAD
20601         if (tp->t_flags & TF_TOE) {
20602                 ti->tcpi_options |= TCPI_OPT_TOE;
20603                 tcp_offload_tcp_info(tp, ti);
20604         }
20605 #endif
20606 }
20607
20608 static int
20609 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
20610     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
20611 {
20612         int32_t error, optval;
20613         uint64_t val, loptval;
20614         struct  tcp_info ti;
20615         /*
20616          * Because all our options are either boolean or an int, we can just
20617          * pull everything into optval and then unlock and copy. If we ever
20618          * add a option that is not a int, then this will have quite an
20619          * impact to this routine.
20620          */
20621         error = 0;
20622         switch (sopt->sopt_name) {
20623         case TCP_INFO:
20624                 /* First get the info filled */
20625                 rack_fill_info(tp, &ti);
20626                 /* Fix up the rtt related fields if needed */
20627                 INP_WUNLOCK(inp);
20628                 error = sooptcopyout(sopt, &ti, sizeof ti);
20629                 return (error);
20630         /*
20631          * Beta is the congestion control value for NewReno that influences how
20632          * much of a backoff happens when loss is detected. It is normally set
20633          * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
20634          * when you exit recovery.
20635          */
20636         case TCP_RACK_PACING_BETA:
20637                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20638                         error = EINVAL;
20639                 else if (rack->rc_pacing_cc_set == 0)
20640                         optval = rack->r_ctl.rc_saved_beta.beta;
20641                 else {
20642                         /*
20643                          * Reach out into the CC data and report back what
20644                          * I have previously set. Yeah it looks hackish but
20645                          * we don't want to report the saved values.
20646                          */
20647                         if (tp->ccv->cc_data)
20648                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta;
20649                         else
20650                                 error = EINVAL;
20651                 }
20652                 break;
20653                 /*
20654                  * Beta_ecn is the congestion control value for NewReno that influences how
20655                  * much of a backoff happens when a ECN mark is detected. It is normally set
20656                  * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
20657                  * you exit recovery. Note that classic ECN has a beta of 50, it is only
20658                  * ABE Ecn that uses this "less" value, but we do too with pacing :)
20659                  */
20660
20661         case TCP_RACK_PACING_BETA_ECN:
20662                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20663                         error = EINVAL;
20664                 else if (rack->rc_pacing_cc_set == 0)
20665                         optval = rack->r_ctl.rc_saved_beta.beta_ecn;
20666                 else {
20667                         /*
20668                          * Reach out into the CC data and report back what
20669                          * I have previously set. Yeah it looks hackish but
20670                          * we don't want to report the saved values.
20671                          */
20672                         if (tp->ccv->cc_data)
20673                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn;
20674                         else
20675                                 error = EINVAL;
20676                 }
20677                 break;
20678         case TCP_RACK_DSACK_OPT:
20679                 optval = 0;
20680                 if (rack->rc_rack_tmr_std_based) {
20681                         optval |= 1;
20682                 }
20683                 if (rack->rc_rack_use_dsack) {
20684                         optval |= 2;
20685                 }
20686                 break;
20687         case TCP_RACK_ENABLE_HYSTART:
20688         {
20689                 struct sockopt sopt;
20690                 struct cc_newreno_opts opt;
20691
20692                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
20693                 sopt.sopt_dir = SOPT_GET;
20694                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
20695                 if (CC_ALGO(tp)->ctl_output != NULL)
20696                         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
20697                 else
20698                         error = EINVAL;
20699                 optval = opt.val;
20700         }
20701         break;
20702         case TCP_FAST_RSM_HACK:
20703                 optval = rack->fast_rsm_hack;
20704                 break;
20705         case TCP_DEFER_OPTIONS:
20706                 optval = rack->defer_options;
20707                 break;
20708         case TCP_RACK_MEASURE_CNT:
20709                 optval = rack->r_ctl.req_measurements;
20710                 break;
20711         case TCP_REC_ABC_VAL:
20712                 optval = rack->r_use_labc_for_rec;
20713                 break;
20714         case TCP_RACK_ABC_VAL:
20715                 optval = rack->rc_labc;
20716                 break;
20717         case TCP_HDWR_UP_ONLY:
20718                 optval= rack->r_up_only;
20719                 break;
20720         case TCP_PACING_RATE_CAP:
20721                 loptval = rack->r_ctl.bw_rate_cap;
20722                 break;
20723         case TCP_RACK_PROFILE:
20724                 /* You cannot retrieve a profile, its write only */
20725                 error = EINVAL;
20726                 break;
20727         case TCP_USE_CMP_ACKS:
20728                 optval = rack->r_use_cmp_ack;
20729                 break;
20730         case TCP_RACK_PACE_TO_FILL:
20731                 optval = rack->rc_pace_to_cwnd;
20732                 if (optval && rack->r_fill_less_agg)
20733                         optval++;
20734                 break;
20735         case TCP_RACK_NO_PUSH_AT_MAX:
20736                 optval = rack->r_ctl.rc_no_push_at_mrtt;
20737                 break;
20738         case TCP_SHARED_CWND_ENABLE:
20739                 optval = rack->rack_enable_scwnd;
20740                 break;
20741         case TCP_RACK_NONRXT_CFG_RATE:
20742                 optval = rack->rack_rec_nonrxt_use_cr;
20743                 break;
20744         case TCP_NO_PRR:
20745                 if (rack->rack_no_prr  == 1)
20746                         optval = 1;
20747                 else if (rack->no_prr_addback == 1)
20748                         optval = 2;
20749                 else
20750                         optval = 0;
20751                 break;
20752         case TCP_RACK_DO_DETECTION:
20753                 optval = rack->do_detection;
20754                 break;
20755         case TCP_RACK_MBUF_QUEUE:
20756                 /* Now do we use the LRO mbuf-queue feature */
20757                 optval = rack->r_mbuf_queue;
20758                 break;
20759         case TCP_TIMELY_DYN_ADJ:
20760                 optval = rack->rc_gp_dyn_mul;
20761                 break;
20762         case TCP_BBR_IWINTSO:
20763                 optval = rack->rc_init_win;
20764                 break;
20765         case TCP_RACK_TLP_REDUCE:
20766                 /* RACK TLP cwnd reduction (bool) */
20767                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
20768                 break;
20769         case TCP_BBR_RACK_INIT_RATE:
20770                 val = rack->r_ctl.init_rate;
20771                 /* convert to kbits per sec */
20772                 val *= 8;
20773                 val /= 1000;
20774                 optval = (uint32_t)val;
20775                 break;
20776         case TCP_RACK_FORCE_MSEG:
20777                 optval = rack->rc_force_max_seg;
20778                 break;
20779         case TCP_RACK_PACE_MAX_SEG:
20780                 /* Max segments in a pace */
20781                 optval = rack->rc_user_set_max_segs;
20782                 break;
20783         case TCP_RACK_PACE_ALWAYS:
20784                 /* Use the always pace method */
20785                 optval = rack->rc_always_pace;
20786                 break;
20787         case TCP_RACK_PRR_SENDALOT:
20788                 /* Allow PRR to send more than one seg */
20789                 optval = rack->r_ctl.rc_prr_sendalot;
20790                 break;
20791         case TCP_RACK_MIN_TO:
20792                 /* Minimum time between rack t-o's in ms */
20793                 optval = rack->r_ctl.rc_min_to;
20794                 break;
20795         case TCP_RACK_EARLY_SEG:
20796                 /* If early recovery max segments */
20797                 optval = rack->r_ctl.rc_early_recovery_segs;
20798                 break;
20799         case TCP_RACK_REORD_THRESH:
20800                 /* RACK reorder threshold (shift amount) */
20801                 optval = rack->r_ctl.rc_reorder_shift;
20802                 break;
20803         case TCP_RACK_REORD_FADE:
20804                 /* Does reordering fade after ms time */
20805                 optval = rack->r_ctl.rc_reorder_fade;
20806                 break;
20807         case TCP_BBR_USE_RACK_RR:
20808                 /* Do we use the rack cheat for rxt */
20809                 optval = rack->use_rack_rr;
20810                 break;
20811         case TCP_RACK_RR_CONF:
20812                 optval = rack->r_rr_config;
20813                 break;
20814         case TCP_HDWR_RATE_CAP:
20815                 optval = rack->r_rack_hw_rate_caps;
20816                 break;
20817         case TCP_BBR_HDWR_PACE:
20818                 optval = rack->rack_hdw_pace_ena;
20819                 break;
20820         case TCP_RACK_TLP_THRESH:
20821                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20822                 optval = rack->r_ctl.rc_tlp_threshold;
20823                 break;
20824         case TCP_RACK_PKT_DELAY:
20825                 /* RACK added ms i.e. rack-rtt + reord + N */
20826                 optval = rack->r_ctl.rc_pkt_delay;
20827                 break;
20828         case TCP_RACK_TLP_USE:
20829                 optval = rack->rack_tlp_threshold_use;
20830                 break;
20831         case TCP_RACK_PACE_RATE_CA:
20832                 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
20833                 break;
20834         case TCP_RACK_PACE_RATE_SS:
20835                 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
20836                 break;
20837         case TCP_RACK_PACE_RATE_REC:
20838                 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
20839                 break;
20840         case TCP_RACK_GP_INCREASE_SS:
20841                 optval = rack->r_ctl.rack_per_of_gp_ca;
20842                 break;
20843         case TCP_RACK_GP_INCREASE_CA:
20844                 optval = rack->r_ctl.rack_per_of_gp_ss;
20845                 break;
20846         case TCP_BBR_RACK_RTT_USE:
20847                 optval = rack->r_ctl.rc_rate_sample_method;
20848                 break;
20849         case TCP_DELACK:
20850                 optval = tp->t_delayed_ack;
20851                 break;
20852         case TCP_DATA_AFTER_CLOSE:
20853                 optval = rack->rc_allow_data_af_clo;
20854                 break;
20855         case TCP_SHARED_CWND_TIME_LIMIT:
20856                 optval = rack->r_limit_scw;
20857                 break;
20858         case TCP_RACK_TIMER_SLOP:
20859                 optval = rack->r_ctl.timer_slop;
20860                 break;
20861         default:
20862                 return (tcp_default_ctloutput(so, sopt, inp, tp));
20863                 break;
20864         }
20865         INP_WUNLOCK(inp);
20866         if (error == 0) {
20867                 if (TCP_PACING_RATE_CAP)
20868                         error = sooptcopyout(sopt, &loptval, sizeof loptval);
20869                 else
20870                         error = sooptcopyout(sopt, &optval, sizeof optval);
20871         }
20872         return (error);
20873 }
20874
20875 static int
20876 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
20877 {
20878         int32_t error = EINVAL;
20879         struct tcp_rack *rack;
20880
20881         rack = (struct tcp_rack *)tp->t_fb_ptr;
20882         if (rack == NULL) {
20883                 /* Huh? */
20884                 goto out;
20885         }
20886         if (sopt->sopt_dir == SOPT_SET) {
20887                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
20888         } else if (sopt->sopt_dir == SOPT_GET) {
20889                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
20890         }
20891 out:
20892         INP_WUNLOCK(inp);
20893         return (error);
20894 }
20895
20896 static const char *rack_stack_names[] = {
20897         __XSTRING(STACKNAME),
20898 #ifdef STACKALIAS
20899         __XSTRING(STACKALIAS),
20900 #endif
20901 };
20902
20903 static int
20904 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
20905 {
20906         memset(mem, 0, size);
20907         return (0);
20908 }
20909
20910 static void
20911 rack_dtor(void *mem, int32_t size, void *arg)
20912 {
20913
20914 }
20915
20916 static bool rack_mod_inited = false;
20917
20918 static int
20919 tcp_addrack(module_t mod, int32_t type, void *data)
20920 {
20921         int32_t err = 0;
20922         int num_stacks;
20923
20924         switch (type) {
20925         case MOD_LOAD:
20926                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
20927                     sizeof(struct rack_sendmap),
20928                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
20929
20930                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
20931                     sizeof(struct tcp_rack),
20932                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
20933
20934                 sysctl_ctx_init(&rack_sysctl_ctx);
20935                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
20936                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
20937                     OID_AUTO,
20938 #ifdef STACKALIAS
20939                     __XSTRING(STACKALIAS),
20940 #else
20941                     __XSTRING(STACKNAME),
20942 #endif
20943                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
20944                     "");
20945                 if (rack_sysctl_root == NULL) {
20946                         printf("Failed to add sysctl node\n");
20947                         err = EFAULT;
20948                         goto free_uma;
20949                 }
20950                 rack_init_sysctls();
20951                 num_stacks = nitems(rack_stack_names);
20952                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
20953                     rack_stack_names, &num_stacks);
20954                 if (err) {
20955                         printf("Failed to register %s stack name for "
20956                             "%s module\n", rack_stack_names[num_stacks],
20957                             __XSTRING(MODNAME));
20958                         sysctl_ctx_free(&rack_sysctl_ctx);
20959 free_uma:
20960                         uma_zdestroy(rack_zone);
20961                         uma_zdestroy(rack_pcb_zone);
20962                         rack_counter_destroy();
20963                         printf("Failed to register rack module -- err:%d\n", err);
20964                         return (err);
20965                 }
20966                 tcp_lro_reg_mbufq();
20967                 rack_mod_inited = true;
20968                 break;
20969         case MOD_QUIESCE:
20970                 err = deregister_tcp_functions(&__tcp_rack, true, false);
20971                 break;
20972         case MOD_UNLOAD:
20973                 err = deregister_tcp_functions(&__tcp_rack, false, true);
20974                 if (err == EBUSY)
20975                         break;
20976                 if (rack_mod_inited) {
20977                         uma_zdestroy(rack_zone);
20978                         uma_zdestroy(rack_pcb_zone);
20979                         sysctl_ctx_free(&rack_sysctl_ctx);
20980                         rack_counter_destroy();
20981                         rack_mod_inited = false;
20982                 }
20983                 tcp_lro_dereg_mbufq();
20984                 err = 0;
20985                 break;
20986         default:
20987                 return (EOPNOTSUPP);
20988         }
20989         return (err);
20990 }
20991
20992 static moduledata_t tcp_rack = {
20993         .name = __XSTRING(MODNAME),
20994         .evhand = tcp_addrack,
20995         .priv = 0
20996 };
20997
20998 MODULE_VERSION(MODNAME, 1);
20999 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
21000 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);