sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2020 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34 #include "opt_ratelimit.h"
  35 #include "opt_kern_tls.h"
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #ifdef KERN_TLS
  52 #include <sys/ktls.h>
  53 #endif
  54 #include <sys/sysctl.h>
  55 #include <sys/systm.h>
  56 #ifdef STATS
  57 #include <sys/qmath.h>
  58 #include <sys/tree.h>
  59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  60 #else
  61 #include <sys/tree.h>
  62 #endif
  63 #include <sys/refcount.h>
  64 #include <sys/queue.h>
  65 #include <sys/tim_filter.h>
  66 #include <sys/smp.h>
  67 #include <sys/kthread.h>
  68 #include <sys/kern_prefetch.h>
  69 #include <sys/protosw.h>
  70
  71 #include <vm/uma.h>
  72
  73 #include <net/route.h>
  74 #include <net/route/nhop.h>
  75 #include <net/vnet.h>
  76
  77 #define TCPSTATES               /* for logging */
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_kdtrace.h>
  81 #include <netinet/in_pcb.h>
  82 #include <netinet/ip.h>
  83 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  84 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  85 #include <netinet/ip_var.h>
  86 #include <netinet/ip6.h>
  87 #include <netinet6/in6_pcb.h>
  88 #include <netinet6/ip6_var.h>
  89 #include <netinet/tcp.h>
  90 #define TCPOUTFLAGS
  91 #include <netinet/tcp_fsm.h>
  92 #include <netinet/tcp_log_buf.h>
  93 #include <netinet/tcp_seq.h>
  94 #include <netinet/tcp_timer.h>
  95 #include <netinet/tcp_var.h>
  96 #include <netinet/tcp_hpts.h>
  97 #include <netinet/tcp_ratelimit.h>
  98 #include <netinet/tcpip.h>
  99 #include <netinet/cc/cc.h>
 100 #include <netinet/tcp_fastopen.h>
 101 #include <netinet/tcp_lro.h>
 102 #ifdef NETFLIX_SHARED_CWND
 103 #include <netinet/tcp_shared_cwnd.h>
 104 #endif
 105 #ifdef TCPDEBUG
 106 #include <netinet/tcp_debug.h>
 107 #endif                          /* TCPDEBUG */
 108 #ifdef TCP_OFFLOAD
 109 #include <netinet/tcp_offload.h>
 110 #endif
 111 #ifdef INET6
 112 #include <netinet6/tcp6_var.h>
 113 #endif
 114
 115 #include <netipsec/ipsec_support.h>
 116
 117 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 118 #include <netipsec/ipsec.h>
 119 #include <netipsec/ipsec6.h>
 120 #endif                          /* IPSEC */
 121
 122 #include <netinet/udp.h>
 123 #include <netinet/udp_var.h>
 124 #include <machine/in_cksum.h>
 125
 126 #ifdef MAC
 127 #include <security/mac/mac_framework.h>
 128 #endif
 129 #include "sack_filter.h"
 130 #include "tcp_rack.h"
 131 #include "rack_bbr_common.h"
 132
 133 uma_zone_t rack_zone;
 134 uma_zone_t rack_pcb_zone;
 135
 136 #ifndef TICKS2SBT
 137 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 138 #endif
 139
 140 struct sysctl_ctx_list rack_sysctl_ctx;
 141 struct sysctl_oid *rack_sysctl_root;
 142
 143 #define CUM_ACKED 1
 144 #define SACKED 2
 145
 146 /*
 147  * The RACK module incorporates a number of
 148  * TCP ideas that have been put out into the IETF
 149  * over the last few years:
 150  * - Matt Mathis's Rate Halving which slowly drops
 151  *    the congestion window so that the ack clock can
 152  *    be maintained during a recovery.
 153  * - Yuchung Cheng's RACK TCP (for which its named) that
 154  *    will stop us using the number of dup acks and instead
 155  *    use time as the gage of when we retransmit.
 156  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 157  *    of Dukkipati et.al.
 158  * RACK depends on SACK, so if an endpoint arrives that
 159  * cannot do SACK the state machine below will shuttle the
 160  * connection back to using the "default" TCP stack that is
 161  * in FreeBSD.
 162  *
 163  * To implement RACK the original TCP stack was first decomposed
 164  * into a functional state machine with individual states
 165  * for each of the possible TCP connection states. The do_segement
 166  * functions role in life is to mandate the connection supports SACK
 167  * initially and then assure that the RACK state matches the conenction
 168  * state before calling the states do_segment function. Each
 169  * state is simplified due to the fact that the original do_segment
 170  * has been decomposed and we *know* what state we are in (no
 171  * switches on the state) and all tests for SACK are gone. This
 172  * greatly simplifies what each state does.
 173  *
 174  * TCP output is also over-written with a new version since it
 175  * must maintain the new rack scoreboard.
 176  *
 177  */
 178 static int32_t rack_tlp_thresh = 1;
 179 static int32_t rack_tlp_limit = 2;      /* No more than 2 TLPs w-out new data */
 180 static int32_t rack_tlp_use_greater = 1;
 181 static int32_t rack_reorder_thresh = 2;
 182 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 183                                                  * - 60 seconds */
 184 /* Attack threshold detections */
 185 static uint32_t rack_highest_sack_thresh_seen = 0;
 186 static uint32_t rack_highest_move_thresh_seen = 0;
 187
 188 static int32_t rack_pkt_delay = 1;
 189 static int32_t rack_early_recovery = 1;
 190 static int32_t rack_send_a_lot_in_prr = 1;
 191 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 192 static int32_t rack_verbose_logging = 0;
 193 static int32_t rack_ignore_data_after_close = 1;
 194 static int32_t rack_enable_shared_cwnd = 0;
 195 static int32_t rack_limits_scwnd = 1;
 196 static int32_t rack_enable_mqueue_for_nonpaced = 0;
 197 static int32_t rack_disable_prr = 0;
 198 static int32_t use_rack_rr = 1;
 199 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
 200 static int32_t rack_persist_min = 250;  /* 250ms */
 201 static int32_t rack_persist_max = 2000; /* 2 Second */
 202 static int32_t rack_sack_not_required = 0;      /* set to one to allow non-sack to use rack */
 203 static int32_t rack_hw_tls_max_seg = 3; /* 3 means use hw-tls single segment */
 204 static int32_t rack_default_init_window = 0;    /* Use system default */
 205 static int32_t rack_limit_time_with_srtt = 0;
 206 static int32_t rack_hw_pace_adjust = 0;
 207 /*
 208  * Currently regular tcp has a rto_min of 30ms
 209  * the backoff goes 12 times so that ends up
 210  * being a total of 122.850 seconds before a
 211  * connection is killed.
 212  */
 213 static uint32_t rack_def_data_window = 20;
 214 static uint32_t rack_goal_bdp = 2;
 215 static uint32_t rack_min_srtts = 1;
 216 static uint32_t rack_min_measure_usec = 0;
 217 static int32_t rack_tlp_min = 10;
 218 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 219 static int32_t rack_rto_max = 4000;     /* 4 seconds */
 220 static const int32_t rack_free_cache = 2;
 221 static int32_t rack_hptsi_segments = 40;
 222 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 223 static int32_t rack_pace_every_seg = 0;
 224 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 225 static int32_t rack_slot_reduction = 4;
 226 static int32_t rack_wma_divisor = 8;            /* For WMA calculation */
 227 static int32_t rack_cwnd_block_ends_measure = 0;
 228 static int32_t rack_rwnd_block_ends_measure = 0;
 229
 230 static int32_t rack_lower_cwnd_at_tlp = 0;
 231 static int32_t rack_use_proportional_reduce = 0;
 232 static int32_t rack_proportional_rate = 10;
 233 static int32_t rack_tlp_max_resend = 2;
 234 static int32_t rack_limited_retran = 0;
 235 static int32_t rack_always_send_oldest = 0;
 236 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 237
 238 static uint16_t rack_per_of_gp_ss = 250;        /* 250 % slow-start */
 239 static uint16_t rack_per_of_gp_ca = 200;        /* 200 % congestion-avoidance */
 240 static uint16_t rack_per_of_gp_rec = 200;       /* 200 % of bw */
 241
 242 /* Probertt */
 243 static uint16_t rack_per_of_gp_probertt = 60;   /* 60% of bw */
 244 static uint16_t rack_per_of_gp_lowthresh = 40;  /* 40% is bottom */
 245 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
 246 static uint16_t rack_atexit_prtt_hbp = 130;     /* Clamp to 130% on exit prtt if highly buffered path */
 247 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
 248
 249 static uint32_t rack_max_drain_wait = 2;        /* How man gp srtt's before we give up draining */
 250 static uint32_t rack_must_drain = 1;            /* How many GP srtt's we *must* wait */
 251 static uint32_t rack_probertt_use_min_rtt_entry = 1;    /* Use the min to calculate the goal else gp_srtt */
 252 static uint32_t rack_probertt_use_min_rtt_exit = 0;
 253 static uint32_t rack_probe_rtt_sets_cwnd = 0;
 254 static uint32_t rack_probe_rtt_safety_val = 2000000;    /* No more than 2 sec in probe-rtt */
 255 static uint32_t rack_time_between_probertt = 9600000;   /* 9.6 sec in us */
 256 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;       /* How many srtt periods does probe-rtt last top fraction */
 257 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;       /* How many srtt periods does probe-rtt last bottom fraction  */
 258 static uint32_t rack_min_probertt_hold = 200000;        /* Equal to delayed ack time */
 259 static uint32_t rack_probertt_filter_life = 10000000;
 260 static uint32_t rack_probertt_lower_within = 10;
 261 static uint32_t rack_min_rtt_movement = 250;    /* Must move at least 250 useconds to count as a lowering */
 262 static int32_t rack_pace_one_seg = 0;           /* Shall we pace for less than 1.4Meg 1MSS at a time */
 263 static int32_t rack_probertt_clear_is = 1;
 264 static int32_t rack_max_drain_hbp = 1;          /* Extra drain times gpsrtt for highly buffered paths */
 265 static int32_t rack_hbp_thresh = 3;             /* what is the divisor max_rtt/min_rtt to decided a hbp */
 266
 267
 268 /* Part of pacing */
 269 static int32_t rack_max_per_above = 30;         /* When we go to increment stop if above 100+this% */
 270
 271 /* Timely information */
 272 /* Combine these two gives the range of 'no change' to bw */
 273 /* ie the up/down provide the upper and lower bound  */
 274 static int32_t rack_gp_per_bw_mul_up = 2;       /* 2% */
 275 static int32_t rack_gp_per_bw_mul_down = 4;     /* 4% */
 276 static int32_t rack_gp_rtt_maxmul = 3;          /* 3 x maxmin */
 277 static int32_t rack_gp_rtt_minmul = 1;          /* minrtt + (minrtt/mindiv) is lower rtt */
 278 static int32_t rack_gp_rtt_mindiv = 4;          /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
 279 static int32_t rack_gp_decrease_per = 20;       /* 20% decrease in multipler */
 280 static int32_t rack_gp_increase_per = 2;        /* 2% increase in multipler */
 281 static int32_t rack_per_lower_bound = 50;       /* Don't allow to drop below this multiplier */
 282 static int32_t rack_per_upper_bound_ss = 0;     /* Don't allow SS to grow above this */
 283 static int32_t rack_per_upper_bound_ca = 0;     /* Don't allow CA to grow above this */
 284 static int32_t rack_do_dyn_mul = 0;             /* Are the rack gp multipliers dynamic */
 285 static int32_t rack_gp_no_rec_chg = 1;          /* Prohibit recovery from reducing it's multiplier */
 286 static int32_t rack_timely_dec_clear = 6;       /* Do we clear decrement count at a value (6)? */
 287 static int32_t rack_timely_max_push_rise = 3;   /* One round of pushing */
 288 static int32_t rack_timely_max_push_drop = 3;   /* Three round of pushing */
 289 static int32_t rack_timely_min_segs = 4;        /* 4 segment minimum */
 290 static int32_t rack_use_max_for_nobackoff = 0;
 291 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
 292 static int32_t rack_timely_no_stopping = 0;
 293 static int32_t rack_down_raise_thresh = 100;
 294 static int32_t rack_req_segs = 1;
 295
 296 /* Weird delayed ack mode */
 297 static int32_t rack_use_imac_dack = 0;
 298 /* Rack specific counters */
 299 counter_u64_t rack_badfr;
 300 counter_u64_t rack_badfr_bytes;
 301 counter_u64_t rack_rtm_prr_retran;
 302 counter_u64_t rack_rtm_prr_newdata;
 303 counter_u64_t rack_timestamp_mismatch;
 304 counter_u64_t rack_reorder_seen;
 305 counter_u64_t rack_paced_segments;
 306 counter_u64_t rack_unpaced_segments;
 307 counter_u64_t rack_calc_zero;
 308 counter_u64_t rack_calc_nonzero;
 309 counter_u64_t rack_saw_enobuf;
 310 counter_u64_t rack_saw_enetunreach;
 311 counter_u64_t rack_per_timer_hole;
 312
 313 /* Tail loss probe counters */
 314 counter_u64_t rack_tlp_tot;
 315 counter_u64_t rack_tlp_newdata;
 316 counter_u64_t rack_tlp_retran;
 317 counter_u64_t rack_tlp_retran_bytes;
 318 counter_u64_t rack_tlp_retran_fail;
 319 counter_u64_t rack_to_tot;
 320 counter_u64_t rack_to_arm_rack;
 321 counter_u64_t rack_to_arm_tlp;
 322 counter_u64_t rack_to_alloc;
 323 counter_u64_t rack_to_alloc_hard;
 324 counter_u64_t rack_to_alloc_emerg;
 325 counter_u64_t rack_to_alloc_limited;
 326 counter_u64_t rack_alloc_limited_conns;
 327 counter_u64_t rack_split_limited;
 328
 329 counter_u64_t rack_sack_proc_all;
 330 counter_u64_t rack_sack_proc_short;
 331 counter_u64_t rack_sack_proc_restart;
 332 counter_u64_t rack_sack_attacks_detected;
 333 counter_u64_t rack_sack_attacks_reversed;
 334 counter_u64_t rack_sack_used_next_merge;
 335 counter_u64_t rack_sack_splits;
 336 counter_u64_t rack_sack_used_prev_merge;
 337 counter_u64_t rack_sack_skipped_acked;
 338 counter_u64_t rack_ack_total;
 339 counter_u64_t rack_express_sack;
 340 counter_u64_t rack_sack_total;
 341 counter_u64_t rack_move_none;
 342 counter_u64_t rack_move_some;
 343
 344 counter_u64_t rack_used_tlpmethod;
 345 counter_u64_t rack_used_tlpmethod2;
 346 counter_u64_t rack_enter_tlp_calc;
 347 counter_u64_t rack_input_idle_reduces;
 348 counter_u64_t rack_collapsed_win;
 349 counter_u64_t rack_tlp_does_nada;
 350 counter_u64_t rack_try_scwnd;
 351
 352 /* Counters for HW TLS */
 353 counter_u64_t rack_tls_rwnd;
 354 counter_u64_t rack_tls_cwnd;
 355 counter_u64_t rack_tls_app;
 356 counter_u64_t rack_tls_other;
 357 counter_u64_t rack_tls_filled;
 358 counter_u64_t rack_tls_rxt;
 359 counter_u64_t rack_tls_tlp;
 360
 361 /* Temp CPU counters */
 362 counter_u64_t rack_find_high;
 363
 364 counter_u64_t rack_progress_drops;
 365 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 366 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 367
 368 static void
 369 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 370
 371 static int
 372 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 373     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 374     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 375 static int
 376 rack_process_data(struct mbuf *m, struct tcphdr *th,
 377     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 378     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 379 static void
 380 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 381     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 382 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 383 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 384     uint8_t limit_type);
 385 static struct rack_sendmap *
 386 rack_check_recovery_mode(struct tcpcb *tp,
 387     uint32_t tsused);
 388 static void
 389 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 390     uint32_t type);
 391 static void rack_counter_destroy(void);
 392 static int
 393 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 394     struct inpcb *inp, struct tcpcb *tp);
 395 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 396 static void
 397 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line);
 398 static void
 399 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 400     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 401     uint8_t iptos);
 402 static void rack_dtor(void *mem, int32_t size, void *arg);
 403 static void
 404 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 405     uint32_t t, uint32_t cts);
 406 static void
 407 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
 408     uint32_t flex1, uint32_t flex2,
 409     uint32_t flex3, uint32_t flex4,
 410     uint32_t flex5, uint32_t flex6,
 411     uint16_t flex7, uint8_t mod);
 412 static void
 413 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
 414    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm);
 415 static struct rack_sendmap *
 416 rack_find_high_nonack(struct tcp_rack *rack,
 417     struct rack_sendmap *rsm);
 418 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 419 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 420 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 421 static int
 422 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 423     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 424 static void
 425 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 426                             tcp_seq th_ack, int line);
 427 static uint32_t
 428 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 429 static int32_t rack_handoff_ok(struct tcpcb *tp);
 430 static int32_t rack_init(struct tcpcb *tp);
 431 static void rack_init_sysctls(void);
 432 static void
 433 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 434     struct tcphdr *th);
 435 static void
 436 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 437     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 438     uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts);
 439 static void
 440 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 441     struct rack_sendmap *rsm);
 442 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
 443 static int32_t rack_output(struct tcpcb *tp);
 444
 445 static uint32_t
 446 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 447     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 448     uint32_t cts, int *moved_two);
 449 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 450 static void rack_remxt_tmr(struct tcpcb *tp);
 451 static int
 452 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 453     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 454 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 455 static int32_t rack_stopall(struct tcpcb *tp);
 456 static void
 457 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 458     uint32_t delta);
 459 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 460 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 461 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 462 static uint32_t
 463 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 464     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 465 static void
 466 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 467     struct rack_sendmap *rsm, uint32_t ts);
 468 static int
 469 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 470     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
 471 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 472 static int
 473 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 474     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 475     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 476 static int
 477 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 478     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 479     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 480 static int
 481 rack_do_established(struct mbuf *m, struct tcphdr *th,
 482     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 483     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 484 static int
 485 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 486     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 487     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 488 static int
 489 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 490     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 491     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 492 static int
 493 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 494     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 495     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 496 static int
 497 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 498     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 499     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 500 static int
 501 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 502     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 503     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 504 static int
 505 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 506     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 507     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 508 struct rack_sendmap *
 509 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 510     uint32_t tsused);
 511 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
 512     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
 513 static void
 514      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 515
 516 int32_t rack_clear_counter=0;
 517
 518
 519 static int
 520 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 521 {
 522         uint32_t stat;
 523         int32_t error;
 524
 525         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 526         if (error || req->newptr == NULL)
 527                 return error;
 528
 529         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 530         if (error)
 531                 return (error);
 532         if (stat == 1) {
 533 #ifdef INVARIANTS
 534                 printf("Clearing RACK counters\n");
 535 #endif
 536                 counter_u64_zero(rack_badfr);
 537                 counter_u64_zero(rack_badfr_bytes);
 538                 counter_u64_zero(rack_rtm_prr_retran);
 539                 counter_u64_zero(rack_rtm_prr_newdata);
 540                 counter_u64_zero(rack_timestamp_mismatch);
 541                 counter_u64_zero(rack_reorder_seen);
 542                 counter_u64_zero(rack_tlp_tot);
 543                 counter_u64_zero(rack_tlp_newdata);
 544                 counter_u64_zero(rack_tlp_retran);
 545                 counter_u64_zero(rack_tlp_retran_bytes);
 546                 counter_u64_zero(rack_tlp_retran_fail);
 547                 counter_u64_zero(rack_to_tot);
 548                 counter_u64_zero(rack_to_arm_rack);
 549                 counter_u64_zero(rack_to_arm_tlp);
 550                 counter_u64_zero(rack_paced_segments);
 551                 counter_u64_zero(rack_calc_zero);
 552                 counter_u64_zero(rack_calc_nonzero);
 553                 counter_u64_zero(rack_unpaced_segments);
 554                 counter_u64_zero(rack_saw_enobuf);
 555                 counter_u64_zero(rack_saw_enetunreach);
 556                 counter_u64_zero(rack_per_timer_hole);
 557                 counter_u64_zero(rack_to_alloc_hard);
 558                 counter_u64_zero(rack_to_alloc_emerg);
 559                 counter_u64_zero(rack_sack_proc_all);
 560                 counter_u64_zero(rack_sack_proc_short);
 561                 counter_u64_zero(rack_sack_proc_restart);
 562                 counter_u64_zero(rack_to_alloc);
 563                 counter_u64_zero(rack_to_alloc_limited);
 564                 counter_u64_zero(rack_alloc_limited_conns);
 565                 counter_u64_zero(rack_split_limited);
 566                 counter_u64_zero(rack_find_high);
 567                 counter_u64_zero(rack_tls_rwnd);
 568                 counter_u64_zero(rack_tls_cwnd);
 569                 counter_u64_zero(rack_tls_app);
 570                 counter_u64_zero(rack_tls_other);
 571                 counter_u64_zero(rack_tls_filled);
 572                 counter_u64_zero(rack_tls_rxt);
 573                 counter_u64_zero(rack_tls_tlp);
 574                 counter_u64_zero(rack_sack_attacks_detected);
 575                 counter_u64_zero(rack_sack_attacks_reversed);
 576                 counter_u64_zero(rack_sack_used_next_merge);
 577                 counter_u64_zero(rack_sack_used_prev_merge);
 578                 counter_u64_zero(rack_sack_splits);
 579                 counter_u64_zero(rack_sack_skipped_acked);
 580                 counter_u64_zero(rack_ack_total);
 581                 counter_u64_zero(rack_express_sack);
 582                 counter_u64_zero(rack_sack_total);
 583                 counter_u64_zero(rack_move_none);
 584                 counter_u64_zero(rack_move_some);
 585                 counter_u64_zero(rack_used_tlpmethod);
 586                 counter_u64_zero(rack_used_tlpmethod2);
 587                 counter_u64_zero(rack_enter_tlp_calc);
 588                 counter_u64_zero(rack_progress_drops);
 589                 counter_u64_zero(rack_tlp_does_nada);
 590                 counter_u64_zero(rack_try_scwnd);
 591                 counter_u64_zero(rack_collapsed_win);
 592
 593         }
 594         rack_clear_counter = 0;
 595         return (0);
 596 }
 597
 598
 599
 600 static void
 601 rack_init_sysctls(void)
 602 {
 603         struct sysctl_oid *rack_counters;
 604         struct sysctl_oid *rack_attack;
 605         struct sysctl_oid *rack_pacing;
 606         struct sysctl_oid *rack_timely;
 607         struct sysctl_oid *rack_timers;
 608         struct sysctl_oid *rack_tlp;
 609         struct sysctl_oid *rack_misc;
 610         struct sysctl_oid *rack_measure;
 611         struct sysctl_oid *rack_probertt;
 612
 613         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 614             SYSCTL_CHILDREN(rack_sysctl_root),
 615             OID_AUTO,
 616             "sack_attack",
 617             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 618             "Rack Sack Attack Counters and Controls");
 619         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 620             SYSCTL_CHILDREN(rack_sysctl_root),
 621             OID_AUTO,
 622             "stats",
 623             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 624             "Rack Counters");
 625         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 626             SYSCTL_CHILDREN(rack_sysctl_root),
 627             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 628             &rack_rate_sample_method , USE_RTT_LOW,
 629             "What method should we use for rate sampling 0=high, 1=low ");
 630         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 631             SYSCTL_CHILDREN(rack_sysctl_root),
 632             OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
 633             &rack_hw_tls_max_seg , 3,
 634             "What is the maximum number of full TLS records that will be sent at once");
 635         /* Probe rtt related controls */
 636         rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO,
 639             "probertt",
 640             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 641             "ProbeRTT related Controls");
 642         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 643             SYSCTL_CHILDREN(rack_probertt),
 644             OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
 645             &rack_atexit_prtt_hbp, 130,
 646             "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
 647         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 648             SYSCTL_CHILDREN(rack_probertt),
 649             OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
 650             &rack_atexit_prtt, 130,
 651             "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
 652         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 653             SYSCTL_CHILDREN(rack_probertt),
 654             OID_AUTO, "gp_per_mul", CTLFLAG_RW,
 655             &rack_per_of_gp_probertt, 60,
 656             "What percentage of goodput do we pace at in probertt");
 657         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 658             SYSCTL_CHILDREN(rack_probertt),
 659             OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
 660             &rack_per_of_gp_probertt_reduce, 10,
 661             "What percentage of goodput do we reduce every gp_srtt");
 662         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 663             SYSCTL_CHILDREN(rack_probertt),
 664             OID_AUTO, "gp_per_low", CTLFLAG_RW,
 665             &rack_per_of_gp_lowthresh, 40,
 666             "What percentage of goodput do we allow the multiplier to fall to");
 667         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 668             SYSCTL_CHILDREN(rack_probertt),
 669             OID_AUTO, "time_between", CTLFLAG_RW,
 670             & rack_time_between_probertt, 96000000,
 671             "How many useconds between the lowest rtt falling must past before we enter probertt");
 672         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 673             SYSCTL_CHILDREN(rack_probertt),
 674             OID_AUTO, "safety", CTLFLAG_RW,
 675             &rack_probe_rtt_safety_val, 2000000,
 676             "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
 677         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 678             SYSCTL_CHILDREN(rack_probertt),
 679             OID_AUTO, "sets_cwnd", CTLFLAG_RW,
 680             &rack_probe_rtt_sets_cwnd, 0,
 681             "Do we set the cwnd too (if always_lower is on)");
 682         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 683             SYSCTL_CHILDREN(rack_probertt),
 684             OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
 685             &rack_max_drain_wait, 2,
 686             "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
 687         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 688             SYSCTL_CHILDREN(rack_probertt),
 689             OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
 690             &rack_must_drain, 1,
 691             "We must drain this many gp_srtt's waiting for flight to reach goal");
 692         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 693             SYSCTL_CHILDREN(rack_probertt),
 694             OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
 695             &rack_probertt_use_min_rtt_entry, 1,
 696             "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
 697         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 698             SYSCTL_CHILDREN(rack_probertt),
 699             OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
 700             &rack_probertt_use_min_rtt_exit, 0,
 701             "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
 702         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 703             SYSCTL_CHILDREN(rack_probertt),
 704             OID_AUTO, "length_div", CTLFLAG_RW,
 705             &rack_probertt_gpsrtt_cnt_div, 0,
 706             "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
 707         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 708             SYSCTL_CHILDREN(rack_probertt),
 709             OID_AUTO, "length_mul", CTLFLAG_RW,
 710             &rack_probertt_gpsrtt_cnt_mul, 0,
 711             "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
 712         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 713             SYSCTL_CHILDREN(rack_probertt),
 714             OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
 715             &rack_min_probertt_hold, 200000,
 716             "What is the minimum time we hold probertt at target");
 717         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 718             SYSCTL_CHILDREN(rack_probertt),
 719             OID_AUTO, "filter_life", CTLFLAG_RW,
 720             &rack_probertt_filter_life, 10000000,
 721             "What is the time for the filters life in useconds");
 722         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 723             SYSCTL_CHILDREN(rack_probertt),
 724             OID_AUTO, "lower_within", CTLFLAG_RW,
 725             &rack_probertt_lower_within, 10,
 726             "If the rtt goes lower within this percentage of the time, go into probe-rtt");
 727         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 728             SYSCTL_CHILDREN(rack_probertt),
 729             OID_AUTO, "must_move", CTLFLAG_RW,
 730             &rack_min_rtt_movement, 250,
 731             "How much is the minimum movement in rtt to count as a drop for probertt purposes");
 732         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 733             SYSCTL_CHILDREN(rack_probertt),
 734             OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
 735             &rack_probertt_clear_is, 1,
 736             "Do we clear I/S counts on exiting probe-rtt");
 737         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 738             SYSCTL_CHILDREN(rack_probertt),
 739             OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
 740             &rack_max_drain_hbp, 1,
 741             "How many extra drain gpsrtt's do we get in highly buffered paths");
 742         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 743             SYSCTL_CHILDREN(rack_probertt),
 744             OID_AUTO, "hbp_threshold", CTLFLAG_RW,
 745             &rack_hbp_thresh, 3,
 746             "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
 747         /* Pacing related sysctls */
 748         rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 749             SYSCTL_CHILDREN(rack_sysctl_root),
 750             OID_AUTO,
 751             "pacing",
 752             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 753             "Pacing related Controls");
 754         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 755             SYSCTL_CHILDREN(rack_pacing),
 756             OID_AUTO, "max_pace_over", CTLFLAG_RW,
 757             &rack_max_per_above, 30,
 758             "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
 759         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 760             SYSCTL_CHILDREN(rack_pacing),
 761             OID_AUTO, "pace_to_one", CTLFLAG_RW,
 762             &rack_pace_one_seg, 0,
 763             "Do we allow low b/w pacing of 1MSS instead of two");
 764         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 765             SYSCTL_CHILDREN(rack_pacing),
 766             OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
 767             &rack_limit_time_with_srtt, 0,
 768             "Do we limit pacing time based on srtt");
 769         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 770             SYSCTL_CHILDREN(rack_pacing),
 771             OID_AUTO, "init_win", CTLFLAG_RW,
 772             &rack_default_init_window, 0,
 773             "Do we have a rack initial window 0 = system default");
 774         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 775             SYSCTL_CHILDREN(rack_pacing),
 776             OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW,
 777             &rack_hw_pace_adjust, 0,
 778             "What percentage do we raise the MSS by (11 = 1.1%)");
 779         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 780             SYSCTL_CHILDREN(rack_pacing),
 781             OID_AUTO, "gp_per_ss", CTLFLAG_RW,
 782             &rack_per_of_gp_ss, 250,
 783             "If non zero, what percentage of goodput to pace at in slow start");
 784         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 785             SYSCTL_CHILDREN(rack_pacing),
 786             OID_AUTO, "gp_per_ca", CTLFLAG_RW,
 787             &rack_per_of_gp_ca, 150,
 788             "If non zero, what percentage of goodput to pace at in congestion avoidance");
 789         SYSCTL_ADD_U16(&rack_sysctl_ctx,
 790             SYSCTL_CHILDREN(rack_pacing),
 791             OID_AUTO, "gp_per_rec", CTLFLAG_RW,
 792             &rack_per_of_gp_rec, 200,
 793             "If non zero, what percentage of goodput to pace at in recovery");
 794         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 795             SYSCTL_CHILDREN(rack_pacing),
 796             OID_AUTO, "pace_max_seg", CTLFLAG_RW,
 797             &rack_hptsi_segments, 40,
 798             "What size is the max for TSO segments in pacing and burst mitigation");
 799         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 800             SYSCTL_CHILDREN(rack_pacing),
 801             OID_AUTO, "burst_reduces", CTLFLAG_RW,
 802             &rack_slot_reduction, 4,
 803             "When doing only burst mitigation what is the reduce divisor");
 804         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 805             SYSCTL_CHILDREN(rack_sysctl_root),
 806             OID_AUTO, "use_pacing", CTLFLAG_RW,
 807             &rack_pace_every_seg, 0,
 808             "If set we use pacing, if clear we use only the original burst mitigation");
 809
 810         rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 811             SYSCTL_CHILDREN(rack_sysctl_root),
 812             OID_AUTO,
 813             "timely",
 814             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 815             "Rack Timely RTT Controls");
 816         /* Timely based GP dynmics */
 817         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 818             SYSCTL_CHILDREN(rack_timely),
 819             OID_AUTO, "upper", CTLFLAG_RW,
 820             &rack_gp_per_bw_mul_up, 2,
 821             "Rack timely upper range for equal b/w (in percentage)");
 822         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 823             SYSCTL_CHILDREN(rack_timely),
 824             OID_AUTO, "lower", CTLFLAG_RW,
 825             &rack_gp_per_bw_mul_down, 4,
 826             "Rack timely lower range for equal b/w (in percentage)");
 827         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 828             SYSCTL_CHILDREN(rack_timely),
 829             OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
 830             &rack_gp_rtt_maxmul, 3,
 831             "Rack timely multipler of lowest rtt for rtt_max");
 832         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 833             SYSCTL_CHILDREN(rack_timely),
 834             OID_AUTO, "rtt_min_div", CTLFLAG_RW,
 835             &rack_gp_rtt_mindiv, 4,
 836             "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
 837         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 838             SYSCTL_CHILDREN(rack_timely),
 839             OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
 840             &rack_gp_rtt_minmul, 1,
 841             "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
 842         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 843             SYSCTL_CHILDREN(rack_timely),
 844             OID_AUTO, "decrease", CTLFLAG_RW,
 845             &rack_gp_decrease_per, 20,
 846             "Rack timely decrease percentage of our GP multiplication factor");
 847         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 848             SYSCTL_CHILDREN(rack_timely),
 849             OID_AUTO, "increase", CTLFLAG_RW,
 850             &rack_gp_increase_per, 2,
 851             "Rack timely increase perentage of our GP multiplication factor");
 852         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 853             SYSCTL_CHILDREN(rack_timely),
 854             OID_AUTO, "lowerbound", CTLFLAG_RW,
 855             &rack_per_lower_bound, 50,
 856             "Rack timely lowest percentage we allow GP multiplier to fall to");
 857         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 858             SYSCTL_CHILDREN(rack_timely),
 859             OID_AUTO, "upperboundss", CTLFLAG_RW,
 860             &rack_per_upper_bound_ss, 0,
 861             "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
 862         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 863             SYSCTL_CHILDREN(rack_timely),
 864             OID_AUTO, "upperboundca", CTLFLAG_RW,
 865             &rack_per_upper_bound_ca, 0,
 866             "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
 867         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 868             SYSCTL_CHILDREN(rack_timely),
 869             OID_AUTO, "dynamicgp", CTLFLAG_RW,
 870             &rack_do_dyn_mul, 0,
 871             "Rack timely do we enable dynmaic timely goodput by default");
 872         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 873             SYSCTL_CHILDREN(rack_timely),
 874             OID_AUTO, "no_rec_red", CTLFLAG_RW,
 875             &rack_gp_no_rec_chg, 1,
 876             "Rack timely do we prohibit the recovery multiplier from being lowered");
 877         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 878             SYSCTL_CHILDREN(rack_timely),
 879             OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
 880             &rack_timely_dec_clear, 6,
 881             "Rack timely what threshold do we count to before another boost during b/w decent");
 882         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 883             SYSCTL_CHILDREN(rack_timely),
 884             OID_AUTO, "max_push_rise", CTLFLAG_RW,
 885             &rack_timely_max_push_rise, 3,
 886             "Rack timely how many times do we push up with b/w increase");
 887         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 888             SYSCTL_CHILDREN(rack_timely),
 889             OID_AUTO, "max_push_drop", CTLFLAG_RW,
 890             &rack_timely_max_push_drop, 3,
 891             "Rack timely how many times do we push back on b/w decent");
 892         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 893             SYSCTL_CHILDREN(rack_timely),
 894             OID_AUTO, "min_segs", CTLFLAG_RW,
 895             &rack_timely_min_segs, 4,
 896             "Rack timely when setting the cwnd what is the min num segments");
 897         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 898             SYSCTL_CHILDREN(rack_timely),
 899             OID_AUTO, "noback_max", CTLFLAG_RW,
 900             &rack_use_max_for_nobackoff, 0,
 901             "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
 902         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 903             SYSCTL_CHILDREN(rack_timely),
 904             OID_AUTO, "interim_timely_only", CTLFLAG_RW,
 905             &rack_timely_int_timely_only, 0,
 906             "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
 907         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 908             SYSCTL_CHILDREN(rack_timely),
 909             OID_AUTO, "nonstop", CTLFLAG_RW,
 910             &rack_timely_no_stopping, 0,
 911             "Rack timely don't stop increase");
 912         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 913             SYSCTL_CHILDREN(rack_timely),
 914             OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
 915             &rack_down_raise_thresh, 100,
 916             "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
 917         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 918             SYSCTL_CHILDREN(rack_timely),
 919             OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
 920             &rack_req_segs, 1,
 921             "Bottom dragging if not these many segments outstanding and room");
 922
 923         /* TLP and Rack related parameters */
 924         rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 925             SYSCTL_CHILDREN(rack_sysctl_root),
 926             OID_AUTO,
 927             "tlp",
 928             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 929             "TLP and Rack related Controls");
 930         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 931             SYSCTL_CHILDREN(rack_tlp),
 932             OID_AUTO, "use_rrr", CTLFLAG_RW,
 933             &use_rack_rr, 1,
 934             "Do we use Rack Rapid Recovery");
 935         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 936             SYSCTL_CHILDREN(rack_tlp),
 937             OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
 938             &rack_non_rxt_use_cr, 0,
 939             "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
 940         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 941             SYSCTL_CHILDREN(rack_tlp),
 942             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 943             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 944             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 945         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 946             SYSCTL_CHILDREN(rack_tlp),
 947             OID_AUTO, "limit", CTLFLAG_RW,
 948             &rack_tlp_limit, 2,
 949             "How many TLP's can be sent without sending new data");
 950         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 951             SYSCTL_CHILDREN(rack_tlp),
 952             OID_AUTO, "use_greater", CTLFLAG_RW,
 953             &rack_tlp_use_greater, 1,
 954             "Should we use the rack_rtt time if its greater than srtt");
 955         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 956             SYSCTL_CHILDREN(rack_tlp),
 957             OID_AUTO, "tlpminto", CTLFLAG_RW,
 958             &rack_tlp_min, 10,
 959             "TLP minimum timeout per the specification (10ms)");
 960         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 961             SYSCTL_CHILDREN(rack_tlp),
 962             OID_AUTO, "send_oldest", CTLFLAG_RW,
 963             &rack_always_send_oldest, 0,
 964             "Should we always send the oldest TLP and RACK-TLP");
 965         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 966             SYSCTL_CHILDREN(rack_tlp),
 967             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 968             &rack_limited_retran, 0,
 969             "How many times can a rack timeout drive out sends");
 970         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 971             SYSCTL_CHILDREN(rack_tlp),
 972             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 973             &rack_tlp_max_resend, 2,
 974             "How many times does TLP retry a single segment or multiple with no ACK");
 975         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 976             SYSCTL_CHILDREN(rack_tlp),
 977             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 978             &rack_lower_cwnd_at_tlp, 0,
 979             "When a TLP completes a retran should we enter recovery");
 980         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 981             SYSCTL_CHILDREN(rack_tlp),
 982             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 983             &rack_reorder_thresh, 2,
 984             "What factor for rack will be added when seeing reordering (shift right)");
 985         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 986             SYSCTL_CHILDREN(rack_tlp),
 987             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 988             &rack_tlp_thresh, 1,
 989             "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 990         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 991             SYSCTL_CHILDREN(rack_tlp),
 992             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 993             &rack_reorder_fade, 0,
 994             "Does reorder detection fade, if so how many ms (0 means never)");
 995         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 996             SYSCTL_CHILDREN(rack_tlp),
 997             OID_AUTO, "pktdelay", CTLFLAG_RW,
 998             &rack_pkt_delay, 1,
 999             "Extra RACK time (in ms) besides reordering thresh");
1000
1001         /* Timer related controls */
1002         rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1003             SYSCTL_CHILDREN(rack_sysctl_root),
1004             OID_AUTO,
1005             "timers",
1006             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1007             "Timer related controls");
1008         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1009             SYSCTL_CHILDREN(rack_timers),
1010             OID_AUTO, "persmin", CTLFLAG_RW,
1011             &rack_persist_min, 250,
1012             "What is the minimum time in milliseconds between persists");
1013         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1014             SYSCTL_CHILDREN(rack_timers),
1015             OID_AUTO, "persmax", CTLFLAG_RW,
1016             &rack_persist_max, 2000,
1017             "What is the largest delay in milliseconds between persists");
1018         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1019             SYSCTL_CHILDREN(rack_timers),
1020             OID_AUTO, "delayed_ack", CTLFLAG_RW,
1021             &rack_delayed_ack_time, 200,
1022             "Delayed ack time (200ms)");
1023         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1024             SYSCTL_CHILDREN(rack_timers),
1025             OID_AUTO, "minrto", CTLFLAG_RW,
1026             &rack_rto_min, 0,
1027             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
1028         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1029             SYSCTL_CHILDREN(rack_timers),
1030             OID_AUTO, "maxrto", CTLFLAG_RW,
1031             &rack_rto_max, 0,
1032             "Maxiumum RTO in ms -- should be at least as large as min_rto");
1033         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1034             SYSCTL_CHILDREN(rack_timers),
1035             OID_AUTO, "minto", CTLFLAG_RW,
1036             &rack_min_to, 1,
1037             "Minimum rack timeout in milliseconds");
1038         /* Measure controls */
1039         rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1040             SYSCTL_CHILDREN(rack_sysctl_root),
1041             OID_AUTO,
1042             "measure",
1043             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1044             "Measure related controls");
1045         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1046             SYSCTL_CHILDREN(rack_measure),
1047             OID_AUTO, "wma_divisor", CTLFLAG_RW,
1048             &rack_wma_divisor, 8,
1049             "When doing b/w calculation what is the  divisor for the WMA");
1050         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1051             SYSCTL_CHILDREN(rack_measure),
1052             OID_AUTO, "end_cwnd", CTLFLAG_RW,
1053             &rack_cwnd_block_ends_measure, 0,
1054             "Does a cwnd just-return end the measurement window (app limited)");
1055         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1056             SYSCTL_CHILDREN(rack_measure),
1057             OID_AUTO, "end_rwnd", CTLFLAG_RW,
1058             &rack_rwnd_block_ends_measure, 0,
1059             "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1060         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1061             SYSCTL_CHILDREN(rack_measure),
1062             OID_AUTO, "min_target", CTLFLAG_RW,
1063             &rack_def_data_window, 20,
1064             "What is the minimum target window (in mss) for a GP measurements");
1065         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1066             SYSCTL_CHILDREN(rack_measure),
1067             OID_AUTO, "goal_bdp", CTLFLAG_RW,
1068             &rack_goal_bdp, 2,
1069             "What is the goal BDP to measure");
1070         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1071             SYSCTL_CHILDREN(rack_measure),
1072             OID_AUTO, "min_srtts", CTLFLAG_RW,
1073             &rack_min_srtts, 1,
1074             "What is the goal BDP to measure");
1075         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1076             SYSCTL_CHILDREN(rack_measure),
1077             OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1078             &rack_min_measure_usec, 0,
1079             "What is the Minimum time time for a measurement if 0, this is off");
1080         /* Misc rack controls */
1081         rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1082             SYSCTL_CHILDREN(rack_sysctl_root),
1083             OID_AUTO,
1084             "misc",
1085             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1086             "Misc related controls");
1087         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1088             SYSCTL_CHILDREN(rack_misc),
1089             OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1090             &rack_enable_shared_cwnd, 0,
1091             "Should RACK try to use the shared cwnd on connections where allowed");
1092         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1093             SYSCTL_CHILDREN(rack_misc),
1094             OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1095             &rack_limits_scwnd, 1,
1096             "Should RACK place low end time limits on the shared cwnd feature");
1097         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1098             SYSCTL_CHILDREN(rack_misc),
1099             OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1100             &rack_enable_mqueue_for_nonpaced, 0,
1101             "Should RACK use mbuf queuing for non-paced connections");
1102         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1103             SYSCTL_CHILDREN(rack_misc),
1104             OID_AUTO, "iMac_dack", CTLFLAG_RW,
1105             &rack_use_imac_dack, 0,
1106             "Should RACK try to emulate iMac delayed ack");
1107         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1108             SYSCTL_CHILDREN(rack_misc),
1109             OID_AUTO, "no_prr", CTLFLAG_RW,
1110             &rack_disable_prr, 0,
1111             "Should RACK not use prr and only pace (must have pacing on)");
1112         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1113             SYSCTL_CHILDREN(rack_misc),
1114             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1115             &rack_verbose_logging, 0,
1116             "Should RACK black box logging be verbose");
1117         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1118             SYSCTL_CHILDREN(rack_misc),
1119             OID_AUTO, "data_after_close", CTLFLAG_RW,
1120             &rack_ignore_data_after_close, 1,
1121             "Do we hold off sending a RST until all pending data is ack'd");
1122         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1123             SYSCTL_CHILDREN(rack_misc),
1124             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1125             &rack_sack_not_required, 0,
1126             "Do we allow rack to run on connections not supporting SACK");
1127         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1128             SYSCTL_CHILDREN(rack_misc),
1129             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
1130             &rack_use_proportional_reduce, 0,
1131             "Should we proportionaly reduce cwnd based on the number of losses ");
1132         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1133             SYSCTL_CHILDREN(rack_misc),
1134             OID_AUTO, "recovery_prop", CTLFLAG_RW,
1135             &rack_proportional_rate, 10,
1136             "What percent reduction per loss");
1137         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1138             SYSCTL_CHILDREN(rack_misc),
1139             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1140             &rack_send_a_lot_in_prr, 1,
1141             "Send a lot in prr");
1142         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1143             SYSCTL_CHILDREN(rack_misc),
1144             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
1145             &rack_early_recovery, 1,
1146             "Do we do early recovery with rack");
1147         /* Sack Attacker detection stuff */
1148         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1149             SYSCTL_CHILDREN(rack_attack),
1150             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
1151             &rack_highest_sack_thresh_seen, 0,
1152             "Highest sack to ack ratio seen");
1153         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1154             SYSCTL_CHILDREN(rack_attack),
1155             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
1156             &rack_highest_move_thresh_seen, 0,
1157             "Highest move to non-move ratio seen");
1158         rack_ack_total = counter_u64_alloc(M_WAITOK);
1159         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1160             SYSCTL_CHILDREN(rack_attack),
1161             OID_AUTO, "acktotal", CTLFLAG_RD,
1162             &rack_ack_total,
1163             "Total number of Ack's");
1164         rack_express_sack = counter_u64_alloc(M_WAITOK);
1165         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1166             SYSCTL_CHILDREN(rack_attack),
1167             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
1168             &rack_express_sack,
1169             "Total expresss number of Sack's");
1170         rack_sack_total = counter_u64_alloc(M_WAITOK);
1171         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1172             SYSCTL_CHILDREN(rack_attack),
1173             OID_AUTO, "sacktotal", CTLFLAG_RD,
1174             &rack_sack_total,
1175             "Total number of SACKs");
1176         rack_move_none = counter_u64_alloc(M_WAITOK);
1177         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1178             SYSCTL_CHILDREN(rack_attack),
1179             OID_AUTO, "move_none", CTLFLAG_RD,
1180             &rack_move_none,
1181             "Total number of SACK index reuse of postions under threshold");
1182         rack_move_some = counter_u64_alloc(M_WAITOK);
1183         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1184             SYSCTL_CHILDREN(rack_attack),
1185             OID_AUTO, "move_some", CTLFLAG_RD,
1186             &rack_move_some,
1187             "Total number of SACK index reuse of postions over threshold");
1188         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
1189         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1190             SYSCTL_CHILDREN(rack_attack),
1191             OID_AUTO, "attacks", CTLFLAG_RD,
1192             &rack_sack_attacks_detected,
1193             "Total number of SACK attackers that had sack disabled");
1194         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
1195         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1196             SYSCTL_CHILDREN(rack_attack),
1197             OID_AUTO, "reversed", CTLFLAG_RD,
1198             &rack_sack_attacks_reversed,
1199             "Total number of SACK attackers that were later determined false positive");
1200         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
1201         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1202             SYSCTL_CHILDREN(rack_attack),
1203             OID_AUTO, "nextmerge", CTLFLAG_RD,
1204             &rack_sack_used_next_merge,
1205             "Total number of times we used the next merge");
1206         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
1207         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1208             SYSCTL_CHILDREN(rack_attack),
1209             OID_AUTO, "prevmerge", CTLFLAG_RD,
1210             &rack_sack_used_prev_merge,
1211             "Total number of times we used the prev merge");
1212         /* Counters */
1213         rack_badfr = counter_u64_alloc(M_WAITOK);
1214         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1215             SYSCTL_CHILDREN(rack_counters),
1216             OID_AUTO, "badfr", CTLFLAG_RD,
1217             &rack_badfr, "Total number of bad FRs");
1218         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
1219         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1220             SYSCTL_CHILDREN(rack_counters),
1221             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
1222             &rack_badfr_bytes, "Total number of bad FRs");
1223         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
1224         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1225             SYSCTL_CHILDREN(rack_counters),
1226             OID_AUTO, "prrsndret", CTLFLAG_RD,
1227             &rack_rtm_prr_retran,
1228             "Total number of prr based retransmits");
1229         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
1230         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1231             SYSCTL_CHILDREN(rack_counters),
1232             OID_AUTO, "prrsndnew", CTLFLAG_RD,
1233             &rack_rtm_prr_newdata,
1234             "Total number of prr based new transmits");
1235         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
1236         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1237             SYSCTL_CHILDREN(rack_counters),
1238             OID_AUTO, "tsnf", CTLFLAG_RD,
1239             &rack_timestamp_mismatch,
1240             "Total number of timestamps that we could not find the reported ts");
1241         rack_find_high = counter_u64_alloc(M_WAITOK);
1242         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1243             SYSCTL_CHILDREN(rack_counters),
1244             OID_AUTO, "findhigh", CTLFLAG_RD,
1245             &rack_find_high,
1246             "Total number of FIN causing find-high");
1247         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
1248         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1249             SYSCTL_CHILDREN(rack_counters),
1250             OID_AUTO, "reordering", CTLFLAG_RD,
1251             &rack_reorder_seen,
1252             "Total number of times we added delay due to reordering");
1253         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1254         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1255             SYSCTL_CHILDREN(rack_counters),
1256             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1257             &rack_tlp_tot,
1258             "Total number of tail loss probe expirations");
1259         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1260         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1261             SYSCTL_CHILDREN(rack_counters),
1262             OID_AUTO, "tlp_new", CTLFLAG_RD,
1263             &rack_tlp_newdata,
1264             "Total number of tail loss probe sending new data");
1265         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1266         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1267             SYSCTL_CHILDREN(rack_counters),
1268             OID_AUTO, "tlp_retran", CTLFLAG_RD,
1269             &rack_tlp_retran,
1270             "Total number of tail loss probe sending retransmitted data");
1271         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1272         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1273             SYSCTL_CHILDREN(rack_counters),
1274             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1275             &rack_tlp_retran_bytes,
1276             "Total bytes of tail loss probe sending retransmitted data");
1277         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
1278         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1279             SYSCTL_CHILDREN(rack_counters),
1280             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
1281             &rack_tlp_retran_fail,
1282             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
1283         rack_to_tot = counter_u64_alloc(M_WAITOK);
1284         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1285             SYSCTL_CHILDREN(rack_counters),
1286             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1287             &rack_to_tot,
1288             "Total number of times the rack to expired");
1289         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
1290         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1291             SYSCTL_CHILDREN(rack_counters),
1292             OID_AUTO, "arm_rack", CTLFLAG_RD,
1293             &rack_to_arm_rack,
1294             "Total number of times the rack timer armed");
1295         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
1296         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1297             SYSCTL_CHILDREN(rack_counters),
1298             OID_AUTO, "arm_tlp", CTLFLAG_RD,
1299             &rack_to_arm_tlp,
1300             "Total number of times the tlp timer armed");
1301         rack_calc_zero = counter_u64_alloc(M_WAITOK);
1302         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
1303         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1304             SYSCTL_CHILDREN(rack_counters),
1305             OID_AUTO, "calc_zero", CTLFLAG_RD,
1306             &rack_calc_zero,
1307             "Total number of times pacing time worked out to zero");
1308         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1309             SYSCTL_CHILDREN(rack_counters),
1310             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
1311             &rack_calc_nonzero,
1312             "Total number of times pacing time worked out to non-zero");
1313         rack_paced_segments = counter_u64_alloc(M_WAITOK);
1314         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1315             SYSCTL_CHILDREN(rack_counters),
1316             OID_AUTO, "paced", CTLFLAG_RD,
1317             &rack_paced_segments,
1318             "Total number of times a segment send caused hptsi");
1319         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
1320         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1321             SYSCTL_CHILDREN(rack_counters),
1322             OID_AUTO, "unpaced", CTLFLAG_RD,
1323             &rack_unpaced_segments,
1324             "Total number of times a segment did not cause hptsi");
1325         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1326         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1327             SYSCTL_CHILDREN(rack_counters),
1328             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1329             &rack_saw_enobuf,
1330             "Total number of times a segment did not cause hptsi");
1331         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1332         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1333             SYSCTL_CHILDREN(rack_counters),
1334             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1335             &rack_saw_enetunreach,
1336             "Total number of times a segment did not cause hptsi");
1337         rack_to_alloc = counter_u64_alloc(M_WAITOK);
1338         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1339             SYSCTL_CHILDREN(rack_counters),
1340             OID_AUTO, "allocs", CTLFLAG_RD,
1341             &rack_to_alloc,
1342             "Total allocations of tracking structures");
1343         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1344         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1345             SYSCTL_CHILDREN(rack_counters),
1346             OID_AUTO, "allochard", CTLFLAG_RD,
1347             &rack_to_alloc_hard,
1348             "Total allocations done with sleeping the hard way");
1349         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1350         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1351             SYSCTL_CHILDREN(rack_counters),
1352             OID_AUTO, "allocemerg", CTLFLAG_RD,
1353             &rack_to_alloc_emerg,
1354             "Total allocations done from emergency cache");
1355         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1356         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1357             SYSCTL_CHILDREN(rack_counters),
1358             OID_AUTO, "alloc_limited", CTLFLAG_RD,
1359             &rack_to_alloc_limited,
1360             "Total allocations dropped due to limit");
1361         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1362         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1363             SYSCTL_CHILDREN(rack_counters),
1364             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1365             &rack_alloc_limited_conns,
1366             "Connections with allocations dropped due to limit");
1367         rack_split_limited = counter_u64_alloc(M_WAITOK);
1368         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1369             SYSCTL_CHILDREN(rack_counters),
1370             OID_AUTO, "split_limited", CTLFLAG_RD,
1371             &rack_split_limited,
1372             "Split allocations dropped due to limit");
1373         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1374         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1375             SYSCTL_CHILDREN(rack_counters),
1376             OID_AUTO, "sack_long", CTLFLAG_RD,
1377             &rack_sack_proc_all,
1378             "Total times we had to walk whole list for sack processing");
1379         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1380         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1381             SYSCTL_CHILDREN(rack_counters),
1382             OID_AUTO, "sack_restart", CTLFLAG_RD,
1383             &rack_sack_proc_restart,
1384             "Total times we had to walk whole list due to a restart");
1385         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1386         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1387             SYSCTL_CHILDREN(rack_counters),
1388             OID_AUTO, "sack_short", CTLFLAG_RD,
1389             &rack_sack_proc_short,
1390             "Total times we took shortcut for sack processing");
1391         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
1392         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1393             SYSCTL_CHILDREN(rack_counters),
1394             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
1395             &rack_enter_tlp_calc,
1396             "Total times we called calc-tlp");
1397         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
1398         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1399             SYSCTL_CHILDREN(rack_counters),
1400             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
1401             &rack_used_tlpmethod,
1402             "Total number of runt sacks");
1403         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
1404         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1405             SYSCTL_CHILDREN(rack_counters),
1406             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
1407             &rack_used_tlpmethod2,
1408             "Total number of times we hit TLP method 2");
1409         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
1410         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1411             SYSCTL_CHILDREN(rack_attack),
1412             OID_AUTO, "skipacked", CTLFLAG_RD,
1413             &rack_sack_skipped_acked,
1414             "Total number of times we skipped previously sacked");
1415         rack_sack_splits = counter_u64_alloc(M_WAITOK);
1416         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1417             SYSCTL_CHILDREN(rack_attack),
1418             OID_AUTO, "ofsplit", CTLFLAG_RD,
1419             &rack_sack_splits,
1420             "Total number of times we did the old fashion tree split");
1421         rack_progress_drops = counter_u64_alloc(M_WAITOK);
1422         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1423             SYSCTL_CHILDREN(rack_counters),
1424             OID_AUTO, "prog_drops", CTLFLAG_RD,
1425             &rack_progress_drops,
1426             "Total number of progress drops");
1427         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1428         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1429             SYSCTL_CHILDREN(rack_counters),
1430             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1431             &rack_input_idle_reduces,
1432             "Total number of idle reductions on input");
1433         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1434         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1435             SYSCTL_CHILDREN(rack_counters),
1436             OID_AUTO, "collapsed_win", CTLFLAG_RD,
1437             &rack_collapsed_win,
1438             "Total number of collapsed windows");
1439         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
1440         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1441             SYSCTL_CHILDREN(rack_counters),
1442             OID_AUTO, "tlp_nada", CTLFLAG_RD,
1443             &rack_tlp_does_nada,
1444             "Total number of nada tlp calls");
1445         rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1446         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1447             SYSCTL_CHILDREN(rack_counters),
1448             OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1449             &rack_try_scwnd,
1450             "Total number of scwnd attempts");
1451
1452         rack_tls_rwnd = counter_u64_alloc(M_WAITOK);
1453         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1454             SYSCTL_CHILDREN(rack_counters),
1455             OID_AUTO, "tls_rwnd", CTLFLAG_RD,
1456             &rack_tls_rwnd,
1457             "Total hdwr tls rwnd limited");
1458         rack_tls_cwnd = counter_u64_alloc(M_WAITOK);
1459         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1460             SYSCTL_CHILDREN(rack_counters),
1461             OID_AUTO, "tls_cwnd", CTLFLAG_RD,
1462             &rack_tls_cwnd,
1463             "Total hdwr tls cwnd limited");
1464         rack_tls_app = counter_u64_alloc(M_WAITOK);
1465         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1466             SYSCTL_CHILDREN(rack_counters),
1467             OID_AUTO, "tls_app", CTLFLAG_RD,
1468             &rack_tls_app,
1469             "Total hdwr tls app limited");
1470         rack_tls_other = counter_u64_alloc(M_WAITOK);
1471         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1472             SYSCTL_CHILDREN(rack_counters),
1473             OID_AUTO, "tls_other", CTLFLAG_RD,
1474             &rack_tls_other,
1475             "Total hdwr tls other limited");
1476         rack_tls_filled = counter_u64_alloc(M_WAITOK);
1477         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1478             SYSCTL_CHILDREN(rack_counters),
1479             OID_AUTO, "tls_filled", CTLFLAG_RD,
1480             &rack_tls_filled,
1481             "Total hdwr tls filled");
1482         rack_tls_rxt = counter_u64_alloc(M_WAITOK);
1483         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1484             SYSCTL_CHILDREN(rack_counters),
1485             OID_AUTO, "tls_rxt", CTLFLAG_RD,
1486             &rack_tls_rxt,
1487             "Total hdwr rxt");
1488         rack_tls_tlp = counter_u64_alloc(M_WAITOK);
1489         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1490             SYSCTL_CHILDREN(rack_counters),
1491             OID_AUTO, "tls_tlp", CTLFLAG_RD,
1492             &rack_tls_tlp,
1493             "Total hdwr tls tlp");
1494         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1495         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1496             SYSCTL_CHILDREN(rack_counters),
1497             OID_AUTO, "timer_hole", CTLFLAG_RD,
1498             &rack_per_timer_hole,
1499             "Total persists start in timer hole");
1500         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1501         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1502             OID_AUTO, "outsize", CTLFLAG_RD,
1503             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1504         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1505         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1506             OID_AUTO, "opts", CTLFLAG_RD,
1507             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1508         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1509             SYSCTL_CHILDREN(rack_sysctl_root),
1510             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1511             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1512 }
1513
1514 static __inline int
1515 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1516 {
1517         if (SEQ_GEQ(b->r_start, a->r_start) &&
1518             SEQ_LT(b->r_start, a->r_end)) {
1519                 /*
1520                  * The entry b is within the
1521                  * block a. i.e.:
1522                  * a --   |-------------|
1523                  * b --   |----|
1524                  * <or>
1525                  * b --       |------|
1526                  * <or>
1527                  * b --       |-----------|
1528                  */
1529                 return (0);
1530         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1531                 /*
1532                  * b falls as either the next
1533                  * sequence block after a so a
1534                  * is said to be smaller than b.
1535                  * i.e:
1536                  * a --   |------|
1537                  * b --          |--------|
1538                  * or
1539                  * b --              |-----|
1540                  */
1541                 return (1);
1542         }
1543         /*
1544          * Whats left is where a is
1545          * larger than b. i.e:
1546          * a --         |-------|
1547          * b --  |---|
1548          * or even possibly
1549          * b --   |--------------|
1550          */
1551         return (-1);
1552 }
1553
1554 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1555 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1556
1557 static uint32_t
1558 rc_init_window(struct tcp_rack *rack)
1559 {
1560         uint32_t win;
1561
1562         if (rack->rc_init_win == 0) {
1563                 /*
1564                  * Nothing set by the user, use the system stack
1565                  * default.
1566                  */
1567                 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1568         }
1569         win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
1570         return(win);
1571 }
1572
1573 static uint64_t
1574 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1575 {
1576         if (IN_RECOVERY(rack->rc_tp->t_flags))
1577                 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1578         else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1579                 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1580         else
1581                 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1582 }
1583
1584 static uint64_t
1585 rack_get_bw(struct tcp_rack *rack)
1586 {
1587         if (rack->use_fixed_rate) {
1588                 /* Return the fixed pacing rate */
1589                 return (rack_get_fixed_pacing_bw(rack));
1590         }
1591         if (rack->r_ctl.gp_bw == 0) {
1592                 /*
1593                  * We have yet no b/w measurement,
1594                  * if we have a user set initial bw
1595                  * return it. If we don't have that and
1596                  * we have an srtt, use the tcp IW (10) to
1597                  * calculate a fictional b/w over the SRTT
1598                  * which is more or less a guess. Note
1599                  * we don't use our IW from rack on purpose
1600                  * so if we have like IW=30, we are not
1601                  * calculating a "huge" b/w.
1602                  */
1603                 uint64_t bw, srtt;
1604                 if (rack->r_ctl.init_rate)
1605                         return (rack->r_ctl.init_rate);
1606
1607                 /* Has the user set a max peak rate? */
1608 #ifdef NETFLIX_PEAKRATE
1609                 if (rack->rc_tp->t_maxpeakrate)
1610                         return (rack->rc_tp->t_maxpeakrate);
1611 #endif
1612                 /* Ok lets come up with the IW guess, if we have a srtt */
1613                 if (rack->rc_tp->t_srtt == 0) {
1614                         /*
1615                          * Go with old pacing method
1616                          * i.e. burst mitigation only.
1617                          */
1618                         return (0);
1619                 }
1620                 /* Ok lets get the initial TCP win (not racks) */
1621                 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
1622                 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
1623                 bw *= (uint64_t)USECS_IN_SECOND;
1624                 bw /= srtt;
1625                 return (bw);
1626         } else {
1627                 uint64_t bw;
1628
1629                 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) {
1630                         /* Averaging is done, we can return the value */
1631                         bw = rack->r_ctl.gp_bw;
1632                 } else {
1633                         /* Still doing initial average must calculate */
1634                         bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg;
1635                 }
1636 #ifdef NETFLIX_PEAKRATE
1637                 if ((rack->rc_tp->t_maxpeakrate) &&
1638                     (bw > rack->rc_tp->t_maxpeakrate)) {
1639                         /* The user has set a peak rate to pace at
1640                          * don't allow us to pace faster than that.
1641                          */
1642                         return (rack->rc_tp->t_maxpeakrate);
1643                 }
1644 #endif
1645                 return (bw);
1646         }
1647 }
1648
1649 static uint16_t
1650 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
1651 {
1652         if (rack->use_fixed_rate) {
1653                 return (100);
1654         } else if (rack->in_probe_rtt && (rsm == NULL))
1655                 return(rack->r_ctl.rack_per_of_gp_probertt);
1656         else if ((IN_RECOVERY(rack->rc_tp->t_flags) &&
1657                   rack->r_ctl.rack_per_of_gp_rec)) {
1658                 if (rsm) {
1659                         /* a retransmission always use the recovery rate */
1660                         return(rack->r_ctl.rack_per_of_gp_rec);
1661                 } else if (rack->rack_rec_nonrxt_use_cr) {
1662                         /* Directed to use the configured rate */
1663                         goto configured_rate;
1664                 } else if (rack->rack_no_prr &&
1665                            (rack->r_ctl.rack_per_of_gp_rec > 100)) {
1666                         /* No PRR, lets just use the b/w estimate only */
1667                         return(100);
1668                 } else {
1669                         /*
1670                          * Here we may have a non-retransmit but we
1671                          * have no overrides, so just use the recovery
1672                          * rate (prr is in effect).
1673                          */
1674                         return(rack->r_ctl.rack_per_of_gp_rec);
1675                 }
1676         }
1677 configured_rate:
1678         /* For the configured rate we look at our cwnd vs the ssthresh */
1679         if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1680                 return (rack->r_ctl.rack_per_of_gp_ss);
1681         else
1682                 return(rack->r_ctl.rack_per_of_gp_ca);
1683 }
1684
1685 static uint64_t
1686 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm)
1687 {
1688         /*
1689          * We allow rack_per_of_gp_xx to dictate our bw rate we want.
1690          */
1691         uint64_t bw_est;
1692         uint64_t gain;
1693
1694         gain = (uint64_t)rack_get_output_gain(rack, rsm);
1695         bw_est = bw * gain;
1696         bw_est /= (uint64_t)100;
1697         /* Never fall below the minimum (def 64kbps) */
1698         if (bw_est < RACK_MIN_BW)
1699                 bw_est = RACK_MIN_BW;
1700         return (bw_est);
1701 }
1702
1703 static void
1704 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
1705 {
1706         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1707                 union tcp_log_stackspecific log;
1708                 struct timeval tv;
1709
1710                 if ((mod != 1) && (rack_verbose_logging == 0)) {
1711                         /*
1712                          * We get 3 values currently for mod
1713                          * 1 - We are retransmitting and this tells the reason.
1714                          * 2 - We are clearing a dup-ack count.
1715                          * 3 - We are incrementing a dup-ack count.
1716                          *
1717                          * The clear/increment are only logged
1718                          * if you have BBverbose on.
1719                          */
1720                         return;
1721                 }
1722                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1723                 log.u_bbr.flex1 = tsused;
1724                 log.u_bbr.flex2 = thresh;
1725                 log.u_bbr.flex3 = rsm->r_flags;
1726                 log.u_bbr.flex4 = rsm->r_dupack;
1727                 log.u_bbr.flex5 = rsm->r_start;
1728                 log.u_bbr.flex6 = rsm->r_end;
1729                 log.u_bbr.flex8 = mod;
1730                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1731                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1732                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1733                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1734                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1735                     &rack->rc_inp->inp_socket->so_rcv,
1736                     &rack->rc_inp->inp_socket->so_snd,
1737                     BBR_LOG_SETTINGS_CHG, 0,
1738                     0, &log, false, &tv);
1739         }
1740 }
1741
1742
1743
1744 static void
1745 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
1746 {
1747         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1748                 union tcp_log_stackspecific log;
1749                 struct timeval tv;
1750
1751                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1752                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
1753                 log.u_bbr.flex2 = to * 1000;
1754                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1755                 log.u_bbr.flex4 = slot;
1756                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
1757                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1758                 log.u_bbr.flex7 = rack->rc_in_persist;
1759                 log.u_bbr.flex8 = which;
1760                 if (rack->rack_no_prr)
1761                         log.u_bbr.pkts_out = 0;
1762                 else
1763                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1764                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1765                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1766                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1767                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1768                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1769                     &rack->rc_inp->inp_socket->so_rcv,
1770                     &rack->rc_inp->inp_socket->so_snd,
1771                     BBR_LOG_TIMERSTAR, 0,
1772                     0, &log, false, &tv);
1773         }
1774 }
1775
1776 static void
1777 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
1778 {
1779         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1780                 union tcp_log_stackspecific log;
1781                 struct timeval tv;
1782
1783                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1784                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1785                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1786                 log.u_bbr.flex8 = to_num;
1787                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
1788                 log.u_bbr.flex2 = rack->rc_rack_rtt;
1789                 if (rsm == NULL)
1790                         log.u_bbr.flex3 = 0;
1791                 else
1792                         log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
1793                 if (rack->rack_no_prr)
1794                         log.u_bbr.flex5 = 0;
1795                 else
1796                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1797                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1798                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1799                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1800                     &rack->rc_inp->inp_socket->so_rcv,
1801                     &rack->rc_inp->inp_socket->so_snd,
1802                     BBR_LOG_RTO, 0,
1803                     0, &log, false, &tv);
1804         }
1805 }
1806
1807 static void
1808 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
1809                  struct rack_sendmap *rsm, int conf)
1810 {
1811         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1812                 union tcp_log_stackspecific log;
1813                 struct timeval tv;
1814                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1815                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1816                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1817                 log.u_bbr.flex1 = t;
1818                 log.u_bbr.flex2 = len;
1819                 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC;
1820                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC;
1821                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC;
1822                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
1823                 log.u_bbr.flex7 = conf;
1824                 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC;
1825                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
1826                 if (rack->rack_no_prr)
1827                         log.u_bbr.pkts_out = 0;
1828                 else
1829                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1830                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1831                 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt;
1832                 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
1833                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1834                 if (rsm) {
1835                         log.u_bbr.pkt_epoch = rsm->r_start;
1836                         log.u_bbr.lost = rsm->r_end;
1837                         log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
1838                 } else {
1839
1840                         /* Its a SYN */
1841                         log.u_bbr.pkt_epoch = rack->rc_tp->iss;
1842                         log.u_bbr.lost = 0;
1843                         log.u_bbr.cwnd_gain = 0;
1844                 }
1845                 /* Write out general bits of interest rrs here */
1846                 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
1847                 log.u_bbr.use_lt_bw <<= 1;
1848                 log.u_bbr.use_lt_bw |= rack->forced_ack;
1849                 log.u_bbr.use_lt_bw <<= 1;
1850                 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
1851                 log.u_bbr.use_lt_bw <<= 1;
1852                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
1853                 log.u_bbr.use_lt_bw <<= 1;
1854                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
1855                 log.u_bbr.use_lt_bw <<= 1;
1856                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
1857                 log.u_bbr.use_lt_bw <<= 1;
1858                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
1859                 log.u_bbr.use_lt_bw <<= 1;
1860                 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
1861                 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
1862                 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
1863                 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
1864                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
1865                 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
1866                 TCP_LOG_EVENTP(tp, NULL,
1867                     &rack->rc_inp->inp_socket->so_rcv,
1868                     &rack->rc_inp->inp_socket->so_snd,
1869                     BBR_LOG_BBRRTT, 0,
1870                     0, &log, false, &tv);
1871         }
1872 }
1873
1874 static void
1875 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
1876 {
1877         /*
1878          * Log the rtt sample we are
1879          * applying to the srtt algorithm in
1880          * useconds.
1881          */
1882         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1883                 union tcp_log_stackspecific log;
1884                 struct timeval tv;
1885
1886                 /* Convert our ms to a microsecond */
1887                 memset(&log, 0, sizeof(log));
1888                 log.u_bbr.flex1 = rtt * 1000;
1889                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1890                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
1891                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1892                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
1893                 log.u_bbr.flex8 = rack->sack_attack_disable;
1894                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1895                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1896                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1897                     &rack->rc_inp->inp_socket->so_rcv,
1898                     &rack->rc_inp->inp_socket->so_snd,
1899                     TCP_LOG_RTT, 0,
1900                     0, &log, false, &tv);
1901         }
1902 }
1903
1904
1905 static inline void
1906 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
1907 {
1908         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1909                 union tcp_log_stackspecific log;
1910                 struct timeval tv;
1911
1912                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1913                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1914                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1915                 log.u_bbr.flex1 = line;
1916                 log.u_bbr.flex2 = tick;
1917                 log.u_bbr.flex3 = tp->t_maxunacktime;
1918                 log.u_bbr.flex4 = tp->t_acktime;
1919                 log.u_bbr.flex8 = event;
1920                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1921                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1922                 TCP_LOG_EVENTP(tp, NULL,
1923                     &rack->rc_inp->inp_socket->so_rcv,
1924                     &rack->rc_inp->inp_socket->so_snd,
1925                     BBR_LOG_PROGRESS, 0,
1926                     0, &log, false, &tv);
1927         }
1928 }
1929
1930 static void
1931 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
1932 {
1933         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1934                 union tcp_log_stackspecific log;
1935
1936                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1937                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1938                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1939                 log.u_bbr.flex1 = slot;
1940                 if (rack->rack_no_prr)
1941                         log.u_bbr.flex2 = 0;
1942                 else
1943                         log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
1944                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1945                 log.u_bbr.flex8 = rack->rc_in_persist;
1946                 log.u_bbr.timeStamp = cts;
1947                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1948                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1949                     &rack->rc_inp->inp_socket->so_rcv,
1950                     &rack->rc_inp->inp_socket->so_snd,
1951                     BBR_LOG_BBRSND, 0,
1952                     0, &log, false, tv);
1953         }
1954 }
1955
1956 static void
1957 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1958 {
1959         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1960                 union tcp_log_stackspecific log;
1961                 struct timeval tv;
1962
1963                 memset(&log, 0, sizeof(log));
1964                 log.u_bbr.flex1 = did_out;
1965                 log.u_bbr.flex2 = nxt_pkt;
1966                 log.u_bbr.flex3 = way_out;
1967                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1968                 if (rack->rack_no_prr)
1969                         log.u_bbr.flex5 = 0;
1970                 else
1971                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1972                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
1973                 log.u_bbr.flex7 = rack->r_wanted_output;
1974                 log.u_bbr.flex8 = rack->rc_in_persist;
1975                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1976                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1977                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1978                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1979                     &rack->rc_inp->inp_socket->so_rcv,
1980                     &rack->rc_inp->inp_socket->so_snd,
1981                     BBR_LOG_DOSEG_DONE, 0,
1982                     0, &log, false, &tv);
1983         }
1984 }
1985
1986 static void
1987 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
1988 {
1989         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1990                 union tcp_log_stackspecific log;
1991                 struct timeval tv;
1992                 uint32_t cts;
1993
1994                 memset(&log, 0, sizeof(log));
1995                 cts = tcp_get_usecs(&tv);
1996                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
1997                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
1998                 log.u_bbr.flex4 = len;
1999                 log.u_bbr.flex5 = orig_len;
2000                 log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
2001                 log.u_bbr.flex7 = mod;
2002                 log.u_bbr.flex8 = frm;
2003                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2004                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2005                 TCP_LOG_EVENTP(tp, NULL,
2006                     &tp->t_inpcb->inp_socket->so_rcv,
2007                     &tp->t_inpcb->inp_socket->so_snd,
2008                     TCP_HDWR_TLS, 0,
2009                     0, &log, false, &tv);
2010         }
2011 }
2012
2013 static void
2014 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
2015                           uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
2016 {
2017         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2018                 union tcp_log_stackspecific log;
2019                 struct timeval tv;
2020
2021                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2022                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2023                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2024                 log.u_bbr.flex1 = slot;
2025                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
2026                 log.u_bbr.flex4 = reason;
2027                 if (rack->rack_no_prr)
2028                         log.u_bbr.flex5 = 0;
2029                 else
2030                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2031                 log.u_bbr.flex7 = hpts_calling;
2032                 log.u_bbr.flex8 = rack->rc_in_persist;
2033                 log.u_bbr.lt_epoch = cwnd_to_use;
2034                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2035                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2036                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2037                     &rack->rc_inp->inp_socket->so_rcv,
2038                     &rack->rc_inp->inp_socket->so_snd,
2039                     BBR_LOG_JUSTRET, 0,
2040                     tlen, &log, false, &tv);
2041         }
2042 }
2043
2044 static void
2045 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
2046                    struct timeval *tv, uint32_t flags_on_entry)
2047 {
2048         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2049                 union tcp_log_stackspecific log;
2050
2051                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2052                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2053                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2054                 log.u_bbr.flex1 = line;
2055                 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
2056                 log.u_bbr.flex3 = flags_on_entry;
2057                 log.u_bbr.flex4 = us_cts;
2058                 if (rack->rack_no_prr)
2059                         log.u_bbr.flex5 = 0;
2060                 else
2061                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2062                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2063                 log.u_bbr.flex7 = hpts_removed;
2064                 log.u_bbr.flex8 = 1;
2065                 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
2066                 log.u_bbr.timeStamp = us_cts;
2067                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2068                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2069                     &rack->rc_inp->inp_socket->so_rcv,
2070                     &rack->rc_inp->inp_socket->so_snd,
2071                     BBR_LOG_TIMERCANC, 0,
2072                     0, &log, false, tv);
2073         }
2074 }
2075
2076 static void
2077 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
2078                           uint32_t flex1, uint32_t flex2,
2079                           uint32_t flex3, uint32_t flex4,
2080                           uint32_t flex5, uint32_t flex6,
2081                           uint16_t flex7, uint8_t mod)
2082 {
2083         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2084                 union tcp_log_stackspecific log;
2085                 struct timeval tv;
2086
2087                 if (mod == 1) {
2088                         /* No you can't use 1, its for the real to cancel */
2089                         return;
2090                 }
2091                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2092                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2093                 log.u_bbr.flex1 = flex1;
2094                 log.u_bbr.flex2 = flex2;
2095                 log.u_bbr.flex3 = flex3;
2096                 log.u_bbr.flex4 = flex4;
2097                 log.u_bbr.flex5 = flex5;
2098                 log.u_bbr.flex6 = flex6;
2099                 log.u_bbr.flex7 = flex7;
2100                 log.u_bbr.flex8 =  mod;
2101                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2102                     &rack->rc_inp->inp_socket->so_rcv,
2103                     &rack->rc_inp->inp_socket->so_snd,
2104                     BBR_LOG_TIMERCANC, 0,
2105                     0, &log, false, &tv);
2106         }
2107 }
2108
2109 static void
2110 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
2111 {
2112         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2113                 union tcp_log_stackspecific log;
2114                 struct timeval tv;
2115
2116                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2117                 log.u_bbr.flex1 = timers;
2118                 log.u_bbr.flex2 = ret;
2119                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
2120                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2121                 log.u_bbr.flex5 = cts;
2122                 if (rack->rack_no_prr)
2123                         log.u_bbr.flex6 = 0;
2124                 else
2125                         log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
2126                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2127                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2128                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2129                     &rack->rc_inp->inp_socket->so_rcv,
2130                     &rack->rc_inp->inp_socket->so_snd,
2131                     BBR_LOG_TO_PROCESS, 0,
2132                     0, &log, false, &tv);
2133         }
2134 }
2135
2136 static void
2137 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd)
2138 {
2139         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2140                 union tcp_log_stackspecific log;
2141                 struct timeval tv;
2142
2143                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2144                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
2145                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
2146                 if (rack->rack_no_prr)
2147                         log.u_bbr.flex3 = 0;
2148                 else
2149                         log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
2150                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
2151                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
2152                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
2153                 log.u_bbr.flex8 = frm;
2154                 log.u_bbr.pkts_out = orig_cwnd;
2155                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2156                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2157                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2158                     &rack->rc_inp->inp_socket->so_rcv,
2159                     &rack->rc_inp->inp_socket->so_snd,
2160                     BBR_LOG_BBRUPD, 0,
2161                     0, &log, false, &tv);
2162         }
2163 }
2164
2165 #ifdef NETFLIX_EXP_DETECTION
2166 static void
2167 rack_log_sad(struct tcp_rack *rack, int event)
2168 {
2169         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2170                 union tcp_log_stackspecific log;
2171                 struct timeval tv;
2172
2173                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2174                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
2175                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2176                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
2177                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2178                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
2179                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
2180                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
2181                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
2182                 log.u_bbr.lt_epoch |= rack->do_detection;
2183                 log.u_bbr.applimited = tcp_map_minimum;
2184                 log.u_bbr.flex7 = rack->sack_attack_disable;
2185                 log.u_bbr.flex8 = event;
2186                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2187                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2188                 log.u_bbr.delivered = tcp_sad_decay_val;
2189                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2190                     &rack->rc_inp->inp_socket->so_rcv,
2191                     &rack->rc_inp->inp_socket->so_snd,
2192                     TCP_SAD_DETECTION, 0,
2193                     0, &log, false, &tv);
2194         }
2195 }
2196 #endif
2197
2198 static void
2199 rack_counter_destroy(void)
2200 {
2201         counter_u64_free(rack_ack_total);
2202         counter_u64_free(rack_express_sack);
2203         counter_u64_free(rack_sack_total);
2204         counter_u64_free(rack_move_none);
2205         counter_u64_free(rack_move_some);
2206         counter_u64_free(rack_sack_attacks_detected);
2207         counter_u64_free(rack_sack_attacks_reversed);
2208         counter_u64_free(rack_sack_used_next_merge);
2209         counter_u64_free(rack_sack_used_prev_merge);
2210         counter_u64_free(rack_badfr);
2211         counter_u64_free(rack_badfr_bytes);
2212         counter_u64_free(rack_rtm_prr_retran);
2213         counter_u64_free(rack_rtm_prr_newdata);
2214         counter_u64_free(rack_timestamp_mismatch);
2215         counter_u64_free(rack_find_high);
2216         counter_u64_free(rack_reorder_seen);
2217         counter_u64_free(rack_tlp_tot);
2218         counter_u64_free(rack_tlp_newdata);
2219         counter_u64_free(rack_tlp_retran);
2220         counter_u64_free(rack_tlp_retran_bytes);
2221         counter_u64_free(rack_tlp_retran_fail);
2222         counter_u64_free(rack_to_tot);
2223         counter_u64_free(rack_to_arm_rack);
2224         counter_u64_free(rack_to_arm_tlp);
2225         counter_u64_free(rack_calc_zero);
2226         counter_u64_free(rack_calc_nonzero);
2227         counter_u64_free(rack_paced_segments);
2228         counter_u64_free(rack_unpaced_segments);
2229         counter_u64_free(rack_saw_enobuf);
2230         counter_u64_free(rack_saw_enetunreach);
2231         counter_u64_free(rack_to_alloc);
2232         counter_u64_free(rack_to_alloc_hard);
2233         counter_u64_free(rack_to_alloc_emerg);
2234         counter_u64_free(rack_to_alloc_limited);
2235         counter_u64_free(rack_alloc_limited_conns);
2236         counter_u64_free(rack_split_limited);
2237         counter_u64_free(rack_sack_proc_all);
2238         counter_u64_free(rack_sack_proc_restart);
2239         counter_u64_free(rack_sack_proc_short);
2240         counter_u64_free(rack_enter_tlp_calc);
2241         counter_u64_free(rack_used_tlpmethod);
2242         counter_u64_free(rack_used_tlpmethod2);
2243         counter_u64_free(rack_sack_skipped_acked);
2244         counter_u64_free(rack_sack_splits);
2245         counter_u64_free(rack_progress_drops);
2246         counter_u64_free(rack_input_idle_reduces);
2247         counter_u64_free(rack_collapsed_win);
2248         counter_u64_free(rack_tlp_does_nada);
2249         counter_u64_free(rack_try_scwnd);
2250         counter_u64_free(rack_tls_rwnd);
2251         counter_u64_free(rack_tls_cwnd);
2252         counter_u64_free(rack_tls_app);
2253         counter_u64_free(rack_tls_other);
2254         counter_u64_free(rack_tls_filled);
2255         counter_u64_free(rack_tls_rxt);
2256         counter_u64_free(rack_tls_tlp);
2257         counter_u64_free(rack_per_timer_hole);
2258         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
2259         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
2260 }
2261
2262 static struct rack_sendmap *
2263 rack_alloc(struct tcp_rack *rack)
2264 {
2265         struct rack_sendmap *rsm;
2266
2267         rsm = uma_zalloc(rack_zone, M_NOWAIT);
2268         if (rsm) {
2269                 rack->r_ctl.rc_num_maps_alloced++;
2270                 counter_u64_add(rack_to_alloc, 1);
2271                 return (rsm);
2272         }
2273         if (rack->rc_free_cnt) {
2274                 counter_u64_add(rack_to_alloc_emerg, 1);
2275                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
2276                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2277                 rack->rc_free_cnt--;
2278                 return (rsm);
2279         }
2280         return (NULL);
2281 }
2282
2283 static struct rack_sendmap *
2284 rack_alloc_full_limit(struct tcp_rack *rack)
2285 {
2286         if ((V_tcp_map_entries_limit > 0) &&
2287             (rack->do_detection == 0) &&
2288             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
2289                 counter_u64_add(rack_to_alloc_limited, 1);
2290                 if (!rack->alloc_limit_reported) {
2291                         rack->alloc_limit_reported = 1;
2292                         counter_u64_add(rack_alloc_limited_conns, 1);
2293                 }
2294                 return (NULL);
2295         }
2296         return (rack_alloc(rack));
2297 }
2298
2299 /* wrapper to allocate a sendmap entry, subject to a specific limit */
2300 static struct rack_sendmap *
2301 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
2302 {
2303         struct rack_sendmap *rsm;
2304
2305         if (limit_type) {
2306                 /* currently there is only one limit type */
2307                 if (V_tcp_map_split_limit > 0 &&
2308                     (rack->do_detection == 0) &&
2309                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
2310                         counter_u64_add(rack_split_limited, 1);
2311                         if (!rack->alloc_limit_reported) {
2312                                 rack->alloc_limit_reported = 1;
2313                                 counter_u64_add(rack_alloc_limited_conns, 1);
2314                         }
2315                         return (NULL);
2316                 }
2317         }
2318
2319         /* allocate and mark in the limit type, if set */
2320         rsm = rack_alloc(rack);
2321         if (rsm != NULL && limit_type) {
2322                 rsm->r_limit_type = limit_type;
2323                 rack->r_ctl.rc_num_split_allocs++;
2324         }
2325         return (rsm);
2326 }
2327
2328 static void
2329 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
2330 {
2331         if (rsm->r_flags & RACK_APP_LIMITED) {
2332                 if (rack->r_ctl.rc_app_limited_cnt > 0) {
2333                         rack->r_ctl.rc_app_limited_cnt--;
2334                 }
2335         }
2336         if (rsm->r_limit_type) {
2337                 /* currently there is only one limit type */
2338                 rack->r_ctl.rc_num_split_allocs--;
2339         }
2340         if (rsm == rack->r_ctl.rc_first_appl) {
2341                 if (rack->r_ctl.rc_app_limited_cnt == 0)
2342                         rack->r_ctl.rc_first_appl = NULL;
2343                 else {
2344                         /* Follow the next one out */
2345                         struct rack_sendmap fe;
2346
2347                         fe.r_start = rsm->r_nseq_appl;
2348                         rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
2349                 }
2350         }
2351         if (rsm == rack->r_ctl.rc_resend)
2352                 rack->r_ctl.rc_resend = NULL;
2353         if (rsm == rack->r_ctl.rc_rsm_at_retran)
2354                 rack->r_ctl.rc_rsm_at_retran = NULL;
2355         if (rsm == rack->r_ctl.rc_end_appl)
2356                 rack->r_ctl.rc_end_appl = NULL;
2357         if (rack->r_ctl.rc_tlpsend == rsm)
2358                 rack->r_ctl.rc_tlpsend = NULL;
2359         if (rack->r_ctl.rc_sacklast == rsm)
2360                 rack->r_ctl.rc_sacklast = NULL;
2361         if (rack->rc_free_cnt < rack_free_cache) {
2362                 memset(rsm, 0, sizeof(struct rack_sendmap));
2363                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
2364                 rsm->r_limit_type = 0;
2365                 rack->rc_free_cnt++;
2366                 return;
2367         }
2368         rack->r_ctl.rc_num_maps_alloced--;
2369         uma_zfree(rack_zone, rsm);
2370 }
2371
2372 static uint32_t
2373 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
2374 {
2375         uint64_t srtt, bw, len, tim;
2376         uint32_t segsiz, def_len, minl;
2377
2378         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
2379         def_len = rack_def_data_window * segsiz;
2380         if (rack->rc_gp_filled == 0) {
2381                 /*
2382                  * We have no measurement (IW is in flight?) so
2383                  * we can only guess using our data_window sysctl
2384                  * value (usually 100MSS).
2385                  */
2386                 return (def_len);
2387         }
2388         /*
2389          * Now we have a number of factors to consider.
2390          *
2391          * 1) We have a desired BDP which is usually
2392          *    at least 2.
2393          * 2) We have a minimum number of rtt's usually 1 SRTT
2394          *    but we allow it too to be more.
2395          * 3) We want to make sure a measurement last N useconds (if
2396          *    we have set rack_min_measure_usec.
2397          *
2398          * We handle the first concern here by trying to create a data
2399          * window of max(rack_def_data_window, DesiredBDP). The
2400          * second concern we handle in not letting the measurement
2401          * window end normally until at least the required SRTT's
2402          * have gone by which is done further below in
2403          * rack_enough_for_measurement(). Finally the third concern
2404          * we also handle here by calculating how long that time
2405          * would take at the current BW and then return the
2406          * max of our first calculation and that length. Note
2407          * that if rack_min_measure_usec is 0, we don't deal
2408          * with concern 3. Also for both Concern 1 and 3 an
2409          * application limited period could end the measurement
2410          * earlier.
2411          *
2412          * So lets calculate the BDP with the "known" b/w using
2413          * the SRTT has our rtt and then multiply it by the
2414          * goal.
2415          */
2416         bw = rack_get_bw(rack);
2417         srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
2418         len = bw * srtt;
2419         len /= (uint64_t)HPTS_USEC_IN_SEC;
2420         len *= max(1, rack_goal_bdp);
2421         /* Now we need to round up to the nearest MSS */
2422         len = roundup(len, segsiz);
2423         if (rack_min_measure_usec) {
2424                 /* Now calculate our min length for this b/w */
2425                 tim = rack_min_measure_usec;
2426                 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
2427                 if (minl == 0)
2428                         minl = 1;
2429                 minl = roundup(minl, segsiz);
2430                 if (len < minl)
2431                         len = minl;
2432         }
2433         /*
2434          * Now if we have a very small window we want
2435          * to attempt to get the window that is
2436          * as small as possible. This happens on
2437          * low b/w connections and we don't want to
2438          * span huge numbers of rtt's between measurements.
2439          *
2440          * We basically include 2 over our "MIN window" so
2441          * that the measurement can be shortened (possibly) by
2442          * an ack'ed packet.
2443          */
2444         if (len < def_len)
2445                 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
2446         else
2447                 return (max((uint32_t)len, def_len));
2448
2449 }
2450
2451 static int
2452 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack)
2453 {
2454         uint32_t tim, srtts, segsiz;
2455
2456         /*
2457          * Has enough time passed for the GP measurement to be valid?
2458          */
2459         if ((tp->snd_max == tp->snd_una) ||
2460             (th_ack == tp->snd_max)){
2461                 /* All is acked */
2462                 return (1);
2463         }
2464         if (SEQ_LT(th_ack, tp->gput_seq)) {
2465                 /* Not enough bytes yet */
2466                 return (0);
2467         }
2468         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
2469         if (SEQ_LT(th_ack, tp->gput_ack) &&
2470             ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
2471                 /* Not enough bytes yet */
2472                 return (0);
2473         }
2474         if (rack->r_ctl.rc_first_appl &&
2475             (rack->r_ctl.rc_first_appl->r_start == th_ack)) {
2476                 /*
2477                  * We are up to the app limited point
2478                  * we have to measure irrespective of the time..
2479                  */
2480                 return (1);
2481         }
2482         /* Now what about time? */
2483         srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
2484         tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
2485         if (tim >= srtts) {
2486                 return (1);
2487         }
2488         /* Nope not even a full SRTT has passed */
2489         return (0);
2490 }
2491
2492
2493 static void
2494 rack_log_timely(struct tcp_rack *rack,
2495                 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
2496                 uint64_t up_bnd, int line, uint8_t method)
2497 {
2498         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2499                 union tcp_log_stackspecific log;
2500                 struct timeval tv;
2501
2502                 memset(&log, 0, sizeof(log));
2503                 log.u_bbr.flex1 = logged;
2504                 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
2505                 log.u_bbr.flex2 <<= 4;
2506                 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
2507                 log.u_bbr.flex2 <<= 4;
2508                 log.u_bbr.flex2 |= rack->rc_gp_incr;
2509                 log.u_bbr.flex2 <<= 4;
2510                 log.u_bbr.flex2 |= rack->rc_gp_bwred;
2511                 log.u_bbr.flex3 = rack->rc_gp_incr;
2512                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
2513                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
2514                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
2515                 log.u_bbr.flex7 = rack->rc_gp_bwred;
2516                 log.u_bbr.flex8 = method;
2517                 log.u_bbr.cur_del_rate = cur_bw;
2518                 log.u_bbr.delRate = low_bnd;
2519                 log.u_bbr.bw_inuse = up_bnd;
2520                 log.u_bbr.rttProp = rack_get_bw(rack);
2521                 log.u_bbr.pkt_epoch = line;
2522                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
2523                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2524                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2525                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
2526                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
2527                 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
2528                 log.u_bbr.cwnd_gain <<= 1;
2529                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
2530                 log.u_bbr.cwnd_gain <<= 1;
2531                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
2532                 log.u_bbr.cwnd_gain <<= 1;
2533                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
2534                 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
2535                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2536                     &rack->rc_inp->inp_socket->so_rcv,
2537                     &rack->rc_inp->inp_socket->so_snd,
2538                     TCP_TIMELY_WORK, 0,
2539                     0, &log, false, &tv);
2540         }
2541 }
2542
2543 static int
2544 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
2545 {
2546         /*
2547          * Before we increase we need to know if
2548          * the estimate just made was less than
2549          * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
2550          *
2551          * If we already are pacing at a fast enough
2552          * rate to push us faster there is no sense of
2553          * increasing.
2554          *
2555          * We first caculate our actual pacing rate (ss or ca multipler
2556          * times our cur_bw).
2557          *
2558          * Then we take the last measured rate and multipy by our
2559          * maximum pacing overage to give us a max allowable rate.
2560          *
2561          * If our act_rate is smaller than our max_allowable rate
2562          * then we should increase. Else we should hold steady.
2563          *
2564          */
2565         uint64_t act_rate, max_allow_rate;
2566
2567         if (rack_timely_no_stopping)
2568                 return (1);
2569
2570         if ((cur_bw == 0) || (last_bw_est == 0)) {
2571                 /*
2572                  * Initial startup case or
2573                  * everything is acked case.
2574                  */
2575                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
2576                                 __LINE__, 9);
2577                 return (1);
2578         }
2579         if (mult <= 100) {
2580                 /*
2581                  * We can always pace at or slightly above our rate.
2582                  */
2583                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
2584                                 __LINE__, 9);
2585                 return (1);
2586         }
2587         act_rate = cur_bw * (uint64_t)mult;
2588         act_rate /= 100;
2589         max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
2590         max_allow_rate /= 100;
2591         if (act_rate < max_allow_rate) {
2592                 /*
2593                  * Here the rate we are actually pacing at
2594                  * is smaller than 10% above our last measurement.
2595                  * This means we are pacing below what we would
2596                  * like to try to achieve (plus some wiggle room).
2597                  */
2598                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
2599                                 __LINE__, 9);
2600                 return (1);
2601         } else {
2602                 /*
2603                  * Here we are already pacing at least rack_max_per_above(10%)
2604                  * what we are getting back. This indicates most likely
2605                  * that we are being limited (cwnd/rwnd/app) and can't
2606                  * get any more b/w. There is no sense of trying to
2607                  * raise up the pacing rate its not speeding us up
2608                  * and we already are pacing faster than we are getting.
2609                  */
2610                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
2611                                 __LINE__, 8);
2612                 return (0);
2613         }
2614 }
2615
2616 static void
2617 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
2618 {
2619         /*
2620          * When we drag bottom, we want to assure
2621          * that no multiplier is below 1.0, if so
2622          * we want to restore it to at least that.
2623          */
2624         if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
2625                 /* This is unlikely we usually do not touch recovery */
2626                 rack->r_ctl.rack_per_of_gp_rec = 100;
2627         }
2628         if (rack->r_ctl.rack_per_of_gp_ca < 100) {
2629                 rack->r_ctl.rack_per_of_gp_ca = 100;
2630         }
2631         if (rack->r_ctl.rack_per_of_gp_ss < 100) {
2632                 rack->r_ctl.rack_per_of_gp_ss = 100;
2633         }
2634 }
2635
2636 static void
2637 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
2638 {
2639         if (rack->r_ctl.rack_per_of_gp_ca > 100) {
2640                 rack->r_ctl.rack_per_of_gp_ca = 100;
2641         }
2642         if (rack->r_ctl.rack_per_of_gp_ss > 100) {
2643                 rack->r_ctl.rack_per_of_gp_ss = 100;
2644         }
2645 }
2646
2647 static void
2648 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
2649 {
2650         int32_t  calc, logged, plus;
2651
2652         logged = 0;
2653
2654         if (override) {
2655                 /*
2656                  * override is passed when we are
2657                  * loosing b/w and making one last
2658                  * gasp at trying to not loose out
2659                  * to a new-reno flow.
2660                  */
2661                 goto extra_boost;
2662         }
2663         /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
2664         if (rack->rc_gp_incr &&
2665             ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
2666                 /*
2667                  * Reset and get 5 strokes more before the boost. Note
2668                  * that the count is 0 based so we have to add one.
2669                  */
2670 extra_boost:
2671                 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
2672                 rack->rc_gp_timely_inc_cnt = 0;
2673         } else
2674                 plus = (uint32_t)rack_gp_increase_per;
2675         /* Must be at least 1% increase for true timely increases */
2676         if ((plus < 1) &&
2677             ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
2678                 plus = 1;
2679         if (rack->rc_gp_saw_rec &&
2680             (rack->rc_gp_no_rec_chg == 0) &&
2681             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
2682                                   rack->r_ctl.rack_per_of_gp_rec)) {
2683                 /* We have been in recovery ding it too */
2684                 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
2685                 if (calc > 0xffff)
2686                         calc = 0xffff;
2687                 logged |= 1;
2688                 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
2689                 if (rack_per_upper_bound_ss &&
2690                     (rack->rc_dragged_bottom == 0) &&
2691                     (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
2692                         rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
2693         }
2694         if (rack->rc_gp_saw_ca &&
2695             (rack->rc_gp_saw_ss == 0) &&
2696             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
2697                                   rack->r_ctl.rack_per_of_gp_ca)) {
2698                 /* In CA */
2699                 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
2700                 if (calc > 0xffff)
2701                         calc = 0xffff;
2702                 logged |= 2;
2703                 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
2704                 if (rack_per_upper_bound_ca &&
2705                     (rack->rc_dragged_bottom == 0) &&
2706                     (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
2707                         rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
2708         }
2709         if (rack->rc_gp_saw_ss &&
2710             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
2711                                   rack->r_ctl.rack_per_of_gp_ss)) {
2712                 /* In SS */
2713                 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
2714                 if (calc > 0xffff)
2715                         calc = 0xffff;
2716                 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
2717                 if (rack_per_upper_bound_ss &&
2718                     (rack->rc_dragged_bottom == 0) &&
2719                     (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
2720                         rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
2721                 logged |= 4;
2722         }
2723         if (logged &&
2724             (rack->rc_gp_incr == 0)){
2725                 /* Go into increment mode */
2726                 rack->rc_gp_incr = 1;
2727                 rack->rc_gp_timely_inc_cnt = 0;
2728         }
2729         if (rack->rc_gp_incr &&
2730             logged &&
2731             (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
2732                 rack->rc_gp_timely_inc_cnt++;
2733         }
2734         rack_log_timely(rack,  logged, plus, 0, 0,
2735                         __LINE__, 1);
2736 }
2737
2738 static uint32_t
2739 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
2740 {
2741         /*
2742          * norm_grad = rtt_diff / minrtt;
2743          * new_per = curper  * (1 - B * norm_grad)
2744          *
2745          * B = rack_gp_decrease_per (default 10%)
2746          * rtt_dif = input var current rtt-diff
2747          * curper = input var current percentage
2748          * minrtt = from rack filter
2749          *
2750          */
2751         uint64_t perf;
2752
2753         perf = (((uint64_t)curper * ((uint64_t)1000000 -
2754                     ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
2755                      (((uint64_t)rtt_diff * (uint64_t)1000000)/
2756                       (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
2757                      (uint64_t)1000000)) /
2758                 (uint64_t)1000000);
2759         if (perf > curper) {
2760                 /* TSNH */
2761                 perf = curper - 1;
2762         }
2763         return ((uint32_t)perf);
2764 }
2765
2766 static uint32_t
2767 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
2768 {
2769         /*
2770          *                                   highrttthresh
2771          * result = curper * (1 - (B * ( 1 -  ------          ))
2772          *                                     gp_srtt
2773          *
2774          * B = rack_gp_decrease_per (default 10%)
2775          * highrttthresh = filter_min * rack_gp_rtt_maxmul
2776          */
2777         uint64_t perf;
2778         uint32_t highrttthresh;
2779
2780         highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
2781
2782         perf =  (((uint64_t)curper * ((uint64_t)1000000 -
2783                                     ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
2784                                         ((uint64_t)highrttthresh * (uint64_t)1000000) /
2785                                                     (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
2786         return (perf);
2787 }
2788
2789
2790 static void
2791 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
2792 {
2793         uint64_t logvar, logvar2, logvar3;
2794         uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
2795
2796         if (rack->rc_gp_incr) {
2797                 /* Turn off increment counting  */
2798                 rack->rc_gp_incr = 0;
2799                 rack->rc_gp_timely_inc_cnt = 0;
2800         }
2801         ss_red = ca_red = rec_red = 0;
2802         logged = 0;
2803         /* Calculate the reduction value */
2804         if (rtt_diff < 0) {
2805                 rtt_diff *= -1;
2806         }
2807         /* Must be at least 1% reduction */
2808         if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
2809                 /* We have been in recovery ding it too */
2810                 if (timely_says == 2) {
2811                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
2812                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
2813                         if (alt < new_per)
2814                                 val = alt;
2815                         else
2816                                 val = new_per;
2817                 } else
2818                          val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
2819                 if (rack->r_ctl.rack_per_of_gp_rec > val) {
2820                         rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
2821                         rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
2822                 } else {
2823                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
2824                         rec_red = 0;
2825                 }
2826                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
2827                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
2828                 logged |= 1;
2829         }
2830         if (rack->rc_gp_saw_ss) {
2831                 /* Sent in SS */
2832                 if (timely_says == 2) {
2833                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
2834                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
2835                         if (alt < new_per)
2836                                 val = alt;
2837                         else
2838                                 val = new_per;
2839                 } else
2840                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
2841                 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
2842                         ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
2843                         rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
2844                 } else {
2845                         ss_red = new_per;
2846                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
2847                         logvar = new_per;
2848                         logvar <<= 32;
2849                         logvar |= alt;
2850                         logvar2 = (uint32_t)rtt;
2851                         logvar2 <<= 32;
2852                         logvar2 |= (uint32_t)rtt_diff;
2853                         logvar3 = rack_gp_rtt_maxmul;
2854                         logvar3 <<= 32;
2855                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
2856                         rack_log_timely(rack, timely_says,
2857                                         logvar2, logvar3,
2858                                         logvar, __LINE__, 10);
2859                 }
2860                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
2861                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
2862                 logged |= 4;
2863         } else  if (rack->rc_gp_saw_ca) {
2864                 /* Sent in CA */
2865                 if (timely_says == 2) {
2866                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
2867                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
2868                         if (alt < new_per)
2869                                 val = alt;
2870                         else
2871                                 val = new_per;
2872                 } else
2873                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
2874                 if (rack->r_ctl.rack_per_of_gp_ca > val) {
2875                         ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
2876                         rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
2877                 } else {
2878                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
2879                         ca_red = 0;
2880                         logvar = new_per;
2881                         logvar <<= 32;
2882                         logvar |= alt;
2883                         logvar2 = (uint32_t)rtt;
2884                         logvar2 <<= 32;
2885                         logvar2 |= (uint32_t)rtt_diff;
2886                         logvar3 = rack_gp_rtt_maxmul;
2887                         logvar3 <<= 32;
2888                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
2889                         rack_log_timely(rack, timely_says,
2890                                         logvar2, logvar3,
2891                                         logvar, __LINE__, 10);
2892                 }
2893                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
2894                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
2895                 logged |= 2;
2896         }
2897         if (rack->rc_gp_timely_dec_cnt < 0x7) {
2898                 rack->rc_gp_timely_dec_cnt++;
2899                 if (rack_timely_dec_clear &&
2900                     (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
2901                         rack->rc_gp_timely_dec_cnt = 0;
2902         }
2903         logvar = ss_red;
2904         logvar <<= 32;
2905         logvar |= ca_red;
2906         rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
2907                         __LINE__, 2);
2908 }
2909
2910 static void
2911 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
2912                      uint32_t rtt, uint32_t line, uint8_t reas)
2913 {
2914         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2915                 union tcp_log_stackspecific log;
2916                 struct timeval tv;
2917
2918                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2919                 log.u_bbr.flex1 = line;
2920                 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
2921                 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
2922                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
2923                 log.u_bbr.flex5 = rtt;
2924                 log.u_bbr.flex6 = rack->rc_highly_buffered;
2925                 log.u_bbr.flex6 <<= 1;
2926                 log.u_bbr.flex6 |= rack->forced_ack;
2927                 log.u_bbr.flex6 <<= 1;
2928                 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
2929                 log.u_bbr.flex6 <<= 1;
2930                 log.u_bbr.flex6 |= rack->in_probe_rtt;
2931                 log.u_bbr.flex6 <<= 1;
2932                 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
2933                 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
2934                 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
2935                 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
2936                 log.u_bbr.flex8 = reas;
2937                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2938                 log.u_bbr.delRate = rack_get_bw(rack);
2939                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
2940                 log.u_bbr.cur_del_rate <<= 32;
2941                 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
2942                 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
2943                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
2944                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2945                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
2946                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
2947                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
2948                 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
2949                 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
2950                 log.u_bbr.rttProp = us_cts;
2951                 log.u_bbr.rttProp <<= 32;
2952                 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
2953                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2954                     &rack->rc_inp->inp_socket->so_rcv,
2955                     &rack->rc_inp->inp_socket->so_snd,
2956                     BBR_LOG_RTT_SHRINKS, 0,
2957                     0, &log, false, &rack->r_ctl.act_rcv_time);
2958         }
2959 }
2960
2961 static void
2962 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
2963 {
2964         uint64_t bwdp;
2965
2966         bwdp = rack_get_bw(rack);
2967         bwdp *= (uint64_t)rtt;
2968         bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
2969         rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
2970         if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
2971                 /*
2972                  * A window protocol must be able to have 4 packets
2973                  * outstanding as the floor in order to function
2974                  * (especially considering delayed ack :D).
2975                  */
2976                 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
2977         }
2978 }
2979
2980 static void
2981 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
2982 {
2983         /**
2984          * ProbeRTT is a bit different in rack_pacing than in
2985          * BBR. It is like BBR in that it uses the lowering of
2986          * the RTT as a signal that we saw something new and
2987          * counts from there for how long between. But it is
2988          * different in that its quite simple. It does not
2989          * play with the cwnd and wait until we get down
2990          * to N segments outstanding and hold that for
2991          * 200ms. Instead it just sets the pacing reduction
2992          * rate to a set percentage (70 by default) and hold
2993          * that for a number of recent GP Srtt's.
2994          */
2995         uint32_t segsiz;
2996
2997         if (rack->rc_gp_dyn_mul == 0)
2998                 return;
2999
3000         if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
3001                 /* We are idle */
3002                 return;
3003         }
3004         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3005             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3006                 /*
3007                  * Stop the goodput now, the idea here is
3008                  * that future measurements with in_probe_rtt
3009                  * won't register if they are not greater so
3010                  * we want to get what info (if any) is available
3011                  * now.
3012                  */
3013                 rack_do_goodput_measurement(rack->rc_tp, rack,
3014                                             rack->rc_tp->snd_una, __LINE__);
3015         }
3016         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3017         rack->r_ctl.rc_time_probertt_entered = us_cts;
3018         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3019                      rack->r_ctl.rc_pace_min_segs);
3020         rack->in_probe_rtt = 1;
3021         rack->measure_saw_probe_rtt = 1;
3022         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3023         rack->r_ctl.rc_time_probertt_starts = 0;
3024         rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
3025         if (rack_probertt_use_min_rtt_entry)
3026                 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3027         else
3028                 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
3029         rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3030                              __LINE__, RACK_RTTS_ENTERPROBE);
3031 }
3032
3033 static void
3034 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
3035 {
3036         struct rack_sendmap *rsm;
3037         uint32_t segsiz;
3038
3039         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3040                      rack->r_ctl.rc_pace_min_segs);
3041         rack->in_probe_rtt = 0;
3042         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3043             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3044                 /*
3045                  * Stop the goodput now, the idea here is
3046                  * that future measurements with in_probe_rtt
3047                  * won't register if they are not greater so
3048                  * we want to get what info (if any) is available
3049                  * now.
3050                  */
3051                 rack_do_goodput_measurement(rack->rc_tp, rack,
3052                                             rack->rc_tp->snd_una, __LINE__);
3053         } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
3054                 /*
3055                  * We don't have enough data to make a measurement.
3056                  * So lets just stop and start here after exiting
3057                  * probe-rtt. We probably are not interested in
3058                  * the results anyway.
3059                  */
3060                 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
3061         }
3062         /*
3063          * Measurements through the current snd_max are going
3064          * to be limited by the slower pacing rate.
3065          *
3066          * We need to mark these as app-limited so we
3067          * don't collapse the b/w.
3068          */
3069         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3070         if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
3071                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3072                         rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
3073                 else {
3074                         /*
3075                          * Go out to the end app limited and mark
3076                          * this new one as next and move the end_appl up
3077                          * to this guy.
3078                          */
3079                         if (rack->r_ctl.rc_end_appl)
3080                                 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
3081                         rack->r_ctl.rc_end_appl = rsm;
3082                 }
3083                 rsm->r_flags |= RACK_APP_LIMITED;
3084                 rack->r_ctl.rc_app_limited_cnt++;
3085         }
3086         /*
3087          * Now, we need to examine our pacing rate multipliers.
3088          * If its under 100%, we need to kick it back up to
3089          * 100%. We also don't let it be over our "max" above
3090          * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
3091          * Note setting clamp_atexit_prtt to 0 has the effect
3092          * of setting CA/SS to 100% always at exit (which is
3093          * the default behavior).
3094          */
3095         if (rack_probertt_clear_is) {
3096                 rack->rc_gp_incr = 0;
3097                 rack->rc_gp_bwred = 0;
3098                 rack->rc_gp_timely_inc_cnt = 0;
3099                 rack->rc_gp_timely_dec_cnt = 0;
3100         }
3101         /* Do we do any clamping at exit? */
3102         if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
3103                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
3104                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
3105         }
3106         if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
3107                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
3108                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
3109         }
3110         /*
3111          * Lets set rtt_diff to 0, so that we will get a "boost"
3112          * after exiting.
3113          */
3114         rack->r_ctl.rc_rtt_diff = 0;
3115
3116         /* Clear all flags so we start fresh */
3117         rack->rc_tp->t_bytes_acked = 0;
3118         rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND;
3119         /*
3120          * If configured to, set the cwnd and ssthresh to
3121          * our targets.
3122          */
3123         if (rack_probe_rtt_sets_cwnd) {
3124                 uint64_t ebdp;
3125                 uint32_t setto;
3126
3127                 /* Set ssthresh so we get into CA once we hit our target */
3128                 if (rack_probertt_use_min_rtt_exit == 1) {
3129                         /* Set to min rtt */
3130                         rack_set_prtt_target(rack, segsiz,
3131                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3132                 } else if (rack_probertt_use_min_rtt_exit == 2) {
3133                         /* Set to current gp rtt */
3134                         rack_set_prtt_target(rack, segsiz,
3135                                              rack->r_ctl.rc_gp_srtt);
3136                 } else if (rack_probertt_use_min_rtt_exit == 3) {
3137                         /* Set to entry gp rtt */
3138                         rack_set_prtt_target(rack, segsiz,
3139                                              rack->r_ctl.rc_entry_gp_rtt);
3140                 } else  {
3141                         uint64_t sum;
3142                         uint32_t setval;
3143
3144                         sum = rack->r_ctl.rc_entry_gp_rtt;
3145                         sum *= 10;
3146                         sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
3147                         if (sum >= 20) {
3148                                 /*
3149                                  * A highly buffered path needs
3150                                  * cwnd space for timely to work.
3151                                  * Lets set things up as if
3152                                  * we are heading back here again.
3153                                  */
3154                                 setval = rack->r_ctl.rc_entry_gp_rtt;
3155                         } else if (sum >= 15) {
3156                                 /*
3157                                  * Lets take the smaller of the
3158                                  * two since we are just somewhat
3159                                  * buffered.
3160                                  */
3161                                 setval = rack->r_ctl.rc_gp_srtt;
3162                                 if (setval > rack->r_ctl.rc_entry_gp_rtt)
3163                                         setval = rack->r_ctl.rc_entry_gp_rtt;
3164                         } else {
3165                                 /*
3166                                  * Here we are not highly buffered
3167                                  * and should pick the min we can to
3168                                  * keep from causing loss.
3169                                  */
3170                                 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3171                         }
3172                         rack_set_prtt_target(rack, segsiz,
3173                                              setval);
3174                 }
3175                 if (rack_probe_rtt_sets_cwnd > 1) {
3176                         /* There is a percentage here to boost */
3177                         ebdp = rack->r_ctl.rc_target_probertt_flight;
3178                         ebdp *= rack_probe_rtt_sets_cwnd;
3179                         ebdp /= 100;
3180                         setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
3181                 } else
3182                         setto = rack->r_ctl.rc_target_probertt_flight;
3183                 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
3184                 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
3185                         /* Enforce a min */
3186                         rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
3187                 }
3188                 /* If we set in the cwnd also set the ssthresh point so we are in CA */
3189                 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
3190         }
3191         rack_log_rtt_shrinks(rack,  us_cts,
3192                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3193                              __LINE__, RACK_RTTS_EXITPROBE);
3194         /* Clear times last so log has all the info */
3195         rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
3196         rack->r_ctl.rc_time_probertt_entered = us_cts;
3197         rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3198         rack->r_ctl.rc_time_of_last_probertt = us_cts;
3199 }
3200
3201 static void
3202 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
3203 {
3204         /* Check in on probe-rtt */
3205         if (rack->rc_gp_filled == 0) {
3206                 /* We do not do p-rtt unless we have gp measurements */
3207                 return;
3208         }
3209         if (rack->in_probe_rtt) {
3210                 uint64_t no_overflow;
3211                 uint32_t endtime, must_stay;
3212
3213                 if (rack->r_ctl.rc_went_idle_time &&
3214                     ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
3215                         /*
3216                          * We went idle during prtt, just exit now.
3217                          */
3218                         rack_exit_probertt(rack, us_cts);
3219                 } else if (rack_probe_rtt_safety_val &&
3220                     TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
3221                     ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
3222                         /*
3223                          * Probe RTT safety value triggered!
3224                          */
3225                         rack_log_rtt_shrinks(rack,  us_cts,
3226                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3227                                              __LINE__, RACK_RTTS_SAFETY);
3228                         rack_exit_probertt(rack, us_cts);
3229                 }
3230                 /* Calculate the max we will wait */
3231                 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
3232                 if (rack->rc_highly_buffered)
3233                         endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
3234                 /* Calculate the min we must wait */
3235                 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
3236                 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
3237                     TSTMP_LT(us_cts, endtime)) {
3238                         uint32_t calc;
3239                         /* Do we lower more? */
3240 no_exit:
3241                         if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
3242                                 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
3243                         else
3244                                 calc = 0;
3245                         calc /= max(rack->r_ctl.rc_gp_srtt, 1);
3246                         if (calc) {
3247                                 /* Maybe */
3248                                 calc *= rack_per_of_gp_probertt_reduce;
3249                                 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
3250                                 /* Limit it too */
3251                                 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
3252                                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
3253                         }
3254                         /* We must reach target or the time set */
3255                         return;
3256                 }
3257                 if (rack->r_ctl.rc_time_probertt_starts == 0) {
3258                         if ((TSTMP_LT(us_cts, must_stay) &&
3259                              rack->rc_highly_buffered) ||
3260                              (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
3261                               rack->r_ctl.rc_target_probertt_flight)) {
3262                                 /* We are not past the must_stay time */
3263                                 goto no_exit;
3264                         }
3265                         rack_log_rtt_shrinks(rack,  us_cts,
3266                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3267                                              __LINE__, RACK_RTTS_REACHTARGET);
3268                         rack->r_ctl.rc_time_probertt_starts = us_cts;
3269                         if (rack->r_ctl.rc_time_probertt_starts == 0)
3270                                 rack->r_ctl.rc_time_probertt_starts = 1;
3271                         /* Restore back to our rate we want to pace at in prtt */
3272                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3273                 }
3274                 /*
3275                  * Setup our end time, some number of gp_srtts plus 200ms.
3276                  */
3277                 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
3278                                (uint64_t)rack_probertt_gpsrtt_cnt_mul);
3279                 if (rack_probertt_gpsrtt_cnt_div)
3280                         endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
3281                 else
3282                         endtime = 0;
3283                 endtime += rack_min_probertt_hold;
3284                 endtime += rack->r_ctl.rc_time_probertt_starts;
3285                 if (TSTMP_GEQ(us_cts,  endtime)) {
3286                         /* yes, exit probertt  */
3287                         rack_exit_probertt(rack, us_cts);
3288                 }
3289
3290         } else  if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
3291                 /* Go into probertt, its been too long since we went lower  */
3292                 rack_enter_probertt(rack, us_cts);
3293         }
3294 }
3295
3296 static void
3297 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
3298                        uint32_t rtt, int32_t rtt_diff)
3299 {
3300         uint64_t cur_bw, up_bnd, low_bnd, subfr;
3301         uint32_t losses;
3302
3303         if ((rack->rc_gp_dyn_mul == 0) ||
3304             (rack->use_fixed_rate) ||
3305             (rack->in_probe_rtt) ||
3306             (rack->rc_always_pace == 0)) {
3307                 /* No dynamic GP multipler in play */
3308                 return;
3309         }
3310         losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
3311         cur_bw = rack_get_bw(rack);
3312         /* Calculate our up and down range */
3313         up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
3314         up_bnd /= 100;
3315         up_bnd += rack->r_ctl.last_gp_comp_bw;
3316
3317         subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
3318         subfr /= 100;
3319         low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
3320         if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
3321                 /*
3322                  * This is the case where our RTT is above
3323                  * the max target and we have been configured
3324                  * to just do timely no bonus up stuff in that case.
3325                  *
3326                  * There are two configurations, set to 1, and we
3327                  * just do timely if we are over our max. If its
3328                  * set above 1 then we slam the multipliers down
3329                  * to 100 and then decrement per timely.
3330                  */
3331                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3332                                 __LINE__, 3);
3333                 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
3334                         rack_validate_multipliers_at_or_below_100(rack);
3335                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
3336         } else if ((last_bw_est < low_bnd) && !losses) {
3337                 /*
3338                  * We are decreasing this is a bit complicated this
3339                  * means we are loosing ground. This could be
3340                  * because another flow entered and we are competing
3341                  * for b/w with it. This will push the RTT up which
3342                  * makes timely unusable unless we want to get shoved
3343                  * into a corner and just be backed off (the age
3344                  * old problem with delay based CC).
3345                  *
3346                  * On the other hand if it was a route change we
3347                  * would like to stay somewhat contained and not
3348                  * blow out the buffers.
3349                  */
3350                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3351                                 __LINE__, 3);
3352                 rack->r_ctl.last_gp_comp_bw = cur_bw;
3353                 if (rack->rc_gp_bwred == 0) {
3354                         /* Go into reduction counting */
3355                         rack->rc_gp_bwred = 1;
3356                         rack->rc_gp_timely_dec_cnt = 0;
3357                 }
3358                 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
3359                     (timely_says == 0)) {
3360                         /*
3361                          * Push another time with a faster pacing
3362                          * to try to gain back (we include override to
3363                          * get a full raise factor).
3364                          */
3365                         if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
3366                             (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
3367                             (timely_says == 0) ||
3368                             (rack_down_raise_thresh == 0)) {
3369                                 /*
3370                                  * Do an override up in b/w if we were
3371                                  * below the threshold or if the threshold
3372                                  * is zero we always do the raise.
3373                                  */
3374                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
3375                         } else {
3376                                 /* Log it stays the same */
3377                                 rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
3378                                                 __LINE__, 11);
3379
3380                         }
3381                         rack->rc_gp_timely_dec_cnt++;
3382                         /* We are not incrementing really no-count */
3383                         rack->rc_gp_incr = 0;
3384                         rack->rc_gp_timely_inc_cnt = 0;
3385                 } else {
3386                         /*
3387                          * Lets just use the RTT
3388                          * information and give up
3389                          * pushing.
3390                          */
3391                         goto use_timely;
3392                 }
3393         }  else if ((timely_says != 2) &&
3394                     !losses &&
3395                     (last_bw_est > up_bnd)) {
3396                 /*
3397                  * We are increasing b/w lets keep going, updating
3398                  * our b/w and ignoring any timely input, unless
3399                  * of course we are at our max raise (if there is one).
3400                  */
3401
3402                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3403                                 __LINE__, 3);
3404                 rack->r_ctl.last_gp_comp_bw = cur_bw;
3405                 if (rack->rc_gp_saw_ss &&
3406                     rack_per_upper_bound_ss &&
3407                      (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
3408                             /*
3409                              * In cases where we can't go higher
3410                              * we should just use timely.
3411                              */
3412                             goto use_timely;
3413                 }
3414                 if (rack->rc_gp_saw_ca &&
3415                     rack_per_upper_bound_ca &&
3416                     (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
3417                             /*
3418                              * In cases where we can't go higher
3419                              * we should just use timely.
3420                              */
3421                             goto use_timely;
3422                 }
3423                 rack->rc_gp_bwred = 0;
3424                 rack->rc_gp_timely_dec_cnt = 0;
3425                 /* You get a set number of pushes if timely is trying to reduce  */
3426                 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
3427                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
3428                 } else {
3429                         /* Log it stays the same */
3430                         rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
3431                             __LINE__, 12);
3432
3433                 }
3434                 return;
3435         } else {
3436                 /*
3437                  * We are staying between the lower and upper range bounds
3438                  * so use timely to decide.
3439                  */
3440                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3441                                 __LINE__, 3);
3442 use_timely:
3443                 if (timely_says) {
3444                         rack->rc_gp_incr = 0;
3445                         rack->rc_gp_timely_inc_cnt = 0;
3446                         if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
3447                             !losses &&
3448                             (last_bw_est < low_bnd)) {
3449                                 /* We are loosing ground */
3450                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
3451                                 rack->rc_gp_timely_dec_cnt++;
3452                                 /* We are not incrementing really no-count */
3453                                 rack->rc_gp_incr = 0;
3454                                 rack->rc_gp_timely_inc_cnt = 0;
3455                         } else
3456                                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
3457                 } else  {
3458                         rack->rc_gp_bwred = 0;
3459                         rack->rc_gp_timely_dec_cnt = 0;
3460                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
3461                 }
3462         }
3463 }
3464
3465 static int32_t
3466 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
3467 {
3468         int32_t timely_says;
3469         uint64_t log_mult, log_rtt_a_diff;
3470
3471         log_rtt_a_diff = rtt;
3472         log_rtt_a_diff <<= 32;
3473         log_rtt_a_diff |= (uint32_t)rtt_diff;
3474         if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
3475                     rack_gp_rtt_maxmul)) {
3476                 /* Reduce the b/w multipler */
3477                 timely_says = 2;
3478                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3479                 log_mult <<= 32;
3480                 log_mult |= prev_rtt;
3481                 rack_log_timely(rack,  timely_says, log_mult,
3482                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3483                                 log_rtt_a_diff, __LINE__, 4);
3484         } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
3485                            ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
3486                             max(rack_gp_rtt_mindiv , 1)))) {
3487                 /* Increase the b/w multipler */
3488                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
3489                         ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
3490                          max(rack_gp_rtt_mindiv , 1));
3491                 log_mult <<= 32;
3492                 log_mult |= prev_rtt;
3493                 timely_says = 0;
3494                 rack_log_timely(rack,  timely_says, log_mult ,
3495                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3496                                 log_rtt_a_diff, __LINE__, 5);
3497         } else {
3498                 /*
3499                  * Use a gradient to find it the timely gradient
3500                  * is:
3501                  * grad = rc_rtt_diff / min_rtt;
3502                  *
3503                  * anything below or equal to 0 will be
3504                  * a increase indication. Anything above
3505                  * zero is a decrease. Note we take care
3506                  * of the actual gradient calculation
3507                  * in the reduction (its not needed for
3508                  * increase).
3509                  */
3510                 log_mult = prev_rtt;
3511                 if (rtt_diff <= 0) {
3512                         /*
3513                          * Rttdiff is less than zero, increase the
3514                          * b/w multipler (its 0 or negative)
3515                          */
3516                         timely_says = 0;
3517                         rack_log_timely(rack,  timely_says, log_mult,
3518                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
3519                 } else {
3520                         /* Reduce the b/w multipler */
3521                         timely_says = 1;
3522                         rack_log_timely(rack,  timely_says, log_mult,
3523                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
3524                 }
3525         }
3526         return (timely_says);
3527 }
3528
3529 static void
3530 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
3531                             tcp_seq th_ack, int line)
3532 {
3533         uint64_t tim, bytes_ps, ltim, stim, utim;
3534         uint32_t segsiz, bytes, reqbytes, us_cts;
3535         int32_t gput, new_rtt_diff, timely_says;
3536
3537         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
3538         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3539         if (TSTMP_GEQ(us_cts, tp->gput_ts))
3540                 tim = us_cts - tp->gput_ts;
3541         else
3542                 tim = 0;
3543
3544         if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts))
3545                 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
3546         else
3547                 stim = 0;
3548         /*
3549          * Use the larger of the send time or ack time. This prevents us
3550          * from being influenced by ack artifacts to come up with too
3551          * high of measurement. Note that since we are spanning over many more
3552          * bytes in most of our measurements hopefully that is less likely to
3553          * occur.
3554          */
3555         if (tim > stim)
3556                 utim = max(tim, 1);
3557         else
3558                 utim = max(stim, 1);
3559         /* Lets validate utim */
3560         ltim = max(1, (utim/HPTS_USEC_IN_MSEC));
3561         gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
3562         reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
3563         if ((tim == 0) && (stim == 0)) {
3564                 /*
3565                  * Invalid measurement time, maybe
3566                  * all on one ack/one send?
3567                  */
3568                 bytes = 0;
3569                 bytes_ps = 0;
3570                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
3571                                            0, 0, 0, 10, __LINE__, NULL);
3572                 goto skip_measurement;
3573         }
3574         if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
3575                 /* We never made a us_rtt measurement? */
3576                 bytes = 0;
3577                 bytes_ps = 0;
3578                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
3579                                            0, 0, 0, 10, __LINE__, NULL);
3580                 goto skip_measurement;
3581         }
3582         /*
3583          * Calculate the maximum possible b/w this connection
3584          * could have. We base our calculation on the lowest
3585          * rtt we have seen during the measurement and the
3586          * largest rwnd the client has given us in that time. This
3587          * forms a BDP that is the maximum that we could ever
3588          * get to the client. Anything larger is not valid.
3589          *
3590          * I originally had code here that rejected measurements
3591          * where the time was less than 1/2 the latest us_rtt.
3592          * But after thinking on that I realized its wrong since
3593          * say you had a 150Mbps or even 1Gbps link, and you
3594          * were a long way away.. example I am in Europe (100ms rtt)
3595          * talking to my 1Gbps link in S.C. Now measuring say 150,000
3596          * bytes my time would be 1.2ms, and yet my rtt would say
3597          * the measurement was invalid the time was < 50ms. The
3598          * same thing is true for 150Mb (8ms of time).
3599          *
3600          * A better way I realized is to look at what the maximum
3601          * the connection could possibly do. This is gated on
3602          * the lowest RTT we have seen and the highest rwnd.
3603          * We should in theory never exceed that, if we are
3604          * then something on the path is storing up packets
3605          * and then feeding them all at once to our endpoint
3606          * messing up our measurement.
3607          */
3608         rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
3609         rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
3610         rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
3611         if (SEQ_LT(th_ack, tp->gput_seq)) {
3612                 /* No measurement can be made */
3613                 bytes = 0;
3614                 bytes_ps = 0;
3615                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
3616                                            0, 0, 0, 10, __LINE__, NULL);
3617                 goto skip_measurement;
3618         } else
3619                 bytes = (th_ack - tp->gput_seq);
3620         bytes_ps = (uint64_t)bytes;
3621         /*
3622          * Don't measure a b/w for pacing unless we have gotten at least
3623          * an initial windows worth of data in this measurement interval.
3624          *
3625          * Small numbers of bytes get badly influenced by delayed ack and
3626          * other artifacts. Note we take the initial window or our
3627          * defined minimum GP (defaulting to 10 which hopefully is the
3628          * IW).
3629          */
3630         if (rack->rc_gp_filled == 0) {
3631                 /*
3632                  * The initial estimate is special. We
3633                  * have blasted out an IW worth of packets
3634                  * without a real valid ack ts results. We
3635                  * then setup the app_limited_needs_set flag,
3636                  * this should get the first ack in (probably 2
3637                  * MSS worth) to be recorded as the timestamp.
3638                  * We thus allow a smaller number of bytes i.e.
3639                  * IW - 2MSS.
3640                  */
3641                 reqbytes -= (2 * segsiz);
3642                 /* Also lets fill previous for our first measurement to be neutral */
3643                 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
3644         }
3645         if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
3646                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
3647                                            rack->r_ctl.rc_app_limited_cnt,
3648                                            0, 0, 10, __LINE__, NULL);
3649                 goto skip_measurement;
3650         }
3651         /*
3652          * We now need to calculate the Timely like status so
3653          * we can update (possibly) the b/w multipliers.
3654          */
3655         new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
3656         if (rack->rc_gp_filled == 0) {
3657                 /* No previous reading */
3658                 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
3659         } else {
3660                 if (rack->measure_saw_probe_rtt == 0) {
3661                         /*
3662                          * We don't want a probertt to be counted
3663                          * since it will be negative incorrectly. We
3664                          * expect to be reducing the RTT when we
3665                          * pace at a slower rate.
3666                          */
3667                         rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
3668                         rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
3669                 }
3670         }
3671         timely_says = rack_make_timely_judgement(rack,
3672                 rack->r_ctl.rc_gp_srtt,
3673                 rack->r_ctl.rc_rtt_diff,
3674                 rack->r_ctl.rc_prev_gp_srtt
3675                 );
3676         bytes_ps *= HPTS_USEC_IN_SEC;
3677         bytes_ps /= utim;
3678         if (bytes_ps > rack->r_ctl.last_max_bw) {
3679                 /*
3680                  * Something is on path playing
3681                  * since this b/w is not possible based
3682                  * on our BDP (highest rwnd and lowest rtt
3683                  * we saw in the measurement window).
3684                  *
3685                  * Another option here would be to
3686                  * instead skip the measurement.
3687                  */
3688                 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
3689                                            bytes_ps, rack->r_ctl.last_max_bw, 0,
3690                                            11, __LINE__, NULL);
3691                 bytes_ps = rack->r_ctl.last_max_bw;
3692         }
3693         /* We store gp for b/w in bytes per second  */
3694         if (rack->rc_gp_filled == 0) {
3695                 /* Initial measurment */
3696                 if (bytes_ps) {
3697                         rack->r_ctl.gp_bw = bytes_ps;
3698                         rack->rc_gp_filled = 1;
3699                         rack->r_ctl.num_avg = 1;
3700                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
3701                 } else {
3702                         rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
3703                                                    rack->r_ctl.rc_app_limited_cnt,
3704                                                    0, 0, 10, __LINE__, NULL);
3705                 }
3706                 if (rack->rc_inp->inp_in_hpts &&
3707                     (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
3708                         /*
3709                          * Ok we can't trust the pacer in this case
3710                          * where we transition from un-paced to paced.
3711                          * Or for that matter when the burst mitigation
3712                          * was making a wild guess and got it wrong.
3713                          * Stop the pacer and clear up all the aggregate
3714                          * delays etc.
3715                          */
3716                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3717                         rack->r_ctl.rc_hpts_flags = 0;
3718                         rack->r_ctl.rc_last_output_to = 0;
3719                 }
3720         } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) {
3721                 /* Still a small number run an average */
3722                 rack->r_ctl.gp_bw += bytes_ps;
3723                 rack->r_ctl.num_avg++;
3724                 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) {
3725                         /* We have collected enought to move forward */
3726                         rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg;
3727                 }
3728         } else {
3729                 /*
3730                  * We want to take 1/wma of the goodput and add in to 7/8th
3731                  * of the old value weighted by the srtt. So if your measurement
3732                  * period is say 2 SRTT's long you would get 1/4 as the
3733                  * value, if it was like 1/2 SRTT then you would get 1/16th.
3734                  *
3735                  * But we must be careful not to take too much i.e. if the
3736                  * srtt is say 20ms and the measurement is taken over
3737                  * 400ms our weight would be 400/20 i.e. 20. On the
3738                  * other hand if we get a measurement over 1ms with a
3739                  * 10ms rtt we only want to take a much smaller portion.
3740                  */
3741                 uint64_t  resid_bw, subpart, addpart, srtt;
3742
3743                 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
3744                 if (srtt == 0) {
3745                         /*
3746                          * Strange why did t_srtt go back to zero?
3747                          */
3748                         if (rack->r_ctl.rc_rack_min_rtt)
3749                                 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC);
3750                         else
3751                                 srtt = HPTS_USEC_IN_MSEC;
3752                 }
3753                 /*
3754                  * XXXrrs: Note for reviewers, in playing with
3755                  * dynamic pacing I discovered this GP calculation
3756                  * as done originally leads to some undesired results.
3757                  * Basically you can get longer measurements contributing
3758                  * too much to the WMA. Thus I changed it if you are doing
3759                  * dynamic adjustments to only do the aportioned adjustment
3760                  * if we have a very small (time wise) measurement. Longer
3761                  * measurements just get there weight (defaulting to 1/8)
3762                  * add to the WMA. We may want to think about changing
3763                  * this to always do that for both sides i.e. dynamic
3764                  * and non-dynamic... but considering lots of folks
3765                  * were playing with this I did not want to change the
3766                  * calculation per.se. without your thoughts.. Lawerence?
3767                  * Peter??
3768                  */
3769                 if (rack->rc_gp_dyn_mul == 0) {
3770                         subpart = rack->r_ctl.gp_bw * utim;
3771                         subpart /= (srtt * 8);
3772                         if (subpart < (rack->r_ctl.gp_bw / 2)) {
3773                                 /*
3774                                  * The b/w update takes no more
3775                                  * away then 1/2 our running total
3776                                  * so factor it in.
3777                                  */
3778                                 addpart = bytes_ps * utim;
3779                                 addpart /= (srtt * 8);
3780                         } else {
3781                                 /*
3782                                  * Don't allow a single measurement
3783                                  * to account for more than 1/2 of the
3784                                  * WMA. This could happen on a retransmission
3785                                  * where utim becomes huge compared to
3786                                  * srtt (multiple retransmissions when using
3787                                  * the sending rate which factors in all the
3788                                  * transmissions from the first one).
3789                                  */
3790                                 subpart = rack->r_ctl.gp_bw / 2;
3791                                 addpart = bytes_ps / 2;
3792                         }
3793                         resid_bw = rack->r_ctl.gp_bw - subpart;
3794                         rack->r_ctl.gp_bw = resid_bw + addpart;
3795                 } else {
3796                         if ((utim / srtt) <= 1) {
3797                                 /*
3798                                  * The b/w update was over a small period
3799                                  * of time. The idea here is to prevent a small
3800                                  * measurement time period from counting
3801                                  * too much. So we scale it based on the
3802                                  * time so it attributes less than 1/rack_wma_divisor
3803                                  * of its measurement.
3804                                  */
3805                                 subpart = rack->r_ctl.gp_bw * utim;
3806                                 subpart /= (srtt * rack_wma_divisor);
3807                                 addpart = bytes_ps * utim;
3808                                 addpart /= (srtt * rack_wma_divisor);
3809                         } else {
3810                                 /*
3811                                  * The scaled measurement was long
3812                                  * enough so lets just add in the
3813                                  * portion of the measurment i.e. 1/rack_wma_divisor
3814                                  */
3815                                 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
3816                                 addpart = bytes_ps / rack_wma_divisor;
3817                         }
3818                         if ((rack->measure_saw_probe_rtt == 0) ||
3819                             (bytes_ps > rack->r_ctl.gp_bw)) {
3820                                 /*
3821                                  * For probe-rtt we only add it in
3822                                  * if its larger, all others we just
3823                                  * add in.
3824                                  */
3825                                 resid_bw = rack->r_ctl.gp_bw - subpart;
3826                                 rack->r_ctl.gp_bw = resid_bw + addpart;
3827                         }
3828                 }
3829         }
3830         /* We do not update any multipliers if we are in or have seen a probe-rtt */
3831         if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
3832                 rack_update_multiplier(rack, timely_says, bytes_ps,
3833                                        rack->r_ctl.rc_gp_srtt,
3834                                        rack->r_ctl.rc_rtt_diff);
3835         rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
3836                                    rack_get_bw(rack), 3, line, NULL);
3837         /* reset the gp srtt and setup the new prev */
3838         rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
3839         /* Record the lost count for the next measurement */
3840         rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
3841         /*
3842          * We restart our diffs based on the gpsrtt in the
3843          * measurement window.
3844          */
3845         rack->rc_gp_rtt_set = 0;
3846         rack->rc_gp_saw_rec = 0;
3847         rack->rc_gp_saw_ca = 0;
3848         rack->rc_gp_saw_ss = 0;
3849         rack->rc_dragged_bottom = 0;
3850 skip_measurement:
3851
3852 #ifdef STATS
3853         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
3854                                  gput);
3855         /*
3856          * XXXLAS: This is a temporary hack, and should be
3857          * chained off VOI_TCP_GPUT when stats(9) grows an
3858          * API to deal with chained VOIs.
3859          */
3860         if (tp->t_stats_gput_prev > 0)
3861                 stats_voi_update_abs_s32(tp->t_stats,
3862                                          VOI_TCP_GPUT_ND,
3863                                          ((gput - tp->t_stats_gput_prev) * 100) /
3864                                          tp->t_stats_gput_prev);
3865 #endif
3866         tp->t_flags &= ~TF_GPUTINPROG;
3867         tp->t_stats_gput_prev = gput;
3868         /*
3869          * Now are we app limited now and there is space from where we
3870          * were to where we want to go?
3871          *
3872          * We don't do the other case i.e. non-applimited here since
3873          * the next send will trigger us picking up the missing data.
3874          */
3875         if (rack->r_ctl.rc_first_appl &&
3876             TCPS_HAVEESTABLISHED(tp->t_state) &&
3877             rack->r_ctl.rc_app_limited_cnt &&
3878             (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
3879             ((rack->r_ctl.rc_first_appl->r_start - th_ack) >
3880              max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3881                 /*
3882                  * Yep there is enough outstanding to make a measurement here.
3883                  */
3884                 struct rack_sendmap *rsm, fe;
3885
3886                 tp->t_flags |= TF_GPUTINPROG;
3887                 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
3888                 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
3889                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
3890                 rack->app_limited_needs_set = 0;
3891                 tp->gput_seq = th_ack;
3892                 if (rack->in_probe_rtt)
3893                         rack->measure_saw_probe_rtt = 1;
3894                 else if ((rack->measure_saw_probe_rtt) &&
3895                          (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
3896                         rack->measure_saw_probe_rtt = 0;
3897                 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) {
3898                         /* There is a full window to gain info from */
3899                         tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
3900                 } else {
3901                         /* We can only measure up to the applimited point */
3902                         tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack);
3903                 }
3904                 /*
3905                  * Now we need to find the timestamp of the send at tp->gput_seq
3906                  * for the send based measurement.
3907                  */
3908                 fe.r_start = tp->gput_seq;
3909                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3910                 if (rsm) {
3911                         /* Ok send-based limit is set */
3912                         if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
3913                                 /*
3914                                  * Move back to include the earlier part
3915                                  * so our ack time lines up right (this may
3916                                  * make an overlapping measurement but thats
3917                                  * ok).
3918                                  */
3919                                 tp->gput_seq = rsm->r_start;
3920                         }
3921                         if (rsm->r_flags & RACK_ACKED)
3922                                 tp->gput_ts = rsm->r_ack_arrival;
3923                         else
3924                                 rack->app_limited_needs_set = 1;
3925                         rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
3926                 } else {
3927                         /*
3928                          * If we don't find the rsm due to some
3929                          * send-limit set the current time, which
3930                          * basically disables the send-limit.
3931                          */
3932                         rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL);
3933                 }
3934                 rack_log_pacing_delay_calc(rack,
3935                                            tp->gput_seq,
3936                                            tp->gput_ack,
3937                                            (uint64_t)rsm,
3938                                            tp->gput_ts,
3939                                            rack->r_ctl.rc_app_limited_cnt,
3940                                            9,
3941                                            __LINE__, NULL);
3942         }
3943 }
3944
3945 /*
3946  * CC wrapper hook functions
3947  */
3948 static void
3949 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
3950     uint16_t type, int32_t recovery)
3951 {
3952         INP_WLOCK_ASSERT(tp->t_inpcb);
3953         tp->ccv->nsegs = nsegs;
3954         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
3955         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
3956                 uint32_t max;
3957
3958                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
3959                 if (tp->ccv->bytes_this_ack > max) {
3960                         tp->ccv->bytes_this_ack = max;
3961                 }
3962         }
3963         if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd)
3964                 tp->ccv->flags |= CCF_CWND_LIMITED;
3965         else
3966                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
3967 #ifdef STATS
3968         stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
3969             ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
3970 #endif
3971         if ((tp->t_flags & TF_GPUTINPROG) &&
3972             rack_enough_for_measurement(tp, rack, th->th_ack)) {
3973                 /* Measure the Goodput */
3974                 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__);
3975 #ifdef NETFLIX_PEAKRATE
3976                 if ((type == CC_ACK) &&
3977                     (tp->t_maxpeakrate)) {
3978                         /*
3979                          * We update t_peakrate_thr. This gives us roughly
3980                          * one update per round trip time. Note
3981                          * it will only be used if pace_always is off i.e
3982                          * we don't do this for paced flows.
3983                          */
3984                         tcp_update_peakrate_thr(tp);
3985                 }
3986 #endif
3987         }
3988         if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) {
3989                 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
3990                          nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
3991                 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
3992                         tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
3993                         tp->ccv->flags |= CCF_ABC_SENTAWND;
3994                 }
3995         } else {
3996                 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
3997                 tp->t_bytes_acked = 0;
3998         }
3999         if (CC_ALGO(tp)->ack_received != NULL) {
4000                 /* XXXLAS: Find a way to live without this */
4001                 tp->ccv->curack = th->th_ack;
4002                 CC_ALGO(tp)->ack_received(tp->ccv, type);
4003         }
4004 #ifdef STATS
4005         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
4006 #endif
4007         if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
4008                 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
4009         }
4010 #ifdef NETFLIX_PEAKRATE
4011         /* we enforce max peak rate if it is set and we are not pacing */
4012         if ((rack->rc_always_pace == 0) &&
4013             tp->t_peakrate_thr &&
4014             (tp->snd_cwnd > tp->t_peakrate_thr)) {
4015                 tp->snd_cwnd = tp->t_peakrate_thr;
4016         }
4017 #endif
4018 }
4019
4020 static void
4021 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
4022 {
4023         struct tcp_rack *rack;
4024
4025         rack = (struct tcp_rack *)tp->t_fb_ptr;
4026         INP_WLOCK_ASSERT(tp->t_inpcb);
4027         /*
4028          * If we are doing PRR and have enough
4029          * room to send <or> we are pacing and prr
4030          * is disabled we will want to see if we
4031          * can send data (by setting r_wanted_output to
4032          * true).
4033          */
4034         if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
4035             rack->rack_no_prr)
4036                 rack->r_wanted_output = 1;
4037 }
4038
4039 static void
4040 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
4041 {
4042         struct tcp_rack *rack;
4043         uint32_t orig_cwnd;
4044
4045
4046         orig_cwnd = tp->snd_cwnd;
4047         INP_WLOCK_ASSERT(tp->t_inpcb);
4048         rack = (struct tcp_rack *)tp->t_fb_ptr;
4049         if (rack->rc_not_backing_off == 0) {
4050                 /* only alert CC if we alerted when we entered */
4051                 if (CC_ALGO(tp)->post_recovery != NULL) {
4052                         tp->ccv->curack = th->th_ack;
4053                         CC_ALGO(tp)->post_recovery(tp->ccv);
4054                 }
4055                 if (tp->snd_cwnd > tp->snd_ssthresh) {
4056                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
4057                         tp->snd_cwnd = tp->snd_ssthresh;
4058                 }
4059         }
4060         if ((rack->rack_no_prr == 0) &&
4061             (rack->r_ctl.rc_prr_sndcnt > 0)) {
4062                 /* Suck the next prr cnt back into cwnd */
4063                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
4064                 rack->r_ctl.rc_prr_sndcnt = 0;
4065                 rack_log_to_prr(rack, 1, 0);
4066         }
4067         rack_log_to_prr(rack, 14, orig_cwnd);
4068         tp->snd_recover = tp->snd_una;
4069         EXIT_RECOVERY(tp->t_flags);
4070 }
4071
4072 static void
4073 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
4074 {
4075         struct tcp_rack *rack;
4076
4077         INP_WLOCK_ASSERT(tp->t_inpcb);
4078
4079         rack = (struct tcp_rack *)tp->t_fb_ptr;
4080         switch (type) {
4081         case CC_NDUPACK:
4082                 tp->t_flags &= ~TF_WASFRECOVERY;
4083                 tp->t_flags &= ~TF_WASCRECOVERY;
4084                 if (!IN_FASTRECOVERY(tp->t_flags)) {
4085                         rack->r_ctl.rc_prr_delivered = 0;
4086                         rack->r_ctl.rc_prr_out = 0;
4087                         if (rack->rack_no_prr == 0) {
4088                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4089                                 rack_log_to_prr(rack, 2, 0);
4090                         }
4091                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
4092                         tp->snd_recover = tp->snd_max;
4093                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4094                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4095                 }
4096                 break;
4097         case CC_ECN:
4098                 if (!IN_CONGRECOVERY(tp->t_flags) ||
4099                     /*
4100                      * Allow ECN reaction on ACK to CWR, if
4101                      * that data segment was also CE marked.
4102                      */
4103                     SEQ_GEQ(th->th_ack, tp->snd_recover)) {
4104                         EXIT_CONGRECOVERY(tp->t_flags);
4105                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
4106                         tp->snd_recover = tp->snd_max + 1;
4107                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4108                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4109                 }
4110                 break;
4111         case CC_RTO:
4112                 tp->t_dupacks = 0;
4113                 tp->t_bytes_acked = 0;
4114                 EXIT_RECOVERY(tp->t_flags);
4115                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
4116                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
4117                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
4118                 if (tp->t_flags2 & TF2_ECN_PERMIT)
4119                         tp->t_flags2 |= TF2_ECN_SND_CWR;
4120                 break;
4121         case CC_RTO_ERR:
4122                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
4123                 /* RTO was unnecessary, so reset everything. */
4124                 tp->snd_cwnd = tp->snd_cwnd_prev;
4125                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
4126                 tp->snd_recover = tp->snd_recover_prev;
4127                 if (tp->t_flags & TF_WASFRECOVERY) {
4128                         ENTER_FASTRECOVERY(tp->t_flags);
4129                         tp->t_flags &= ~TF_WASFRECOVERY;
4130                 }
4131                 if (tp->t_flags & TF_WASCRECOVERY) {
4132                         ENTER_CONGRECOVERY(tp->t_flags);
4133                         tp->t_flags &= ~TF_WASCRECOVERY;
4134                 }
4135                 tp->snd_nxt = tp->snd_max;
4136                 tp->t_badrxtwin = 0;
4137                 break;
4138         }
4139         /*
4140          * If we are below our max rtt, don't
4141          * signal the CC control to change things.
4142          * instead set it up so that we are in
4143          * recovery but not going to back off.
4144          */
4145
4146         if (rack->rc_highly_buffered) {
4147                 /*
4148                  * Do we use the higher rtt for
4149                  * our threshold to not backoff (like CDG)?
4150                  */
4151                 uint32_t rtt_mul, rtt_div;
4152
4153                 if (rack_use_max_for_nobackoff) {
4154                         rtt_mul = (rack_gp_rtt_maxmul - 1);
4155                         rtt_div = 1;
4156                 } else {
4157                         rtt_mul = rack_gp_rtt_minmul;
4158                         rtt_div = max(rack_gp_rtt_mindiv , 1);
4159                 }
4160                 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt +
4161                                                ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) /
4162                                                 rtt_div))) {
4163                         /* below our min threshold */
4164                         rack->rc_not_backing_off = 1;
4165                         ENTER_RECOVERY(rack->rc_tp->t_flags);
4166                         rack_log_rtt_shrinks(rack, 0,
4167                                              rtt_mul,
4168                                              rtt_div,
4169                                              RACK_RTTS_NOBACKOFF);
4170                         return;
4171                 }
4172         }
4173         rack->rc_not_backing_off = 0;
4174         if (CC_ALGO(tp)->cong_signal != NULL) {
4175                 if (th != NULL)
4176                         tp->ccv->curack = th->th_ack;
4177                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
4178         }
4179 }
4180
4181
4182
4183 static inline void
4184 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
4185 {
4186         uint32_t i_cwnd;
4187
4188         INP_WLOCK_ASSERT(tp->t_inpcb);
4189
4190 #ifdef NETFLIX_STATS
4191         KMOD_TCPSTAT_INC(tcps_idle_restarts);
4192         if (tp->t_state == TCPS_ESTABLISHED)
4193                 KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
4194 #endif
4195         if (CC_ALGO(tp)->after_idle != NULL)
4196                 CC_ALGO(tp)->after_idle(tp->ccv);
4197
4198         if (tp->snd_cwnd == 1)
4199                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
4200         else
4201                 i_cwnd = rc_init_window(rack);
4202
4203         /*
4204          * Being idle is no differnt than the initial window. If the cc
4205          * clamps it down below the initial window raise it to the initial
4206          * window.
4207          */
4208         if (tp->snd_cwnd < i_cwnd) {
4209                 tp->snd_cwnd = i_cwnd;
4210         }
4211 }
4212
4213
4214 /*
4215  * Indicate whether this ack should be delayed.  We can delay the ack if
4216  * following conditions are met:
4217  *      - There is no delayed ack timer in progress.
4218  *      - Our last ack wasn't a 0-sized window. We never want to delay
4219  *        the ack that opens up a 0-sized window.
4220  *      - LRO wasn't used for this segment. We make sure by checking that the
4221  *        segment size is not larger than the MSS.
4222  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
4223  *        connection.
4224  */
4225 #define DELAY_ACK(tp, tlen)                      \
4226         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
4227         ((tp->t_flags & TF_DELACK) == 0) &&      \
4228         (tlen <= tp->t_maxseg) &&                \
4229         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
4230
4231 static struct rack_sendmap *
4232 rack_find_lowest_rsm(struct tcp_rack *rack)
4233 {
4234         struct rack_sendmap *rsm;
4235
4236         /*
4237          * Walk the time-order transmitted list looking for an rsm that is
4238          * not acked. This will be the one that was sent the longest time
4239          * ago that is still outstanding.
4240          */
4241         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
4242                 if (rsm->r_flags & RACK_ACKED) {
4243                         continue;
4244                 }
4245                 goto finish;
4246         }
4247 finish:
4248         return (rsm);
4249 }
4250
4251 static struct rack_sendmap *
4252 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
4253 {
4254         struct rack_sendmap *prsm;
4255
4256         /*
4257          * Walk the sequence order list backward until we hit and arrive at
4258          * the highest seq not acked. In theory when this is called it
4259          * should be the last segment (which it was not).
4260          */
4261         counter_u64_add(rack_find_high, 1);
4262         prsm = rsm;
4263         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
4264                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
4265                         continue;
4266                 }
4267                 return (prsm);
4268         }
4269         return (NULL);
4270 }
4271
4272
4273 static uint32_t
4274 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
4275 {
4276         int32_t lro;
4277         uint32_t thresh;
4278
4279         /*
4280          * lro is the flag we use to determine if we have seen reordering.
4281          * If it gets set we have seen reordering. The reorder logic either
4282          * works in one of two ways:
4283          *
4284          * If reorder-fade is configured, then we track the last time we saw
4285          * re-ordering occur. If we reach the point where enough time as
4286          * passed we no longer consider reordering has occuring.
4287          *
4288          * Or if reorder-face is 0, then once we see reordering we consider
4289          * the connection to alway be subject to reordering and just set lro
4290          * to 1.
4291          *
4292          * In the end if lro is non-zero we add the extra time for
4293          * reordering in.
4294          */
4295         if (srtt == 0)
4296                 srtt = 1;
4297         if (rack->r_ctl.rc_reorder_ts) {
4298                 if (rack->r_ctl.rc_reorder_fade) {
4299                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
4300                                 lro = cts - rack->r_ctl.rc_reorder_ts;
4301                                 if (lro == 0) {
4302                                         /*
4303                                          * No time as passed since the last
4304                                          * reorder, mark it as reordering.
4305                                          */
4306                                         lro = 1;
4307                                 }
4308                         } else {
4309                                 /* Negative time? */
4310                                 lro = 0;
4311                         }
4312                         if (lro > rack->r_ctl.rc_reorder_fade) {
4313                                 /* Turn off reordering seen too */
4314                                 rack->r_ctl.rc_reorder_ts = 0;
4315                                 lro = 0;
4316                         }
4317                 } else {
4318                         /* Reodering does not fade */
4319                         lro = 1;
4320                 }
4321         } else {
4322                 lro = 0;
4323         }
4324         thresh = srtt + rack->r_ctl.rc_pkt_delay;
4325         if (lro) {
4326                 /* It must be set, if not you get 1/4 rtt */
4327                 if (rack->r_ctl.rc_reorder_shift)
4328                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
4329                 else
4330                         thresh += (srtt >> 2);
4331         } else {
4332                 thresh += 1;
4333         }
4334         /* We don't let the rack timeout be above a RTO */
4335         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
4336                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
4337         }
4338         /* And we don't want it above the RTO max either */
4339         if (thresh > rack_rto_max) {
4340                 thresh = rack_rto_max;
4341         }
4342         return (thresh);
4343 }
4344
4345 static uint32_t
4346 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
4347                      struct rack_sendmap *rsm, uint32_t srtt)
4348 {
4349         struct rack_sendmap *prsm;
4350         uint32_t thresh, len;
4351         int segsiz;
4352
4353         if (srtt == 0)
4354                 srtt = 1;
4355         if (rack->r_ctl.rc_tlp_threshold)
4356                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
4357         else
4358                 thresh = (srtt * 2);
4359
4360         /* Get the previous sent packet, if any  */
4361         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4362         counter_u64_add(rack_enter_tlp_calc, 1);
4363         len = rsm->r_end - rsm->r_start;
4364         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
4365                 /* Exactly like the ID */
4366                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
4367                         uint32_t alt_thresh;
4368                         /*
4369                          * Compensate for delayed-ack with the d-ack time.
4370                          */
4371                         counter_u64_add(rack_used_tlpmethod, 1);
4372                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
4373                         if (alt_thresh > thresh)
4374                                 thresh = alt_thresh;
4375                 }
4376         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
4377                 /* 2.1 behavior */
4378                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
4379                 if (prsm && (len <= segsiz)) {
4380                         /*
4381                          * Two packets outstanding, thresh should be (2*srtt) +
4382                          * possible inter-packet delay (if any).
4383                          */
4384                         uint32_t inter_gap = 0;
4385                         int idx, nidx;
4386
4387                         counter_u64_add(rack_used_tlpmethod, 1);
4388                         idx = rsm->r_rtr_cnt - 1;
4389                         nidx = prsm->r_rtr_cnt - 1;
4390                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
4391                                 /* Yes it was sent later (or at the same time) */
4392                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
4393                         }
4394                         thresh += inter_gap;
4395                 } else  if (len <= segsiz) {
4396                         /*
4397                          * Possibly compensate for delayed-ack.
4398                          */
4399                         uint32_t alt_thresh;
4400
4401                         counter_u64_add(rack_used_tlpmethod2, 1);
4402                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
4403                         if (alt_thresh > thresh)
4404                                 thresh = alt_thresh;
4405                 }
4406         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
4407                 /* 2.2 behavior */
4408                 if (len <= segsiz) {
4409                         uint32_t alt_thresh;
4410                         /*
4411                          * Compensate for delayed-ack with the d-ack time.
4412                          */
4413                         counter_u64_add(rack_used_tlpmethod, 1);
4414                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
4415                         if (alt_thresh > thresh)
4416                                 thresh = alt_thresh;
4417                 }
4418         }
4419         /* Not above an RTO */
4420         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
4421                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
4422         }
4423         /* Not above a RTO max */
4424         if (thresh > rack_rto_max) {
4425                 thresh = rack_rto_max;
4426         }
4427         /* Apply user supplied min TLP */
4428         if (thresh < rack_tlp_min) {
4429                 thresh = rack_tlp_min;
4430         }
4431         return (thresh);
4432 }
4433
4434 static uint32_t
4435 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
4436 {
4437         /*
4438          * We want the rack_rtt which is the
4439          * last rtt we measured. However if that
4440          * does not exist we fallback to the srtt (which
4441          * we probably will never do) and then as a last
4442          * resort we use RACK_INITIAL_RTO if no srtt is
4443          * yet set.
4444          */
4445         if (rack->rc_rack_rtt)
4446                 return(rack->rc_rack_rtt);
4447         else if (tp->t_srtt == 0)
4448                 return(RACK_INITIAL_RTO);
4449         return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
4450 }
4451
4452 static struct rack_sendmap *
4453 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
4454 {
4455         /*
4456          * Check to see that we don't need to fall into recovery. We will
4457          * need to do so if our oldest transmit is past the time we should
4458          * have had an ack.
4459          */
4460         struct tcp_rack *rack;
4461         struct rack_sendmap *rsm;
4462         int32_t idx;
4463         uint32_t srtt, thresh;
4464
4465         rack = (struct tcp_rack *)tp->t_fb_ptr;
4466         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
4467                 return (NULL);
4468         }
4469         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4470         if (rsm == NULL)
4471                 return (NULL);
4472
4473         if (rsm->r_flags & RACK_ACKED) {
4474                 rsm = rack_find_lowest_rsm(rack);
4475                 if (rsm == NULL)
4476                         return (NULL);
4477         }
4478         idx = rsm->r_rtr_cnt - 1;
4479         srtt = rack_grab_rtt(tp, rack);
4480         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
4481         if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) {
4482                 return (NULL);
4483         }
4484         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
4485                 return (NULL);
4486         }
4487         /* Ok if we reach here we are over-due and this guy can be sent */
4488         if (IN_RECOVERY(tp->t_flags) == 0) {
4489                 /*
4490                  * For the one that enters us into recovery record undo
4491                  * info.
4492                  */
4493                 rack->r_ctl.rc_rsm_start = rsm->r_start;
4494                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4495                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4496         }
4497         rack_cong_signal(tp, NULL, CC_NDUPACK);
4498         return (rsm);
4499 }
4500
4501 static uint32_t
4502 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
4503 {
4504         int32_t t;
4505         int32_t tt;
4506         uint32_t ret_val;
4507
4508         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
4509         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
4510             rack_persist_min, rack_persist_max);
4511         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
4512                 tp->t_rxtshift++;
4513         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
4514         ret_val = (uint32_t)tt;
4515         return (ret_val);
4516 }
4517
4518 static uint32_t
4519 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
4520 {
4521         /*
4522          * Start the FR timer, we do this based on getting the first one in
4523          * the rc_tmap. Note that if its NULL we must stop the timer. in all
4524          * events we need to stop the running timer (if its running) before
4525          * starting the new one.
4526          */
4527         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
4528         uint32_t srtt_cur;
4529         int32_t idx;
4530         int32_t is_tlp_timer = 0;
4531         struct rack_sendmap *rsm;
4532
4533         if (rack->t_timers_stopped) {
4534                 /* All timers have been stopped none are to run */
4535                 return (0);
4536         }
4537         if (rack->rc_in_persist) {
4538                 /* We can't start any timer in persists */
4539                 return (rack_get_persists_timer_val(tp, rack));
4540         }
4541         rack->rc_on_min_to = 0;
4542         if ((tp->t_state < TCPS_ESTABLISHED) ||
4543             ((tp->t_flags & TF_SACK_PERMIT) == 0))
4544                 goto activate_rxt;
4545         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4546         if ((rsm == NULL) || sup_rack) {
4547                 /* Nothing on the send map */
4548 activate_rxt:
4549                 time_since_sent = 0;
4550                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4551                 if (rsm) {
4552                         idx = rsm->r_rtr_cnt - 1;
4553                         if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
4554                                 tstmp_touse = rsm->r_tim_lastsent[idx];
4555                         else
4556                                 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
4557                         if (TSTMP_GT(cts, tstmp_touse))
4558                             time_since_sent = cts - tstmp_touse;
4559                 }
4560                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
4561                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
4562                         to = TICKS_2_MSEC(tp->t_rxtcur);
4563                         if (to > time_since_sent)
4564                                 to -= time_since_sent;
4565                         else
4566                                 to = rack->r_ctl.rc_min_to;
4567                         if (to == 0)
4568                                 to = 1;
4569                         return (to);
4570                 }
4571                 return (0);
4572         }
4573         if (rsm->r_flags & RACK_ACKED) {
4574                 rsm = rack_find_lowest_rsm(rack);
4575                 if (rsm == NULL) {
4576                         /* No lowest? */
4577                         goto activate_rxt;
4578                 }
4579         }
4580         if (rack->sack_attack_disable) {
4581                 /*
4582                  * We don't want to do
4583                  * any TLP's if you are an attacker.
4584                  * Though if you are doing what
4585                  * is expected you may still have
4586                  * SACK-PASSED marks.
4587                  */
4588                 goto activate_rxt;
4589         }
4590         /* Convert from ms to usecs */
4591         if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
4592                 if ((tp->t_flags & TF_SENTFIN) &&
4593                     ((tp->snd_max - tp->snd_una) == 1) &&
4594                     (rsm->r_flags & RACK_HAS_FIN)) {
4595                         /*
4596                          * We don't start a rack timer if all we have is a
4597                          * FIN outstanding.
4598                          */
4599                         goto activate_rxt;
4600                 }
4601                 if ((rack->use_rack_rr == 0) &&
4602                     (IN_RECOVERY(tp->t_flags)) &&
4603                     (rack->rack_no_prr == 0) &&
4604                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
4605                         /*
4606                          * We are not cheating, in recovery  and
4607                          * not enough ack's to yet get our next
4608                          * retransmission out.
4609                          *
4610                          * Note that classified attackers do not
4611                          * get to use the rack-cheat.
4612                          */
4613                         goto activate_tlp;
4614                 }
4615                 srtt = rack_grab_rtt(tp, rack);
4616                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
4617                 idx = rsm->r_rtr_cnt - 1;
4618                 exp = rsm->r_tim_lastsent[idx] + thresh;
4619                 if (SEQ_GEQ(exp, cts)) {
4620                         to = exp - cts;
4621                         if (to < rack->r_ctl.rc_min_to) {
4622                                 to = rack->r_ctl.rc_min_to;
4623                                 if (rack->r_rr_config == 3)
4624                                         rack->rc_on_min_to = 1;
4625                         }
4626                 } else {
4627                         to = rack->r_ctl.rc_min_to;
4628                         if (rack->r_rr_config == 3)
4629                                 rack->rc_on_min_to = 1;
4630                 }
4631         } else {
4632                 /* Ok we need to do a TLP not RACK */
4633 activate_tlp:
4634                 if ((rack->rc_tlp_in_progress != 0) &&
4635                     (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
4636                         /*
4637                          * The previous send was a TLP and we have sent
4638                          * N TLP's without sending new data.
4639                          */
4640                         goto activate_rxt;
4641                 }
4642                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
4643                 if (rsm == NULL) {
4644                         /* We found no rsm to TLP with. */
4645                         goto activate_rxt;
4646                 }
4647                 if (rsm->r_flags & RACK_HAS_FIN) {
4648                         /* If its a FIN we dont do TLP */
4649                         rsm = NULL;
4650                         goto activate_rxt;
4651                 }
4652                 idx = rsm->r_rtr_cnt - 1;
4653                 time_since_sent = 0;
4654                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
4655                         tstmp_touse = rsm->r_tim_lastsent[idx];
4656                 else
4657                         tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
4658                 if (TSTMP_GT(cts, tstmp_touse))
4659                     time_since_sent = cts - tstmp_touse;
4660                 is_tlp_timer = 1;
4661                 if (tp->t_srtt) {
4662                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
4663                         srtt = TICKS_2_MSEC(srtt_cur);
4664                 } else
4665                         srtt = RACK_INITIAL_RTO;
4666                 /*
4667                  * If the SRTT is not keeping up and the
4668                  * rack RTT has spiked we want to use
4669                  * the last RTT not the smoothed one.
4670                  */
4671                 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack)))
4672                         srtt = rack_grab_rtt(tp, rack);
4673                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
4674                 if (thresh > time_since_sent)
4675                         to = thresh - time_since_sent;
4676                 else {
4677                         to = rack->r_ctl.rc_min_to;
4678                         rack_log_alt_to_to_cancel(rack,
4679                                                   thresh,               /* flex1 */
4680                                                   time_since_sent,      /* flex2 */
4681                                                   tstmp_touse,          /* flex3 */
4682                                                   rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
4683                                                   rsm->r_tim_lastsent[idx],
4684                                                   srtt,
4685                                                   idx, 99);
4686                 }
4687                 if (to > TCPTV_REXMTMAX) {
4688                         /*
4689                          * If the TLP time works out to larger than the max
4690                          * RTO lets not do TLP.. just RTO.
4691                          */
4692                         goto activate_rxt;
4693                 }
4694         }
4695         if (is_tlp_timer == 0) {
4696                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
4697         } else {
4698                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
4699         }
4700         if (to == 0)
4701                 to = 1;
4702         return (to);
4703 }
4704
4705 static void
4706 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
4707 {
4708         if (rack->rc_in_persist == 0) {
4709                 if (tp->t_flags & TF_GPUTINPROG) {
4710                         /*
4711                          * Stop the goodput now, the calling of the
4712                          * measurement function clears the flag.
4713                          */
4714                         rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__);
4715                 }
4716 #ifdef NETFLIX_SHARED_CWND
4717                 if (rack->r_ctl.rc_scw) {
4718                         tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
4719                         rack->rack_scwnd_is_idle = 1;
4720                 }
4721 #endif
4722                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
4723                 if (rack->r_ctl.rc_went_idle_time == 0)
4724                         rack->r_ctl.rc_went_idle_time = 1;
4725                 rack_timer_cancel(tp, rack, cts, __LINE__);
4726                 tp->t_rxtshift = 0;
4727                 rack->rc_in_persist = 1;
4728         }
4729 }
4730
4731 static void
4732 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
4733 {
4734         if (rack->rc_inp->inp_in_hpts)  {
4735                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
4736                 rack->r_ctl.rc_hpts_flags  = 0;
4737         }
4738 #ifdef NETFLIX_SHARED_CWND
4739         if (rack->r_ctl.rc_scw) {
4740                 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
4741                 rack->rack_scwnd_is_idle = 0;
4742         }
4743 #endif
4744         if (rack->rc_gp_dyn_mul &&
4745             (rack->use_fixed_rate == 0) &&
4746             (rack->rc_always_pace)) {
4747                 /*
4748                  * Do we count this as if a probe-rtt just
4749                  * finished?
4750                  */
4751                 uint32_t time_idle, idle_min;
4752
4753                 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
4754                 idle_min = rack_min_probertt_hold;
4755                 if (rack_probertt_gpsrtt_cnt_div) {
4756                         uint64_t extra;
4757                         extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
4758                                 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
4759                         extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
4760                         idle_min += (uint32_t)extra;
4761                 }
4762                 if (time_idle >= idle_min)  {
4763                         /* Yes, we count it as a probe-rtt. */
4764                         uint32_t us_cts;
4765
4766                         us_cts = tcp_get_usecs(NULL);
4767                         if (rack->in_probe_rtt == 0) {
4768                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4769                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
4770                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
4771                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
4772                         } else {
4773                                 rack_exit_probertt(rack, us_cts);
4774                         }
4775                 }
4776
4777         }
4778         rack->rc_in_persist = 0;
4779         rack->r_ctl.rc_went_idle_time = 0;
4780         tp->t_rxtshift = 0;
4781         rack->r_ctl.rc_agg_delayed = 0;
4782         rack->r_early = 0;
4783         rack->r_late = 0;
4784         rack->r_ctl.rc_agg_early = 0;
4785 }
4786
4787 static void
4788 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
4789                    struct hpts_diag *diag, struct timeval *tv)
4790 {
4791         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
4792                 union tcp_log_stackspecific log;
4793
4794                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4795                 log.u_bbr.flex1 = diag->p_nxt_slot;
4796                 log.u_bbr.flex2 = diag->p_cur_slot;
4797                 log.u_bbr.flex3 = diag->slot_req;
4798                 log.u_bbr.flex4 = diag->inp_hptsslot;
4799                 log.u_bbr.flex5 = diag->slot_remaining;
4800                 log.u_bbr.flex6 = diag->need_new_to;
4801                 log.u_bbr.flex7 = diag->p_hpts_active;
4802                 log.u_bbr.flex8 = diag->p_on_min_sleep;
4803                 /* Hijack other fields as needed  */
4804                 log.u_bbr.epoch = diag->have_slept;
4805                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
4806                 log.u_bbr.pkts_out = diag->co_ret;
4807                 log.u_bbr.applimited = diag->hpts_sleep_time;
4808                 log.u_bbr.delivered = diag->p_prev_slot;
4809                 log.u_bbr.inflight = diag->p_runningtick;
4810                 log.u_bbr.bw_inuse = diag->wheel_tick;
4811                 log.u_bbr.rttProp = diag->wheel_cts;
4812                 log.u_bbr.timeStamp = cts;
4813                 log.u_bbr.delRate = diag->maxticks;
4814                 log.u_bbr.cur_del_rate = diag->p_curtick;
4815                 log.u_bbr.cur_del_rate <<= 32;
4816                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
4817                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4818                     &rack->rc_inp->inp_socket->so_rcv,
4819                     &rack->rc_inp->inp_socket->so_snd,
4820                     BBR_LOG_HPTSDIAG, 0,
4821                     0, &log, false, tv);
4822         }
4823
4824 }
4825
4826 static void
4827 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
4828       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
4829 {
4830         struct hpts_diag diag;
4831         struct inpcb *inp;
4832         struct timeval tv;
4833         uint32_t delayed_ack = 0;
4834         uint32_t hpts_timeout;
4835         uint8_t stopped;
4836         uint32_t left = 0;
4837         uint32_t us_cts;
4838
4839         inp = tp->t_inpcb;
4840         if ((tp->t_state == TCPS_CLOSED) ||
4841             (tp->t_state == TCPS_LISTEN)) {
4842                 return;
4843         }
4844         if (inp->inp_in_hpts) {
4845                 /* Already on the pacer */
4846                 return;
4847         }
4848         stopped = rack->rc_tmr_stopped;
4849         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
4850                 left = rack->r_ctl.rc_timer_exp - cts;
4851         }
4852         rack->r_ctl.rc_timer_exp = 0;
4853         rack->r_ctl.rc_hpts_flags = 0;
4854         us_cts = tcp_get_usecs(&tv);
4855         /* Now early/late accounting */
4856         if (rack->r_early) {
4857                 /*
4858                  * We have a early carry over set,
4859                  * we can always add more time so we
4860                  * can always make this compensation.
4861                  */
4862                 slot += rack->r_ctl.rc_agg_early;
4863                 rack->r_early = 0;
4864                 rack->r_ctl.rc_agg_early = 0;
4865         }
4866         if (rack->r_late) {
4867                 /*
4868                  * This is harder, we can
4869                  * compensate some but it
4870                  * really depends on what
4871                  * the current pacing time is.
4872                  */
4873                 if (rack->r_ctl.rc_agg_delayed >= slot) {
4874                         /*
4875                          * We can't compensate for it all.
4876                          * And we have to have some time
4877                          * on the clock. We always have a min
4878                          * 10 slots (10 x 10 i.e. 100 usecs).
4879                          */
4880                         if (slot <= HPTS_TICKS_PER_USEC) {
4881                                 /* We gain delay */
4882                                 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
4883                                 slot = HPTS_TICKS_PER_USEC;
4884                         } else {
4885                                 /* We take off some */
4886                                 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
4887                                 slot = HPTS_TICKS_PER_USEC;
4888                         }
4889                 } else {
4890
4891                         slot -= rack->r_ctl.rc_agg_delayed;
4892                         rack->r_ctl.rc_agg_delayed = 0;
4893                         /* Make sure we have 100 useconds at minimum */
4894                         if (slot < HPTS_TICKS_PER_USEC) {
4895                                 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
4896                                 slot = HPTS_TICKS_PER_USEC;
4897                         }
4898                         if (rack->r_ctl.rc_agg_delayed == 0)
4899                                 rack->r_late = 0;
4900                 }
4901         }
4902         if (slot) {
4903                 /* We are pacing too */
4904                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
4905         }
4906         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
4907 #ifdef NETFLIX_EXP_DETECTION
4908         if (rack->sack_attack_disable &&
4909             (slot < tcp_sad_pacing_interval)) {
4910                 /*
4911                  * We have a potential attacker on
4912                  * the line. We have possibly some
4913                  * (or now) pacing time set. We want to
4914                  * slow down the processing of sacks by some
4915                  * amount (if it is an attacker). Set the default
4916                  * slot for attackers in place (unless the orginal
4917                  * interval is longer). Its stored in
4918                  * micro-seconds, so lets convert to msecs.
4919                  */
4920                 slot = tcp_sad_pacing_interval;
4921         }
4922 #endif
4923         if (tp->t_flags & TF_DELACK) {
4924                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
4925                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
4926         }
4927         if (delayed_ack && ((hpts_timeout == 0) ||
4928                             (delayed_ack < hpts_timeout)))
4929                 hpts_timeout = delayed_ack;
4930         else
4931                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
4932         /*
4933          * If no timers are going to run and we will fall off the hptsi
4934          * wheel, we resort to a keep-alive timer if its configured.
4935          */
4936         if ((hpts_timeout == 0) &&
4937             (slot == 0)) {
4938                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
4939                     (tp->t_state <= TCPS_CLOSING)) {
4940                         /*
4941                          * Ok we have no timer (persists, rack, tlp, rxt  or
4942                          * del-ack), we don't have segments being paced. So
4943                          * all that is left is the keepalive timer.
4944                          */
4945                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
4946                                 /* Get the established keep-alive time */
4947                                 hpts_timeout = TP_KEEPIDLE(tp);
4948                         } else {
4949                                 /* Get the initial setup keep-alive time */
4950                                 hpts_timeout = TP_KEEPINIT(tp);
4951                         }
4952                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
4953                         if (rack->in_probe_rtt) {
4954                                 /*
4955                                  * We want to instead not wake up a long time from
4956                                  * now but to wake up about the time we would
4957                                  * exit probe-rtt and initiate a keep-alive ack.
4958                                  * This will get us out of probe-rtt and update
4959                                  * our min-rtt.
4960                                  */
4961                                 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC);
4962                         }
4963                 }
4964         }
4965         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
4966             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
4967                 /*
4968                  * RACK, TLP, persists and RXT timers all are restartable
4969                  * based on actions input .. i.e we received a packet (ack
4970                  * or sack) and that changes things (rw, or snd_una etc).
4971                  * Thus we can restart them with a new value. For
4972                  * keep-alive, delayed_ack we keep track of what was left
4973                  * and restart the timer with a smaller value.
4974                  */
4975                 if (left < hpts_timeout)
4976                         hpts_timeout = left;
4977         }
4978         if (hpts_timeout) {
4979                 /*
4980                  * Hack alert for now we can't time-out over 2,147,483
4981                  * seconds (a bit more than 596 hours), which is probably ok
4982                  * :).
4983                  */
4984                 if (hpts_timeout > 0x7ffffffe)
4985                         hpts_timeout = 0x7ffffffe;
4986                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
4987         }
4988         if ((rack->rc_gp_filled == 0) &&
4989             (hpts_timeout < slot) &&
4990             (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
4991                 /*
4992                  * We have no good estimate yet for the
4993                  * old clunky burst mitigation or the
4994                  * real pacing. And the tlp or rxt is smaller
4995                  * than the pacing calculation. Lets not
4996                  * pace that long since we know the calculation
4997                  * so far is not accurate.
4998                  */
4999                 slot = hpts_timeout;
5000         }
5001         rack->r_ctl.last_pacing_time = slot;
5002         if (slot) {
5003                 rack->r_ctl.rc_last_output_to = us_cts + slot;
5004                 if (rack->rc_always_pace || rack->r_mbuf_queue) {
5005                         if ((rack->rc_gp_filled == 0) ||
5006                             rack->pacing_longer_than_rtt) {
5007                                 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
5008                         } else {
5009                                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
5010                                 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
5011                                     (rack->r_rr_config != 3))
5012                                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
5013                                 else
5014                                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
5015                         }
5016                 }
5017                 if ((rack->use_rack_rr) &&
5018                     (rack->r_rr_config < 2) &&
5019                     ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) {
5020                         /*
5021                          * Arrange for the hpts to kick back in after the
5022                          * t-o if the t-o does not cause a send.
5023                          */
5024                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout),
5025                                                    __LINE__, &diag);
5026                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5027                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
5028                 } else {
5029                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
5030                                                    __LINE__, &diag);
5031                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5032                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
5033                 }
5034         } else if (hpts_timeout) {
5035                 if (rack->rc_always_pace || rack->r_mbuf_queue) {
5036                         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)  {
5037                                 /* For a rack timer, don't wake us */
5038                                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
5039                                 if  (rack->r_rr_config != 3)
5040                                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
5041                                 else
5042                                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
5043                         } else {
5044                                 /* All other timers wake us up */
5045                                 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
5046                                 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
5047                         }
5048                 }
5049                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout),
5050                                            __LINE__, &diag);
5051                 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5052                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
5053         } else {
5054                 /* No timer starting */
5055 #ifdef INVARIANTS
5056                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
5057                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
5058                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
5059                 }
5060 #endif
5061         }
5062         rack->rc_tmr_stopped = 0;
5063         if (slot)
5064                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
5065 }
5066
5067 /*
5068  * RACK Timer, here we simply do logging and house keeping.
5069  * the normal rack_output() function will call the
5070  * appropriate thing to check if we need to do a RACK retransmit.
5071  * We return 1, saying don't proceed with rack_output only
5072  * when all timers have been stopped (destroyed PCB?).
5073  */
5074 static int
5075 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5076 {
5077         /*
5078          * This timer simply provides an internal trigger to send out data.
5079          * The check_recovery_mode call will see if there are needed
5080          * retransmissions, if so we will enter fast-recovery. The output
5081          * call may or may not do the same thing depending on sysctl
5082          * settings.
5083          */
5084         struct rack_sendmap *rsm;
5085         int32_t recovery;
5086
5087         if (tp->t_timers->tt_flags & TT_STOPPED) {
5088                 return (1);
5089         }
5090         recovery = IN_RECOVERY(tp->t_flags);
5091         counter_u64_add(rack_to_tot, 1);
5092         if (rack->r_state && (rack->r_state != tp->t_state))
5093                 rack_set_state(tp, rack);
5094         rack->rc_on_min_to = 0;
5095         rsm = rack_check_recovery_mode(tp, cts);
5096         rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
5097         if (rsm) {
5098                 uint32_t rtt;
5099
5100                 rack->r_ctl.rc_resend = rsm;
5101                 if (rack->use_rack_rr) {
5102                         /*
5103                          * Don't accumulate extra pacing delay
5104                          * we are allowing the rack timer to
5105                          * over-ride pacing i.e. rrr takes precedence
5106                          * if the pacing interval is longer than the rrr
5107                          * time (in other words we get the min pacing
5108                          * time versus rrr pacing time).
5109                          */
5110                         rack->r_timer_override = 1;
5111                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
5112                 }
5113                 rtt = rack->rc_rack_rtt;
5114                 if (rtt == 0)
5115                         rtt = 1;
5116                 if (rack->rack_no_prr == 0) {
5117                         if ((recovery == 0) &&
5118                             (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
5119                                 /*
5120                                  * The rack-timeout that enter's us into recovery
5121                                  * will force out one MSS and set us up so that we
5122                                  * can do one more send in 2*rtt (transitioning the
5123                                  * rack timeout into a rack-tlp).
5124                                  */
5125                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5126                                 rack->r_timer_override = 1;
5127                                 rack_log_to_prr(rack, 3, 0);
5128                         } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
5129                                    rack->use_rack_rr) {
5130                                 /*
5131                                  * When a rack timer goes, if the rack rr is
5132                                  * on, arrange it so we can send a full segment
5133                                  * overriding prr (though we pay a price for this
5134                                  * for future new sends).
5135                                  */
5136                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5137                                 rack_log_to_prr(rack, 4, 0);
5138                         }
5139                 }
5140         }
5141         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
5142         if (rsm == NULL) {
5143                 /* restart a timer and return 1 */
5144                 rack_start_hpts_timer(rack, tp, cts,
5145                                       0, 0, 0);
5146                 return (1);
5147         }
5148         return (0);
5149 }
5150
5151 static __inline void
5152 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
5153                struct rack_sendmap *rsm, uint32_t start)
5154 {
5155         int idx;
5156
5157         nrsm->r_start = start;
5158         nrsm->r_end = rsm->r_end;
5159         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
5160         nrsm->r_flags = rsm->r_flags;
5161         nrsm->r_dupack = rsm->r_dupack;
5162         nrsm->usec_orig_send = rsm->usec_orig_send;
5163         nrsm->r_rtr_bytes = 0;
5164         rsm->r_end = nrsm->r_start;
5165         nrsm->r_just_ret = rsm->r_just_ret;
5166         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
5167                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
5168         }
5169 }
5170
5171 static struct rack_sendmap *
5172 rack_merge_rsm(struct tcp_rack *rack,
5173                struct rack_sendmap *l_rsm,
5174                struct rack_sendmap *r_rsm)
5175 {
5176         /*
5177          * We are merging two ack'd RSM's,
5178          * the l_rsm is on the left (lower seq
5179          * values) and the r_rsm is on the right
5180          * (higher seq value). The simplest way
5181          * to merge these is to move the right
5182          * one into the left. I don't think there
5183          * is any reason we need to try to find
5184          * the oldest (or last oldest retransmitted).
5185          */
5186         struct rack_sendmap *rm;
5187
5188         l_rsm->r_end = r_rsm->r_end;
5189         if (l_rsm->r_dupack < r_rsm->r_dupack)
5190                 l_rsm->r_dupack = r_rsm->r_dupack;
5191         if (r_rsm->r_rtr_bytes)
5192                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
5193         if (r_rsm->r_in_tmap) {
5194                 /* This really should not happen */
5195                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
5196                 r_rsm->r_in_tmap = 0;
5197         }
5198
5199         /* Now the flags */
5200         if (r_rsm->r_flags & RACK_HAS_FIN)
5201                 l_rsm->r_flags |= RACK_HAS_FIN;
5202         if (r_rsm->r_flags & RACK_TLP)
5203                 l_rsm->r_flags |= RACK_TLP;
5204         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
5205                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
5206         if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
5207             ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
5208                 /*
5209                  * If both are app-limited then let the
5210                  * free lower the count. If right is app
5211                  * limited and left is not, transfer.
5212                  */
5213                 l_rsm->r_flags |= RACK_APP_LIMITED;
5214                 r_rsm->r_flags &= ~RACK_APP_LIMITED;
5215                 if (r_rsm == rack->r_ctl.rc_first_appl)
5216                         rack->r_ctl.rc_first_appl = l_rsm;
5217         }
5218         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
5219 #ifdef INVARIANTS
5220         if (rm != r_rsm) {
5221                 panic("removing head in rack:%p rsm:%p rm:%p",
5222                       rack, r_rsm, rm);
5223         }
5224 #endif
5225         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
5226                 /* Transfer the split limit to the map we free */
5227                 r_rsm->r_limit_type = l_rsm->r_limit_type;
5228                 l_rsm->r_limit_type = 0;
5229         }
5230         rack_free(rack, r_rsm);
5231         return(l_rsm);
5232 }
5233
5234 /*
5235  * TLP Timer, here we simply setup what segment we want to
5236  * have the TLP expire on, the normal rack_output() will then
5237  * send it out.
5238  *
5239  * We return 1, saying don't proceed with rack_output only
5240  * when all timers have been stopped (destroyed PCB?).
5241  */
5242 static int
5243 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5244 {
5245         /*
5246          * Tail Loss Probe.
5247          */
5248         struct rack_sendmap *rsm = NULL;
5249         struct rack_sendmap *insret;
5250         struct socket *so;
5251         uint32_t amm, old_prr_snd = 0;
5252         uint32_t out, avail;
5253         int collapsed_win = 0;
5254
5255         if (tp->t_timers->tt_flags & TT_STOPPED) {
5256                 return (1);
5257         }
5258         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
5259                 /* Its not time yet */
5260                 return (0);
5261         }
5262         if (ctf_progress_timeout_check(tp, true)) {
5263                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
5264                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5265                 return (1);
5266         }
5267         /*
5268          * A TLP timer has expired. We have been idle for 2 rtts. So we now
5269          * need to figure out how to force a full MSS segment out.
5270          */
5271         rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
5272         counter_u64_add(rack_tlp_tot, 1);
5273         if (rack->r_state && (rack->r_state != tp->t_state))
5274                 rack_set_state(tp, rack);
5275         so = tp->t_inpcb->inp_socket;
5276 #ifdef KERN_TLS
5277         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
5278                 /*
5279                  * For hardware TLS we do *not* want to send
5280                  * new data, lets instead just do a retransmission.
5281                  */
5282                 goto need_retran;
5283         }
5284 #endif
5285         avail = sbavail(&so->so_snd);
5286         out = tp->snd_max - tp->snd_una;
5287         if (out > tp->snd_wnd) {
5288                 /* special case, we need a retransmission */
5289                 collapsed_win = 1;
5290                 goto need_retran;
5291         }
5292         /*
5293          * Check our send oldest always settings, and if
5294          * there is an oldest to send jump to the need_retran.
5295          */
5296         if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
5297                 goto need_retran;
5298
5299         if (avail > out) {
5300                 /* New data is available */
5301                 amm = avail - out;
5302                 if (amm > ctf_fixed_maxseg(tp)) {
5303                         amm = ctf_fixed_maxseg(tp);
5304                         if ((amm + out) > tp->snd_wnd) {
5305                                 /* We are rwnd limited */
5306                                 goto need_retran;
5307                         }
5308                 } else if (amm < ctf_fixed_maxseg(tp)) {
5309                         /* not enough to fill a MTU */
5310                         goto need_retran;
5311                 }
5312                 if (IN_RECOVERY(tp->t_flags)) {
5313                         /* Unlikely */
5314                         if (rack->rack_no_prr == 0) {
5315                                 old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
5316                                 if (out + amm <= tp->snd_wnd) {
5317                                         rack->r_ctl.rc_prr_sndcnt = amm;
5318                                         rack_log_to_prr(rack, 4, 0);
5319                                 }
5320                         } else
5321                                 goto need_retran;
5322                 } else {
5323                         /* Set the send-new override */
5324                         if (out + amm <= tp->snd_wnd)
5325                                 rack->r_ctl.rc_tlp_new_data = amm;
5326                         else
5327                                 goto need_retran;
5328                 }
5329                 rack->r_ctl.rc_tlpsend = NULL;
5330                 counter_u64_add(rack_tlp_newdata, 1);
5331                 goto send;
5332         }
5333 need_retran:
5334         /*
5335          * Ok we need to arrange the last un-acked segment to be re-sent, or
5336          * optionally the first un-acked segment.
5337          */
5338         if (collapsed_win == 0) {
5339                 if (rack_always_send_oldest)
5340                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5341                 else {
5342                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
5343                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
5344                                 rsm = rack_find_high_nonack(rack, rsm);
5345                         }
5346                 }
5347                 if (rsm == NULL) {
5348                         counter_u64_add(rack_tlp_does_nada, 1);
5349 #ifdef TCP_BLACKBOX
5350                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
5351 #endif
5352                         goto out;
5353                 }
5354         } else {
5355                 /*
5356                  * We must find the last segment
5357                  * that was acceptable by the client.
5358                  */
5359                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
5360                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
5361                                 /* Found one */
5362                                 break;
5363                         }
5364                 }
5365                 if (rsm == NULL) {
5366                         /* None? if so send the first */
5367                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
5368                         if (rsm == NULL) {
5369                                 counter_u64_add(rack_tlp_does_nada, 1);
5370 #ifdef TCP_BLACKBOX
5371                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
5372 #endif
5373                                 goto out;
5374                         }
5375                 }
5376         }
5377         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
5378                 /*
5379                  * We need to split this the last segment in two.
5380                  */
5381                 struct rack_sendmap *nrsm;
5382
5383
5384                 nrsm = rack_alloc_full_limit(rack);
5385                 if (nrsm == NULL) {
5386                         /*
5387                          * No memory to split, we will just exit and punt
5388                          * off to the RXT timer.
5389                          */
5390                         counter_u64_add(rack_tlp_does_nada, 1);
5391                         goto out;
5392                 }
5393                 rack_clone_rsm(rack, nrsm, rsm,
5394                                (rsm->r_end - ctf_fixed_maxseg(tp)));
5395                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
5396 #ifdef INVARIANTS
5397                 if (insret != NULL) {
5398                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
5399                               nrsm, insret, rack, rsm);
5400                 }
5401 #endif
5402                 if (rsm->r_in_tmap) {
5403                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5404                         nrsm->r_in_tmap = 1;
5405                 }
5406                 rsm->r_flags &= (~RACK_HAS_FIN);
5407                 rsm = nrsm;
5408         }
5409         rack->r_ctl.rc_tlpsend = rsm;
5410 send:
5411         rack->r_timer_override = 1;
5412         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
5413         return (0);
5414 out:
5415         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
5416         return (0);
5417 }
5418
5419 /*
5420  * Delayed ack Timer, here we simply need to setup the
5421  * ACK_NOW flag and remove the DELACK flag. From there
5422  * the output routine will send the ack out.
5423  *
5424  * We only return 1, saying don't proceed, if all timers
5425  * are stopped (destroyed PCB?).
5426  */
5427 static int
5428 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5429 {
5430         if (tp->t_timers->tt_flags & TT_STOPPED) {
5431                 return (1);
5432         }
5433         rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
5434         tp->t_flags &= ~TF_DELACK;
5435         tp->t_flags |= TF_ACKNOW;
5436         KMOD_TCPSTAT_INC(tcps_delack);
5437         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
5438         return (0);
5439 }
5440
5441 /*
5442  * Persists timer, here we simply send the
5443  * same thing as a keepalive will.
5444  * the one byte send.
5445  *
5446  * We only return 1, saying don't proceed, if all timers
5447  * are stopped (destroyed PCB?).
5448  */
5449 static int
5450 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5451 {
5452         struct tcptemp *t_template;
5453         struct inpcb *inp;
5454         int32_t retval = 1;
5455
5456         inp = tp->t_inpcb;
5457
5458         if (tp->t_timers->tt_flags & TT_STOPPED) {
5459                 return (1);
5460         }
5461         if (rack->rc_in_persist == 0)
5462                 return (0);
5463         if (ctf_progress_timeout_check(tp, false)) {
5464                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
5465                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
5466                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
5467                 return (1);
5468         }
5469         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
5470         /*
5471          * Persistence timer into zero window. Force a byte to be output, if
5472          * possible.
5473          */
5474         KMOD_TCPSTAT_INC(tcps_persisttimeo);
5475         /*
5476          * Hack: if the peer is dead/unreachable, we do not time out if the
5477          * window is closed.  After a full backoff, drop the connection if
5478          * the idle time (no responses to probes) reaches the maximum
5479          * backoff that we would use if retransmitting.
5480          */
5481         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
5482             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
5483             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
5484                 KMOD_TCPSTAT_INC(tcps_persistdrop);
5485                 retval = 1;
5486                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
5487                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
5488                 goto out;
5489         }
5490         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
5491             tp->snd_una == tp->snd_max)
5492                 rack_exit_persist(tp, rack, cts);
5493         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
5494         /*
5495          * If the user has closed the socket then drop a persisting
5496          * connection after a much reduced timeout.
5497          */
5498         if (tp->t_state > TCPS_CLOSE_WAIT &&
5499             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
5500                 retval = 1;
5501                 KMOD_TCPSTAT_INC(tcps_persistdrop);
5502                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
5503                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
5504                 goto out;
5505         }
5506         t_template = tcpip_maketemplate(rack->rc_inp);
5507         if (t_template) {
5508                 /* only set it if we were answered */
5509                 if (rack->forced_ack == 0) {
5510                         rack->forced_ack = 1;
5511                         rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
5512                 }
5513                 tcp_respond(tp, t_template->tt_ipgen,
5514                             &t_template->tt_t, (struct mbuf *)NULL,
5515                             tp->rcv_nxt, tp->snd_una - 1, 0);
5516                 /* This sends an ack */
5517                 if (tp->t_flags & TF_DELACK)
5518                         tp->t_flags &= ~TF_DELACK;
5519                 free(t_template, M_TEMP);
5520         }
5521         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
5522                 tp->t_rxtshift++;
5523 out:
5524         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
5525         rack_start_hpts_timer(rack, tp, cts,
5526                               0, 0, 0);
5527         return (retval);
5528 }
5529
5530 /*
5531  * If a keepalive goes off, we had no other timers
5532  * happening. We always return 1 here since this
5533  * routine either drops the connection or sends
5534  * out a segment with respond.
5535  */
5536 static int
5537 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5538 {
5539         struct tcptemp *t_template;
5540         struct inpcb *inp;
5541
5542         if (tp->t_timers->tt_flags & TT_STOPPED) {
5543                 return (1);
5544         }
5545         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
5546         inp = tp->t_inpcb;
5547         rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
5548         /*
5549          * Keep-alive timer went off; send something or drop connection if
5550          * idle for too long.
5551          */
5552         KMOD_TCPSTAT_INC(tcps_keeptimeo);
5553         if (tp->t_state < TCPS_ESTABLISHED)
5554                 goto dropit;
5555         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
5556             tp->t_state <= TCPS_CLOSING) {
5557                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
5558                         goto dropit;
5559                 /*
5560                  * Send a packet designed to force a response if the peer is
5561                  * up and reachable: either an ACK if the connection is
5562                  * still alive, or an RST if the peer has closed the
5563                  * connection due to timeout or reboot. Using sequence
5564                  * number tp->snd_una-1 causes the transmitted zero-length
5565                  * segment to lie outside the receive window; by the
5566                  * protocol spec, this requires the correspondent TCP to
5567                  * respond.
5568                  */
5569                 KMOD_TCPSTAT_INC(tcps_keepprobe);
5570                 t_template = tcpip_maketemplate(inp);
5571                 if (t_template) {
5572                         if (rack->forced_ack == 0) {
5573                                 rack->forced_ack = 1;
5574                                 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
5575                         }
5576                         tcp_respond(tp, t_template->tt_ipgen,
5577                             &t_template->tt_t, (struct mbuf *)NULL,
5578                             tp->rcv_nxt, tp->snd_una - 1, 0);
5579                         free(t_template, M_TEMP);
5580                 }
5581         }
5582         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
5583         return (1);
5584 dropit:
5585         KMOD_TCPSTAT_INC(tcps_keepdrops);
5586         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
5587         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
5588         return (1);
5589 }
5590
5591 /*
5592  * Retransmit helper function, clear up all the ack
5593  * flags and take care of important book keeping.
5594  */
5595 static void
5596 rack_remxt_tmr(struct tcpcb *tp)
5597 {
5598         /*
5599          * The retransmit timer went off, all sack'd blocks must be
5600          * un-acked.
5601          */
5602         struct rack_sendmap *rsm, *trsm = NULL;
5603         struct tcp_rack *rack;
5604         int32_t cnt = 0;
5605
5606         rack = (struct tcp_rack *)tp->t_fb_ptr;
5607         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
5608         rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
5609         if (rack->r_state && (rack->r_state != tp->t_state))
5610                 rack_set_state(tp, rack);
5611         /*
5612          * Ideally we would like to be able to
5613          * mark SACK-PASS on anything not acked here.
5614          * However, if we do that we would burst out
5615          * all that data 1ms apart. This would be unwise,
5616          * so for now we will just let the normal rxt timer
5617          * and tlp timer take care of it.
5618          */
5619         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
5620                 if (rsm->r_flags & RACK_ACKED) {
5621                         cnt++;
5622                         rsm->r_dupack = 0;
5623                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
5624                         if (rsm->r_in_tmap == 0) {
5625                                 /* We must re-add it back to the tlist */
5626                                 if (trsm == NULL) {
5627                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
5628                                 } else {
5629                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
5630                                 }
5631                                 rsm->r_in_tmap = 1;
5632                         }
5633                 }
5634                 trsm = rsm;
5635                 if (rsm->r_flags & RACK_ACKED)
5636                         rsm->r_flags |= RACK_WAS_ACKED;
5637                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
5638         }
5639         /* Clear the count (we just un-acked them) */
5640         rack->r_ctl.rc_sacked = 0;
5641         rack->r_ctl.rc_agg_delayed = 0;
5642         rack->r_early = 0;
5643         rack->r_ctl.rc_agg_early = 0;
5644         rack->r_late = 0;
5645         /* Clear the tlp rtx mark */
5646         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
5647         rack->r_ctl.rc_prr_sndcnt = 0;
5648         rack_log_to_prr(rack, 6, 0);
5649         rack->r_timer_override = 1;
5650 }
5651
5652 static void
5653 rack_cc_conn_init(struct tcpcb *tp)
5654 {
5655         struct tcp_rack *rack;
5656
5657
5658         rack = (struct tcp_rack *)tp->t_fb_ptr;
5659         cc_conn_init(tp);
5660         /*
5661          * We want a chance to stay in slowstart as
5662          * we create a connection. TCP spec says that
5663          * initially ssthresh is infinite. For our
5664          * purposes that is the snd_wnd.
5665          */
5666         if (tp->snd_ssthresh < tp->snd_wnd) {
5667                 tp->snd_ssthresh = tp->snd_wnd;
5668         }
5669         /*
5670          * We also want to assure a IW worth of
5671          * data can get inflight.
5672          */
5673         if (rc_init_window(rack) < tp->snd_cwnd)
5674                 tp->snd_cwnd = rc_init_window(rack);
5675 }
5676
5677 /*
5678  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
5679  * we will setup to retransmit the lowest seq number outstanding.
5680  */
5681 static int
5682 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5683 {
5684         int32_t rexmt;
5685         struct inpcb *inp;
5686         int32_t retval = 0;
5687         bool isipv6;
5688
5689         inp = tp->t_inpcb;
5690         if (tp->t_timers->tt_flags & TT_STOPPED) {
5691                 return (1);
5692         }
5693         if (ctf_progress_timeout_check(tp, false)) {
5694                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
5695                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
5696                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
5697                 return (1);
5698         }
5699         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
5700         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
5701             (tp->snd_una == tp->snd_max)) {
5702                 /* Nothing outstanding .. nothing to do */
5703                 return (0);
5704         }
5705         /*
5706          * Retransmission timer went off.  Message has not been acked within
5707          * retransmit interval.  Back off to a longer retransmit interval
5708          * and retransmit one segment.
5709          */
5710         rack_remxt_tmr(tp);
5711         if ((rack->r_ctl.rc_resend == NULL) ||
5712             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
5713                 /*
5714                  * If the rwnd collapsed on
5715                  * the one we are retransmitting
5716                  * it does not count against the
5717                  * rxt count.
5718                  */
5719                 tp->t_rxtshift++;
5720         }
5721         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
5722                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
5723                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
5724                 retval = 1;
5725                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
5726                 tcp_set_inp_to_drop(rack->rc_inp,
5727                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
5728                 goto out;
5729         }
5730         if (tp->t_state == TCPS_SYN_SENT) {
5731                 /*
5732                  * If the SYN was retransmitted, indicate CWND to be limited
5733                  * to 1 segment in cc_conn_init().
5734                  */
5735                 tp->snd_cwnd = 1;
5736         } else if (tp->t_rxtshift == 1) {
5737                 /*
5738                  * first retransmit; record ssthresh and cwnd so they can be
5739                  * recovered if this turns out to be a "bad" retransmit. A
5740                  * retransmit is considered "bad" if an ACK for this segment
5741                  * is received within RTT/2 interval; the assumption here is
5742                  * that the ACK was already in flight.  See "On Estimating
5743                  * End-to-End Network Path Properties" by Allman and Paxson
5744                  * for more details.
5745                  */
5746                 tp->snd_cwnd_prev = tp->snd_cwnd;
5747                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
5748                 tp->snd_recover_prev = tp->snd_recover;
5749                 if (IN_FASTRECOVERY(tp->t_flags))
5750                         tp->t_flags |= TF_WASFRECOVERY;
5751                 else
5752                         tp->t_flags &= ~TF_WASFRECOVERY;
5753                 if (IN_CONGRECOVERY(tp->t_flags))
5754                         tp->t_flags |= TF_WASCRECOVERY;
5755                 else
5756                         tp->t_flags &= ~TF_WASCRECOVERY;
5757                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
5758                 tp->t_flags |= TF_PREVVALID;
5759         } else
5760                 tp->t_flags &= ~TF_PREVVALID;
5761         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
5762         if ((tp->t_state == TCPS_SYN_SENT) ||
5763             (tp->t_state == TCPS_SYN_RECEIVED))
5764                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
5765         else
5766                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
5767         TCPT_RANGESET(tp->t_rxtcur, rexmt,
5768            max(MSEC_2_TICKS(rack_rto_min), rexmt),
5769            MSEC_2_TICKS(rack_rto_max));
5770         /*
5771          * We enter the path for PLMTUD if connection is established or, if
5772          * connection is FIN_WAIT_1 status, reason for the last is that if
5773          * amount of data we send is very small, we could send it in couple
5774          * of packets and process straight to FIN. In that case we won't
5775          * catch ESTABLISHED state.
5776          */
5777 #ifdef INET6
5778         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
5779 #else
5780         isipv6 = false;
5781 #endif
5782         if (((V_tcp_pmtud_blackhole_detect == 1) ||
5783             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
5784             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
5785             ((tp->t_state == TCPS_ESTABLISHED) ||
5786             (tp->t_state == TCPS_FIN_WAIT_1))) {
5787
5788                 /*
5789                  * Idea here is that at each stage of mtu probe (usually,
5790                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
5791                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
5792                  * should take care of that.
5793                  */
5794                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
5795                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
5796                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
5797                     tp->t_rxtshift % 2 == 0)) {
5798                         /*
5799                          * Enter Path MTU Black-hole Detection mechanism: -
5800                          * Disable Path MTU Discovery (IP "DF" bit). -
5801                          * Reduce MTU to lower value than what we negotiated
5802                          * with peer.
5803                          */
5804                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
5805                                 /* Record that we may have found a black hole. */
5806                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
5807                                 /* Keep track of previous MSS. */
5808                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
5809                         }
5810
5811                         /*
5812                          * Reduce the MSS to blackhole value or to the
5813                          * default in an attempt to retransmit.
5814                          */
5815 #ifdef INET6
5816                         if (isipv6 &&
5817                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
5818                                 /* Use the sysctl tuneable blackhole MSS. */
5819                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
5820                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5821                         } else if (isipv6) {
5822                                 /* Use the default MSS. */
5823                                 tp->t_maxseg = V_tcp_v6mssdflt;
5824                                 /*
5825                                  * Disable Path MTU Discovery when we switch
5826                                  * to minmss.
5827                                  */
5828                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5829                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5830                         }
5831 #endif
5832 #if defined(INET6) && defined(INET)
5833                         else
5834 #endif
5835 #ifdef INET
5836                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
5837                                 /* Use the sysctl tuneable blackhole MSS. */
5838                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
5839                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
5840                         } else {
5841                                 /* Use the default MSS. */
5842                                 tp->t_maxseg = V_tcp_mssdflt;
5843                                 /*
5844                                  * Disable Path MTU Discovery when we switch
5845                                  * to minmss.
5846                                  */
5847                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
5848                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
5849                         }
5850 #endif
5851                 } else {
5852                         /*
5853                          * If further retransmissions are still unsuccessful
5854                          * with a lowered MTU, maybe this isn't a blackhole
5855                          * and we restore the previous MSS and blackhole
5856                          * detection flags. The limit '6' is determined by
5857                          * giving each probe stage (1448, 1188, 524) 2
5858                          * chances to recover.
5859                          */
5860                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
5861                             (tp->t_rxtshift >= 6)) {
5862                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
5863                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
5864                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
5865                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
5866                         }
5867                 }
5868         }
5869         /*
5870          * If we backed off this far, our srtt estimate is probably bogus.
5871          * Clobber it so we'll take the next rtt measurement as our srtt;
5872          * move the current srtt into rttvar to keep the current retransmit
5873          * times until then.
5874          */
5875         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
5876 #ifdef INET6
5877                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
5878                         in6_losing(tp->t_inpcb);
5879                 else
5880 #endif
5881                         in_losing(tp->t_inpcb);
5882                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
5883                 tp->t_srtt = 0;
5884         }
5885         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5886         tp->snd_recover = tp->snd_max;
5887         tp->t_flags |= TF_ACKNOW;
5888         tp->t_rtttime = 0;
5889         rack_cong_signal(tp, NULL, CC_RTO);
5890 out:
5891         return (retval);
5892 }
5893
5894 static int
5895 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
5896 {
5897         int32_t ret = 0;
5898         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
5899
5900         if (timers == 0) {
5901                 return (0);
5902         }
5903         if (tp->t_state == TCPS_LISTEN) {
5904                 /* no timers on listen sockets */
5905                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
5906                         return (0);
5907                 return (1);
5908         }
5909         if ((timers & PACE_TMR_RACK) &&
5910             rack->rc_on_min_to) {
5911                 /*
5912                  * For the rack timer when we
5913                  * are on a min-timeout (which means rrr_conf = 3)
5914                  * we don't want to check the timer. It may
5915                  * be going off for a pace and thats ok we
5916                  * want to send the retransmit (if its ready).
5917                  *
5918                  * If its on a normal rack timer (non-min) then
5919                  * we will check if its expired.
5920                  */
5921                 goto skip_time_check;
5922         }
5923         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
5924                 uint32_t left;
5925
5926                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
5927                         ret = -1;
5928                         rack_log_to_processing(rack, cts, ret, 0);
5929                         return (0);
5930                 }
5931                 if (hpts_calling == 0) {
5932                         /*
5933                          * A user send or queued mbuf (sack) has called us? We
5934                          * return 0 and let the pacing guards
5935                          * deal with it if they should or
5936                          * should not cause a send.
5937                          */
5938                         ret = -2;
5939                         rack_log_to_processing(rack, cts, ret, 0);
5940                         return (0);
5941                 }
5942                 /*
5943                  * Ok our timer went off early and we are not paced false
5944                  * alarm, go back to sleep.
5945                  */
5946                 ret = -3;
5947                 left = rack->r_ctl.rc_timer_exp - cts;
5948                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
5949                 rack_log_to_processing(rack, cts, ret, left);
5950                 return (1);
5951         }
5952 skip_time_check:
5953         rack->rc_tmr_stopped = 0;
5954         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
5955         if (timers & PACE_TMR_DELACK) {
5956                 ret = rack_timeout_delack(tp, rack, cts);
5957         } else if (timers & PACE_TMR_RACK) {
5958                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
5959                 ret = rack_timeout_rack(tp, rack, cts);
5960         } else if (timers & PACE_TMR_TLP) {
5961                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
5962                 ret = rack_timeout_tlp(tp, rack, cts);
5963         } else if (timers & PACE_TMR_RXT) {
5964                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
5965                 ret = rack_timeout_rxt(tp, rack, cts);
5966         } else if (timers & PACE_TMR_PERSIT) {
5967                 ret = rack_timeout_persist(tp, rack, cts);
5968         } else if (timers & PACE_TMR_KEEP) {
5969                 ret = rack_timeout_keepalive(tp, rack, cts);
5970         }
5971         rack_log_to_processing(rack, cts, ret, timers);
5972         return (ret);
5973 }
5974
5975 static void
5976 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
5977 {
5978         struct timeval tv;
5979         uint32_t us_cts, flags_on_entry;
5980         uint8_t hpts_removed = 0;
5981
5982
5983         flags_on_entry = rack->r_ctl.rc_hpts_flags;
5984         us_cts = tcp_get_usecs(&tv);
5985         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
5986             ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
5987              ((tp->snd_max - tp->snd_una) == 0))) {
5988                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
5989                 hpts_removed = 1;
5990                 /* If we were not delayed cancel out the flag. */
5991                 if ((tp->snd_max - tp->snd_una) == 0)
5992                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
5993                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
5994         }
5995         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
5996                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
5997                 if (rack->rc_inp->inp_in_hpts &&
5998                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
5999                         /*
6000                          * Canceling timer's when we have no output being
6001                          * paced. We also must remove ourselves from the
6002                          * hpts.
6003                          */
6004                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6005                         hpts_removed = 1;
6006                 }
6007                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
6008         }
6009         if (hpts_removed == 0)
6010                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
6011 }
6012
6013 static void
6014 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
6015 {
6016         return;
6017 }
6018
6019 static int
6020 rack_stopall(struct tcpcb *tp)
6021 {
6022         struct tcp_rack *rack;
6023         rack = (struct tcp_rack *)tp->t_fb_ptr;
6024         rack->t_timers_stopped = 1;
6025         return (0);
6026 }
6027
6028 static void
6029 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
6030 {
6031         return;
6032 }
6033
6034 static int
6035 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
6036 {
6037         return (0);
6038 }
6039
6040 static void
6041 rack_stop_all_timers(struct tcpcb *tp)
6042 {
6043         struct tcp_rack *rack;
6044
6045         /*
6046          * Assure no timers are running.
6047          */
6048         if (tcp_timer_active(tp, TT_PERSIST)) {
6049                 /* We enter in persists, set the flag appropriately */
6050                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6051                 rack->rc_in_persist = 1;
6052         }
6053         tcp_timer_suspend(tp, TT_PERSIST);
6054         tcp_timer_suspend(tp, TT_REXMT);
6055         tcp_timer_suspend(tp, TT_KEEP);
6056         tcp_timer_suspend(tp, TT_DELACK);
6057 }
6058
6059 static void
6060 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
6061     struct rack_sendmap *rsm, uint32_t ts)
6062 {
6063         int32_t idx;
6064
6065         rsm->r_rtr_cnt++;
6066         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
6067         rsm->r_dupack = 0;
6068         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
6069                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
6070                 rsm->r_flags |= RACK_OVERMAX;
6071         }
6072         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
6073                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
6074                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
6075         }
6076         idx = rsm->r_rtr_cnt - 1;
6077         rsm->r_tim_lastsent[idx] = ts;
6078         if (rsm->r_flags & RACK_ACKED) {
6079                 /* Problably MTU discovery messing with us */
6080                 rsm->r_flags &= ~RACK_ACKED;
6081                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
6082         }
6083         if (rsm->r_in_tmap) {
6084                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6085                 rsm->r_in_tmap = 0;
6086         }
6087         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6088         rsm->r_in_tmap = 1;
6089         if (rsm->r_flags & RACK_SACK_PASSED) {
6090                 /* We have retransmitted due to the SACK pass */
6091                 rsm->r_flags &= ~RACK_SACK_PASSED;
6092                 rsm->r_flags |= RACK_WAS_SACKPASS;
6093         }
6094 }
6095
6096
6097 static uint32_t
6098 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
6099     struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
6100 {
6101         /*
6102          * We (re-)transmitted starting at rsm->r_start for some length
6103          * (possibly less than r_end.
6104          */
6105         struct rack_sendmap *nrsm, *insret;
6106         uint32_t c_end;
6107         int32_t len;
6108
6109         len = *lenp;
6110         c_end = rsm->r_start + len;
6111         if (SEQ_GEQ(c_end, rsm->r_end)) {
6112                 /*
6113                  * We retransmitted the whole piece or more than the whole
6114                  * slopping into the next rsm.
6115                  */
6116                 rack_update_rsm(tp, rack, rsm, ts);
6117                 if (c_end == rsm->r_end) {
6118                         *lenp = 0;
6119                         return (0);
6120                 } else {
6121                         int32_t act_len;
6122
6123                         /* Hangs over the end return whats left */
6124                         act_len = rsm->r_end - rsm->r_start;
6125                         *lenp = (len - act_len);
6126                         return (rsm->r_end);
6127                 }
6128                 /* We don't get out of this block. */
6129         }
6130         /*
6131          * Here we retransmitted less than the whole thing which means we
6132          * have to split this into what was transmitted and what was not.
6133          */
6134         nrsm = rack_alloc_full_limit(rack);
6135         if (nrsm == NULL) {
6136                 /*
6137                  * We can't get memory, so lets not proceed.
6138                  */
6139                 *lenp = 0;
6140                 return (0);
6141         }
6142         /*
6143          * So here we are going to take the original rsm and make it what we
6144          * retransmitted. nrsm will be the tail portion we did not
6145          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
6146          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
6147          * 1, 6 and the new piece will be 6, 11.
6148          */
6149         rack_clone_rsm(rack, nrsm, rsm, c_end);
6150         nrsm->r_dupack = 0;
6151         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
6152         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6153 #ifdef INVARIANTS
6154         if (insret != NULL) {
6155                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6156                       nrsm, insret, rack, rsm);
6157         }
6158 #endif
6159         if (rsm->r_in_tmap) {
6160                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6161                 nrsm->r_in_tmap = 1;
6162         }
6163         rsm->r_flags &= (~RACK_HAS_FIN);
6164         rack_update_rsm(tp, rack, rsm, ts);
6165         *lenp = 0;
6166         return (0);
6167 }
6168
6169
6170 static void
6171 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
6172     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
6173     uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts)
6174 {
6175         struct tcp_rack *rack;
6176         struct rack_sendmap *rsm, *nrsm, *insret, fe;
6177         register uint32_t snd_max, snd_una;
6178
6179         /*
6180          * Add to the RACK log of packets in flight or retransmitted. If
6181          * there is a TS option we will use the TS echoed, if not we will
6182          * grab a TS.
6183          *
6184          * Retransmissions will increment the count and move the ts to its
6185          * proper place. Note that if options do not include TS's then we
6186          * won't be able to effectively use the ACK for an RTT on a retran.
6187          *
6188          * Notes about r_start and r_end. Lets consider a send starting at
6189          * sequence 1 for 10 bytes. In such an example the r_start would be
6190          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
6191          * This means that r_end is actually the first sequence for the next
6192          * slot (11).
6193          *
6194          */
6195         /*
6196          * If err is set what do we do XXXrrs? should we not add the thing?
6197          * -- i.e. return if err != 0 or should we pretend we sent it? --
6198          * i.e. proceed with add ** do this for now.
6199          */
6200         INP_WLOCK_ASSERT(tp->t_inpcb);
6201         if (err)
6202                 /*
6203                  * We don't log errors -- we could but snd_max does not
6204                  * advance in this case either.
6205                  */
6206                 return;
6207
6208         if (th_flags & TH_RST) {
6209                 /*
6210                  * We don't log resets and we return immediately from
6211                  * sending
6212                  */
6213                 return;
6214         }
6215         rack = (struct tcp_rack *)tp->t_fb_ptr;
6216         snd_una = tp->snd_una;
6217         if (SEQ_LEQ((seq_out + len), snd_una)) {
6218                 /* Are sending an old segment to induce an ack (keep-alive)? */
6219                 return;
6220         }
6221         if (SEQ_LT(seq_out, snd_una)) {
6222                 /* huh? should we panic? */
6223                 uint32_t end;
6224
6225                 end = seq_out + len;
6226                 seq_out = snd_una;
6227                 if (SEQ_GEQ(end, seq_out))
6228                         len = end - seq_out;
6229                 else
6230                         len = 0;
6231         }
6232         snd_max = tp->snd_max;
6233         if (th_flags & (TH_SYN | TH_FIN)) {
6234                 /*
6235                  * The call to rack_log_output is made before bumping
6236                  * snd_max. This means we can record one extra byte on a SYN
6237                  * or FIN if seq_out is adding more on and a FIN is present
6238                  * (and we are not resending).
6239                  */
6240                 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
6241                         len++;
6242                 if (th_flags & TH_FIN)
6243                         len++;
6244                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
6245                         /*
6246                          * The add/update as not been done for the FIN/SYN
6247                          * yet.
6248                          */
6249                         snd_max = tp->snd_nxt;
6250                 }
6251         }
6252         if (len == 0) {
6253                 /* We don't log zero window probes */
6254                 return;
6255         }
6256         rack->r_ctl.rc_time_last_sent = ts;
6257         if (IN_RECOVERY(tp->t_flags)) {
6258                 rack->r_ctl.rc_prr_out += len;
6259         }
6260         /* First question is it a retransmission or new? */
6261         if (seq_out == snd_max) {
6262                 /* Its new */
6263 again:
6264                 rsm = rack_alloc(rack);
6265                 if (rsm == NULL) {
6266                         /*
6267                          * Hmm out of memory and the tcb got destroyed while
6268                          * we tried to wait.
6269                          */
6270                         return;
6271                 }
6272                 if (th_flags & TH_FIN) {
6273                         rsm->r_flags = RACK_HAS_FIN;
6274                 } else {
6275                         rsm->r_flags = 0;
6276                 }
6277                 rsm->r_tim_lastsent[0] = ts;
6278                 rsm->r_rtr_cnt = 1;
6279                 rsm->r_rtr_bytes = 0;
6280                 rsm->usec_orig_send = us_cts;
6281                 if (th_flags & TH_SYN) {
6282                         /* The data space is one beyond snd_una */
6283                         rsm->r_flags |= RACK_HAS_SIN;
6284                         rsm->r_start = seq_out + 1;
6285                         rsm->r_end = rsm->r_start + (len - 1);
6286                 } else {
6287                         /* Normal case */
6288                         rsm->r_start = seq_out;
6289                         rsm->r_end = rsm->r_start + len;
6290                 }
6291                 rsm->r_dupack = 0;
6292                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
6293                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
6294 #ifdef INVARIANTS
6295                 if (insret != NULL) {
6296                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6297                               nrsm, insret, rack, rsm);
6298                 }
6299 #endif
6300                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6301                 rsm->r_in_tmap = 1;
6302                 /*
6303                  * Special case detection, is there just a single
6304                  * packet outstanding when we are not in recovery?
6305                  *
6306                  * If this is true mark it so.
6307                  */
6308                 if ((IN_RECOVERY(tp->t_flags) == 0) &&
6309                     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
6310                         struct rack_sendmap *prsm;
6311
6312                         prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
6313                         if (prsm)
6314                                 prsm->r_one_out_nr = 1;
6315                 }
6316                 return;
6317         }
6318         /*
6319          * If we reach here its a retransmission and we need to find it.
6320          */
6321         memset(&fe, 0, sizeof(fe));
6322 more:
6323         if (hintrsm && (hintrsm->r_start == seq_out)) {
6324                 rsm = hintrsm;
6325                 hintrsm = NULL;
6326         } else {
6327                 /* No hints sorry */
6328                 rsm = NULL;
6329         }
6330         if ((rsm) && (rsm->r_start == seq_out)) {
6331                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
6332                 if (len == 0) {
6333                         return;
6334                 } else {
6335                         goto more;
6336                 }
6337         }
6338         /* Ok it was not the last pointer go through it the hard way. */
6339 refind:
6340         fe.r_start = seq_out;
6341         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
6342         if (rsm) {
6343                 if (rsm->r_start == seq_out) {
6344                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
6345                         if (len == 0) {
6346                                 return;
6347                         } else {
6348                                 goto refind;
6349                         }
6350                 }
6351                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
6352                         /* Transmitted within this piece */
6353                         /*
6354                          * Ok we must split off the front and then let the
6355                          * update do the rest
6356                          */
6357                         nrsm = rack_alloc_full_limit(rack);
6358                         if (nrsm == NULL) {
6359                                 rack_update_rsm(tp, rack, rsm, ts);
6360                                 return;
6361                         }
6362                         /*
6363                          * copy rsm to nrsm and then trim the front of rsm
6364                          * to not include this part.
6365                          */
6366                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
6367                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6368 #ifdef INVARIANTS
6369                         if (insret != NULL) {
6370                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6371                                       nrsm, insret, rack, rsm);
6372                         }
6373 #endif
6374                         if (rsm->r_in_tmap) {
6375                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6376                                 nrsm->r_in_tmap = 1;
6377                         }
6378                         rsm->r_flags &= (~RACK_HAS_FIN);
6379                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
6380                         if (len == 0) {
6381                                 return;
6382                         } else if (len > 0)
6383                                 goto refind;
6384                 }
6385         }
6386         /*
6387          * Hmm not found in map did they retransmit both old and on into the
6388          * new?
6389          */
6390         if (seq_out == tp->snd_max) {
6391                 goto again;
6392         } else if (SEQ_LT(seq_out, tp->snd_max)) {
6393 #ifdef INVARIANTS
6394                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
6395                     seq_out, len, tp->snd_una, tp->snd_max);
6396                 printf("Starting Dump of all rack entries\n");
6397                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6398                         printf("rsm:%p start:%u end:%u\n",
6399                             rsm, rsm->r_start, rsm->r_end);
6400                 }
6401                 printf("Dump complete\n");
6402                 panic("seq_out not found rack:%p tp:%p",
6403                     rack, tp);
6404 #endif
6405         } else {
6406 #ifdef INVARIANTS
6407                 /*
6408                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
6409                  * flag)
6410                  */
6411                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
6412                     seq_out, len, tp->snd_max, tp);
6413 #endif
6414         }
6415 }
6416
6417 /*
6418  * Record one of the RTT updates from an ack into
6419  * our sample structure.
6420  */
6421
6422 static void
6423 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
6424                     int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
6425 {
6426         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
6427             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
6428                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
6429         }
6430         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
6431             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
6432                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
6433         }
6434         if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
6435             if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
6436                 rack->r_ctl.rc_gp_lowrtt = us_rtt;
6437             if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
6438                     rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
6439         }
6440         if ((confidence == 1) &&
6441             ((rsm == NULL) ||
6442              (rsm->r_just_ret) ||
6443              (rsm->r_one_out_nr &&
6444               len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
6445                 /*
6446                  * If the rsm had a just return
6447                  * hit it then we can't trust the
6448                  * rtt measurement for buffer deterimination
6449                  * Note that a confidence of 2, indicates
6450                  * SACK'd which overrides the r_just_ret or
6451                  * the r_one_out_nr. If it was a CUM-ACK and
6452                  * we had only two outstanding, but get an
6453                  * ack for only 1. Then that also lowers our
6454                  * confidence.
6455                  */
6456                 confidence = 0;
6457         }
6458         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
6459             (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
6460                 if (rack->r_ctl.rack_rs.confidence == 0) {
6461                         /*
6462                          * We take anything with no current confidence
6463                          * saved.
6464                          */
6465                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
6466                         rack->r_ctl.rack_rs.confidence = confidence;
6467                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
6468                 } else if (confidence || rack->r_ctl.rack_rs.confidence) {
6469                         /*
6470                          * Once we have a confident number,
6471                          * we can update it with a smaller
6472                          * value since this confident number
6473                          * may include the DSACK time until
6474                          * the next segment (the second one) arrived.
6475                          */
6476                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
6477                         rack->r_ctl.rack_rs.confidence = confidence;
6478                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
6479                 }
6480
6481         }
6482         rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
6483         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
6484         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
6485         rack->r_ctl.rack_rs.rs_rtt_cnt++;
6486 }
6487
6488 /*
6489  * Collect new round-trip time estimate
6490  * and update averages and current timeout.
6491  */
6492 static void
6493 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
6494 {
6495         int32_t delta;
6496         uint32_t o_srtt, o_var;
6497         int32_t hrtt_up = 0;
6498         int32_t rtt;
6499
6500         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
6501                 /* No valid sample */
6502                 return;
6503         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
6504                 /* We are to use the lowest RTT seen in a single ack */
6505                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
6506         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
6507                 /* We are to use the highest RTT seen in a single ack */
6508                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
6509         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
6510                 /* We are to use the average RTT seen in a single ack */
6511                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
6512                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
6513         } else {
6514 #ifdef INVARIANTS
6515                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
6516 #endif
6517                 return;
6518         }
6519         if (rtt == 0)
6520                 rtt = 1;
6521         if (rack->rc_gp_rtt_set == 0) {
6522                 /*
6523                  * With no RTT we have to accept
6524                  * even one we are not confident of.
6525                  */
6526                 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
6527                 rack->rc_gp_rtt_set = 1;
6528         } else if (rack->r_ctl.rack_rs.confidence) {
6529                 /* update the running gp srtt */
6530                 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
6531                 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
6532         }
6533         if (rack->r_ctl.rack_rs.confidence) {
6534                 /*
6535                  * record the low and high for highly buffered path computation,
6536                  * we only do this if we are confident (not a retransmission).
6537                  */
6538                 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
6539                         rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
6540                         hrtt_up = 1;
6541                 }
6542                 if (rack->rc_highly_buffered == 0) {
6543                         /*
6544                          * Currently once we declare a path has
6545                          * highly buffered there is no going
6546                          * back, which may be a problem...
6547                          */
6548                         if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
6549                                 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
6550                                                      rack->r_ctl.rc_highest_us_rtt,
6551                                                      rack->r_ctl.rc_lowest_us_rtt,
6552                                                      RACK_RTTS_SEEHBP);
6553                                 rack->rc_highly_buffered = 1;
6554                         }
6555                 }
6556         }
6557         if ((rack->r_ctl.rack_rs.confidence) ||
6558             (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
6559                 /*
6560                  * If we are highly confident of it <or> it was
6561                  * never retransmitted we accept it as the last us_rtt.
6562                  */
6563                 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
6564                 /* The lowest rtt can be set if its was not retransmited */
6565                 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
6566                         rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
6567                         if (rack->r_ctl.rc_lowest_us_rtt == 0)
6568                                 rack->r_ctl.rc_lowest_us_rtt = 1;
6569                 }
6570         }
6571         rack_log_rtt_sample(rack, rtt);
6572         o_srtt = tp->t_srtt;
6573         o_var = tp->t_rttvar;
6574         rack = (struct tcp_rack *)tp->t_fb_ptr;
6575         if (tp->t_srtt != 0) {
6576                 /*
6577                  * srtt is stored as fixed point with 5 bits after the
6578                  * binary point (i.e., scaled by 8).  The following magic is
6579                  * equivalent to the smoothing algorithm in rfc793 with an
6580                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
6581                  * Adjust rtt to origin 0.
6582                  */
6583                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
6584                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6585
6586                 tp->t_srtt += delta;
6587                 if (tp->t_srtt <= 0)
6588                         tp->t_srtt = 1;
6589
6590                 /*
6591                  * We accumulate a smoothed rtt variance (actually, a
6592                  * smoothed mean difference), then set the retransmit timer
6593                  * to smoothed rtt + 4 times the smoothed variance. rttvar
6594                  * is stored as fixed point with 4 bits after the binary
6595                  * point (scaled by 16).  The following is equivalent to
6596                  * rfc793 smoothing with an alpha of .75 (rttvar =
6597                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
6598                  * wired-in beta.
6599                  */
6600                 if (delta < 0)
6601                         delta = -delta;
6602                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
6603                 tp->t_rttvar += delta;
6604                 if (tp->t_rttvar <= 0)
6605                         tp->t_rttvar = 1;
6606                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
6607                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6608         } else {
6609                 /*
6610                  * No rtt measurement yet - use the unsmoothed rtt. Set the
6611                  * variance to half the rtt (so our first retransmit happens
6612                  * at 3*rtt).
6613                  */
6614                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
6615                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
6616                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6617         }
6618         KMOD_TCPSTAT_INC(tcps_rttupdated);
6619         tp->t_rttupdated++;
6620 #ifdef STATS
6621         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
6622 #endif
6623         tp->t_rxtshift = 0;
6624
6625         /*
6626          * the retransmit should happen at rtt + 4 * rttvar. Because of the
6627          * way we do the smoothing, srtt and rttvar will each average +1/2
6628          * tick of bias.  When we compute the retransmit timer, we want 1/2
6629          * tick of rounding and 1 extra tick because of +-1/2 tick
6630          * uncertainty in the firing of the timer.  The bias will give us
6631          * exactly the 1.5 tick we need.  But, because the bias is
6632          * statistical, we have to test that we don't drop below the minimum
6633          * feasible timer (which is 2 ticks).
6634          */
6635         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6636            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
6637         tp->t_softerror = 0;
6638 }
6639
6640 static void
6641 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
6642     uint32_t t, uint32_t cts)
6643 {
6644         /*
6645          * For this RSM, we acknowledged the data from a previous
6646          * transmission, not the last one we made. This means we did a false
6647          * retransmit.
6648          */
6649         struct tcp_rack *rack;
6650
6651         if (rsm->r_flags & RACK_HAS_FIN) {
6652                 /*
6653                  * The sending of the FIN often is multiple sent when we
6654                  * have everything outstanding ack'd. We ignore this case
6655                  * since its over now.
6656                  */
6657                 return;
6658         }
6659         if (rsm->r_flags & RACK_TLP) {
6660                 /*
6661                  * We expect TLP's to have this occur.
6662                  */
6663                 return;
6664         }
6665         rack = (struct tcp_rack *)tp->t_fb_ptr;
6666         /* should we undo cc changes and exit recovery? */
6667         if (IN_RECOVERY(tp->t_flags)) {
6668                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
6669                         /*
6670                          * Undo what we ratched down and exit recovery if
6671                          * possible
6672                          */
6673                         EXIT_RECOVERY(tp->t_flags);
6674                         tp->snd_recover = tp->snd_una;
6675                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
6676                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
6677                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
6678                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
6679                 }
6680         }
6681         if (rsm->r_flags & RACK_WAS_SACKPASS) {
6682                 /*
6683                  * We retransmitted based on a sack and the earlier
6684                  * retransmission ack'd it - re-ordering is occuring.
6685                  */
6686                 counter_u64_add(rack_reorder_seen, 1);
6687                 rack->r_ctl.rc_reorder_ts = cts;
6688         }
6689         counter_u64_add(rack_badfr, 1);
6690         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
6691 }
6692
6693 static void
6694 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
6695 {
6696         /*
6697          * Apply to filter the inbound us-rtt at us_cts.
6698          */
6699         uint32_t old_rtt;
6700
6701         old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
6702         apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
6703                                us_rtt, us_cts);
6704         if (rack->r_ctl.last_pacing_time &&
6705             rack->rc_gp_dyn_mul &&
6706             (rack->r_ctl.last_pacing_time > us_rtt))
6707                 rack->pacing_longer_than_rtt = 1;
6708         else
6709                 rack->pacing_longer_than_rtt = 0;
6710         if (old_rtt > us_rtt) {
6711                 /* We just hit a new lower rtt time */
6712                 rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
6713                                      __LINE__, RACK_RTTS_NEWRTT);
6714                 /*
6715                  * Only count it if its lower than what we saw within our
6716                  * calculated range.
6717                  */
6718                 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
6719                         if (rack_probertt_lower_within &&
6720                             rack->rc_gp_dyn_mul &&
6721                             (rack->use_fixed_rate == 0) &&
6722                             (rack->rc_always_pace)) {
6723                                 /*
6724                                  * We are seeing a new lower rtt very close
6725                                  * to the time that we would have entered probe-rtt.
6726                                  * This is probably due to the fact that a peer flow
6727                                  * has entered probe-rtt. Lets go in now too.
6728                                  */
6729                                 uint32_t val;
6730
6731                                 val = rack_probertt_lower_within * rack_time_between_probertt;
6732                                 val /= 100;
6733                                 if ((rack->in_probe_rtt == 0)  &&
6734                                     ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
6735                                         rack_enter_probertt(rack, us_cts);
6736                                 }
6737                         }
6738                         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
6739                 }
6740         }
6741 }
6742
6743 static int
6744 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
6745     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
6746 {
6747         int32_t i;
6748         uint32_t t, len_acked;
6749
6750         if ((rsm->r_flags & RACK_ACKED) ||
6751             (rsm->r_flags & RACK_WAS_ACKED))
6752                 /* Already done */
6753                 return (0);
6754
6755         if (ack_type == CUM_ACKED) {
6756                 if (SEQ_GT(th_ack, rsm->r_end))
6757                         len_acked = rsm->r_end - rsm->r_start;
6758                 else
6759                         len_acked = th_ack - rsm->r_start;
6760         } else
6761                 len_acked = rsm->r_end - rsm->r_start;
6762         if (rsm->r_rtr_cnt == 1) {
6763                 uint32_t us_rtt;
6764
6765                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
6766                 if ((int)t <= 0)
6767                         t = 1;
6768                 if (!tp->t_rttlow || tp->t_rttlow > t)
6769                         tp->t_rttlow = t;
6770                 if (!rack->r_ctl.rc_rack_min_rtt ||
6771                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
6772                         rack->r_ctl.rc_rack_min_rtt = t;
6773                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
6774                                 rack->r_ctl.rc_rack_min_rtt = 1;
6775                         }
6776                 }
6777                 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send;
6778                 if (us_rtt == 0)
6779                         us_rtt = 1;
6780                 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
6781                 if (ack_type == SACKED)
6782                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
6783                 else {
6784                         /*
6785                          * For cum-ack we are only confident if what
6786                          * is being acked is included in a measurement.
6787                          * Otherwise it could be an idle period that
6788                          * includes Delayed-ack time.
6789                          */
6790                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
6791                                             (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt);
6792                 }
6793                 if ((rsm->r_flags & RACK_TLP) &&
6794                     (!IN_RECOVERY(tp->t_flags))) {
6795                         /* Segment was a TLP and our retrans matched */
6796                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
6797                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
6798                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
6799                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
6800                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
6801                                 /*
6802                                  * When we enter recovery we need to assure
6803                                  * we send one packet.
6804                                  */
6805                                 if (rack->rack_no_prr == 0) {
6806                                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
6807                                         rack_log_to_prr(rack, 7, 0);
6808                                 }
6809                         }
6810                 }
6811                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
6812                         /* New more recent rack_tmit_time */
6813                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
6814                         rack->rc_rack_rtt = t;
6815                 }
6816                 return (1);
6817         }
6818         /*
6819          * We clear the soft/rxtshift since we got an ack.
6820          * There is no assurance we will call the commit() function
6821          * so we need to clear these to avoid incorrect handling.
6822          */
6823         tp->t_rxtshift = 0;
6824         tp->t_softerror = 0;
6825         if ((to->to_flags & TOF_TS) &&
6826             (ack_type == CUM_ACKED) &&
6827             (to->to_tsecr) &&
6828             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
6829                 /*
6830                  * Now which timestamp does it match? In this block the ACK
6831                  * must be coming from a previous transmission.
6832                  */
6833                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
6834                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
6835                                 t = cts - rsm->r_tim_lastsent[i];
6836                                 if ((int)t <= 0)
6837                                         t = 1;
6838                                 if ((i + 1) < rsm->r_rtr_cnt) {
6839                                         /* Likely */
6840                                         rack_earlier_retran(tp, rsm, t, cts);
6841                                 }
6842                                 if (!tp->t_rttlow || tp->t_rttlow > t)
6843                                         tp->t_rttlow = t;
6844                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
6845                                         rack->r_ctl.rc_rack_min_rtt = t;
6846                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
6847                                                 rack->r_ctl.rc_rack_min_rtt = 1;
6848                                         }
6849                                 }
6850                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
6851                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
6852                                         /* New more recent rack_tmit_time */
6853                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
6854                                         rack->rc_rack_rtt = t;
6855                                 }
6856                                 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm,
6857                                                     rsm->r_rtr_cnt);
6858                                 return (1);
6859                         }
6860                 }
6861                 goto ts_not_found;
6862         } else {
6863                 /*
6864                  * Ok its a SACK block that we retransmitted. or a windows
6865                  * machine without timestamps. We can tell nothing from the
6866                  * time-stamp since its not there or the time the peer last
6867                  * recieved a segment that moved forward its cum-ack point.
6868                  */
6869 ts_not_found:
6870                 i = rsm->r_rtr_cnt - 1;
6871                 t = cts - rsm->r_tim_lastsent[i];
6872                 if ((int)t <= 0)
6873                         t = 1;
6874                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
6875                         /*
6876                          * We retransmitted and the ack came back in less
6877                          * than the smallest rtt we have observed. We most
6878                          * likey did an improper retransmit as outlined in
6879                          * 4.2 Step 3 point 2 in the rack-draft.
6880                          */
6881                         i = rsm->r_rtr_cnt - 2;
6882                         t = cts - rsm->r_tim_lastsent[i];
6883                         rack_earlier_retran(tp, rsm, t, cts);
6884                 } else if (rack->r_ctl.rc_rack_min_rtt) {
6885                         /*
6886                          * We retransmitted it and the retransmit did the
6887                          * job.
6888                          */
6889                         if (!rack->r_ctl.rc_rack_min_rtt ||
6890                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
6891                                 rack->r_ctl.rc_rack_min_rtt = t;
6892                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
6893                                         rack->r_ctl.rc_rack_min_rtt = 1;
6894                                 }
6895                         }
6896                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
6897                                 /* New more recent rack_tmit_time */
6898                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
6899                                 rack->rc_rack_rtt = t;
6900                         }
6901                         return (1);
6902                 }
6903         }
6904         return (0);
6905 }
6906
6907 /*
6908  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
6909  */
6910 static void
6911 rack_log_sack_passed(struct tcpcb *tp,
6912     struct tcp_rack *rack, struct rack_sendmap *rsm)
6913 {
6914         struct rack_sendmap *nrsm;
6915
6916         nrsm = rsm;
6917         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
6918             rack_head, r_tnext) {
6919                 if (nrsm == rsm) {
6920                         /* Skip orginal segment he is acked */
6921                         continue;
6922                 }
6923                 if (nrsm->r_flags & RACK_ACKED) {
6924                         /*
6925                          * Skip ack'd segments, though we
6926                          * should not see these, since tmap
6927                          * should not have ack'd segments.
6928                          */
6929                         continue;
6930                 }
6931                 if (nrsm->r_flags & RACK_SACK_PASSED) {
6932                         /*
6933                          * We found one that is already marked
6934                          * passed, we have been here before and
6935                          * so all others below this are marked.
6936                          */
6937                         break;
6938                 }
6939                 nrsm->r_flags |= RACK_SACK_PASSED;
6940                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
6941         }
6942 }
6943
6944 static void
6945 rack_need_set_test(struct tcpcb *tp,
6946                    struct tcp_rack *rack,
6947                    struct rack_sendmap *rsm,
6948                    tcp_seq th_ack,
6949                    int line,
6950                    int use_which)
6951 {
6952
6953         if ((tp->t_flags & TF_GPUTINPROG) &&
6954             SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
6955                 /*
6956                  * We were app limited, and this ack
6957                  * butts up or goes beyond the point where we want
6958                  * to start our next measurement. We need
6959                  * to record the new gput_ts as here and
6960                  * possibly update the start sequence.
6961                  */
6962                 uint32_t seq, ts;
6963
6964                 if (rsm->r_rtr_cnt > 1) {
6965                         /*
6966                          * This is a retransmit, can we
6967                          * really make any assessment at this
6968                          * point?  We are not really sure of
6969                          * the timestamp, is it this or the
6970                          * previous transmission?
6971                          *
6972                          * Lets wait for something better that
6973                          * is not retransmitted.
6974                          */
6975                         return;
6976                 }
6977                 seq = tp->gput_seq;
6978                 ts = tp->gput_ts;
6979                 rack->app_limited_needs_set = 0;
6980                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
6981                 /* Do we start at a new end? */
6982                 if ((use_which == RACK_USE_BEG) &&
6983                     SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
6984                         /*
6985                          * When we get an ACK that just eats
6986                          * up some of the rsm, we set RACK_USE_BEG
6987                          * since whats at r_start (i.e. th_ack)
6988                          * is left unacked and thats where the
6989                          * measurement not starts.
6990                          */
6991                         tp->gput_seq = rsm->r_start;
6992                         rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
6993                 }
6994                 if ((use_which == RACK_USE_END) &&
6995                     SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
6996                             /*
6997                              * We use the end when the cumack
6998                              * is moving forward and completely
6999                              * deleting the rsm passed so basically
7000                              * r_end holds th_ack.
7001                              *
7002                              * For SACK's we also want to use the end
7003                              * since this piece just got sacked and
7004                              * we want to target anything after that
7005                              * in our measurement.
7006                              */
7007                             tp->gput_seq = rsm->r_end;
7008                             rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
7009                 }
7010                 if (use_which == RACK_USE_END_OR_THACK) {
7011                         /*
7012                          * special case for ack moving forward,
7013                          * not a sack, we need to move all the
7014                          * way up to where this ack cum-ack moves
7015                          * to.
7016                          */
7017                         if (SEQ_GT(th_ack, rsm->r_end))
7018                                 tp->gput_seq = th_ack;
7019                         else
7020                                 tp->gput_seq = rsm->r_end;
7021                         rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send;
7022                 }
7023                 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
7024                         /*
7025                          * We moved beyond this guy's range, re-calculate
7026                          * the new end point.
7027                          */
7028                         if (rack->rc_gp_filled == 0) {
7029                                 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
7030                         } else {
7031                                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
7032                         }
7033                 }
7034                 /*
7035                  * We are moving the goal post, we may be able to clear the
7036                  * measure_saw_probe_rtt flag.
7037                  */
7038                 if ((rack->in_probe_rtt == 0) &&
7039                     (rack->measure_saw_probe_rtt) &&
7040                     (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
7041                         rack->measure_saw_probe_rtt = 0;
7042                 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
7043                                            seq, tp->gput_seq, 0, 5, line, NULL);
7044                 if (rack->rc_gp_filled &&
7045                     ((tp->gput_ack - tp->gput_seq) <
7046                      max(rc_init_window(rack), (MIN_GP_WIN *
7047                                                 ctf_fixed_maxseg(tp))))) {
7048                         /*
7049                          * There is no sense of continuing this measurement
7050                          * because its too small to gain us anything we
7051                          * trust. Skip it and that way we can start a new
7052                          * measurement quicker.
7053                          */
7054                         rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
7055                                                    0, 0, 0, 6, __LINE__, NULL);
7056                         tp->t_flags &= ~TF_GPUTINPROG;
7057                 }
7058         }
7059 }
7060
7061 static uint32_t
7062 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
7063                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
7064 {
7065         uint32_t start, end, changed = 0;
7066         struct rack_sendmap stack_map;
7067         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
7068         int32_t used_ref = 1;
7069         int moved = 0;
7070
7071         start = sack->start;
7072         end = sack->end;
7073         rsm = *prsm;
7074         memset(&fe, 0, sizeof(fe));
7075 do_rest_ofb:
7076         if ((rsm == NULL) ||
7077             (SEQ_LT(end, rsm->r_start)) ||
7078             (SEQ_GEQ(start, rsm->r_end)) ||
7079             (SEQ_LT(start, rsm->r_start))) {
7080                 /*
7081                  * We are not in the right spot,
7082                  * find the correct spot in the tree.
7083                  */
7084                 used_ref = 0;
7085                 fe.r_start = start;
7086                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
7087                 moved++;
7088         }
7089         if (rsm == NULL) {
7090                 /* TSNH */
7091                 goto out;
7092         }
7093         /* Ok we have an ACK for some piece of this rsm */
7094         if (rsm->r_start != start) {
7095                 if ((rsm->r_flags & RACK_ACKED) == 0) {
7096                         /**
7097                          * Need to split this in two pieces the before and after,
7098                          * the before remains in the map, the after must be
7099                          * added. In other words we have:
7100                          * rsm        |--------------|
7101                          * sackblk        |------->
7102                          * rsm will become
7103                          *     rsm    |---|
7104                          * and nrsm will be  the sacked piece
7105                          *     nrsm       |----------|
7106                          *
7107                          * But before we start down that path lets
7108                          * see if the sack spans over on top of
7109                          * the next guy and it is already sacked.
7110                          */
7111                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7112                         if (next && (next->r_flags & RACK_ACKED) &&
7113                             SEQ_GEQ(end, next->r_start)) {
7114                                 /**
7115                                  * So the next one is already acked, and
7116                                  * we can thus by hookery use our stack_map
7117                                  * to reflect the piece being sacked and
7118                                  * then adjust the two tree entries moving
7119                                  * the start and ends around. So we start like:
7120                                  *  rsm     |------------|             (not-acked)
7121                                  *  next                 |-----------| (acked)
7122                                  *  sackblk        |-------->
7123                                  *  We want to end like so:
7124                                  *  rsm     |------|                   (not-acked)
7125                                  *  next           |-----------------| (acked)
7126                                  *  nrsm           |-----|
7127                                  * Where nrsm is a temporary stack piece we
7128                                  * use to update all the gizmos.
7129                                  */
7130                                 /* Copy up our fudge block */
7131                                 nrsm = &stack_map;
7132                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
7133                                 /* Now adjust our tree blocks */
7134                                 rsm->r_end = start;
7135                                 next->r_start = start;
7136                                 /* Clear out the dup ack count of the remainder */
7137                                 rsm->r_dupack = 0;
7138                                 rsm->r_just_ret = 0;
7139                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7140                                 /* Now lets make sure our fudge block is right */
7141                                 nrsm->r_start = start;
7142                                 /* Now lets update all the stats and such */
7143                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
7144                                 if (rack->app_limited_needs_set)
7145                                         rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
7146                                 changed += (nrsm->r_end - nrsm->r_start);
7147                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
7148                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
7149                                         counter_u64_add(rack_reorder_seen, 1);
7150                                         rack->r_ctl.rc_reorder_ts = cts;
7151                                 }
7152                                 /*
7153                                  * Now we want to go up from rsm (the
7154                                  * one left un-acked) to the next one
7155                                  * in the tmap. We do this so when
7156                                  * we walk backwards we include marking
7157                                  * sack-passed on rsm (The one passed in
7158                                  * is skipped since it is generally called
7159                                  * on something sacked before removing it
7160                                  * from the tmap).
7161                                  */
7162                                 if (rsm->r_in_tmap) {
7163                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
7164                                         /*
7165                                          * Now that we have the next
7166                                          * one walk backwards from there.
7167                                          */
7168                                         if (nrsm && nrsm->r_in_tmap)
7169                                                 rack_log_sack_passed(tp, rack, nrsm);
7170                                 }
7171                                 /* Now are we done? */
7172                                 if (SEQ_LT(end, next->r_end) ||
7173                                     (end == next->r_end)) {
7174                                         /* Done with block */
7175                                         goto out;
7176                                 }
7177                                 counter_u64_add(rack_sack_used_next_merge, 1);
7178                                 /* Postion for the next block */
7179                                 start = next->r_end;
7180                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
7181                                 if (rsm == NULL)
7182                                         goto out;
7183                         } else {
7184                                 /**
7185                                  * We can't use any hookery here, so we
7186                                  * need to split the map. We enter like
7187                                  * so:
7188                                  *  rsm      |--------|
7189                                  *  sackblk       |----->
7190                                  * We will add the new block nrsm and
7191                                  * that will be the new portion, and then
7192                                  * fall through after reseting rsm. So we
7193                                  * split and look like this:
7194                                  *  rsm      |----|
7195                                  *  sackblk       |----->
7196                                  *  nrsm          |---|
7197                                  * We then fall through reseting
7198                                  * rsm to nrsm, so the next block
7199                                  * picks it up.
7200                                  */
7201                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
7202                                 if (nrsm == NULL) {
7203                                         /*
7204                                          * failed XXXrrs what can we do but loose the sack
7205                                          * info?
7206                                          */
7207                                         goto out;
7208                                 }
7209                                 counter_u64_add(rack_sack_splits, 1);
7210                                 rack_clone_rsm(rack, nrsm, rsm, start);
7211                                 rsm->r_just_ret = 0;
7212                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7213 #ifdef INVARIANTS
7214                                 if (insret != NULL) {
7215                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7216                                               nrsm, insret, rack, rsm);
7217                                 }
7218 #endif
7219                                 if (rsm->r_in_tmap) {
7220                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7221                                         nrsm->r_in_tmap = 1;
7222                                 }
7223                                 rsm->r_flags &= (~RACK_HAS_FIN);
7224                                 /* Position us to point to the new nrsm that starts the sack blk */
7225                                 rsm = nrsm;
7226                         }
7227                 } else {
7228                         /* Already sacked this piece */
7229                         counter_u64_add(rack_sack_skipped_acked, 1);
7230                         moved++;
7231                         if (end == rsm->r_end) {
7232                                 /* Done with block */
7233                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7234                                 goto out;
7235                         } else if (SEQ_LT(end, rsm->r_end)) {
7236                                 /* A partial sack to a already sacked block */
7237                                 moved++;
7238                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7239                                 goto out;
7240                         } else {
7241                                 /*
7242                                  * The end goes beyond this guy
7243                                  * repostion the start to the
7244                                  * next block.
7245                                  */
7246                                 start = rsm->r_end;
7247                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7248                                 if (rsm == NULL)
7249                                         goto out;
7250                         }
7251                 }
7252         }
7253         if (SEQ_GEQ(end, rsm->r_end)) {
7254                 /**
7255                  * The end of this block is either beyond this guy or right
7256                  * at this guy. I.e.:
7257                  *  rsm ---                 |-----|
7258                  *  end                     |-----|
7259                  *  <or>
7260                  *  end                     |---------|
7261                  */
7262                 if ((rsm->r_flags & RACK_ACKED) == 0) {
7263                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
7264                         changed += (rsm->r_end - rsm->r_start);
7265                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7266                         if (rsm->r_in_tmap) /* should be true */
7267                                 rack_log_sack_passed(tp, rack, rsm);
7268                         /* Is Reordering occuring? */
7269                         if (rsm->r_flags & RACK_SACK_PASSED) {
7270                                 rsm->r_flags &= ~RACK_SACK_PASSED;
7271                                 counter_u64_add(rack_reorder_seen, 1);
7272                                 rack->r_ctl.rc_reorder_ts = cts;
7273                         }
7274                         if (rack->app_limited_needs_set)
7275                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
7276                         rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
7277                         rsm->r_flags |= RACK_ACKED;
7278                         rsm->r_flags &= ~RACK_TLP;
7279                         if (rsm->r_in_tmap) {
7280                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7281                                 rsm->r_in_tmap = 0;
7282                         }
7283                 } else {
7284                         counter_u64_add(rack_sack_skipped_acked, 1);
7285                         moved++;
7286                 }
7287                 if (end == rsm->r_end) {
7288                         /* This block only - done, setup for next  */
7289                         goto out;
7290                 }
7291                 /*
7292                  * There is more not coverend by this rsm move on
7293                  * to the next block in the RB tree.
7294                  */
7295                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7296                 start = rsm->r_end;
7297                 rsm = nrsm;
7298                 if (rsm == NULL)
7299                         goto out;
7300                 goto do_rest_ofb;
7301         }
7302         /**
7303          * The end of this sack block is smaller than
7304          * our rsm i.e.:
7305          *  rsm ---                 |-----|
7306          *  end                     |--|
7307          */
7308         if ((rsm->r_flags & RACK_ACKED) == 0) {
7309                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7310                 if (prev && (prev->r_flags & RACK_ACKED)) {
7311                         /**
7312                          * Goal, we want the right remainder of rsm to shrink
7313                          * in place and span from (rsm->r_start = end) to rsm->r_end.
7314                          * We want to expand prev to go all the way
7315                          * to prev->r_end <- end.
7316                          * so in the tree we have before:
7317                          *   prev     |--------|         (acked)
7318                          *   rsm               |-------| (non-acked)
7319                          *   sackblk           |-|
7320                          * We churn it so we end up with
7321                          *   prev     |----------|       (acked)
7322                          *   rsm                 |-----| (non-acked)
7323                          *   nrsm              |-| (temporary)
7324                          */
7325                         nrsm = &stack_map;
7326                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
7327                         prev->r_end = end;
7328                         rsm->r_start = end;
7329                         /* Now adjust nrsm (stack copy) to be
7330                          * the one that is the small
7331                          * piece that was "sacked".
7332                          */
7333                         nrsm->r_end = end;
7334                         rsm->r_dupack = 0;
7335                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7336                         /*
7337                          * Now nrsm is our new little piece
7338                          * that is acked (which was merged
7339                          * to prev). Update the rtt and changed
7340                          * based on that. Also check for reordering.
7341                          */
7342                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
7343                         if (rack->app_limited_needs_set)
7344                                 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
7345                         changed += (nrsm->r_end - nrsm->r_start);
7346                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
7347                         if (nrsm->r_flags & RACK_SACK_PASSED) {
7348                                 counter_u64_add(rack_reorder_seen, 1);
7349                                 rack->r_ctl.rc_reorder_ts = cts;
7350                         }
7351                         rsm = prev;
7352                         counter_u64_add(rack_sack_used_prev_merge, 1);
7353                 } else {
7354                         /**
7355                          * This is the case where our previous
7356                          * block is not acked either, so we must
7357                          * split the block in two.
7358                          */
7359                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
7360                         if (nrsm == NULL) {
7361                                 /* failed rrs what can we do but loose the sack info? */
7362                                 goto out;
7363                         }
7364                         /**
7365                          * In this case nrsm becomes
7366                          * nrsm->r_start = end;
7367                          * nrsm->r_end = rsm->r_end;
7368                          * which is un-acked.
7369                          * <and>
7370                          * rsm->r_end = nrsm->r_start;
7371                          * i.e. the remaining un-acked
7372                          * piece is left on the left
7373                          * hand side.
7374                          *
7375                          * So we start like this
7376                          * rsm      |----------| (not acked)
7377                          * sackblk  |---|
7378                          * build it so we have
7379                          * rsm      |---|         (acked)
7380                          * nrsm         |------|  (not acked)
7381                          */
7382                         counter_u64_add(rack_sack_splits, 1);
7383                         rack_clone_rsm(rack, nrsm, rsm, end);
7384                         rsm->r_flags &= (~RACK_HAS_FIN);
7385                         rsm->r_just_ret = 0;
7386                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7387 #ifdef INVARIANTS
7388                         if (insret != NULL) {
7389                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7390                                       nrsm, insret, rack, rsm);
7391                         }
7392 #endif
7393                         if (rsm->r_in_tmap) {
7394                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7395                                 nrsm->r_in_tmap = 1;
7396                         }
7397                         nrsm->r_dupack = 0;
7398                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
7399                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
7400                         changed += (rsm->r_end - rsm->r_start);
7401                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
7402                         if (rsm->r_in_tmap) /* should be true */
7403                                 rack_log_sack_passed(tp, rack, rsm);
7404                         /* Is Reordering occuring? */
7405                         if (rsm->r_flags & RACK_SACK_PASSED) {
7406                                 rsm->r_flags &= ~RACK_SACK_PASSED;
7407                                 counter_u64_add(rack_reorder_seen, 1);
7408                                 rack->r_ctl.rc_reorder_ts = cts;
7409                         }
7410                         if (rack->app_limited_needs_set)
7411                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
7412                         rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
7413                         rsm->r_flags |= RACK_ACKED;
7414                         rsm->r_flags &= ~RACK_TLP;
7415                         if (rsm->r_in_tmap) {
7416                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7417                                 rsm->r_in_tmap = 0;
7418                         }
7419                 }
7420         } else if (start != end){
7421                 /*
7422                  * The block was already acked.
7423                  */
7424                 counter_u64_add(rack_sack_skipped_acked, 1);
7425                 moved++;
7426         }
7427 out:
7428         if (rsm && (rsm->r_flags & RACK_ACKED)) {
7429                 /*
7430                  * Now can we merge where we worked
7431                  * with either the previous or
7432                  * next block?
7433                  */
7434                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7435                 while (next) {
7436                     if (next->r_flags & RACK_ACKED) {
7437                         /* yep this and next can be merged */
7438                         rsm = rack_merge_rsm(rack, rsm, next);
7439                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7440                     } else
7441                             break;
7442                 }
7443                 /* Now what about the previous? */
7444                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7445                 while (prev) {
7446                     if (prev->r_flags & RACK_ACKED) {
7447                         /* yep the previous and this can be merged */
7448                         rsm = rack_merge_rsm(rack, prev, rsm);
7449                         prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7450                     } else
7451                             break;
7452                 }
7453         }
7454         if (used_ref == 0) {
7455                 counter_u64_add(rack_sack_proc_all, 1);
7456         } else {
7457                 counter_u64_add(rack_sack_proc_short, 1);
7458         }
7459         /* Save off the next one for quick reference. */
7460         if (rsm)
7461                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7462         else
7463                 nrsm = NULL;
7464         *prsm = rack->r_ctl.rc_sacklast = nrsm;
7465         /* Pass back the moved. */
7466         *moved_two = moved;
7467         return (changed);
7468 }
7469
7470 static void inline
7471 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
7472 {
7473         struct rack_sendmap *tmap;
7474
7475         tmap = NULL;
7476         while (rsm && (rsm->r_flags & RACK_ACKED)) {
7477                 /* Its no longer sacked, mark it so */
7478                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7479 #ifdef INVARIANTS
7480                 if (rsm->r_in_tmap) {
7481                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
7482                               rack, rsm, rsm->r_flags);
7483                 }
7484 #endif
7485                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
7486                 /* Rebuild it into our tmap */
7487                 if (tmap == NULL) {
7488                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7489                         tmap = rsm;
7490                 } else {
7491                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
7492                         tmap = rsm;
7493                 }
7494                 tmap->r_in_tmap = 1;
7495                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7496         }
7497         /*
7498          * Now lets possibly clear the sack filter so we start
7499          * recognizing sacks that cover this area.
7500          */
7501         sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
7502
7503 }
7504
7505 static void
7506 rack_do_decay(struct tcp_rack *rack)
7507 {
7508         struct timeval res;
7509
7510 #define timersub(tvp, uvp, vvp)                                         \
7511         do {                                                            \
7512                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
7513                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
7514                 if ((vvp)->tv_usec < 0) {                               \
7515                         (vvp)->tv_sec--;                                \
7516                         (vvp)->tv_usec += 1000000;                      \
7517                 }                                                       \
7518         } while (0)
7519
7520         timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
7521 #undef timersub
7522
7523         rack->r_ctl.input_pkt++;
7524         if ((rack->rc_in_persist) ||
7525             (res.tv_sec >= 1) ||
7526             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
7527                 /*
7528                  * Check for decay of non-SAD,
7529                  * we want all SAD detection metrics to
7530                  * decay 1/4 per second (or more) passed.
7531                  */
7532                 uint32_t pkt_delta;
7533
7534                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
7535                 /* Update our saved tracking values */
7536                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
7537                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
7538                 /* Now do we escape without decay? */
7539 #ifdef NETFLIX_EXP_DETECTION
7540                 if (rack->rc_in_persist ||
7541                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
7542                     (pkt_delta < tcp_sad_low_pps)){
7543                         /*
7544                          * We don't decay idle connections
7545                          * or ones that have a low input pps.
7546                          */
7547                         return;
7548                 }
7549                 /* Decay the counters */
7550                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
7551                                                         tcp_sad_decay_val);
7552                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
7553                                                          tcp_sad_decay_val);
7554                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
7555                                                                tcp_sad_decay_val);
7556                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
7557                                                                 tcp_sad_decay_val);
7558 #endif
7559         }
7560 }
7561
7562 static void
7563 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
7564 {
7565         uint32_t changed, entered_recovery = 0;
7566         struct tcp_rack *rack;
7567         struct rack_sendmap *rsm, *rm;
7568         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
7569         register uint32_t th_ack;
7570         int32_t i, j, k, num_sack_blks = 0;
7571         uint32_t cts, acked, ack_point, sack_changed = 0;
7572         int loop_start = 0, moved_two = 0;
7573         uint32_t tsused;
7574
7575
7576         INP_WLOCK_ASSERT(tp->t_inpcb);
7577         if (th->th_flags & TH_RST) {
7578                 /* We don't log resets */
7579                 return;
7580         }
7581         rack = (struct tcp_rack *)tp->t_fb_ptr;
7582         cts = tcp_ts_getticks();
7583         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
7584         changed = 0;
7585         th_ack = th->th_ack;
7586         if (rack->sack_attack_disable == 0)
7587                 rack_do_decay(rack);
7588         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
7589                 /*
7590                  * You only get credit for
7591                  * MSS and greater (and you get extra
7592                  * credit for larger cum-ack moves).
7593                  */
7594                 int ac;
7595
7596                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
7597                 rack->r_ctl.ack_count += ac;
7598                 counter_u64_add(rack_ack_total, ac);
7599         }
7600         if (rack->r_ctl.ack_count > 0xfff00000) {
7601                 /*
7602                  * reduce the number to keep us under
7603                  * a uint32_t.
7604                  */
7605                 rack->r_ctl.ack_count /= 2;
7606                 rack->r_ctl.sack_count /= 2;
7607         }
7608         if (SEQ_GT(th_ack, tp->snd_una)) {
7609                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
7610                 tp->t_acktime = ticks;
7611         }
7612         if (rsm && SEQ_GT(th_ack, rsm->r_start))
7613                 changed = th_ack - rsm->r_start;
7614         if (changed) {
7615                 /*
7616                  * The ACK point is advancing to th_ack, we must drop off
7617                  * the packets in the rack log and calculate any eligble
7618                  * RTT's.
7619                  */
7620                 rack->r_wanted_output = 1;
7621 more:
7622                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
7623                 if (rsm == NULL) {
7624                         if ((th_ack - 1) == tp->iss) {
7625                                 /*
7626                                  * For the SYN incoming case we will not
7627                                  * have called tcp_output for the sending of
7628                                  * the SYN, so there will be no map. All
7629                                  * other cases should probably be a panic.
7630                                  */
7631                                 goto proc_sack;
7632                         }
7633                         if (tp->t_flags & TF_SENTFIN) {
7634                                 /* if we send a FIN we will not hav a map */
7635                                 goto proc_sack;
7636                         }
7637 #ifdef INVARIANTS
7638                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
7639                               tp,
7640                               th, tp->t_state, rack,
7641                               tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
7642 #endif
7643                         goto proc_sack;
7644                 }
7645                 if (SEQ_LT(th_ack, rsm->r_start)) {
7646                         /* Huh map is missing this */
7647 #ifdef INVARIANTS
7648                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
7649                                rsm->r_start,
7650                                th_ack, tp->t_state, rack->r_state);
7651 #endif
7652                         goto proc_sack;
7653                 }
7654                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
7655                 /* Now do we consume the whole thing? */
7656                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
7657                         /* Its all consumed. */
7658                         uint32_t left;
7659                         uint8_t newly_acked;
7660
7661                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
7662                         rsm->r_rtr_bytes = 0;
7663                         /* Record the time of highest cumack sent */
7664                         rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send;
7665                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7666 #ifdef INVARIANTS
7667                         if (rm != rsm) {
7668                                 panic("removing head in rack:%p rsm:%p rm:%p",
7669                                       rack, rsm, rm);
7670                         }
7671 #endif
7672                         if (rsm->r_in_tmap) {
7673                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7674                                 rsm->r_in_tmap = 0;
7675                         }
7676                         newly_acked = 1;
7677                         if (rsm->r_flags & RACK_ACKED) {
7678                                 /*
7679                                  * It was acked on the scoreboard -- remove
7680                                  * it from total
7681                                  */
7682                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7683                                 newly_acked = 0;
7684                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
7685                                 /*
7686                                  * There are segments ACKED on the
7687                                  * scoreboard further up. We are seeing
7688                                  * reordering.
7689                                  */
7690                                 rsm->r_flags &= ~RACK_SACK_PASSED;
7691                                 counter_u64_add(rack_reorder_seen, 1);
7692                                 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
7693                                 rsm->r_flags |= RACK_ACKED;
7694                                 rack->r_ctl.rc_reorder_ts = cts;
7695                         }
7696                         left = th_ack - rsm->r_end;
7697                         if (rack->app_limited_needs_set && newly_acked)
7698                                 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
7699                         /* Free back to zone */
7700                         rack_free(rack, rsm);
7701                         if (left) {
7702                                 goto more;
7703                         }
7704                         goto proc_sack;
7705                 }
7706                 if (rsm->r_flags & RACK_ACKED) {
7707                         /*
7708                          * It was acked on the scoreboard -- remove it from
7709                          * total for the part being cum-acked.
7710                          */
7711                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
7712                 }
7713                 /*
7714                  * Clear the dup ack count for
7715                  * the piece that remains.
7716                  */
7717                 rsm->r_dupack = 0;
7718                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7719                 if (rsm->r_rtr_bytes) {
7720                         /*
7721                          * It was retransmitted adjust the
7722                          * sack holes for what was acked.
7723                          */
7724                         int ack_am;
7725
7726                         ack_am = (th_ack - rsm->r_start);
7727                         if (ack_am >= rsm->r_rtr_bytes) {
7728                                 rack->r_ctl.rc_holes_rxt -= ack_am;
7729                                 rsm->r_rtr_bytes -= ack_am;
7730                         }
7731                 }
7732                 /*
7733                  * Update where the piece starts and record
7734                  * the time of send of highest cumack sent.
7735                  */
7736                 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send;
7737                 rsm->r_start = th_ack;
7738                 if (rack->app_limited_needs_set)
7739                         rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
7740
7741         }
7742 proc_sack:
7743         /* Check for reneging */
7744         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
7745         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
7746                 /*
7747                  * The peer has moved snd_una up to
7748                  * the edge of this send, i.e. one
7749                  * that it had previously acked. The only
7750                  * way that can be true if the peer threw
7751                  * away data (space issues) that it had
7752                  * previously sacked (else it would have
7753                  * given us snd_una up to (rsm->r_end).
7754                  * We need to undo the acked markings here.
7755                  *
7756                  * Note we have to look to make sure th_ack is
7757                  * our rsm->r_start in case we get an old ack
7758                  * where th_ack is behind snd_una.
7759                  */
7760                 rack_peer_reneges(rack, rsm, th->th_ack);
7761         }
7762         if ((to->to_flags & TOF_SACK) == 0) {
7763                 /* We are done nothing left */
7764                 goto out;
7765         }
7766         /* Sack block processing */
7767         if (SEQ_GT(th_ack, tp->snd_una))
7768                 ack_point = th_ack;
7769         else
7770                 ack_point = tp->snd_una;
7771         for (i = 0; i < to->to_nsacks; i++) {
7772                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
7773                       &sack, sizeof(sack));
7774                 sack.start = ntohl(sack.start);
7775                 sack.end = ntohl(sack.end);
7776                 if (SEQ_GT(sack.end, sack.start) &&
7777                     SEQ_GT(sack.start, ack_point) &&
7778                     SEQ_LT(sack.start, tp->snd_max) &&
7779                     SEQ_GT(sack.end, ack_point) &&
7780                     SEQ_LEQ(sack.end, tp->snd_max)) {
7781                         sack_blocks[num_sack_blks] = sack;
7782                         num_sack_blks++;
7783 #ifdef NETFLIX_STATS
7784                 } else if (SEQ_LEQ(sack.start, th_ack) &&
7785                            SEQ_LEQ(sack.end, th_ack)) {
7786                         /*
7787                          * Its a D-SACK block.
7788                          */
7789                         tcp_record_dsack(sack.start, sack.end);
7790 #endif
7791                 }
7792
7793         }
7794         /*
7795          * Sort the SACK blocks so we can update the rack scoreboard with
7796          * just one pass.
7797          */
7798         num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
7799                                          num_sack_blks, th->th_ack);
7800         ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
7801         if (num_sack_blks == 0)  {
7802                 /* Nothing to sack (DSACKs?) */
7803                 goto out_with_totals;
7804         }
7805         if (num_sack_blks < 2) {
7806                 /* Only one, we don't need to sort */
7807                 goto do_sack_work;
7808         }
7809         /* Sort the sacks */
7810         for (i = 0; i < num_sack_blks; i++) {
7811                 for (j = i + 1; j < num_sack_blks; j++) {
7812                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
7813                                 sack = sack_blocks[i];
7814                                 sack_blocks[i] = sack_blocks[j];
7815                                 sack_blocks[j] = sack;
7816                         }
7817                 }
7818         }
7819         /*
7820          * Now are any of the sack block ends the same (yes some
7821          * implementations send these)?
7822          */
7823 again:
7824         if (num_sack_blks == 0)
7825                 goto out_with_totals;
7826         if (num_sack_blks > 1) {
7827                 for (i = 0; i < num_sack_blks; i++) {
7828                         for (j = i + 1; j < num_sack_blks; j++) {
7829                                 if (sack_blocks[i].end == sack_blocks[j].end) {
7830                                         /*
7831                                          * Ok these two have the same end we
7832                                          * want the smallest end and then
7833                                          * throw away the larger and start
7834                                          * again.
7835                                          */
7836                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
7837                                                 /*
7838                                                  * The second block covers
7839                                                  * more area use that
7840                                                  */
7841                                                 sack_blocks[i].start = sack_blocks[j].start;
7842                                         }
7843                                         /*
7844                                          * Now collapse out the dup-sack and
7845                                          * lower the count
7846                                          */
7847                                         for (k = (j + 1); k < num_sack_blks; k++) {
7848                                                 sack_blocks[j].start = sack_blocks[k].start;
7849                                                 sack_blocks[j].end = sack_blocks[k].end;
7850                                                 j++;
7851                                         }
7852                                         num_sack_blks--;
7853                                         goto again;
7854                                 }
7855                         }
7856                 }
7857         }
7858 do_sack_work:
7859         /*
7860          * First lets look to see if
7861          * we have retransmitted and
7862          * can use the transmit next?
7863          */
7864         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7865         if (rsm &&
7866             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
7867             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
7868                 /*
7869                  * We probably did the FR and the next
7870                  * SACK in continues as we would expect.
7871                  */
7872                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
7873                 if (acked) {
7874                         rack->r_wanted_output = 1;
7875                         changed += acked;
7876                         sack_changed += acked;
7877                 }
7878                 if (num_sack_blks == 1) {
7879                         /*
7880                          * This is what we would expect from
7881                          * a normal implementation to happen
7882                          * after we have retransmitted the FR,
7883                          * i.e the sack-filter pushes down
7884                          * to 1 block and the next to be retransmitted
7885                          * is the sequence in the sack block (has more
7886                          * are acked). Count this as ACK'd data to boost
7887                          * up the chances of recovering any false positives.
7888                          */
7889                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
7890                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
7891                         counter_u64_add(rack_express_sack, 1);
7892                         if (rack->r_ctl.ack_count > 0xfff00000) {
7893                                 /*
7894                                  * reduce the number to keep us under
7895                                  * a uint32_t.
7896                                  */
7897                                 rack->r_ctl.ack_count /= 2;
7898                                 rack->r_ctl.sack_count /= 2;
7899                         }
7900                         goto out_with_totals;
7901                 } else {
7902                         /*
7903                          * Start the loop through the
7904                          * rest of blocks, past the first block.
7905                          */
7906                         moved_two = 0;
7907                         loop_start = 1;
7908                 }
7909         }
7910         /* Its a sack of some sort */
7911         rack->r_ctl.sack_count++;
7912         if (rack->r_ctl.sack_count > 0xfff00000) {
7913                 /*
7914                  * reduce the number to keep us under
7915                  * a uint32_t.
7916                  */
7917                 rack->r_ctl.ack_count /= 2;
7918                 rack->r_ctl.sack_count /= 2;
7919         }
7920         counter_u64_add(rack_sack_total, 1);
7921         if (rack->sack_attack_disable) {
7922                 /* An attacker disablement is in place */
7923                 if (num_sack_blks > 1) {
7924                         rack->r_ctl.sack_count += (num_sack_blks - 1);
7925                         rack->r_ctl.sack_moved_extra++;
7926                         counter_u64_add(rack_move_some, 1);
7927                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
7928                                 rack->r_ctl.sack_moved_extra /= 2;
7929                                 rack->r_ctl.sack_noextra_move /= 2;
7930                         }
7931                 }
7932                 goto out;
7933         }
7934         rsm = rack->r_ctl.rc_sacklast;
7935         for (i = loop_start; i < num_sack_blks; i++) {
7936                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
7937                 if (acked) {
7938                         rack->r_wanted_output = 1;
7939                         changed += acked;
7940                         sack_changed += acked;
7941                 }
7942                 if (moved_two) {
7943                         /*
7944                          * If we did not get a SACK for at least a MSS and
7945                          * had to move at all, or if we moved more than our
7946                          * threshold, it counts against the "extra" move.
7947                          */
7948                         rack->r_ctl.sack_moved_extra += moved_two;
7949                         counter_u64_add(rack_move_some, 1);
7950                 } else {
7951                         /*
7952                          * else we did not have to move
7953                          * any more than we would expect.
7954                          */
7955                         rack->r_ctl.sack_noextra_move++;
7956                         counter_u64_add(rack_move_none, 1);
7957                 }
7958                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
7959                         /*
7960                          * If the SACK was not a full MSS then
7961                          * we add to sack_count the number of
7962                          * MSS's (or possibly more than
7963                          * a MSS if its a TSO send) we had to skip by.
7964                          */
7965                         rack->r_ctl.sack_count += moved_two;
7966                         counter_u64_add(rack_sack_total, moved_two);
7967                 }
7968                 /*
7969                  * Now we need to setup for the next
7970                  * round. First we make sure we won't
7971                  * exceed the size of our uint32_t on
7972                  * the various counts, and then clear out
7973                  * moved_two.
7974                  */
7975                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
7976                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
7977                         rack->r_ctl.sack_moved_extra /= 2;
7978                         rack->r_ctl.sack_noextra_move /= 2;
7979                 }
7980                 if (rack->r_ctl.sack_count > 0xfff00000) {
7981                         rack->r_ctl.ack_count /= 2;
7982                         rack->r_ctl.sack_count /= 2;
7983                 }
7984                 moved_two = 0;
7985         }
7986 out_with_totals:
7987         if (num_sack_blks > 1) {
7988                 /*
7989                  * You get an extra stroke if
7990                  * you have more than one sack-blk, this
7991                  * could be where we are skipping forward
7992                  * and the sack-filter is still working, or
7993                  * it could be an attacker constantly
7994                  * moving us.
7995                  */
7996                 rack->r_ctl.sack_moved_extra++;
7997                 counter_u64_add(rack_move_some, 1);
7998         }
7999 out:
8000 #ifdef NETFLIX_EXP_DETECTION
8001         if ((rack->do_detection || tcp_force_detection) &&
8002             tcp_sack_to_ack_thresh &&
8003             tcp_sack_to_move_thresh &&
8004             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
8005                 /*
8006                  * We have thresholds set to find
8007                  * possible attackers and disable sack.
8008                  * Check them.
8009                  */
8010                 uint64_t ackratio, moveratio, movetotal;
8011
8012                 /* Log detecting */
8013                 rack_log_sad(rack, 1);
8014                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
8015                 ackratio *= (uint64_t)(1000);
8016                 if (rack->r_ctl.ack_count)
8017                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
8018                 else {
8019                         /* We really should not hit here */
8020                         ackratio = 1000;
8021                 }
8022                 if ((rack->sack_attack_disable  == 0) &&
8023                     (ackratio > rack_highest_sack_thresh_seen))
8024                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
8025                 movetotal = rack->r_ctl.sack_moved_extra;
8026                 movetotal += rack->r_ctl.sack_noextra_move;
8027                 moveratio = rack->r_ctl.sack_moved_extra;
8028                 moveratio *= (uint64_t)1000;
8029                 if (movetotal)
8030                         moveratio /= movetotal;
8031                 else {
8032                         /* No moves, thats pretty good */
8033                         moveratio = 0;
8034                 }
8035                 if ((rack->sack_attack_disable == 0) &&
8036                     (moveratio > rack_highest_move_thresh_seen))
8037                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
8038                 if (rack->sack_attack_disable == 0) {
8039                         if ((ackratio > tcp_sack_to_ack_thresh) &&
8040                             (moveratio > tcp_sack_to_move_thresh)) {
8041                                 /* Disable sack processing */
8042                                 rack->sack_attack_disable = 1;
8043                                 if (rack->r_rep_attack == 0) {
8044                                         rack->r_rep_attack = 1;
8045                                         counter_u64_add(rack_sack_attacks_detected, 1);
8046                                 }
8047                                 if (tcp_attack_on_turns_on_logging) {
8048                                         /*
8049                                          * Turn on logging, used for debugging
8050                                          * false positives.
8051                                          */
8052                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
8053                                 }
8054                                 /* Clamp the cwnd at flight size */
8055                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
8056                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
8057                                 rack_log_sad(rack, 2);
8058                         }
8059                 } else {
8060                         /* We are sack-disabled check for false positives */
8061                         if ((ackratio <= tcp_restoral_thresh) ||
8062                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
8063                                 rack->sack_attack_disable  = 0;
8064                                 rack_log_sad(rack, 3);
8065                                 /* Restart counting */
8066                                 rack->r_ctl.sack_count = 0;
8067                                 rack->r_ctl.sack_moved_extra = 0;
8068                                 rack->r_ctl.sack_noextra_move = 1;
8069                                 rack->r_ctl.ack_count = max(1,
8070                                       (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
8071
8072                                 if (rack->r_rep_reverse == 0) {
8073                                         rack->r_rep_reverse = 1;
8074                                         counter_u64_add(rack_sack_attacks_reversed, 1);
8075                                 }
8076                                 /* Restore the cwnd */
8077                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
8078                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
8079                         }
8080                 }
8081         }
8082 #endif
8083         if (changed) {
8084                 /* Something changed cancel the rack timer */
8085                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
8086         }
8087         tsused = tcp_ts_getticks();
8088         rsm = tcp_rack_output(tp, rack, tsused);
8089         if ((!IN_RECOVERY(tp->t_flags)) &&
8090             rsm) {
8091                 /* Enter recovery */
8092                 rack->r_ctl.rc_rsm_start = rsm->r_start;
8093                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8094                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8095                 entered_recovery = 1;
8096                 rack_cong_signal(tp, NULL, CC_NDUPACK);
8097                 /*
8098                  * When we enter recovery we need to assure we send
8099                  * one packet.
8100                  */
8101                 if (rack->rack_no_prr == 0) {
8102                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
8103                         rack_log_to_prr(rack, 8, 0);
8104                 }
8105                 rack->r_timer_override = 1;
8106                 rack->r_early = 0;
8107                 rack->r_ctl.rc_agg_early = 0;
8108         } else if (IN_RECOVERY(tp->t_flags) &&
8109                    rsm &&
8110                    (rack->r_rr_config == 3)) {
8111                 /*
8112                  * Assure we can output and we get no
8113                  * remembered pace time except the retransmit.
8114                  */
8115                 rack->r_timer_override = 1;
8116                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
8117                 rack->r_ctl.rc_resend = rsm;
8118         }
8119         if (IN_RECOVERY(tp->t_flags) &&
8120             (rack->rack_no_prr == 0) &&
8121             (entered_recovery == 0)) {
8122                 /* Deal with PRR here (in recovery only) */
8123                 uint32_t pipe, snd_una;
8124
8125                 rack->r_ctl.rc_prr_delivered += changed;
8126                 /* Compute prr_sndcnt */
8127                 if (SEQ_GT(tp->snd_una, th_ack)) {
8128                         snd_una = tp->snd_una;
8129                 } else {
8130                         snd_una = th_ack;
8131                 }
8132                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
8133                 if (pipe > tp->snd_ssthresh) {
8134                         long sndcnt;
8135
8136                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
8137                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
8138                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
8139                         else {
8140                                 rack->r_ctl.rc_prr_sndcnt = 0;
8141                                 rack_log_to_prr(rack, 9, 0);
8142                                 sndcnt = 0;
8143                         }
8144                         sndcnt++;
8145                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
8146                                 sndcnt -= rack->r_ctl.rc_prr_out;
8147                         else
8148                                 sndcnt = 0;
8149                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
8150                         rack_log_to_prr(rack, 10, 0);
8151                 } else {
8152                         uint32_t limit;
8153
8154                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
8155                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
8156                         else
8157                                 limit = 0;
8158                         if (changed > limit)
8159                                 limit = changed;
8160                         limit += ctf_fixed_maxseg(tp);
8161                         if (tp->snd_ssthresh > pipe) {
8162                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
8163                                 rack_log_to_prr(rack, 11, 0);
8164                         } else {
8165                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
8166                                 rack_log_to_prr(rack, 12, 0);
8167                         }
8168                 }
8169                 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
8170                      ((rack->rc_inp->inp_in_hpts == 0) &&
8171                       ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
8172                         /*
8173                          * If you are pacing output you don't want
8174                          * to override.
8175                          */
8176                         rack->r_early = 0;
8177                         rack->r_ctl.rc_agg_early = 0;
8178                         rack->r_timer_override = 1;
8179                 }
8180         }
8181 }
8182
8183 static void
8184 rack_strike_dupack(struct tcp_rack *rack)
8185 {
8186         struct rack_sendmap *rsm;
8187
8188         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
8189         if (rsm && (rsm->r_dupack < 0xff)) {
8190                 rsm->r_dupack++;
8191                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
8192                         rack->r_wanted_output = 1;
8193                         rack->r_timer_override = 1;
8194                         rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
8195                 } else {
8196                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
8197                 }
8198         }
8199 }
8200
8201 static void
8202 rack_check_bottom_drag(struct tcpcb *tp,
8203                        struct tcp_rack *rack,
8204                        struct socket *so, int32_t acked)
8205 {
8206         uint32_t segsiz, minseg;
8207
8208         segsiz = ctf_fixed_maxseg(tp);
8209         if (so->so_snd.sb_flags & SB_TLS_IFNET) {
8210                 minseg = rack->r_ctl.rc_pace_min_segs;
8211         } else {
8212                 minseg = segsiz;
8213         }
8214         if (tp->snd_max == tp->snd_una) {
8215                 /*
8216                  * We are doing dynamic pacing and we are way
8217                  * under. Basically everything got acked while
8218                  * we were still waiting on the pacer to expire.
8219                  *
8220                  * This means we need to boost the b/w in
8221                  * addition to any earlier boosting of
8222                  * the multipler.
8223                  */
8224                 rack->rc_dragged_bottom = 1;
8225                 rack_validate_multipliers_at_or_above100(rack);
8226                 /*
8227                  * Lets use the segment bytes acked plus
8228                  * the lowest RTT seen as the basis to
8229                  * form a b/w estimate. This will be off
8230                  * due to the fact that the true estimate
8231                  * should be around 1/2 the time of the RTT
8232                  * but we can settle for that.
8233                  */
8234                 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
8235                     acked) {
8236                         uint64_t bw, calc_bw, rtt;
8237
8238                         rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8239                         bw = acked;
8240                         calc_bw = bw * 1000000;
8241                         calc_bw /= rtt;
8242                         if (rack->r_ctl.last_max_bw &&
8243                             (rack->r_ctl.last_max_bw < calc_bw)) {
8244                                 /*
8245                                  * If we have a last calculated max bw
8246                                  * enforce it.
8247                                  */
8248                                 calc_bw = rack->r_ctl.last_max_bw;
8249                         }
8250                         /* now plop it in */
8251                         if (rack->rc_gp_filled == 0) {
8252                                 if (calc_bw > ONE_POINT_TWO_MEG) {
8253                                         /*
8254                                          * If we have no measurement
8255                                          * don't let us set in more than
8256                                          * 1.2Mbps. If we are still too
8257                                          * low after pacing with this we
8258                                          * will hopefully have a max b/w
8259                                          * available to sanity check things.
8260                                          */
8261                                         calc_bw = ONE_POINT_TWO_MEG;
8262                                 }
8263                                 rack->r_ctl.rc_rtt_diff = 0;
8264                                 rack->r_ctl.gp_bw = calc_bw;
8265                                 rack->rc_gp_filled = 1;
8266                                 rack->r_ctl.num_avg = RACK_REQ_AVG;
8267                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
8268                         } else if (calc_bw > rack->r_ctl.gp_bw) {
8269                                 rack->r_ctl.rc_rtt_diff = 0;
8270                                 rack->r_ctl.num_avg = RACK_REQ_AVG;
8271                                 rack->r_ctl.gp_bw = calc_bw;
8272                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
8273                         } else
8274                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
8275                         /*
8276                          * For acks over 1mss we do a extra boost to simulate
8277                          * where we would get 2 acks (we want 110 for the mul).
8278                          */
8279                         if (acked > segsiz)
8280                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
8281                 } else {
8282                         /*
8283                          * Huh, this should not be, settle
8284                          * for just an old increase.
8285                          */
8286                         rack_increase_bw_mul(rack, -1, 0, 0, 1);
8287                 }
8288         } else if ((IN_RECOVERY(tp->t_flags) == 0) &&
8289                    (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
8290                                                minseg)) &&
8291                    (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
8292                    (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
8293                    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
8294                     (segsiz * rack_req_segs))) {
8295                 /*
8296                  * We are doing dynamic GP pacing and
8297                  * we have everything except 1MSS or less
8298                  * bytes left out. We are still pacing away.
8299                  * And there is data that could be sent, This
8300                  * means we are inserting delayed ack time in
8301                  * our measurements because we are pacing too slow.
8302                  */
8303                 rack_validate_multipliers_at_or_above100(rack);
8304                 rack->rc_dragged_bottom = 1;
8305                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
8306         }
8307 }
8308
8309 /*
8310  * Return value of 1, we do not need to call rack_process_data().
8311  * return value of 0, rack_process_data can be called.
8312  * For ret_val if its 0 the TCP is locked, if its non-zero
8313  * its unlocked and probably unsafe to touch the TCB.
8314  */
8315 static int
8316 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
8317     struct tcpcb *tp, struct tcpopt *to,
8318     uint32_t tiwin, int32_t tlen,
8319     int32_t * ofia, int32_t thflags, int32_t * ret_val)
8320 {
8321         int32_t ourfinisacked = 0;
8322         int32_t nsegs, acked_amount;
8323         int32_t acked;
8324         struct mbuf *mfree;
8325         struct tcp_rack *rack;
8326         int32_t under_pacing = 0;
8327         int32_t recovery = 0;
8328
8329         rack = (struct tcp_rack *)tp->t_fb_ptr;
8330         if (SEQ_GT(th->th_ack, tp->snd_max)) {
8331                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
8332                 rack->r_wanted_output = 1;
8333                 return (1);
8334         }
8335         if (rack->rc_gp_filled &&
8336             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
8337                 under_pacing = 1;
8338         }
8339         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
8340                 if (rack->rc_in_persist)
8341                         tp->t_rxtshift = 0;
8342                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
8343                         rack_strike_dupack(rack);
8344                 rack_log_ack(tp, to, th);
8345         }
8346         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
8347                 /*
8348                  * Old ack, behind (or duplicate to) the last one rcv'd
8349                  * Note: Should mark reordering is occuring! We should also
8350                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
8351                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
8352                  * retran and> ack 3
8353                  */
8354                 return (0);
8355         }
8356         /*
8357          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
8358          * something we sent.
8359          */
8360         if (tp->t_flags & TF_NEEDSYN) {
8361                 /*
8362                  * T/TCP: Connection was half-synchronized, and our SYN has
8363                  * been ACK'd (so connection is now fully synchronized).  Go
8364                  * to non-starred state, increment snd_una for ACK of SYN,
8365                  * and check if we can do window scaling.
8366                  */
8367                 tp->t_flags &= ~TF_NEEDSYN;
8368                 tp->snd_una++;
8369                 /* Do window scaling? */
8370                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
8371                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
8372                         tp->rcv_scale = tp->request_r_scale;
8373                         /* Send window already scaled. */
8374                 }
8375         }
8376         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8377         INP_WLOCK_ASSERT(tp->t_inpcb);
8378
8379         acked = BYTES_THIS_ACK(tp, th);
8380         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
8381         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
8382         /*
8383          * If we just performed our first retransmit, and the ACK arrives
8384          * within our recovery window, then it was a mistake to do the
8385          * retransmit in the first place.  Recover our original cwnd and
8386          * ssthresh, and proceed to transmit where we left off.
8387          */
8388         if (tp->t_flags & TF_PREVVALID) {
8389                 tp->t_flags &= ~TF_PREVVALID;
8390                 if (tp->t_rxtshift == 1 &&
8391                     (int)(ticks - tp->t_badrxtwin) < 0)
8392                         rack_cong_signal(tp, th, CC_RTO_ERR);
8393         }
8394         if (acked) {
8395                 /* assure we are not backed off */
8396                 tp->t_rxtshift = 0;
8397                 rack->rc_tlp_in_progress = 0;
8398                 rack->r_ctl.rc_tlp_cnt_out = 0;
8399                 /*
8400                  * If it is the RXT timer we want to
8401                  * stop it, so we can restart a TLP.
8402                  */
8403                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
8404                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
8405 #ifdef NETFLIX_HTTP_LOGGING
8406                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
8407 #endif
8408         }
8409         /*
8410          * If we have a timestamp reply, update smoothed round trip time. If
8411          * no timestamp is present but transmit timer is running and timed
8412          * sequence number was acked, update smoothed round trip time. Since
8413          * we now have an rtt measurement, cancel the timer backoff (cf.,
8414          * Phil Karn's retransmit alg.). Recompute the initial retransmit
8415          * timer.
8416          *
8417          * Some boxes send broken timestamp replies during the SYN+ACK
8418          * phase, ignore timestamps of 0 or we could calculate a huge RTT
8419          * and blow up the retransmit timer.
8420          */
8421         /*
8422          * If all outstanding data is acked, stop retransmit timer and
8423          * remember to restart (more output or persist). If there is more
8424          * data to be acked, restart retransmit timer, using current
8425          * (possibly backed-off) value.
8426          */
8427         if (acked == 0) {
8428                 if (ofia)
8429                         *ofia = ourfinisacked;
8430                 return (0);
8431         }
8432         if (rack->r_ctl.rc_early_recovery) {
8433                 if (IN_RECOVERY(tp->t_flags)) {
8434                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
8435                             (SEQ_LT(th->th_ack, tp->snd_max))) {
8436                                 tcp_rack_partialack(tp, th);
8437                         } else {
8438                                 rack_post_recovery(tp, th);
8439                                 recovery = 1;
8440                         }
8441                 }
8442         }
8443         /*
8444          * Let the congestion control algorithm update congestion control
8445          * related information. This typically means increasing the
8446          * congestion window.
8447          */
8448         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
8449         SOCKBUF_LOCK(&so->so_snd);
8450         acked_amount = min(acked, (int)sbavail(&so->so_snd));
8451         tp->snd_wnd -= acked_amount;
8452         mfree = sbcut_locked(&so->so_snd, acked_amount);
8453         if ((sbused(&so->so_snd) == 0) &&
8454             (acked > acked_amount) &&
8455             (tp->t_state >= TCPS_FIN_WAIT_1) &&
8456             (tp->t_flags & TF_SENTFIN)) {
8457                 /*
8458                  * We must be sure our fin
8459                  * was sent and acked (we can be
8460                  * in FIN_WAIT_1 without having
8461                  * sent the fin).
8462                  */
8463                 ourfinisacked = 1;
8464         }
8465         /* NB: sowwakeup_locked() does an implicit unlock. */
8466         sowwakeup_locked(so);
8467         m_freem(mfree);
8468         if (rack->r_ctl.rc_early_recovery == 0) {
8469                 if (IN_RECOVERY(tp->t_flags)) {
8470                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
8471                             (SEQ_LT(th->th_ack, tp->snd_max))) {
8472                                 tcp_rack_partialack(tp, th);
8473                         } else {
8474                                 rack_post_recovery(tp, th);
8475                         }
8476                 }
8477         }
8478         tp->snd_una = th->th_ack;
8479         if (SEQ_GT(tp->snd_una, tp->snd_recover))
8480                 tp->snd_recover = tp->snd_una;
8481
8482         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
8483                 tp->snd_nxt = tp->snd_una;
8484         }
8485         if (under_pacing &&
8486             (rack->use_fixed_rate == 0) &&
8487             (rack->in_probe_rtt == 0) &&
8488             rack->rc_gp_dyn_mul &&
8489             rack->rc_always_pace) {
8490                 /* Check if we are dragging bottom */
8491                 rack_check_bottom_drag(tp, rack, so, acked);
8492         }
8493         if (tp->snd_una == tp->snd_max) {
8494                 /* Nothing left outstanding */
8495                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
8496                 if (rack->r_ctl.rc_went_idle_time == 0)
8497                         rack->r_ctl.rc_went_idle_time = 1;
8498                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
8499                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
8500                         tp->t_acktime = 0;
8501                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
8502                 /* Set need output so persist might get set */
8503                 rack->r_wanted_output = 1;
8504                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
8505                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
8506                     (sbavail(&so->so_snd) == 0) &&
8507                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
8508                         /*
8509                          * The socket was gone and the
8510                          * peer sent data, time to
8511                          * reset him.
8512                          */
8513                         *ret_val = 1;
8514                         /* tcp_close will kill the inp pre-log the Reset */
8515                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
8516                         tp = tcp_close(tp);
8517                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
8518                         return (1);
8519
8520                 }
8521         }
8522         if (ofia)
8523                 *ofia = ourfinisacked;
8524         return (0);
8525 }
8526
8527 static void
8528 rack_collapsed_window(struct tcp_rack *rack)
8529 {
8530         /*
8531          * Now we must walk the
8532          * send map and divide the
8533          * ones left stranded. These
8534          * guys can't cause us to abort
8535          * the connection and are really
8536          * "unsent". However if a buggy
8537          * client actually did keep some
8538          * of the data i.e. collapsed the win
8539          * and refused to ack and then opened
8540          * the win and acked that data. We would
8541          * get into an ack war, the simplier
8542          * method then of just pretending we
8543          * did not send those segments something
8544          * won't work.
8545          */
8546         struct rack_sendmap *rsm, *nrsm, fe, *insret;
8547         tcp_seq max_seq;
8548
8549         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
8550         memset(&fe, 0, sizeof(fe));
8551         fe.r_start = max_seq;
8552         /* Find the first seq past or at maxseq */
8553         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
8554         if (rsm == NULL) {
8555                 /* Nothing to do strange */
8556                 rack->rc_has_collapsed = 0;
8557                 return;
8558         }
8559         /*
8560          * Now do we need to split at
8561          * the collapse point?
8562          */
8563         if (SEQ_GT(max_seq, rsm->r_start)) {
8564                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8565                 if (nrsm == NULL) {
8566                         /* We can't get a rsm, mark all? */
8567                         nrsm = rsm;
8568                         goto no_split;
8569                 }
8570                 /* Clone it */
8571                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
8572                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8573 #ifdef INVARIANTS
8574                 if (insret != NULL) {
8575                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8576                               nrsm, insret, rack, rsm);
8577                 }
8578 #endif
8579                 if (rsm->r_in_tmap) {
8580                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8581                         nrsm->r_in_tmap = 1;
8582                 }
8583                 /*
8584                  * Set in the new RSM as the
8585                  * collapsed starting point
8586                  */
8587                 rsm = nrsm;
8588         }
8589 no_split:
8590         counter_u64_add(rack_collapsed_win, 1);
8591         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
8592                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
8593                 rack->rc_has_collapsed = 1;
8594         }
8595 }
8596
8597 static void
8598 rack_un_collapse_window(struct tcp_rack *rack)
8599 {
8600         struct rack_sendmap *rsm;
8601
8602         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
8603                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
8604                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
8605                 else
8606                         break;
8607         }
8608         rack->rc_has_collapsed = 0;
8609 }
8610
8611 static void
8612 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
8613                         int32_t tlen, int32_t tfo_syn)
8614 {
8615         if (DELAY_ACK(tp, tlen) || tfo_syn) {
8616                 if (rack->rc_dack_mode &&
8617                     (tlen > 500) &&
8618                     (rack->rc_dack_toggle == 1)) {
8619                         goto no_delayed_ack;
8620                 }
8621                 rack_timer_cancel(tp, rack,
8622                                   rack->r_ctl.rc_rcvtime, __LINE__);
8623                 tp->t_flags |= TF_DELACK;
8624         } else {
8625 no_delayed_ack:
8626                 rack->r_wanted_output = 1;
8627                 tp->t_flags |= TF_ACKNOW;
8628                 if (rack->rc_dack_mode) {
8629                         if (tp->t_flags & TF_DELACK)
8630                                 rack->rc_dack_toggle = 1;
8631                         else
8632                                 rack->rc_dack_toggle = 0;
8633                 }
8634         }
8635 }
8636 /*
8637  * Return value of 1, the TCB is unlocked and most
8638  * likely gone, return value of 0, the TCP is still
8639  * locked.
8640  */
8641 static int
8642 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
8643     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
8644     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
8645 {
8646         /*
8647          * Update window information. Don't look at window if no ACK: TAC's
8648          * send garbage on first SYN.
8649          */
8650         int32_t nsegs;
8651         int32_t tfo_syn;
8652         struct tcp_rack *rack;
8653
8654         rack = (struct tcp_rack *)tp->t_fb_ptr;
8655         INP_WLOCK_ASSERT(tp->t_inpcb);
8656         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8657         if ((thflags & TH_ACK) &&
8658             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
8659             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
8660             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
8661                 /* keep track of pure window updates */
8662                 if (tlen == 0 &&
8663                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
8664                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
8665                 tp->snd_wnd = tiwin;
8666                 tp->snd_wl1 = th->th_seq;
8667                 tp->snd_wl2 = th->th_ack;
8668                 if (tp->snd_wnd > tp->max_sndwnd)
8669                         tp->max_sndwnd = tp->snd_wnd;
8670                 rack->r_wanted_output = 1;
8671         } else if (thflags & TH_ACK) {
8672                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
8673                         tp->snd_wnd = tiwin;
8674                         tp->snd_wl1 = th->th_seq;
8675                         tp->snd_wl2 = th->th_ack;
8676                 }
8677         }
8678         if (tp->snd_wnd < ctf_outstanding(tp))
8679                 /* The peer collapsed the window */
8680                 rack_collapsed_window(rack);
8681         else if (rack->rc_has_collapsed)
8682                 rack_un_collapse_window(rack);
8683         /* Was persist timer active and now we have window space? */
8684         if ((rack->rc_in_persist != 0) &&
8685             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
8686                                 rack->r_ctl.rc_pace_min_segs))) {
8687                 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
8688                 tp->snd_nxt = tp->snd_max;
8689                 /* Make sure we output to start the timer */
8690                 rack->r_wanted_output = 1;
8691         }
8692         /* Do we enter persists? */
8693         if ((rack->rc_in_persist == 0) &&
8694             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8695             TCPS_HAVEESTABLISHED(tp->t_state) &&
8696             (tp->snd_max == tp->snd_una) &&
8697             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8698             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
8699                 /*
8700                  * Here the rwnd is less than
8701                  * the pacing size, we are established,
8702                  * nothing is outstanding, and there is
8703                  * data to send. Enter persists.
8704                  */
8705                 tp->snd_nxt = tp->snd_una;
8706                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
8707         }
8708         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
8709                 m_freem(m);
8710                 return (0);
8711         }
8712         /*
8713          * don't process the URG bit, ignore them drag
8714          * along the up.
8715          */
8716         tp->rcv_up = tp->rcv_nxt;
8717         INP_WLOCK_ASSERT(tp->t_inpcb);
8718
8719         /*
8720          * Process the segment text, merging it into the TCP sequencing
8721          * queue, and arranging for acknowledgment of receipt if necessary.
8722          * This process logically involves adjusting tp->rcv_wnd as data is
8723          * presented to the user (this happens in tcp_usrreq.c, case
8724          * PRU_RCVD).  If a FIN has already been received on this connection
8725          * then we just ignore the text.
8726          */
8727         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
8728                    IS_FASTOPEN(tp->t_flags));
8729         if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
8730             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8731                 tcp_seq save_start = th->th_seq;
8732                 tcp_seq save_rnxt  = tp->rcv_nxt;
8733                 int     save_tlen  = tlen;
8734
8735                 m_adj(m, drop_hdrlen);  /* delayed header drop */
8736                 /*
8737                  * Insert segment which includes th into TCP reassembly
8738                  * queue with control block tp.  Set thflags to whether
8739                  * reassembly now includes a segment with FIN.  This handles
8740                  * the common case inline (segment is the next to be
8741                  * received on an established connection, and the queue is
8742                  * empty), avoiding linkage into and removal from the queue
8743                  * and repetition of various conversions. Set DELACK for
8744                  * segments received in order, but ack immediately when
8745                  * segments are out of order (so fast retransmit can work).
8746                  */
8747                 if (th->th_seq == tp->rcv_nxt &&
8748                     SEGQ_EMPTY(tp) &&
8749                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
8750                     tfo_syn)) {
8751 #ifdef NETFLIX_SB_LIMITS
8752                         u_int mcnt, appended;
8753
8754                         if (so->so_rcv.sb_shlim) {
8755                                 mcnt = m_memcnt(m);
8756                                 appended = 0;
8757                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8758                                     CFO_NOSLEEP, NULL) == false) {
8759                                         counter_u64_add(tcp_sb_shlim_fails, 1);
8760                                         m_freem(m);
8761                                         return (0);
8762                                 }
8763                         }
8764 #endif
8765                         rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
8766                         tp->rcv_nxt += tlen;
8767                         if (tlen &&
8768                             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
8769                             (tp->t_fbyte_in == 0)) {
8770                                 tp->t_fbyte_in = ticks;
8771                                 if (tp->t_fbyte_in == 0)
8772                                         tp->t_fbyte_in = 1;
8773                                 if (tp->t_fbyte_out && tp->t_fbyte_in)
8774                                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
8775                         }
8776                         thflags = th->th_flags & TH_FIN;
8777                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
8778                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
8779                         SOCKBUF_LOCK(&so->so_rcv);
8780                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
8781                                 m_freem(m);
8782                         } else
8783 #ifdef NETFLIX_SB_LIMITS
8784                                 appended =
8785 #endif
8786                                         sbappendstream_locked(&so->so_rcv, m, 0);
8787                         /* NB: sorwakeup_locked() does an implicit unlock. */
8788                         sorwakeup_locked(so);
8789 #ifdef NETFLIX_SB_LIMITS
8790                         if (so->so_rcv.sb_shlim && appended != mcnt)
8791                                 counter_fo_release(so->so_rcv.sb_shlim,
8792                                     mcnt - appended);
8793 #endif
8794                 } else {
8795                         /*
8796                          * XXX: Due to the header drop above "th" is
8797                          * theoretically invalid by now.  Fortunately
8798                          * m_adj() doesn't actually frees any mbufs when
8799                          * trimming from the head.
8800                          */
8801                         tcp_seq temp = save_start;
8802                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
8803                         tp->t_flags |= TF_ACKNOW;
8804                 }
8805                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
8806                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
8807                                 /*
8808                                  * DSACK actually handled in the fastpath
8809                                  * above.
8810                                  */
8811                                 RACK_OPTS_INC(tcp_sack_path_1);
8812                                 tcp_update_sack_list(tp, save_start,
8813                                     save_start + save_tlen);
8814                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
8815                                 if ((tp->rcv_numsacks >= 1) &&
8816                                     (tp->sackblks[0].end == save_start)) {
8817                                         /*
8818                                          * Partial overlap, recorded at todrop
8819                                          * above.
8820                                          */
8821                                         RACK_OPTS_INC(tcp_sack_path_2a);
8822                                         tcp_update_sack_list(tp,
8823                                             tp->sackblks[0].start,
8824                                             tp->sackblks[0].end);
8825                                 } else {
8826                                         RACK_OPTS_INC(tcp_sack_path_2b);
8827                                         tcp_update_dsack_list(tp, save_start,
8828                                             save_start + save_tlen);
8829                                 }
8830                         } else if (tlen >= save_tlen) {
8831                                 /* Update of sackblks. */
8832                                 RACK_OPTS_INC(tcp_sack_path_3);
8833                                 tcp_update_dsack_list(tp, save_start,
8834                                     save_start + save_tlen);
8835                         } else if (tlen > 0) {
8836                                 RACK_OPTS_INC(tcp_sack_path_4);
8837                                 tcp_update_dsack_list(tp, save_start,
8838                                     save_start + tlen);
8839                         }
8840                 }
8841         } else {
8842                 m_freem(m);
8843                 thflags &= ~TH_FIN;
8844         }
8845
8846         /*
8847          * If FIN is received ACK the FIN and let the user know that the
8848          * connection is closing.
8849          */
8850         if (thflags & TH_FIN) {
8851                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
8852                         socantrcvmore(so);
8853                         /*
8854                          * If connection is half-synchronized (ie NEEDSYN
8855                          * flag on) then delay ACK, so it may be piggybacked
8856                          * when SYN is sent. Otherwise, since we received a
8857                          * FIN then no more input can be expected, send ACK
8858                          * now.
8859                          */
8860                         if (tp->t_flags & TF_NEEDSYN) {
8861                                 rack_timer_cancel(tp, rack,
8862                                     rack->r_ctl.rc_rcvtime, __LINE__);
8863                                 tp->t_flags |= TF_DELACK;
8864                         } else {
8865                                 tp->t_flags |= TF_ACKNOW;
8866                         }
8867                         tp->rcv_nxt++;
8868                 }
8869                 switch (tp->t_state) {
8870
8871                         /*
8872                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
8873                          * CLOSE_WAIT state.
8874                          */
8875                 case TCPS_SYN_RECEIVED:
8876                         tp->t_starttime = ticks;
8877                         /* FALLTHROUGH */
8878                 case TCPS_ESTABLISHED:
8879                         rack_timer_cancel(tp, rack,
8880                             rack->r_ctl.rc_rcvtime, __LINE__);
8881                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
8882                         break;
8883
8884                         /*
8885                          * If still in FIN_WAIT_1 STATE FIN has not been
8886                          * acked so enter the CLOSING state.
8887                          */
8888                 case TCPS_FIN_WAIT_1:
8889                         rack_timer_cancel(tp, rack,
8890                             rack->r_ctl.rc_rcvtime, __LINE__);
8891                         tcp_state_change(tp, TCPS_CLOSING);
8892                         break;
8893
8894                         /*
8895                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
8896                          * starting the time-wait timer, turning off the
8897                          * other standard timers.
8898                          */
8899                 case TCPS_FIN_WAIT_2:
8900                         rack_timer_cancel(tp, rack,
8901                             rack->r_ctl.rc_rcvtime, __LINE__);
8902                         tcp_twstart(tp);
8903                         return (1);
8904                 }
8905         }
8906         /*
8907          * Return any desired output.
8908          */
8909         if ((tp->t_flags & TF_ACKNOW) ||
8910             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
8911                 rack->r_wanted_output = 1;
8912         }
8913         INP_WLOCK_ASSERT(tp->t_inpcb);
8914         return (0);
8915 }
8916
8917 /*
8918  * Here nothing is really faster, its just that we
8919  * have broken out the fast-data path also just like
8920  * the fast-ack.
8921  */
8922 static int
8923 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
8924     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
8925     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
8926 {
8927         int32_t nsegs;
8928         int32_t newsize = 0;    /* automatic sockbuf scaling */
8929         struct tcp_rack *rack;
8930 #ifdef NETFLIX_SB_LIMITS
8931         u_int mcnt, appended;
8932 #endif
8933 #ifdef TCPDEBUG
8934         /*
8935          * The size of tcp_saveipgen must be the size of the max ip header,
8936          * now IPv6.
8937          */
8938         u_char tcp_saveipgen[IP6_HDR_LEN];
8939         struct tcphdr tcp_savetcp;
8940         short ostate = 0;
8941
8942 #endif
8943         /*
8944          * If last ACK falls within this segment's sequence numbers, record
8945          * the timestamp. NOTE that the test is modified according to the
8946          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
8947          */
8948         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
8949                 return (0);
8950         }
8951         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
8952                 return (0);
8953         }
8954         if (tiwin && tiwin != tp->snd_wnd) {
8955                 return (0);
8956         }
8957         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
8958                 return (0);
8959         }
8960         if (__predict_false((to->to_flags & TOF_TS) &&
8961             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
8962                 return (0);
8963         }
8964         if (__predict_false((th->th_ack != tp->snd_una))) {
8965                 return (0);
8966         }
8967         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
8968                 return (0);
8969         }
8970         if ((to->to_flags & TOF_TS) != 0 &&
8971             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
8972                 tp->ts_recent_age = tcp_ts_getticks();
8973                 tp->ts_recent = to->to_tsval;
8974         }
8975         rack = (struct tcp_rack *)tp->t_fb_ptr;
8976         /*
8977          * This is a pure, in-sequence data packet with nothing on the
8978          * reassembly queue and we have enough buffer space to take it.
8979          */
8980         nsegs = max(1, m->m_pkthdr.lro_nsegs);
8981
8982 #ifdef NETFLIX_SB_LIMITS
8983         if (so->so_rcv.sb_shlim) {
8984                 mcnt = m_memcnt(m);
8985                 appended = 0;
8986                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
8987                     CFO_NOSLEEP, NULL) == false) {
8988                         counter_u64_add(tcp_sb_shlim_fails, 1);
8989                         m_freem(m);
8990                         return (1);
8991                 }
8992         }
8993 #endif
8994         /* Clean receiver SACK report if present */
8995         if (tp->rcv_numsacks)
8996                 tcp_clean_sackreport(tp);
8997         KMOD_TCPSTAT_INC(tcps_preddat);
8998         tp->rcv_nxt += tlen;
8999         if (tlen &&
9000             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
9001             (tp->t_fbyte_in == 0)) {
9002                 tp->t_fbyte_in = ticks;
9003                 if (tp->t_fbyte_in == 0)
9004                         tp->t_fbyte_in = 1;
9005                 if (tp->t_fbyte_out && tp->t_fbyte_in)
9006                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
9007         }
9008         /*
9009          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
9010          */
9011         tp->snd_wl1 = th->th_seq;
9012         /*
9013          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
9014          */
9015         tp->rcv_up = tp->rcv_nxt;
9016         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
9017         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
9018 #ifdef TCPDEBUG
9019         if (so->so_options & SO_DEBUG)
9020                 tcp_trace(TA_INPUT, ostate, tp,
9021                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
9022 #endif
9023         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
9024
9025         /* Add data to socket buffer. */
9026         SOCKBUF_LOCK(&so->so_rcv);
9027         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9028                 m_freem(m);
9029         } else {
9030                 /*
9031                  * Set new socket buffer size. Give up when limit is
9032                  * reached.
9033                  */
9034                 if (newsize)
9035                         if (!sbreserve_locked(&so->so_rcv,
9036                             newsize, so, NULL))
9037                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
9038                 m_adj(m, drop_hdrlen);  /* delayed header drop */
9039 #ifdef NETFLIX_SB_LIMITS
9040                 appended =
9041 #endif
9042                         sbappendstream_locked(&so->so_rcv, m, 0);
9043                 ctf_calc_rwin(so, tp);
9044         }
9045         /* NB: sorwakeup_locked() does an implicit unlock. */
9046         sorwakeup_locked(so);
9047 #ifdef NETFLIX_SB_LIMITS
9048         if (so->so_rcv.sb_shlim && mcnt != appended)
9049                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
9050 #endif
9051         rack_handle_delayed_ack(tp, rack, tlen, 0);
9052         if (tp->snd_una == tp->snd_max)
9053                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
9054         return (1);
9055 }
9056
9057 /*
9058  * This subfunction is used to try to highly optimize the
9059  * fast path. We again allow window updates that are
9060  * in sequence to remain in the fast-path. We also add
9061  * in the __predict's to attempt to help the compiler.
9062  * Note that if we return a 0, then we can *not* process
9063  * it and the caller should push the packet into the
9064  * slow-path.
9065  */
9066 static int
9067 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
9068     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9069     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
9070 {
9071         int32_t acked;
9072         int32_t nsegs;
9073 #ifdef TCPDEBUG
9074         /*
9075          * The size of tcp_saveipgen must be the size of the max ip header,
9076          * now IPv6.
9077          */
9078         u_char tcp_saveipgen[IP6_HDR_LEN];
9079         struct tcphdr tcp_savetcp;
9080         short ostate = 0;
9081 #endif
9082         int32_t under_pacing = 0;
9083         struct tcp_rack *rack;
9084
9085         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
9086                 /* Old ack, behind (or duplicate to) the last one rcv'd */
9087                 return (0);
9088         }
9089         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
9090                 /* Above what we have sent? */
9091                 return (0);
9092         }
9093         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
9094                 /* We are retransmitting */
9095                 return (0);
9096         }
9097         if (__predict_false(tiwin == 0)) {
9098                 /* zero window */
9099                 return (0);
9100         }
9101         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
9102                 /* We need a SYN or a FIN, unlikely.. */
9103                 return (0);
9104         }
9105         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
9106                 /* Timestamp is behind .. old ack with seq wrap? */
9107                 return (0);
9108         }
9109         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
9110                 /* Still recovering */
9111                 return (0);
9112         }
9113         rack = (struct tcp_rack *)tp->t_fb_ptr;
9114         if (rack->r_ctl.rc_sacked) {
9115                 /* We have sack holes on our scoreboard */
9116                 return (0);
9117         }
9118         /* Ok if we reach here, we can process a fast-ack */
9119         if (rack->rc_gp_filled &&
9120             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
9121                 under_pacing = 1;
9122         }
9123         nsegs = max(1, m->m_pkthdr.lro_nsegs);
9124         rack_log_ack(tp, to, th);
9125         /* Did the window get updated? */
9126         if (tiwin != tp->snd_wnd) {
9127                 tp->snd_wnd = tiwin;
9128                 tp->snd_wl1 = th->th_seq;
9129                 if (tp->snd_wnd > tp->max_sndwnd)
9130                         tp->max_sndwnd = tp->snd_wnd;
9131         }
9132         /* Do we exit persists? */
9133         if ((rack->rc_in_persist != 0) &&
9134             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
9135                                rack->r_ctl.rc_pace_min_segs))) {
9136                 rack_exit_persist(tp, rack, cts);
9137         }
9138         /* Do we enter persists? */
9139         if ((rack->rc_in_persist == 0) &&
9140             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
9141             TCPS_HAVEESTABLISHED(tp->t_state) &&
9142             (tp->snd_max == tp->snd_una) &&
9143             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
9144             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
9145                 /*
9146                  * Here the rwnd is less than
9147                  * the pacing size, we are established,
9148                  * nothing is outstanding, and there is
9149                  * data to send. Enter persists.
9150                  */
9151                 tp->snd_nxt = tp->snd_una;
9152                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
9153         }
9154         /*
9155          * If last ACK falls within this segment's sequence numbers, record
9156          * the timestamp. NOTE that the test is modified according to the
9157          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
9158          */
9159         if ((to->to_flags & TOF_TS) != 0 &&
9160             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
9161                 tp->ts_recent_age = tcp_ts_getticks();
9162                 tp->ts_recent = to->to_tsval;
9163         }
9164         /*
9165          * This is a pure ack for outstanding data.
9166          */
9167         KMOD_TCPSTAT_INC(tcps_predack);
9168
9169         /*
9170          * "bad retransmit" recovery.
9171          */
9172         if (tp->t_flags & TF_PREVVALID) {
9173                 tp->t_flags &= ~TF_PREVVALID;
9174                 if (tp->t_rxtshift == 1 &&
9175                     (int)(ticks - tp->t_badrxtwin) < 0)
9176                         rack_cong_signal(tp, th, CC_RTO_ERR);
9177         }
9178         /*
9179          * Recalculate the transmit timer / rtt.
9180          *
9181          * Some boxes send broken timestamp replies during the SYN+ACK
9182          * phase, ignore timestamps of 0 or we could calculate a huge RTT
9183          * and blow up the retransmit timer.
9184          */
9185         acked = BYTES_THIS_ACK(tp, th);
9186
9187 #ifdef TCP_HHOOK
9188         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
9189         hhook_run_tcp_est_in(tp, th, to);
9190 #endif
9191
9192         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
9193         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
9194         sbdrop(&so->so_snd, acked);
9195         if (acked) {
9196                 /* assure we are not backed off */
9197                 tp->t_rxtshift = 0;
9198                 rack->rc_tlp_in_progress = 0;
9199                 rack->r_ctl.rc_tlp_cnt_out = 0;
9200                 /*
9201                  * If it is the RXT timer we want to
9202                  * stop it, so we can restart a TLP.
9203                  */
9204                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
9205                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
9206 #ifdef NETFLIX_HTTP_LOGGING
9207                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
9208 #endif
9209         }
9210         /*
9211          * Let the congestion control algorithm update congestion control
9212          * related information. This typically means increasing the
9213          * congestion window.
9214          */
9215         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
9216
9217         tp->snd_una = th->th_ack;
9218         if (tp->snd_wnd < ctf_outstanding(tp)) {
9219                 /* The peer collapsed the window */
9220                 rack_collapsed_window(rack);
9221         } else if (rack->rc_has_collapsed)
9222                 rack_un_collapse_window(rack);
9223
9224         /*
9225          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
9226          */
9227         tp->snd_wl2 = th->th_ack;
9228         tp->t_dupacks = 0;
9229         m_freem(m);
9230         /* ND6_HINT(tp);         *//* Some progress has been made. */
9231
9232         /*
9233          * If all outstanding data are acked, stop retransmit timer,
9234          * otherwise restart timer using current (possibly backed-off)
9235          * value. If process is waiting for space, wakeup/selwakeup/signal.
9236          * If data are ready to send, let tcp_output decide between more
9237          * output or persist.
9238          */
9239 #ifdef TCPDEBUG
9240         if (so->so_options & SO_DEBUG)
9241                 tcp_trace(TA_INPUT, ostate, tp,
9242                     (void *)tcp_saveipgen,
9243                     &tcp_savetcp, 0);
9244 #endif
9245         if (under_pacing &&
9246             (rack->use_fixed_rate == 0) &&
9247             (rack->in_probe_rtt == 0) &&
9248             rack->rc_gp_dyn_mul &&
9249             rack->rc_always_pace) {
9250                 /* Check if we are dragging bottom */
9251                 rack_check_bottom_drag(tp, rack, so, acked);
9252         }
9253         if (tp->snd_una == tp->snd_max) {
9254                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
9255                 if (rack->r_ctl.rc_went_idle_time == 0)
9256                         rack->r_ctl.rc_went_idle_time = 1;
9257                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
9258                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
9259                         tp->t_acktime = 0;
9260                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
9261         }
9262         /* Wake up the socket if we have room to write more */
9263         sowwakeup(so);
9264         if (sbavail(&so->so_snd)) {
9265                 rack->r_wanted_output = 1;
9266         }
9267         return (1);
9268 }
9269
9270 /*
9271  * Return value of 1, the TCB is unlocked and most
9272  * likely gone, return value of 0, the TCP is still
9273  * locked.
9274  */
9275 static int
9276 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
9277     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9278     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9279 {
9280         int32_t ret_val = 0;
9281         int32_t todrop;
9282         int32_t ourfinisacked = 0;
9283         struct tcp_rack *rack;
9284
9285         ctf_calc_rwin(so, tp);
9286         /*
9287          * If the state is SYN_SENT: if seg contains an ACK, but not for our
9288          * SYN, drop the input. if seg contains a RST, then drop the
9289          * connection. if seg does not contain SYN, then drop it. Otherwise
9290          * this is an acceptable SYN segment initialize tp->rcv_nxt and
9291          * tp->irs if seg contains ack then advance tp->snd_una if seg
9292          * contains an ECE and ECN support is enabled, the stream is ECN
9293          * capable. if SYN has been acked change to ESTABLISHED else
9294          * SYN_RCVD state arrange for segment to be acked (eventually)
9295          * continue processing rest of data/controls.
9296          */
9297         if ((thflags & TH_ACK) &&
9298             (SEQ_LEQ(th->th_ack, tp->iss) ||
9299             SEQ_GT(th->th_ack, tp->snd_max))) {
9300                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9301                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9302                 return (1);
9303         }
9304         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
9305                 TCP_PROBE5(connect__refused, NULL, tp,
9306                     mtod(m, const char *), tp, th);
9307                 tp = tcp_drop(tp, ECONNREFUSED);
9308                 ctf_do_drop(m, tp);
9309                 return (1);
9310         }
9311         if (thflags & TH_RST) {
9312                 ctf_do_drop(m, tp);
9313                 return (1);
9314         }
9315         if (!(thflags & TH_SYN)) {
9316                 ctf_do_drop(m, tp);
9317                 return (1);
9318         }
9319         tp->irs = th->th_seq;
9320         tcp_rcvseqinit(tp);
9321         rack = (struct tcp_rack *)tp->t_fb_ptr;
9322         if (thflags & TH_ACK) {
9323                 int tfo_partial = 0;
9324
9325                 KMOD_TCPSTAT_INC(tcps_connects);
9326                 soisconnected(so);
9327 #ifdef MAC
9328                 mac_socketpeer_set_from_mbuf(m, so);
9329 #endif
9330                 /* Do window scaling on this connection? */
9331                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
9332                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
9333                         tp->rcv_scale = tp->request_r_scale;
9334                 }
9335                 tp->rcv_adv += min(tp->rcv_wnd,
9336                     TCP_MAXWIN << tp->rcv_scale);
9337                 /*
9338                  * If not all the data that was sent in the TFO SYN
9339                  * has been acked, resend the remainder right away.
9340                  */
9341                 if (IS_FASTOPEN(tp->t_flags) &&
9342                     (tp->snd_una != tp->snd_max)) {
9343                         tp->snd_nxt = th->th_ack;
9344                         tfo_partial = 1;
9345                 }
9346                 /*
9347                  * If there's data, delay ACK; if there's also a FIN ACKNOW
9348                  * will be turned on later.
9349                  */
9350                 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
9351                         rack_timer_cancel(tp, rack,
9352                                           rack->r_ctl.rc_rcvtime, __LINE__);
9353                         tp->t_flags |= TF_DELACK;
9354                 } else {
9355                         rack->r_wanted_output = 1;
9356                         tp->t_flags |= TF_ACKNOW;
9357                         rack->rc_dack_toggle = 0;
9358                 }
9359                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
9360                     (V_tcp_do_ecn == 1)) {
9361                         tp->t_flags2 |= TF2_ECN_PERMIT;
9362                         KMOD_TCPSTAT_INC(tcps_ecn_shs);
9363                 }
9364                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
9365                         /*
9366                          * We advance snd_una for the
9367                          * fast open case. If th_ack is
9368                          * acknowledging data beyond
9369                          * snd_una we can't just call
9370                          * ack-processing since the
9371                          * data stream in our send-map
9372                          * will start at snd_una + 1 (one
9373                          * beyond the SYN). If its just
9374                          * equal we don't need to do that
9375                          * and there is no send_map.
9376                          */
9377                         tp->snd_una++;
9378                 }
9379                 /*
9380                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
9381                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
9382                  */
9383                 tp->t_starttime = ticks;
9384                 if (tp->t_flags & TF_NEEDFIN) {
9385                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
9386                         tp->t_flags &= ~TF_NEEDFIN;
9387                         thflags &= ~TH_SYN;
9388                 } else {
9389                         tcp_state_change(tp, TCPS_ESTABLISHED);
9390                         TCP_PROBE5(connect__established, NULL, tp,
9391                             mtod(m, const char *), tp, th);
9392                         rack_cc_conn_init(tp);
9393                 }
9394         } else {
9395                 /*
9396                  * Received initial SYN in SYN-SENT[*] state => simultaneous
9397                  * open.  If segment contains CC option and there is a
9398                  * cached CC, apply TAO test. If it succeeds, connection is *
9399                  * half-synchronized. Otherwise, do 3-way handshake:
9400                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
9401                  * there was no CC option, clear cached CC value.
9402                  */
9403                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
9404                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
9405         }
9406         INP_WLOCK_ASSERT(tp->t_inpcb);
9407         /*
9408          * Advance th->th_seq to correspond to first data byte. If data,
9409          * trim to stay within window, dropping FIN if necessary.
9410          */
9411         th->th_seq++;
9412         if (tlen > tp->rcv_wnd) {
9413                 todrop = tlen - tp->rcv_wnd;
9414                 m_adj(m, -todrop);
9415                 tlen = tp->rcv_wnd;
9416                 thflags &= ~TH_FIN;
9417                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
9418                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
9419         }
9420         tp->snd_wl1 = th->th_seq - 1;
9421         tp->rcv_up = th->th_seq;
9422         /*
9423          * Client side of transaction: already sent SYN and data. If the
9424          * remote host used T/TCP to validate the SYN, our data will be
9425          * ACK'd; if so, enter normal data segment processing in the middle
9426          * of step 5, ack processing. Otherwise, goto step 6.
9427          */
9428         if (thflags & TH_ACK) {
9429                 /* For syn-sent we need to possibly update the rtt */
9430                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
9431                         uint32_t t;
9432
9433                         t = tcp_ts_getticks() - to->to_tsecr;
9434                         if (!tp->t_rttlow || tp->t_rttlow > t)
9435                                 tp->t_rttlow = t;
9436                         tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2);
9437                         tcp_rack_xmit_timer_commit(rack, tp);
9438                 }
9439                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
9440                         return (ret_val);
9441                 /* We may have changed to FIN_WAIT_1 above */
9442                 if (tp->t_state == TCPS_FIN_WAIT_1) {
9443                         /*
9444                          * In FIN_WAIT_1 STATE in addition to the processing
9445                          * for the ESTABLISHED state if our FIN is now
9446                          * acknowledged then enter FIN_WAIT_2.
9447                          */
9448                         if (ourfinisacked) {
9449                                 /*
9450                                  * If we can't receive any more data, then
9451                                  * closing user can proceed. Starting the
9452                                  * timer is contrary to the specification,
9453                                  * but if we don't get a FIN we'll hang
9454                                  * forever.
9455                                  *
9456                                  * XXXjl: we should release the tp also, and
9457                                  * use a compressed state.
9458                                  */
9459                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9460                                         soisdisconnected(so);
9461                                         tcp_timer_activate(tp, TT_2MSL,
9462                                             (tcp_fast_finwait2_recycle ?
9463                                             tcp_finwait2_timeout :
9464                                             TP_MAXIDLE(tp)));
9465                                 }
9466                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
9467                         }
9468                 }
9469         }
9470         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9471            tiwin, thflags, nxt_pkt));
9472 }
9473
9474 /*
9475  * Return value of 1, the TCB is unlocked and most
9476  * likely gone, return value of 0, the TCP is still
9477  * locked.
9478  */
9479 static int
9480 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
9481     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9482     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9483 {
9484         struct tcp_rack *rack;
9485         int32_t ret_val = 0;
9486         int32_t ourfinisacked = 0;
9487
9488         ctf_calc_rwin(so, tp);
9489         if ((thflags & TH_ACK) &&
9490             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
9491             SEQ_GT(th->th_ack, tp->snd_max))) {
9492                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9493                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9494                 return (1);
9495         }
9496         rack = (struct tcp_rack *)tp->t_fb_ptr;
9497         if (IS_FASTOPEN(tp->t_flags)) {
9498                 /*
9499                  * When a TFO connection is in SYN_RECEIVED, the
9500                  * only valid packets are the initial SYN, a
9501                  * retransmit/copy of the initial SYN (possibly with
9502                  * a subset of the original data), a valid ACK, a
9503                  * FIN, or a RST.
9504                  */
9505                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
9506                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9507                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9508                         return (1);
9509                 } else if (thflags & TH_SYN) {
9510                         /* non-initial SYN is ignored */
9511                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
9512                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
9513                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
9514                                 ctf_do_drop(m, NULL);
9515                                 return (0);
9516                         }
9517                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
9518                         ctf_do_drop(m, NULL);
9519                         return (0);
9520                 }
9521         }
9522         if ((thflags & TH_RST) ||
9523             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9524                 return (ctf_process_rst(m, th, so, tp));
9525         /*
9526          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9527          * it's less than ts_recent, drop it.
9528          */
9529         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9530             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9531                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9532                         return (ret_val);
9533         }
9534         /*
9535          * In the SYN-RECEIVED state, validate that the packet belongs to
9536          * this connection before trimming the data to fit the receive
9537          * window.  Check the sequence number versus IRS since we know the
9538          * sequence numbers haven't wrapped.  This is a partial fix for the
9539          * "LAND" DoS attack.
9540          */
9541         if (SEQ_LT(th->th_seq, tp->irs)) {
9542                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
9543                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9544                 return (1);
9545         }
9546         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9547                 return (ret_val);
9548         }
9549         /*
9550          * If last ACK falls within this segment's sequence numbers, record
9551          * its timestamp. NOTE: 1) That the test incorporates suggestions
9552          * from the latest proposal of the tcplw@cray.com list (Braden
9553          * 1993/04/26). 2) That updating only on newer timestamps interferes
9554          * with our earlier PAWS tests, so this check should be solely
9555          * predicated on the sequence space of this segment. 3) That we
9556          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9557          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9558          * SEG.Len, This modified check allows us to overcome RFC1323's
9559          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9560          * p.869. In such cases, we can still calculate the RTT correctly
9561          * when RCV.NXT == Last.ACK.Sent.
9562          */
9563         if ((to->to_flags & TOF_TS) != 0 &&
9564             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9565             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9566             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9567                 tp->ts_recent_age = tcp_ts_getticks();
9568                 tp->ts_recent = to->to_tsval;
9569         }
9570         tp->snd_wnd = tiwin;
9571         /*
9572          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9573          * is on (half-synchronized state), then queue data for later
9574          * processing; else drop segment and return.
9575          */
9576         if ((thflags & TH_ACK) == 0) {
9577                 if (IS_FASTOPEN(tp->t_flags)) {
9578                         rack_cc_conn_init(tp);
9579                 }
9580                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9581                     tiwin, thflags, nxt_pkt));
9582         }
9583         KMOD_TCPSTAT_INC(tcps_connects);
9584         soisconnected(so);
9585         /* Do window scaling? */
9586         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
9587             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
9588                 tp->rcv_scale = tp->request_r_scale;
9589         }
9590         /*
9591          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
9592          * FIN-WAIT-1
9593          */
9594         tp->t_starttime = ticks;
9595         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
9596                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
9597                 tp->t_tfo_pending = NULL;
9598         }
9599         if (tp->t_flags & TF_NEEDFIN) {
9600                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
9601                 tp->t_flags &= ~TF_NEEDFIN;
9602         } else {
9603                 tcp_state_change(tp, TCPS_ESTABLISHED);
9604                 TCP_PROBE5(accept__established, NULL, tp,
9605                     mtod(m, const char *), tp, th);
9606                 /*
9607                  * TFO connections call cc_conn_init() during SYN
9608                  * processing.  Calling it again here for such connections
9609                  * is not harmless as it would undo the snd_cwnd reduction
9610                  * that occurs when a TFO SYN|ACK is retransmitted.
9611                  */
9612                 if (!IS_FASTOPEN(tp->t_flags))
9613                         rack_cc_conn_init(tp);
9614         }
9615         /*
9616          * Account for the ACK of our SYN prior to
9617          * regular ACK processing below, except for
9618          * simultaneous SYN, which is handled later.
9619          */
9620         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
9621                 tp->snd_una++;
9622         /*
9623          * If segment contains data or ACK, will call tcp_reass() later; if
9624          * not, do so now to pass queued data to user.
9625          */
9626         if (tlen == 0 && (thflags & TH_FIN) == 0)
9627                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
9628                     (struct mbuf *)0);
9629         tp->snd_wl1 = th->th_seq - 1;
9630         /* For syn-recv we need to possibly update the rtt */
9631         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
9632                 uint32_t t;
9633
9634                 t = tcp_ts_getticks() - to->to_tsecr;
9635                 if (!tp->t_rttlow || tp->t_rttlow > t)
9636                         tp->t_rttlow = t;
9637                 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2);
9638                 tcp_rack_xmit_timer_commit(rack, tp);
9639         }
9640         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
9641                 return (ret_val);
9642         }
9643         if (tp->t_state == TCPS_FIN_WAIT_1) {
9644                 /* We could have went to FIN_WAIT_1 (or EST) above */
9645                 /*
9646                  * In FIN_WAIT_1 STATE in addition to the processing for the
9647                  * ESTABLISHED state if our FIN is now acknowledged then
9648                  * enter FIN_WAIT_2.
9649                  */
9650                 if (ourfinisacked) {
9651                         /*
9652                          * If we can't receive any more data, then closing
9653                          * user can proceed. Starting the timer is contrary
9654                          * to the specification, but if we don't get a FIN
9655                          * we'll hang forever.
9656                          *
9657                          * XXXjl: we should release the tp also, and use a
9658                          * compressed state.
9659                          */
9660                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
9661                                 soisdisconnected(so);
9662                                 tcp_timer_activate(tp, TT_2MSL,
9663                                     (tcp_fast_finwait2_recycle ?
9664                                     tcp_finwait2_timeout :
9665                                     TP_MAXIDLE(tp)));
9666                         }
9667                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
9668                 }
9669         }
9670         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9671             tiwin, thflags, nxt_pkt));
9672 }
9673
9674 /*
9675  * Return value of 1, the TCB is unlocked and most
9676  * likely gone, return value of 0, the TCP is still
9677  * locked.
9678  */
9679 static int
9680 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
9681     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9682     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9683 {
9684         int32_t ret_val = 0;
9685         struct tcp_rack *rack;
9686
9687         /*
9688          * Header prediction: check for the two common cases of a
9689          * uni-directional data xfer.  If the packet has no control flags,
9690          * is in-sequence, the window didn't change and we're not
9691          * retransmitting, it's a candidate.  If the length is zero and the
9692          * ack moved forward, we're the sender side of the xfer.  Just free
9693          * the data acked & wake any higher level process that was blocked
9694          * waiting for space.  If the length is non-zero and the ack didn't
9695          * move, we're the receiver side.  If we're getting packets in-order
9696          * (the reassembly queue is empty), add the data toc The socket
9697          * buffer and note that we need a delayed ack. Make sure that the
9698          * hidden state-flags are also off. Since we check for
9699          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
9700          */
9701         rack = (struct tcp_rack *)tp->t_fb_ptr;
9702         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
9703             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
9704             __predict_true(SEGQ_EMPTY(tp)) &&
9705             __predict_true(th->th_seq == tp->rcv_nxt)) {
9706                 if (tlen == 0) {
9707                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
9708                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
9709                                 return (0);
9710                         }
9711                 } else {
9712                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
9713                             tiwin, nxt_pkt, iptos)) {
9714                                 return (0);
9715                         }
9716                 }
9717         }
9718         ctf_calc_rwin(so, tp);
9719
9720         if ((thflags & TH_RST) ||
9721             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9722                 return (ctf_process_rst(m, th, so, tp));
9723
9724         /*
9725          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9726          * synchronized state.
9727          */
9728         if (thflags & TH_SYN) {
9729                 ctf_challenge_ack(m, th, tp, &ret_val);
9730                 return (ret_val);
9731         }
9732         /*
9733          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9734          * it's less than ts_recent, drop it.
9735          */
9736         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9737             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9738                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9739                         return (ret_val);
9740         }
9741         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9742                 return (ret_val);
9743         }
9744         /*
9745          * If last ACK falls within this segment's sequence numbers, record
9746          * its timestamp. NOTE: 1) That the test incorporates suggestions
9747          * from the latest proposal of the tcplw@cray.com list (Braden
9748          * 1993/04/26). 2) That updating only on newer timestamps interferes
9749          * with our earlier PAWS tests, so this check should be solely
9750          * predicated on the sequence space of this segment. 3) That we
9751          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9752          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9753          * SEG.Len, This modified check allows us to overcome RFC1323's
9754          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9755          * p.869. In such cases, we can still calculate the RTT correctly
9756          * when RCV.NXT == Last.ACK.Sent.
9757          */
9758         if ((to->to_flags & TOF_TS) != 0 &&
9759             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9760             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9761             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9762                 tp->ts_recent_age = tcp_ts_getticks();
9763                 tp->ts_recent = to->to_tsval;
9764         }
9765         /*
9766          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9767          * is on (half-synchronized state), then queue data for later
9768          * processing; else drop segment and return.
9769          */
9770         if ((thflags & TH_ACK) == 0) {
9771                 if (tp->t_flags & TF_NEEDSYN) {
9772
9773                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9774                             tiwin, thflags, nxt_pkt));
9775
9776                 } else if (tp->t_flags & TF_ACKNOW) {
9777                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9778                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1;
9779                         return (ret_val);
9780                 } else {
9781                         ctf_do_drop(m, NULL);
9782                         return (0);
9783                 }
9784         }
9785         /*
9786          * Ack processing.
9787          */
9788         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9789                 return (ret_val);
9790         }
9791         if (sbavail(&so->so_snd)) {
9792                 if (ctf_progress_timeout_check(tp, true)) {
9793                         rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
9794                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
9795                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9796                         return (1);
9797                 }
9798         }
9799         /* State changes only happen in rack_process_data() */
9800         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9801             tiwin, thflags, nxt_pkt));
9802 }
9803
9804 /*
9805  * Return value of 1, the TCB is unlocked and most
9806  * likely gone, return value of 0, the TCP is still
9807  * locked.
9808  */
9809 static int
9810 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
9811     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9812     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9813 {
9814         int32_t ret_val = 0;
9815
9816         ctf_calc_rwin(so, tp);
9817         if ((thflags & TH_RST) ||
9818             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9819                 return (ctf_process_rst(m, th, so, tp));
9820         /*
9821          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9822          * synchronized state.
9823          */
9824         if (thflags & TH_SYN) {
9825                 ctf_challenge_ack(m, th, tp, &ret_val);
9826                 return (ret_val);
9827         }
9828         /*
9829          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9830          * it's less than ts_recent, drop it.
9831          */
9832         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9833             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9834                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9835                         return (ret_val);
9836         }
9837         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9838                 return (ret_val);
9839         }
9840         /*
9841          * If last ACK falls within this segment's sequence numbers, record
9842          * its timestamp. NOTE: 1) That the test incorporates suggestions
9843          * from the latest proposal of the tcplw@cray.com list (Braden
9844          * 1993/04/26). 2) That updating only on newer timestamps interferes
9845          * with our earlier PAWS tests, so this check should be solely
9846          * predicated on the sequence space of this segment. 3) That we
9847          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9848          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9849          * SEG.Len, This modified check allows us to overcome RFC1323's
9850          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9851          * p.869. In such cases, we can still calculate the RTT correctly
9852          * when RCV.NXT == Last.ACK.Sent.
9853          */
9854         if ((to->to_flags & TOF_TS) != 0 &&
9855             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9856             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9857             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9858                 tp->ts_recent_age = tcp_ts_getticks();
9859                 tp->ts_recent = to->to_tsval;
9860         }
9861         /*
9862          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9863          * is on (half-synchronized state), then queue data for later
9864          * processing; else drop segment and return.
9865          */
9866         if ((thflags & TH_ACK) == 0) {
9867                 if (tp->t_flags & TF_NEEDSYN) {
9868                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9869                             tiwin, thflags, nxt_pkt));
9870
9871                 } else if (tp->t_flags & TF_ACKNOW) {
9872                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
9873                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
9874                         return (ret_val);
9875                 } else {
9876                         ctf_do_drop(m, NULL);
9877                         return (0);
9878                 }
9879         }
9880         /*
9881          * Ack processing.
9882          */
9883         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
9884                 return (ret_val);
9885         }
9886         if (sbavail(&so->so_snd)) {
9887                 if (ctf_progress_timeout_check(tp, true)) {
9888                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
9889                                                 tp, tick, PROGRESS_DROP, __LINE__);
9890                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
9891                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
9892                         return (1);
9893                 }
9894         }
9895         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
9896             tiwin, thflags, nxt_pkt));
9897 }
9898
9899 static int
9900 rack_check_data_after_close(struct mbuf *m,
9901     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
9902 {
9903         struct tcp_rack *rack;
9904
9905         rack = (struct tcp_rack *)tp->t_fb_ptr;
9906         if (rack->rc_allow_data_af_clo == 0) {
9907         close_now:
9908                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
9909                 /* tcp_close will kill the inp pre-log the Reset */
9910                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
9911                 tp = tcp_close(tp);
9912                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
9913                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
9914                 return (1);
9915         }
9916         if (sbavail(&so->so_snd) == 0)
9917                 goto close_now;
9918         /* Ok we allow data that is ignored and a followup reset */
9919         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
9920         tp->rcv_nxt = th->th_seq + *tlen;
9921         tp->t_flags2 |= TF2_DROP_AF_DATA;
9922         rack->r_wanted_output = 1;
9923         *tlen = 0;
9924         return (0);
9925 }
9926
9927 /*
9928  * Return value of 1, the TCB is unlocked and most
9929  * likely gone, return value of 0, the TCP is still
9930  * locked.
9931  */
9932 static int
9933 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
9934     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
9935     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
9936 {
9937         int32_t ret_val = 0;
9938         int32_t ourfinisacked = 0;
9939
9940         ctf_calc_rwin(so, tp);
9941
9942         if ((thflags & TH_RST) ||
9943             (tp->t_fin_is_rst && (thflags & TH_FIN)))
9944                 return (ctf_process_rst(m, th, so, tp));
9945         /*
9946          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
9947          * synchronized state.
9948          */
9949         if (thflags & TH_SYN) {
9950                 ctf_challenge_ack(m, th, tp, &ret_val);
9951                 return (ret_val);
9952         }
9953         /*
9954          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
9955          * it's less than ts_recent, drop it.
9956          */
9957         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
9958             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
9959                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
9960                         return (ret_val);
9961         }
9962         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
9963                 return (ret_val);
9964         }
9965         /*
9966          * If new data are received on a connection after the user processes
9967          * are gone, then RST the other end.
9968          */
9969         if ((so->so_state & SS_NOFDREF) && tlen) {
9970                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
9971                         return (1);
9972         }
9973         /*
9974          * If last ACK falls within this segment's sequence numbers, record
9975          * its timestamp. NOTE: 1) That the test incorporates suggestions
9976          * from the latest proposal of the tcplw@cray.com list (Braden
9977          * 1993/04/26). 2) That updating only on newer timestamps interferes
9978          * with our earlier PAWS tests, so this check should be solely
9979          * predicated on the sequence space of this segment. 3) That we
9980          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
9981          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
9982          * SEG.Len, This modified check allows us to overcome RFC1323's
9983          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
9984          * p.869. In such cases, we can still calculate the RTT correctly
9985          * when RCV.NXT == Last.ACK.Sent.
9986          */
9987         if ((to->to_flags & TOF_TS) != 0 &&
9988             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
9989             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
9990             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
9991                 tp->ts_recent_age = tcp_ts_getticks();
9992                 tp->ts_recent = to->to_tsval;
9993         }
9994         /*
9995          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
9996          * is on (half-synchronized state), then queue data for later
9997          * processing; else drop segment and return.
9998          */
9999         if ((thflags & TH_ACK) == 0) {
10000                 if (tp->t_flags & TF_NEEDSYN) {
10001                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10002                             tiwin, thflags, nxt_pkt));
10003                 } else if (tp->t_flags & TF_ACKNOW) {
10004                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10005                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
10006                         return (ret_val);
10007                 } else {
10008                         ctf_do_drop(m, NULL);
10009                         return (0);
10010                 }
10011         }
10012         /*
10013          * Ack processing.
10014          */
10015         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10016                 return (ret_val);
10017         }
10018         if (ourfinisacked) {
10019                 /*
10020                  * If we can't receive any more data, then closing user can
10021                  * proceed. Starting the timer is contrary to the
10022                  * specification, but if we don't get a FIN we'll hang
10023                  * forever.
10024                  *
10025                  * XXXjl: we should release the tp also, and use a
10026                  * compressed state.
10027                  */
10028                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10029                         soisdisconnected(so);
10030                         tcp_timer_activate(tp, TT_2MSL,
10031                             (tcp_fast_finwait2_recycle ?
10032                             tcp_finwait2_timeout :
10033                             TP_MAXIDLE(tp)));
10034                 }
10035                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
10036         }
10037         if (sbavail(&so->so_snd)) {
10038                 if (ctf_progress_timeout_check(tp, true)) {
10039                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
10040                                                 tp, tick, PROGRESS_DROP, __LINE__);
10041                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
10042                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10043                         return (1);
10044                 }
10045         }
10046         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10047             tiwin, thflags, nxt_pkt));
10048 }
10049
10050 /*
10051  * Return value of 1, the TCB is unlocked and most
10052  * likely gone, return value of 0, the TCP is still
10053  * locked.
10054  */
10055 static int
10056 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
10057     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10058     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
10059 {
10060         int32_t ret_val = 0;
10061         int32_t ourfinisacked = 0;
10062
10063         ctf_calc_rwin(so, tp);
10064
10065         if ((thflags & TH_RST) ||
10066             (tp->t_fin_is_rst && (thflags & TH_FIN)))
10067                 return (ctf_process_rst(m, th, so, tp));
10068         /*
10069          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
10070          * synchronized state.
10071          */
10072         if (thflags & TH_SYN) {
10073                 ctf_challenge_ack(m, th, tp, &ret_val);
10074                 return (ret_val);
10075         }
10076         /*
10077          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
10078          * it's less than ts_recent, drop it.
10079          */
10080         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
10081             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
10082                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
10083                         return (ret_val);
10084         }
10085         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
10086                 return (ret_val);
10087         }
10088         /*
10089          * If new data are received on a connection after the user processes
10090          * are gone, then RST the other end.
10091          */
10092         if ((so->so_state & SS_NOFDREF) && tlen) {
10093                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
10094                         return (1);
10095         }
10096         /*
10097          * If last ACK falls within this segment's sequence numbers, record
10098          * its timestamp. NOTE: 1) That the test incorporates suggestions
10099          * from the latest proposal of the tcplw@cray.com list (Braden
10100          * 1993/04/26). 2) That updating only on newer timestamps interferes
10101          * with our earlier PAWS tests, so this check should be solely
10102          * predicated on the sequence space of this segment. 3) That we
10103          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
10104          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
10105          * SEG.Len, This modified check allows us to overcome RFC1323's
10106          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
10107          * p.869. In such cases, we can still calculate the RTT correctly
10108          * when RCV.NXT == Last.ACK.Sent.
10109          */
10110         if ((to->to_flags & TOF_TS) != 0 &&
10111             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
10112             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
10113             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
10114                 tp->ts_recent_age = tcp_ts_getticks();
10115                 tp->ts_recent = to->to_tsval;
10116         }
10117         /*
10118          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
10119          * is on (half-synchronized state), then queue data for later
10120          * processing; else drop segment and return.
10121          */
10122         if ((thflags & TH_ACK) == 0) {
10123                 if (tp->t_flags & TF_NEEDSYN) {
10124                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10125                             tiwin, thflags, nxt_pkt));
10126                 } else if (tp->t_flags & TF_ACKNOW) {
10127                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10128                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1;
10129                         return (ret_val);
10130                 } else {
10131                         ctf_do_drop(m, NULL);
10132                         return (0);
10133                 }
10134         }
10135         /*
10136          * Ack processing.
10137          */
10138         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10139                 return (ret_val);
10140         }
10141         if (ourfinisacked) {
10142                 tcp_twstart(tp);
10143                 m_freem(m);
10144                 return (1);
10145         }
10146         if (sbavail(&so->so_snd)) {
10147                 if (ctf_progress_timeout_check(tp, true)) {
10148                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
10149                                                 tp, tick, PROGRESS_DROP, __LINE__);
10150                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
10151                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10152                         return (1);
10153                 }
10154         }
10155         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10156             tiwin, thflags, nxt_pkt));
10157 }
10158
10159 /*
10160  * Return value of 1, the TCB is unlocked and most
10161  * likely gone, return value of 0, the TCP is still
10162  * locked.
10163  */
10164 static int
10165 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
10166     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10167     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
10168 {
10169         int32_t ret_val = 0;
10170         int32_t ourfinisacked = 0;
10171
10172         ctf_calc_rwin(so, tp);
10173
10174         if ((thflags & TH_RST) ||
10175             (tp->t_fin_is_rst && (thflags & TH_FIN)))
10176                 return (ctf_process_rst(m, th, so, tp));
10177         /*
10178          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
10179          * synchronized state.
10180          */
10181         if (thflags & TH_SYN) {
10182                 ctf_challenge_ack(m, th, tp, &ret_val);
10183                 return (ret_val);
10184         }
10185         /*
10186          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
10187          * it's less than ts_recent, drop it.
10188          */
10189         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
10190             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
10191                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
10192                         return (ret_val);
10193         }
10194         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
10195                 return (ret_val);
10196         }
10197         /*
10198          * If new data are received on a connection after the user processes
10199          * are gone, then RST the other end.
10200          */
10201         if ((so->so_state & SS_NOFDREF) && tlen) {
10202                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
10203                         return (1);
10204         }
10205         /*
10206          * If last ACK falls within this segment's sequence numbers, record
10207          * its timestamp. NOTE: 1) That the test incorporates suggestions
10208          * from the latest proposal of the tcplw@cray.com list (Braden
10209          * 1993/04/26). 2) That updating only on newer timestamps interferes
10210          * with our earlier PAWS tests, so this check should be solely
10211          * predicated on the sequence space of this segment. 3) That we
10212          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
10213          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
10214          * SEG.Len, This modified check allows us to overcome RFC1323's
10215          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
10216          * p.869. In such cases, we can still calculate the RTT correctly
10217          * when RCV.NXT == Last.ACK.Sent.
10218          */
10219         if ((to->to_flags & TOF_TS) != 0 &&
10220             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
10221             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
10222             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
10223                 tp->ts_recent_age = tcp_ts_getticks();
10224                 tp->ts_recent = to->to_tsval;
10225         }
10226         /*
10227          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
10228          * is on (half-synchronized state), then queue data for later
10229          * processing; else drop segment and return.
10230          */
10231         if ((thflags & TH_ACK) == 0) {
10232                 if (tp->t_flags & TF_NEEDSYN) {
10233                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10234                             tiwin, thflags, nxt_pkt));
10235                 } else if (tp->t_flags & TF_ACKNOW) {
10236                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10237                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
10238                         return (ret_val);
10239                 } else {
10240                         ctf_do_drop(m, NULL);
10241                         return (0);
10242                 }
10243         }
10244         /*
10245          * case TCPS_LAST_ACK: Ack processing.
10246          */
10247         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10248                 return (ret_val);
10249         }
10250         if (ourfinisacked) {
10251                 tp = tcp_close(tp);
10252                 ctf_do_drop(m, tp);
10253                 return (1);
10254         }
10255         if (sbavail(&so->so_snd)) {
10256                 if (ctf_progress_timeout_check(tp, true)) {
10257                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
10258                                                 tp, tick, PROGRESS_DROP, __LINE__);
10259                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
10260                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10261                         return (1);
10262                 }
10263         }
10264         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10265             tiwin, thflags, nxt_pkt));
10266 }
10267
10268
10269 /*
10270  * Return value of 1, the TCB is unlocked and most
10271  * likely gone, return value of 0, the TCP is still
10272  * locked.
10273  */
10274 static int
10275 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
10276     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10277     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
10278 {
10279         int32_t ret_val = 0;
10280         int32_t ourfinisacked = 0;
10281
10282         ctf_calc_rwin(so, tp);
10283
10284         /* Reset receive buffer auto scaling when not in bulk receive mode. */
10285         if ((thflags & TH_RST) ||
10286             (tp->t_fin_is_rst && (thflags & TH_FIN)))
10287                 return (ctf_process_rst(m, th, so, tp));
10288         /*
10289          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
10290          * synchronized state.
10291          */
10292         if (thflags & TH_SYN) {
10293                 ctf_challenge_ack(m, th, tp, &ret_val);
10294                 return (ret_val);
10295         }
10296         /*
10297          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
10298          * it's less than ts_recent, drop it.
10299          */
10300         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
10301             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
10302                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
10303                         return (ret_val);
10304         }
10305         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
10306                 return (ret_val);
10307         }
10308         /*
10309          * If new data are received on a connection after the user processes
10310          * are gone, then RST the other end.
10311          */
10312         if ((so->so_state & SS_NOFDREF) &&
10313             tlen) {
10314                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
10315                         return (1);
10316         }
10317         /*
10318          * If last ACK falls within this segment's sequence numbers, record
10319          * its timestamp. NOTE: 1) That the test incorporates suggestions
10320          * from the latest proposal of the tcplw@cray.com list (Braden
10321          * 1993/04/26). 2) That updating only on newer timestamps interferes
10322          * with our earlier PAWS tests, so this check should be solely
10323          * predicated on the sequence space of this segment. 3) That we
10324          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
10325          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
10326          * SEG.Len, This modified check allows us to overcome RFC1323's
10327          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
10328          * p.869. In such cases, we can still calculate the RTT correctly
10329          * when RCV.NXT == Last.ACK.Sent.
10330          */
10331         if ((to->to_flags & TOF_TS) != 0 &&
10332             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
10333             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
10334             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
10335                 tp->ts_recent_age = tcp_ts_getticks();
10336                 tp->ts_recent = to->to_tsval;
10337         }
10338         /*
10339          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
10340          * is on (half-synchronized state), then queue data for later
10341          * processing; else drop segment and return.
10342          */
10343         if ((thflags & TH_ACK) == 0) {
10344                 if (tp->t_flags & TF_NEEDSYN) {
10345                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10346                             tiwin, thflags, nxt_pkt));
10347                 } else if (tp->t_flags & TF_ACKNOW) {
10348                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
10349                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
10350                         return (ret_val);
10351                 } else {
10352                         ctf_do_drop(m, NULL);
10353                         return (0);
10354                 }
10355         }
10356         /*
10357          * Ack processing.
10358          */
10359         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
10360                 return (ret_val);
10361         }
10362         if (sbavail(&so->so_snd)) {
10363                 if (ctf_progress_timeout_check(tp, true)) {
10364                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
10365                                                 tp, tick, PROGRESS_DROP, __LINE__);
10366                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
10367                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
10368                         return (1);
10369                 }
10370         }
10371         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
10372             tiwin, thflags, nxt_pkt));
10373 }
10374
10375 static void inline
10376 rack_clear_rate_sample(struct tcp_rack *rack)
10377 {
10378         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
10379         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
10380         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
10381 }
10382
10383 static void
10384 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line)
10385 {
10386         uint64_t bw_est, rate_wanted;
10387         uint32_t tls_seg = 0;
10388         int chged = 0;
10389         uint32_t user_max;
10390
10391         user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
10392 #ifdef KERN_TLS
10393         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
10394                 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd);
10395                 if (tls_seg != rack->r_ctl.rc_pace_min_segs)
10396                         chged = 1;
10397                 rack->r_ctl.rc_pace_min_segs = tls_seg;
10398         } else
10399 #endif
10400         {
10401                 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
10402                         chged = 1;
10403                 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
10404         }
10405         if (rack->use_fixed_rate || rack->rc_force_max_seg) {
10406                 if (user_max != rack->r_ctl.rc_pace_max_segs)
10407                         chged = 1;
10408         }
10409         if (rack->rc_force_max_seg) {
10410                 rack->r_ctl.rc_pace_max_segs = user_max;
10411         } else if (rack->use_fixed_rate) {
10412                 bw_est = rack_get_bw(rack);
10413                 if ((rack->r_ctl.crte == NULL) ||
10414                     (bw_est != rack->r_ctl.crte->rate))  {
10415                         rack->r_ctl.rc_pace_max_segs = user_max;
10416                 } else {
10417                         /* We are pacing right at the hardware rate */
10418                         uint32_t segsiz;
10419
10420                         segsiz = min(ctf_fixed_maxseg(tp),
10421                                      rack->r_ctl.rc_pace_min_segs);
10422                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
10423                                                            bw_est, segsiz, 0,
10424                                                            rack->r_ctl.crte, NULL);
10425                 }
10426         } else if (rack->rc_always_pace) {
10427                 if (rack->r_ctl.gp_bw ||
10428 #ifdef NETFLIX_PEAKRATE
10429                     rack->rc_tp->t_maxpeakrate ||
10430 #endif
10431                     rack->r_ctl.init_rate) {
10432                         /* We have a rate of some sort set */
10433                         uint32_t  orig;
10434
10435                         bw_est = rack_get_bw(rack);
10436                         orig = rack->r_ctl.rc_pace_max_segs;
10437                         rate_wanted = rack_get_output_bw(rack, bw_est, NULL);
10438                         if (rate_wanted) {
10439                                 /* We have something */
10440                                 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
10441                                                                                    rate_wanted,
10442                                                                                    ctf_fixed_maxseg(rack->rc_tp));
10443                         } else
10444                                 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
10445                         if (orig != rack->r_ctl.rc_pace_max_segs)
10446                                 chged = 1;
10447                 } else if ((rack->r_ctl.gp_bw == 0) &&
10448                            (rack->r_ctl.rc_pace_max_segs == 0)) {
10449                         /*
10450                          * If we have nothing limit us to bursting
10451                          * out IW sized pieces.
10452                          */
10453                         chged = 1;
10454                         rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
10455                 }
10456         }
10457         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
10458                 chged = 1;
10459                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
10460         }
10461 #ifdef KERN_TLS
10462         uint32_t orig;
10463
10464         if (tls_seg != 0) {
10465                 orig = rack->r_ctl.rc_pace_max_segs;
10466                 if (rack_hw_tls_max_seg > 1) {
10467                         rack->r_ctl.rc_pace_max_segs /= tls_seg;
10468                         if (rack_hw_tls_max_seg > rack->r_ctl.rc_pace_max_segs)
10469                                 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg;
10470                 } else {
10471                         rack->r_ctl.rc_pace_max_segs = 1;
10472                 }
10473                 if (rack->r_ctl.rc_pace_max_segs == 0)
10474                         rack->r_ctl.rc_pace_max_segs = 1;
10475                 rack->r_ctl.rc_pace_max_segs *= tls_seg;
10476                 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
10477                         /* We can't go over the max bytes (usually 64k) */
10478                         rack->r_ctl.rc_pace_max_segs = ((PACE_MAX_IP_BYTES / tls_seg) * tls_seg);
10479                 }
10480                 if (orig != rack->r_ctl.rc_pace_max_segs)
10481                         chged = 1;
10482         }
10483 #endif
10484         if (chged)
10485                 rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2);
10486 }
10487
10488 static int
10489 rack_init(struct tcpcb *tp)
10490 {
10491         struct tcp_rack *rack = NULL;
10492         struct rack_sendmap *insret;
10493         uint32_t iwin, snt, us_cts;
10494
10495         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
10496         if (tp->t_fb_ptr == NULL) {
10497                 /*
10498                  * We need to allocate memory but cant. The INP and INP_INFO
10499                  * locks and they are recusive (happens during setup. So a
10500                  * scheme to drop the locks fails :(
10501                  *
10502                  */
10503                 return (ENOMEM);
10504         }
10505         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
10506
10507         rack = (struct tcp_rack *)tp->t_fb_ptr;
10508         RB_INIT(&rack->r_ctl.rc_mtree);
10509         TAILQ_INIT(&rack->r_ctl.rc_free);
10510         TAILQ_INIT(&rack->r_ctl.rc_tmap);
10511         rack->rc_tp = tp;
10512         if (tp->t_inpcb) {
10513                 rack->rc_inp = tp->t_inpcb;
10514         }
10515         /* Probably not needed but lets be sure */
10516         rack_clear_rate_sample(rack);
10517         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
10518         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
10519         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
10520         if (use_rack_rr)
10521                 rack->use_rack_rr = 1;
10522         if (V_tcp_delack_enabled)
10523                 tp->t_delayed_ack = 1;
10524         else
10525                 tp->t_delayed_ack = 0;
10526         if (rack_enable_shared_cwnd)
10527                 rack->rack_enable_scwnd = 1;
10528         rack->rc_user_set_max_segs = rack_hptsi_segments;
10529         rack->rc_force_max_seg = 0;
10530         if (rack_use_imac_dack)
10531                 rack->rc_dack_mode = 1;
10532         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
10533         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
10534         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
10535         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
10536         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
10537         rack->r_ctl.rc_early_recovery = rack_early_recovery;
10538         rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
10539         rack->r_ctl.rc_highest_us_rtt = 0;
10540         if (rack_disable_prr)
10541                 rack->rack_no_prr = 1;
10542         if (rack_gp_no_rec_chg)
10543                 rack->rc_gp_no_rec_chg = 1;
10544         rack->rc_always_pace = rack_pace_every_seg;
10545         if (rack_enable_mqueue_for_nonpaced)
10546                 rack->r_mbuf_queue = 1;
10547         else
10548                 rack->r_mbuf_queue = 0;
10549         if  (rack->r_mbuf_queue || rack->rc_always_pace)
10550                 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
10551         else
10552                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
10553         rack_set_pace_segments(tp, rack, __LINE__);
10554         if (rack_limits_scwnd)
10555                 rack->r_limit_scw  = 1;
10556         else
10557                 rack->r_limit_scw  = 0;
10558         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
10559         rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
10560         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
10561         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
10562         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
10563         rack->r_ctl.rc_min_to = rack_min_to;
10564         microuptime(&rack->r_ctl.act_rcv_time);
10565         rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
10566         rack->r_running_late = 0;
10567         rack->r_running_early = 0;
10568         rack->rc_init_win = rack_default_init_window;
10569         rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
10570         if (rack_do_dyn_mul) {
10571                 /* When dynamic adjustment is on CA needs to start at 100% */
10572                 rack->rc_gp_dyn_mul = 1;
10573                 if (rack_do_dyn_mul >= 100)
10574                         rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
10575         } else
10576                 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
10577         rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
10578         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
10579         rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
10580         setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
10581                                 rack_probertt_filter_life);
10582         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
10583         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
10584         rack->r_ctl.rc_time_of_last_probertt = us_cts;
10585         rack->r_ctl.rc_time_probertt_starts = 0;
10586         /* Do we force on detection? */
10587 #ifdef NETFLIX_EXP_DETECTION
10588         if (tcp_force_detection)
10589                 rack->do_detection = 1;
10590         else
10591 #endif
10592                 rack->do_detection = 0;
10593         if (rack_non_rxt_use_cr)
10594                 rack->rack_rec_nonrxt_use_cr = 1;
10595         if (tp->snd_una != tp->snd_max) {
10596                 /* Create a send map for the current outstanding data */
10597                 struct rack_sendmap *rsm;
10598
10599                 rsm = rack_alloc(rack);
10600                 if (rsm == NULL) {
10601                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
10602                         tp->t_fb_ptr = NULL;
10603                         return (ENOMEM);
10604                 }
10605                 rsm->r_flags = RACK_OVERMAX;
10606                 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
10607                 rsm->r_rtr_cnt = 1;
10608                 rsm->r_rtr_bytes = 0;
10609                 rsm->r_start = tp->snd_una;
10610                 rsm->r_end = tp->snd_max;
10611                 rsm->usec_orig_send = us_cts;
10612                 rsm->r_dupack = 0;
10613                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
10614 #ifdef INVARIANTS
10615                 if (insret != NULL) {
10616                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
10617                               insret, rack, rsm);
10618                 }
10619 #endif
10620                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10621                 rsm->r_in_tmap = 1;
10622         }
10623         /* Cancel the GP measurement in progress */
10624         tp->t_flags &= ~TF_GPUTINPROG;
10625         if (SEQ_GT(tp->snd_max, tp->iss))
10626                 snt = tp->snd_max - tp->iss;
10627         else
10628                 snt = 0;
10629         iwin = rc_init_window(rack);
10630         if (snt < iwin) {
10631                 /* We are not past the initial window
10632                  * so we need to make sure cwnd is
10633                  * correct.
10634                  */
10635                 if (tp->snd_cwnd < iwin)
10636                         tp->snd_cwnd = iwin;
10637                 /*
10638                  * If we are within the initial window
10639                  * we want ssthresh to be unlimited. Setting
10640                  * it to the rwnd (which the default stack does
10641                  * and older racks) is not really a good idea
10642                  * since we want to be in SS and grow both the
10643                  * cwnd and the rwnd (via dynamic rwnd growth). If
10644                  * we set it to the rwnd then as the peer grows its
10645                  * rwnd we will be stuck in CA and never hit SS.
10646                  *
10647                  * Its far better to raise it up high (this takes the
10648                  * risk that there as been a loss already, probably
10649                  * we should have an indicator in all stacks of loss
10650                  * but we don't), but considering the normal use this
10651                  * is a risk worth taking. The consequences of not
10652                  * hitting SS are far worse than going one more time
10653                  * into it early on (before we have sent even a IW).
10654                  * It is highly unlikely that we will have had a loss
10655                  * before getting the IW out.
10656                  */
10657                 tp->snd_ssthresh = 0xffffffff;
10658         }
10659         rack_stop_all_timers(tp);
10660         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
10661         rack_log_rtt_shrinks(rack,  us_cts,  0,
10662                              __LINE__, RACK_RTTS_INIT);
10663         return (0);
10664 }
10665
10666 static int
10667 rack_handoff_ok(struct tcpcb *tp)
10668 {
10669         if ((tp->t_state == TCPS_CLOSED) ||
10670             (tp->t_state == TCPS_LISTEN)) {
10671                 /* Sure no problem though it may not stick */
10672                 return (0);
10673         }
10674         if ((tp->t_state == TCPS_SYN_SENT) ||
10675             (tp->t_state == TCPS_SYN_RECEIVED)) {
10676                 /*
10677                  * We really don't know you have to get to ESTAB or beyond
10678                  * to tell.
10679                  */
10680                 return (EAGAIN);
10681         }
10682         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
10683                 return (0);
10684         }
10685         /*
10686          * If we reach here we don't do SACK on this connection so we can
10687          * never do rack.
10688          */
10689         return (EINVAL);
10690 }
10691
10692 static void
10693 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
10694 {
10695         if (tp->t_fb_ptr) {
10696                 struct tcp_rack *rack;
10697                 struct rack_sendmap *rsm, *nrsm, *rm;
10698
10699                 rack = (struct tcp_rack *)tp->t_fb_ptr;
10700 #ifdef NETFLIX_SHARED_CWND
10701                 if (rack->r_ctl.rc_scw) {
10702                         uint32_t limit;
10703
10704                         if (rack->r_limit_scw)
10705                                 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
10706                         else
10707                                 limit = 0;
10708                         tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
10709                                                   rack->r_ctl.rc_scw_index,
10710                                                   limit);
10711                         rack->r_ctl.rc_scw = NULL;
10712                 }
10713 #endif
10714                 /* rack does not use force data but other stacks may clear it */
10715                 tp->t_flags &= ~TF_FORCEDATA;
10716                 if (tp->t_inpcb) {
10717                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
10718                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
10719                         tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
10720                 }
10721 #ifdef TCP_BLACKBOX
10722                 tcp_log_flowend(tp);
10723 #endif
10724                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
10725                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
10726 #ifdef INVARIANTS
10727                         if (rm != rsm) {
10728                                 panic("At fini, rack:%p rsm:%p rm:%p",
10729                                       rack, rsm, rm);
10730                         }
10731 #endif
10732                         uma_zfree(rack_zone, rsm);
10733                 }
10734                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
10735                 while (rsm) {
10736                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
10737                         uma_zfree(rack_zone, rsm);
10738                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
10739                 }
10740                 rack->rc_free_cnt = 0;
10741                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
10742                 tp->t_fb_ptr = NULL;
10743         }
10744         /* Cancel the GP measurement in progress */
10745         tp->t_flags &= ~TF_GPUTINPROG;
10746         /* Make sure snd_nxt is correctly set */
10747         tp->snd_nxt = tp->snd_max;
10748 }
10749
10750
10751 static void
10752 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
10753 {
10754         switch (tp->t_state) {
10755         case TCPS_SYN_SENT:
10756                 rack->r_state = TCPS_SYN_SENT;
10757                 rack->r_substate = rack_do_syn_sent;
10758                 break;
10759         case TCPS_SYN_RECEIVED:
10760                 rack->r_state = TCPS_SYN_RECEIVED;
10761                 rack->r_substate = rack_do_syn_recv;
10762                 break;
10763         case TCPS_ESTABLISHED:
10764                 rack_set_pace_segments(tp, rack, __LINE__);
10765                 rack->r_state = TCPS_ESTABLISHED;
10766                 rack->r_substate = rack_do_established;
10767                 break;
10768         case TCPS_CLOSE_WAIT:
10769                 rack->r_state = TCPS_CLOSE_WAIT;
10770                 rack->r_substate = rack_do_close_wait;
10771                 break;
10772         case TCPS_FIN_WAIT_1:
10773                 rack->r_state = TCPS_FIN_WAIT_1;
10774                 rack->r_substate = rack_do_fin_wait_1;
10775                 break;
10776         case TCPS_CLOSING:
10777                 rack->r_state = TCPS_CLOSING;
10778                 rack->r_substate = rack_do_closing;
10779                 break;
10780         case TCPS_LAST_ACK:
10781                 rack->r_state = TCPS_LAST_ACK;
10782                 rack->r_substate = rack_do_lastack;
10783                 break;
10784         case TCPS_FIN_WAIT_2:
10785                 rack->r_state = TCPS_FIN_WAIT_2;
10786                 rack->r_substate = rack_do_fin_wait_2;
10787                 break;
10788         case TCPS_LISTEN:
10789         case TCPS_CLOSED:
10790         case TCPS_TIME_WAIT:
10791         default:
10792                 break;
10793         };
10794 }
10795
10796
10797 static void
10798 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
10799 {
10800         /*
10801          * We received an ack, and then did not
10802          * call send or were bounced out due to the
10803          * hpts was running. Now a timer is up as well, is
10804          * it the right timer?
10805          */
10806         struct rack_sendmap *rsm;
10807         int tmr_up;
10808
10809         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
10810         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
10811                 return;
10812         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10813         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
10814             (tmr_up == PACE_TMR_RXT)) {
10815                 /* Should be an RXT */
10816                 return;
10817         }
10818         if (rsm == NULL) {
10819                 /* Nothing outstanding? */
10820                 if (tp->t_flags & TF_DELACK) {
10821                         if (tmr_up == PACE_TMR_DELACK)
10822                                 /* We are supposed to have delayed ack up and we do */
10823                                 return;
10824                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
10825                         /*
10826                          * if we hit enobufs then we would expect the possiblity
10827                          * of nothing outstanding and the RXT up (and the hptsi timer).
10828                          */
10829                         return;
10830                 } else if (((V_tcp_always_keepalive ||
10831                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
10832                             (tp->t_state <= TCPS_CLOSING)) &&
10833                            (tmr_up == PACE_TMR_KEEP) &&
10834                            (tp->snd_max == tp->snd_una)) {
10835                         /* We should have keep alive up and we do */
10836                         return;
10837                 }
10838         }
10839         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
10840                    ((tmr_up == PACE_TMR_TLP) ||
10841                     (tmr_up == PACE_TMR_RACK) ||
10842                     (tmr_up == PACE_TMR_RXT))) {
10843                 /*
10844                  * Either a Rack, TLP or RXT is fine if  we
10845                  * have outstanding data.
10846                  */
10847                 return;
10848         } else if (tmr_up == PACE_TMR_DELACK) {
10849                 /*
10850                  * If the delayed ack was going to go off
10851                  * before the rtx/tlp/rack timer were going to
10852                  * expire, then that would be the timer in control.
10853                  * Note we don't check the time here trusting the
10854                  * code is correct.
10855                  */
10856                 return;
10857         }
10858         /*
10859          * Ok the timer originally started is not what we want now.
10860          * We will force the hpts to be stopped if any, and restart
10861          * with the slot set to what was in the saved slot.
10862          */
10863         if (rack->rc_inp->inp_in_hpts) {
10864                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
10865                         uint32_t us_cts;
10866
10867                         us_cts = tcp_get_usecs(NULL);
10868                         if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
10869                                 rack->r_early = 1;
10870                                 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
10871                         }
10872                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
10873                 }
10874                 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
10875         }
10876         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10877         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
10878 }
10879
10880 static int
10881 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
10882     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
10883     int32_t nxt_pkt, struct timeval *tv)
10884 {
10885         int32_t thflags, retval, did_out = 0;
10886         int32_t way_out = 0;
10887         uint32_t cts;
10888         uint32_t tiwin;
10889         struct timespec ts;
10890         struct tcpopt to;
10891         struct tcp_rack *rack;
10892         struct rack_sendmap *rsm;
10893         int32_t prev_state = 0;
10894         uint32_t us_cts;
10895         /*
10896          * tv passed from common code is from either M_TSTMP_LRO or
10897          * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The
10898          * rack_pacing stack assumes tv always refers to 'now', so we overwrite
10899          * tv here to guarantee that.
10900          */
10901         if (m->m_flags & M_TSTMP_LRO)
10902                 tcp_get_usecs(tv);
10903
10904         cts = tcp_tv_to_mssectick(tv);
10905         rack = (struct tcp_rack *)tp->t_fb_ptr;
10906
10907         if ((m->m_flags & M_TSTMP) ||
10908             (m->m_flags & M_TSTMP_LRO)) {
10909                 mbuf_tstmp2timespec(m, &ts);
10910                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
10911                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
10912         } else
10913                 rack->r_ctl.act_rcv_time = *tv;
10914         kern_prefetch(rack, &prev_state);
10915         prev_state = 0;
10916         thflags = th->th_flags;
10917
10918         NET_EPOCH_ASSERT();
10919         INP_WLOCK_ASSERT(tp->t_inpcb);
10920         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
10921             __func__));
10922         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
10923             __func__));
10924         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
10925                 union tcp_log_stackspecific log;
10926                 struct timeval ltv;
10927 #ifdef NETFLIX_HTTP_LOGGING
10928                 struct http_sendfile_track *http_req;
10929
10930                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
10931                         http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
10932                 } else {
10933                         http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
10934                 }
10935 #endif
10936                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
10937                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
10938                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
10939                 if (rack->rack_no_prr == 0)
10940                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
10941                 else
10942                         log.u_bbr.flex1 = 0;
10943                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
10944                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
10945                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
10946                 log.u_bbr.flex3 = m->m_flags;
10947                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
10948                 if (m->m_flags & M_TSTMP) {
10949                         /* Record the hardware timestamp if present */
10950                         mbuf_tstmp2timespec(m, &ts);
10951                         ltv.tv_sec = ts.tv_sec;
10952                         ltv.tv_usec = ts.tv_nsec / 1000;
10953                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
10954                 } else if (m->m_flags & M_TSTMP_LRO) {
10955                         /* Record the LRO the arrival timestamp */
10956                         mbuf_tstmp2timespec(m, &ts);
10957                         ltv.tv_sec = ts.tv_sec;
10958                         ltv.tv_usec = ts.tv_nsec / 1000;
10959                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
10960                 }
10961                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
10962                 /* Log the rcv time */
10963                 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
10964 #ifdef NETFLIX_HTTP_LOGGING
10965                 log.u_bbr.applimited = tp->t_http_closed;
10966                 log.u_bbr.applimited <<= 8;
10967                 log.u_bbr.applimited |= tp->t_http_open;
10968                 log.u_bbr.applimited <<= 8;
10969                 log.u_bbr.applimited |= tp->t_http_req;
10970                 if (http_req) {
10971                         /* Copy out any client req info */
10972                         /* seconds */
10973                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
10974                         /* useconds */
10975                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
10976                         log.u_bbr.rttProp = http_req->timestamp;
10977                         log.u_bbr.cur_del_rate = http_req->start;
10978                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
10979                                 log.u_bbr.flex8 |= 1;
10980                         } else {
10981                                 log.u_bbr.flex8 |= 2;
10982                                 log.u_bbr.bw_inuse = http_req->end;
10983                         }
10984                         log.u_bbr.flex6 = http_req->start_seq;
10985                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
10986                                 log.u_bbr.flex8 |= 4;
10987                                 log.u_bbr.epoch = http_req->end_seq;
10988                         }
10989                 }
10990 #endif
10991                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
10992                     tlen, &log, true, &ltv);
10993         }
10994         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
10995                 way_out = 4;
10996                 retval = 0;
10997                 goto done_with_input;
10998         }
10999         /*
11000          * If a segment with the ACK-bit set arrives in the SYN-SENT state
11001          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
11002          */
11003         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
11004             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
11005                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11006                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11007                 return(1);
11008         }
11009         /*
11010          * Segment received on connection. Reset idle time and keep-alive
11011          * timer. XXX: This should be done after segment validation to
11012          * ignore broken/spoofed segs.
11013          */
11014         if  (tp->t_idle_reduce &&
11015              (tp->snd_max == tp->snd_una) &&
11016              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
11017                 counter_u64_add(rack_input_idle_reduces, 1);
11018                 rack_cc_after_idle(rack, tp);
11019         }
11020         tp->t_rcvtime = ticks;
11021         /*
11022          * Unscale the window into a 32-bit value. For the SYN_SENT state
11023          * the scale is zero.
11024          */
11025         tiwin = th->th_win << tp->snd_scale;
11026 #ifdef STATS
11027         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
11028 #endif
11029         if (tiwin > rack->r_ctl.rc_high_rwnd)
11030                 rack->r_ctl.rc_high_rwnd = tiwin;
11031         /*
11032          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
11033          * this to occur after we've validated the segment.
11034          */
11035         if (tp->t_flags2 & TF2_ECN_PERMIT) {
11036                 if (thflags & TH_CWR) {
11037                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
11038                         tp->t_flags |= TF_ACKNOW;
11039                 }
11040                 switch (iptos & IPTOS_ECN_MASK) {
11041                 case IPTOS_ECN_CE:
11042                         tp->t_flags2 |= TF2_ECN_SND_ECE;
11043                         KMOD_TCPSTAT_INC(tcps_ecn_ce);
11044                         break;
11045                 case IPTOS_ECN_ECT0:
11046                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
11047                         break;
11048                 case IPTOS_ECN_ECT1:
11049                         KMOD_TCPSTAT_INC(tcps_ecn_ect1);
11050                         break;
11051                 }
11052
11053                 /* Process a packet differently from RFC3168. */
11054                 cc_ecnpkt_handler(tp, th, iptos);
11055
11056                 /* Congestion experienced. */
11057                 if (thflags & TH_ECE) {
11058                         rack_cong_signal(tp, th, CC_ECN);
11059                 }
11060         }
11061         /*
11062          * Parse options on any incoming segment.
11063          */
11064         tcp_dooptions(&to, (u_char *)(th + 1),
11065             (th->th_off << 2) - sizeof(struct tcphdr),
11066             (thflags & TH_SYN) ? TO_SYN : 0);
11067
11068         /*
11069          * If echoed timestamp is later than the current time, fall back to
11070          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
11071          * were used when this connection was established.
11072          */
11073         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
11074                 to.to_tsecr -= tp->ts_offset;
11075                 if (TSTMP_GT(to.to_tsecr, cts))
11076                         to.to_tsecr = 0;
11077         }
11078
11079         /*
11080          * If its the first time in we need to take care of options and
11081          * verify we can do SACK for rack!
11082          */
11083         if (rack->r_state == 0) {
11084                 /* Should be init'd by rack_init() */
11085                 KASSERT(rack->rc_inp != NULL,
11086                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
11087                 if (rack->rc_inp == NULL) {
11088                         rack->rc_inp = tp->t_inpcb;
11089                 }
11090
11091                 /*
11092                  * Process options only when we get SYN/ACK back. The SYN
11093                  * case for incoming connections is handled in tcp_syncache.
11094                  * According to RFC1323 the window field in a SYN (i.e., a
11095                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
11096                  * this is traditional behavior, may need to be cleaned up.
11097                  */
11098                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
11099                         /* Handle parallel SYN for ECN */
11100                         if (!(thflags & TH_ACK) &&
11101                             ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) &&
11102                             ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) {
11103                                 tp->t_flags2 |= TF2_ECN_PERMIT;
11104                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
11105                                 TCPSTAT_INC(tcps_ecn_shs);
11106                         }
11107                         if ((to.to_flags & TOF_SCALE) &&
11108                             (tp->t_flags & TF_REQ_SCALE)) {
11109                                 tp->t_flags |= TF_RCVD_SCALE;
11110                                 tp->snd_scale = to.to_wscale;
11111                         } else
11112                                 tp->t_flags &= ~TF_REQ_SCALE;
11113                         /*
11114                          * Initial send window.  It will be updated with the
11115                          * next incoming segment to the scaled value.
11116                          */
11117                         tp->snd_wnd = th->th_win;
11118                         if ((to.to_flags & TOF_TS) &&
11119                             (tp->t_flags & TF_REQ_TSTMP)) {
11120                                 tp->t_flags |= TF_RCVD_TSTMP;
11121                                 tp->ts_recent = to.to_tsval;
11122                                 tp->ts_recent_age = cts;
11123                         } else
11124                                 tp->t_flags &= ~TF_REQ_TSTMP;
11125                         if (to.to_flags & TOF_MSS)
11126                                 tcp_mss(tp, to.to_mss);
11127                         if ((tp->t_flags & TF_SACK_PERMIT) &&
11128                             (to.to_flags & TOF_SACKPERM) == 0)
11129                                 tp->t_flags &= ~TF_SACK_PERMIT;
11130                         if (IS_FASTOPEN(tp->t_flags)) {
11131                                 if (to.to_flags & TOF_FASTOPEN) {
11132                                         uint16_t mss;
11133
11134                                         if (to.to_flags & TOF_MSS)
11135                                                 mss = to.to_mss;
11136                                         else
11137                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
11138                                                         mss = TCP6_MSS;
11139                                                 else
11140                                                         mss = TCP_MSS;
11141                                         tcp_fastopen_update_cache(tp, mss,
11142                                             to.to_tfo_len, to.to_tfo_cookie);
11143                                 } else
11144                                         tcp_fastopen_disable_path(tp);
11145                         }
11146                 }
11147                 /*
11148                  * At this point we are at the initial call. Here we decide
11149                  * if we are doing RACK or not. We do this by seeing if
11150                  * TF_SACK_PERMIT is set and the sack-not-required is clear.
11151                  * The code now does do dup-ack counting so if you don't
11152                  * switch back you won't get rack & TLP, but you will still
11153                  * get this stack.
11154                  */
11155
11156                 if ((rack_sack_not_required == 0) &&
11157                     ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
11158                         tcp_switch_back_to_default(tp);
11159                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
11160                             tlen, iptos);
11161                         return (1);
11162                 }
11163                 /* Set the flag */
11164                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
11165                 tcp_set_hpts(tp->t_inpcb);
11166                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
11167         }
11168         if (thflags & TH_FIN)
11169                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
11170         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
11171         if ((rack->rc_gp_dyn_mul) &&
11172             (rack->use_fixed_rate == 0) &&
11173             (rack->rc_always_pace)) {
11174                 /* Check in on probertt */
11175                 rack_check_probe_rtt(rack, us_cts);
11176         }
11177         if (rack->forced_ack) {
11178                 uint32_t us_rtt;
11179
11180                 /*
11181                  * A persist or keep-alive was forced out, update our
11182                  * min rtt time. Note we do not worry about lost
11183                  * retransmissions since KEEP-ALIVES and persists
11184                  * are usually way long on times of sending (though
11185                  * if we were really paranoid or worried we could
11186                  * at least use timestamps if available to validate).
11187                  */
11188                 rack->forced_ack = 0;
11189                 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
11190                 if (us_rtt == 0)
11191                         us_rtt = 1;
11192                 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3);
11193                 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
11194         }
11195         /*
11196          * This is the one exception case where we set the rack state
11197          * always. All other times (timers etc) we must have a rack-state
11198          * set (so we assure we have done the checks above for SACK).
11199          */
11200         rack->r_ctl.rc_rcvtime = cts;
11201         if (rack->r_state != tp->t_state)
11202                 rack_set_state(tp, rack);
11203         if (SEQ_GT(th->th_ack, tp->snd_una) &&
11204             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
11205                 kern_prefetch(rsm, &prev_state);
11206         prev_state = rack->r_state;
11207         rack_clear_rate_sample(rack);
11208         retval = (*rack->r_substate) (m, th, so,
11209             tp, &to, drop_hdrlen,
11210             tlen, tiwin, thflags, nxt_pkt, iptos);
11211 #ifdef INVARIANTS
11212         if ((retval == 0) &&
11213             (tp->t_inpcb == NULL)) {
11214                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
11215                     retval, tp, prev_state);
11216         }
11217 #endif
11218         if (retval == 0) {
11219                 /*
11220                  * If retval is 1 the tcb is unlocked and most likely the tp
11221                  * is gone.
11222                  */
11223                 INP_WLOCK_ASSERT(tp->t_inpcb);
11224                 if ((rack->rc_gp_dyn_mul) &&
11225                     (rack->rc_always_pace) &&
11226                     (rack->use_fixed_rate == 0) &&
11227                     rack->in_probe_rtt &&
11228                     (rack->r_ctl.rc_time_probertt_starts == 0)) {
11229                         /*
11230                          * If we are going for target, lets recheck before
11231                          * we output.
11232                          */
11233                         rack_check_probe_rtt(rack, us_cts);
11234                 }
11235                 if (rack->set_pacing_done_a_iw == 0) {
11236                         /* How much has been acked? */
11237                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
11238                                 /* We have enough to set in the pacing segment size */
11239                                 rack->set_pacing_done_a_iw = 1;
11240                                 rack_set_pace_segments(tp, rack, __LINE__);
11241                         }
11242                 }
11243                 tcp_rack_xmit_timer_commit(rack, tp);
11244                 if (nxt_pkt == 0) {
11245                         if (rack->r_wanted_output != 0) {
11246 do_output_now:
11247                                 did_out = 1;
11248                                 (void)tp->t_fb->tfb_tcp_output(tp);
11249                         }
11250                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
11251                 }
11252                 if ((nxt_pkt == 0) &&
11253                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
11254                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
11255                      (tp->t_flags & TF_DELACK) ||
11256                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
11257                       (tp->t_state <= TCPS_CLOSING)))) {
11258                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
11259                         if ((tp->snd_max == tp->snd_una) &&
11260                             ((tp->t_flags & TF_DELACK) == 0) &&
11261                             (rack->rc_inp->inp_in_hpts) &&
11262                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11263                                 /* keep alive not needed if we are hptsi output yet */
11264                                 ;
11265                         } else {
11266                                 int late = 0;
11267                                 if (rack->rc_inp->inp_in_hpts) {
11268                                         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
11269                                                 us_cts = tcp_get_usecs(NULL);
11270                                                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
11271                                                         rack->r_early = 1;
11272                                                         rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
11273                                                 } else
11274                                                         late = 1;
11275                                                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
11276                                         }
11277                                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
11278                                 }
11279                                 if (late && (did_out == 0)) {
11280                                         /*
11281                                          * We are late in the sending
11282                                          * and we did not call the output
11283                                          * (this probably should not happen).
11284                                          */
11285                                         goto do_output_now;
11286                                 }
11287                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
11288                         }
11289                         way_out = 1;
11290                 } else if (nxt_pkt == 0) {
11291                         /* Do we have the correct timer running? */
11292                         rack_timer_audit(tp, rack, &so->so_snd);
11293                         way_out = 2;
11294                 }
11295         done_with_input:
11296                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
11297                 if (did_out)
11298                         rack->r_wanted_output = 0;
11299 #ifdef INVARIANTS
11300                 if (tp->t_inpcb == NULL) {
11301                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
11302                               did_out,
11303                               retval, tp, prev_state);
11304                 }
11305 #endif
11306         }
11307         return (retval);
11308 }
11309
11310 void
11311 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
11312     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
11313 {
11314         struct timeval tv;
11315
11316         /* First lets see if we have old packets */
11317         if (tp->t_in_pkt) {
11318                 if (ctf_do_queued_segments(so, tp, 1)) {
11319                         m_freem(m);
11320                         return;
11321                 }
11322         }
11323         if (m->m_flags & M_TSTMP_LRO) {
11324                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
11325                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
11326         } else {
11327                 /* Should not be should we kassert instead? */
11328                 tcp_get_usecs(&tv);
11329         }
11330         if(rack_do_segment_nounlock(m, th, so, tp,
11331                                     drop_hdrlen, tlen, iptos, 0, &tv) == 0)
11332                 INP_WUNLOCK(tp->t_inpcb);
11333 }
11334
11335 struct rack_sendmap *
11336 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
11337 {
11338         struct rack_sendmap *rsm = NULL;
11339         int32_t idx;
11340         uint32_t srtt = 0, thresh = 0, ts_low = 0;
11341
11342         /* Return the next guy to be re-transmitted */
11343         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
11344                 return (NULL);
11345         }
11346         if (tp->t_flags & TF_SENTFIN) {
11347                 /* retran the end FIN? */
11348                 return (NULL);
11349         }
11350         /* ok lets look at this one */
11351         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
11352         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
11353                 goto check_it;
11354         }
11355         rsm = rack_find_lowest_rsm(rack);
11356         if (rsm == NULL) {
11357                 return (NULL);
11358         }
11359 check_it:
11360         if (rsm->r_flags & RACK_ACKED) {
11361                 return (NULL);
11362         }
11363         if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
11364             (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
11365                 /* Its not yet ready */
11366                 return (NULL);
11367         }
11368         srtt = rack_grab_rtt(tp, rack);
11369         idx = rsm->r_rtr_cnt - 1;
11370         ts_low = rsm->r_tim_lastsent[idx];
11371         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
11372         if ((tsused == ts_low) ||
11373             (TSTMP_LT(tsused, ts_low))) {
11374                 /* No time since sending */
11375                 return (NULL);
11376         }
11377         if ((tsused - ts_low) < thresh) {
11378                 /* It has not been long enough yet */
11379                 return (NULL);
11380         }
11381         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
11382             ((rsm->r_flags & RACK_SACK_PASSED) &&
11383              (rack->sack_attack_disable == 0))) {
11384                 /*
11385                  * We have passed the dup-ack threshold <or>
11386                  * a SACK has indicated this is missing.
11387                  * Note that if you are a declared attacker
11388                  * it is only the dup-ack threshold that
11389                  * will cause retransmits.
11390                  */
11391                 /* log retransmit reason */
11392                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
11393                 return (rsm);
11394         }
11395         return (NULL);
11396 }
11397
11398 static void
11399 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
11400                            uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
11401                            int line, struct rack_sendmap *rsm)
11402 {
11403         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
11404                 union tcp_log_stackspecific log;
11405                 struct timeval tv;
11406
11407                 memset(&log, 0, sizeof(log));
11408                 log.u_bbr.flex1 = slot;
11409                 log.u_bbr.flex2 = len;
11410                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
11411                 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
11412                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
11413                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
11414                 log.u_bbr.use_lt_bw = rack->app_limited_needs_set;
11415                 log.u_bbr.use_lt_bw <<= 1;
11416                 log.u_bbr.use_lt_bw = rack->rc_gp_filled;
11417                 log.u_bbr.use_lt_bw <<= 1;
11418                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
11419                 log.u_bbr.use_lt_bw <<= 1;
11420                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
11421                 log.u_bbr.pkt_epoch = line;
11422                 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
11423                 log.u_bbr.bw_inuse = bw_est;
11424                 log.u_bbr.delRate = bw;
11425                 if (rack->r_ctl.gp_bw == 0)
11426                         log.u_bbr.cur_del_rate = 0;
11427                 else
11428                         log.u_bbr.cur_del_rate = rack_get_bw(rack);
11429                 log.u_bbr.rttProp = len_time;
11430                 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
11431                 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
11432                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
11433                 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
11434                         /* We are in slow start */
11435                         log.u_bbr.flex7 = 1;
11436                 } else {
11437                         /* we are on congestion avoidance */
11438                         log.u_bbr.flex7 = 0;
11439                 }
11440                 log.u_bbr.flex8 = method;
11441                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11442                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11443                 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
11444                 log.u_bbr.cwnd_gain <<= 1;
11445                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
11446                 log.u_bbr.cwnd_gain <<= 1;
11447                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
11448                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
11449                     &rack->rc_inp->inp_socket->so_rcv,
11450                     &rack->rc_inp->inp_socket->so_snd,
11451                     BBR_LOG_HPTSI_CALC, 0,
11452                     0, &log, false, &tv);
11453         }
11454 }
11455
11456 static uint32_t
11457 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
11458 {
11459         uint32_t new_tso, user_max;
11460
11461         user_max = rack->rc_user_set_max_segs * mss;
11462         if (rack->rc_force_max_seg) {
11463                 return (user_max);
11464         }
11465         if (rack->use_fixed_rate &&
11466             ((rack->r_ctl.crte == NULL) ||
11467              (bw != rack->r_ctl.crte->rate))) {
11468                 /* Use the user mss since we are not exactly matched */
11469                 return (user_max);
11470         }
11471         new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
11472         if (new_tso > user_max)
11473                 new_tso = user_max;
11474         return(new_tso);
11475 }
11476
11477 static void
11478 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp,
11479                      uint64_t rate, uint64_t hw_rate, int line,
11480                      int error)
11481 {
11482         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
11483                 union tcp_log_stackspecific log;
11484                 struct timeval tv;
11485
11486                 memset(&log, 0, sizeof(log));
11487                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
11488                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
11489                 log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
11490                 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
11491                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11492                 log.u_bbr.bw_inuse = rate;
11493                 log.u_bbr.flex5 = line;
11494                 log.u_bbr.flex6 = error;
11495                 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
11496                 log.u_bbr.flex8 = rack->use_fixed_rate;
11497                 log.u_bbr.flex8 <<= 1;
11498                 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
11499                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
11500                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
11501                     &rack->rc_inp->inp_socket->so_rcv,
11502                     &rack->rc_inp->inp_socket->so_snd,
11503                     BBR_LOG_HDWR_PACE, 0,
11504                     0, &log, false, &tv);
11505         }
11506 }
11507
11508 static int32_t
11509 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz)
11510 {
11511         uint64_t lentim, fill_bw;
11512
11513         /* Lets first see if we are full, if so continue with normal rate */
11514         if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
11515                 return (slot);
11516         if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
11517                 return (slot);
11518         if (rack->r_ctl.rc_last_us_rtt == 0)
11519                 return (slot);
11520         if (rack->rc_pace_fill_if_rttin_range &&
11521             (rack->r_ctl.rc_last_us_rtt >=
11522              (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
11523                 /* The rtt is huge, N * smallest, lets not fill */
11524                 return (slot);
11525         }
11526         /*
11527          * first lets calculate the b/w based on the last us-rtt
11528          * and the sndwnd.
11529          */
11530         fill_bw = rack->r_ctl.cwnd_to_use;
11531         /* Take the rwnd if its smaller */
11532         if (fill_bw > rack->rc_tp->snd_wnd)
11533                 fill_bw = rack->rc_tp->snd_wnd;
11534         fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
11535         fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
11536         /* We are below the min b/w */
11537         if (fill_bw < RACK_MIN_BW)
11538                 return (slot);
11539         /*
11540          * Ok fill_bw holds our mythical b/w to fill the cwnd
11541          * in a rtt, what does that time wise equate too?
11542          */
11543         lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
11544         lentim /= fill_bw;
11545         if (lentim < slot) {
11546                 rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
11547                                            0, lentim, 12, __LINE__, NULL);
11548                 return ((int32_t)lentim);
11549         } else
11550                 return (slot);
11551 }
11552
11553 static int32_t
11554 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
11555 {
11556         struct rack_sendmap *lrsm;
11557         int32_t slot = 0;
11558         int err;
11559
11560         if (rack->rc_always_pace == 0) {
11561                 /*
11562                  * We use the most optimistic possible cwnd/srtt for
11563                  * sending calculations. This will make our
11564                  * calculation anticipate getting more through
11565                  * quicker then possible. But thats ok we don't want
11566                  * the peer to have a gap in data sending.
11567                  */
11568                 uint32_t srtt, cwnd, tr_perms = 0;
11569                 int32_t reduce = 0;
11570
11571         old_method:
11572                 /*
11573                  * We keep no precise pacing with the old method
11574                  * instead we use the pacer to mitigate bursts.
11575                  */
11576                 rack->r_ctl.rc_agg_delayed = 0;
11577                 rack->r_early = 0;
11578                 rack->r_late = 0;
11579                 rack->r_ctl.rc_agg_early = 0;
11580                 if (rack->r_ctl.rc_rack_min_rtt)
11581                         srtt = rack->r_ctl.rc_rack_min_rtt;
11582                 else
11583                         srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
11584                 if (rack->r_ctl.rc_rack_largest_cwnd)
11585                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
11586                 else
11587                         cwnd = rack->r_ctl.cwnd_to_use;
11588                 tr_perms = cwnd / srtt;
11589                 if (tr_perms == 0) {
11590                         tr_perms = ctf_fixed_maxseg(tp);
11591                 }
11592                 /*
11593                  * Calculate how long this will take to drain, if
11594                  * the calculation comes out to zero, thats ok we
11595                  * will use send_a_lot to possibly spin around for
11596                  * more increasing tot_len_this_send to the point
11597                  * that its going to require a pace, or we hit the
11598                  * cwnd. Which in that case we are just waiting for
11599                  * a ACK.
11600                  */
11601                 slot = len / tr_perms;
11602                 /* Now do we reduce the time so we don't run dry? */
11603                 if (slot && rack_slot_reduction) {
11604                         reduce = (slot / rack_slot_reduction);
11605                         if (reduce < slot) {
11606                                 slot -= reduce;
11607                         } else
11608                                 slot = 0;
11609                 }
11610                 slot *=  HPTS_USEC_IN_MSEC;
11611                 if (rsm == NULL) {
11612                         /*
11613                          * We always consider ourselves app limited with old style
11614                          * that are not retransmits. This could be the initial
11615                          * measurement, but thats ok its all setup and specially
11616                          * handled. If another send leaks out, then that too will
11617                          * be mark app-limited.
11618                          */
11619                         lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
11620                         if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) {
11621                                 rack->r_ctl.rc_first_appl = lrsm;
11622                                 lrsm->r_flags |= RACK_APP_LIMITED;
11623                                 rack->r_ctl.rc_app_limited_cnt++;
11624                         }
11625                 }
11626                 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL);
11627         } else {
11628                 uint64_t bw_est, res, lentim, rate_wanted;
11629                 uint32_t orig_val, srtt, segs, oh;
11630
11631                 if ((rack->r_rr_config == 1) && rsm) {
11632                         return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC);
11633                 }
11634                 if (rack->use_fixed_rate) {
11635                         rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
11636                 } else if ((rack->r_ctl.init_rate == 0) &&
11637 #ifdef NETFLIX_PEAKRATE
11638                            (rack->rc_tp->t_maxpeakrate == 0) &&
11639 #endif
11640                            (rack->r_ctl.gp_bw == 0)) {
11641                         /* no way to yet do an estimate */
11642                         bw_est = rate_wanted = 0;
11643                 } else {
11644                         bw_est = rack_get_bw(rack);
11645                         rate_wanted = rack_get_output_bw(rack, bw_est, rsm);
11646                 }
11647                 if ((bw_est == 0) || (rate_wanted == 0)) {
11648                         /*
11649                          * No way yet to make a b/w estimate or
11650                          * our raise is set incorrectly.
11651                          */
11652                         goto old_method;
11653                 }
11654                 /* We need to account for all the overheads */
11655                 segs = (len + segsiz - 1) / segsiz;
11656                 /*
11657                  * We need the diff between 1514 bytes (e-mtu with e-hdr)
11658                  * and how much data we put in each packet. Yes this
11659                  * means we may be off if we are larger than 1500 bytes
11660                  * or smaller. But this just makes us more conservative.
11661                  */
11662                 if (ETHERNET_SEGMENT_SIZE > segsiz)
11663                         oh = ETHERNET_SEGMENT_SIZE - segsiz;
11664                 else
11665                         oh = 0;
11666                 segs *= oh;
11667                 lentim = (uint64_t)(len + segs)  * (uint64_t)HPTS_USEC_IN_SEC;
11668                 res = lentim / rate_wanted;
11669                 slot = (uint32_t)res;
11670                 orig_val = rack->r_ctl.rc_pace_max_segs;
11671                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
11672 #ifdef KERN_TLS
11673                 /* For TLS we need to override this, possibly  */
11674                 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
11675                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
11676                 }
11677 #endif
11678                 /* Did we change the TSO size, if so log it */
11679                 if (rack->r_ctl.rc_pace_max_segs != orig_val)
11680                         rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL);
11681                 if ((rack->rc_pace_to_cwnd) &&
11682                     (rack->in_probe_rtt == 0) &&
11683                     (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
11684                         /*
11685                          * We want to pace at our rate *or* faster to
11686                          * fill the cwnd to the max if its not full.
11687                          */
11688                         slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz);
11689                 }
11690                 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
11691                     (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
11692                         if ((rack->rack_hdw_pace_ena) &&
11693                             (rack->rack_hdrw_pacing == 0) &&
11694                             (rack->rack_attempt_hdwr_pace == 0)) {
11695                                 /*
11696                                  * Lets attempt to turn on hardware pacing
11697                                  * if we can.
11698                                  */
11699                                 rack->rack_attempt_hdwr_pace = 1;
11700                                 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
11701                                                                        rack->rc_inp->inp_route.ro_nh->nh_ifp,
11702                                                                        rate_wanted,
11703                                                                        RS_PACING_GEQ,
11704                                                                        &err);
11705                                 if (rack->r_ctl.crte) {
11706                                         rack->rack_hdrw_pacing = 1;
11707                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz,
11708                                                                                                  0, rack->r_ctl.crte,
11709                                                                                                  NULL);
11710                                         rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp,
11711                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
11712                                                              err);
11713                                 }
11714                         } else if (rack->rack_hdrw_pacing &&
11715                                    (rack->r_ctl.crte->rate != rate_wanted)) {
11716                                 /* Do we need to adjust our rate? */
11717                                 const struct tcp_hwrate_limit_table *nrte;
11718
11719                                 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
11720                                                            rack->rc_tp,
11721                                                            rack->rc_inp->inp_route.ro_nh->nh_ifp,
11722                                                            rate_wanted,
11723                                                            RS_PACING_GEQ,
11724                                                            &err);
11725                                 if (nrte == NULL) {
11726                                         /* Lost the rate */
11727                                         rack->rack_hdrw_pacing = 0;
11728                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
11729                                 } else if (nrte != rack->r_ctl.crte) {
11730                                         rack->r_ctl.crte = nrte;
11731                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted,
11732                                                                                                  segsiz, 0,
11733                                                                                                  rack->r_ctl.crte,
11734                                                                                                  NULL);
11735                                         rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp,
11736                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
11737                                                              err);
11738                                 }
11739
11740                         }
11741                 }
11742                 if (rack_limit_time_with_srtt &&
11743                     (rack->use_fixed_rate == 0) &&
11744 #ifdef NETFLIX_PEAKRATE
11745                     (rack->rc_tp->t_maxpeakrate == 0) &&
11746 #endif
11747                     (rack->rack_hdrw_pacing == 0)) {
11748                         /*
11749                          * Sanity check, we do not allow the pacing delay
11750                          * to be longer than the SRTT of the path. If it is
11751                          * a slow path, then adding a packet should increase
11752                          * the RTT and compensate for this i.e. the srtt will
11753                          * be greater so the allowed pacing time will be greater.
11754                          *
11755                          * Note this restriction is not for where a peak rate
11756                          * is set, we are doing fixed pacing or hardware pacing.
11757                          */
11758                         if (rack->rc_tp->t_srtt)
11759                                 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
11760                         else
11761                                 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;    /* its in ms convert */
11762                         if (srtt < slot) {
11763                                 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL);
11764                                 slot = srtt;
11765                         }
11766                 }
11767                 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm);
11768         }
11769         if (slot)
11770                 counter_u64_add(rack_calc_nonzero, 1);
11771         else
11772                 counter_u64_add(rack_calc_zero, 1);
11773         return (slot);
11774 }
11775
11776 static void
11777 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
11778     tcp_seq startseq, uint32_t sb_offset)
11779 {
11780         struct rack_sendmap *my_rsm = NULL;
11781         struct rack_sendmap fe;
11782
11783         if (tp->t_state < TCPS_ESTABLISHED) {
11784                 /*
11785                  * We don't start any measurements if we are
11786                  * not at least established.
11787                  */
11788                 return;
11789         }
11790         tp->t_flags |= TF_GPUTINPROG;
11791         rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
11792         rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
11793         tp->gput_seq = startseq;
11794         rack->app_limited_needs_set = 0;
11795         if (rack->in_probe_rtt)
11796                 rack->measure_saw_probe_rtt = 1;
11797         else if ((rack->measure_saw_probe_rtt) &&
11798                  (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
11799                 rack->measure_saw_probe_rtt = 0;
11800         if (rack->rc_gp_filled)
11801                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
11802         else {
11803                 /* Special case initial measurement */
11804                 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL);
11805         }
11806         /*
11807          * We take a guess out into the future,
11808          * if we have no measurement and no
11809          * initial rate, we measure the first
11810          * initial-windows worth of data to
11811          * speed up getting some GP measurement and
11812          * thus start pacing.
11813          */
11814         if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
11815                 rack->app_limited_needs_set = 1;
11816                 tp->gput_ack = startseq + max(rc_init_window(rack),
11817                                               (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
11818                 rack_log_pacing_delay_calc(rack,
11819                                            tp->gput_seq,
11820                                            tp->gput_ack,
11821                                            0,
11822                                            tp->gput_ts,
11823                                            rack->r_ctl.rc_app_limited_cnt,
11824                                            9,
11825                                            __LINE__, NULL);
11826                 return;
11827         }
11828         if (sb_offset) {
11829                 /*
11830                  * We are out somewhere in the sb
11831                  * can we use the already outstanding data?
11832                  */
11833
11834                 if (rack->r_ctl.rc_app_limited_cnt == 0) {
11835                         /*
11836                          * Yes first one is good and in this case
11837                          * the tp->gput_ts is correctly set based on
11838                          * the last ack that arrived (no need to
11839                          * set things up when an ack comes in).
11840                          */
11841                         my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
11842                         if ((my_rsm == NULL) ||
11843                             (my_rsm->r_rtr_cnt != 1)) {
11844                                 /* retransmission? */
11845                                 goto use_latest;
11846                         }
11847                 } else {
11848                         if (rack->r_ctl.rc_first_appl == NULL) {
11849                                 /*
11850                                  * If rc_first_appl is NULL
11851                                  * then the cnt should be 0.
11852                                  * This is probably an error, maybe
11853                                  * a KASSERT would be approprate.
11854                                  */
11855                                 goto use_latest;
11856                         }
11857                         /*
11858                          * If we have a marker pointer to the last one that is
11859                          * app limited we can use that, but we need to set
11860                          * things up so that when it gets ack'ed we record
11861                          * the ack time (if its not already acked).
11862                          */
11863                         rack->app_limited_needs_set = 1;
11864                         /*
11865                          * We want to get to the rsm that is either
11866                          * next with space i.e. over 1 MSS or the one
11867                          * after that (after the app-limited).
11868                          */
11869                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
11870                                          rack->r_ctl.rc_first_appl);
11871                         if (my_rsm) {
11872                                 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
11873                                         /* Have to use the next one */
11874                                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
11875                                                          my_rsm);
11876                                 else {
11877                                         /* Use after the first MSS of it is acked */
11878                                         tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
11879                                         goto start_set;
11880                                 }
11881                         }
11882                         if ((my_rsm == NULL) ||
11883                             (my_rsm->r_rtr_cnt != 1)) {
11884                                 /*
11885                                  * Either its a retransmit or
11886                                  * the last is the app-limited one.
11887                                  */
11888                                 goto use_latest;
11889                         }
11890                 }
11891                 tp->gput_seq = my_rsm->r_start;
11892 start_set:
11893                 if (my_rsm->r_flags & RACK_ACKED) {
11894                         /*
11895                          * This one has been acked use the arrival ack time
11896                          */
11897                         tp->gput_ts = my_rsm->r_ack_arrival;
11898                         rack->app_limited_needs_set = 0;
11899                 }
11900                 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send;
11901                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
11902                 rack_log_pacing_delay_calc(rack,
11903                                            tp->gput_seq,
11904                                            tp->gput_ack,
11905                                            (uint64_t)my_rsm,
11906                                            tp->gput_ts,
11907                                            rack->r_ctl.rc_app_limited_cnt,
11908                                            9,
11909                                            __LINE__, NULL);
11910                 return;
11911         }
11912
11913 use_latest:
11914         /*
11915          * We don't know how long we may have been
11916          * idle or if this is the first-send. Lets
11917          * setup the flag so we will trim off
11918          * the first ack'd data so we get a true
11919          * measurement.
11920          */
11921         rack->app_limited_needs_set = 1;
11922         tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
11923         /* Find this guy so we can pull the send time */
11924         fe.r_start = startseq;
11925         my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
11926         if (my_rsm) {
11927                 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send;
11928                 if (my_rsm->r_flags & RACK_ACKED) {
11929                         /*
11930                          * Unlikely since its probably what was
11931                          * just transmitted (but I am paranoid).
11932                          */
11933                         tp->gput_ts = my_rsm->r_ack_arrival;
11934                         rack->app_limited_needs_set = 0;
11935                 }
11936                 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
11937                         /* This also is unlikely */
11938                         tp->gput_seq = my_rsm->r_start;
11939                 }
11940         } else {
11941                 /*
11942                  * TSNH unless we have some send-map limit,
11943                  * and even at that it should not be hitting
11944                  * that limit (we should have stopped sending).
11945                  */
11946                 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL);
11947         }
11948         rack_log_pacing_delay_calc(rack,
11949                                    tp->gput_seq,
11950                                    tp->gput_ack,
11951                                    (uint64_t)my_rsm,
11952                                    tp->gput_ts,
11953                                    rack->r_ctl.rc_app_limited_cnt,
11954                                    9, __LINE__, NULL);
11955 }
11956
11957 static inline uint32_t
11958 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
11959     uint32_t avail, int32_t sb_offset)
11960 {
11961         uint32_t len;
11962         uint32_t sendwin;
11963
11964         if (tp->snd_wnd > cwnd_to_use)
11965                 sendwin = cwnd_to_use;
11966         else
11967                 sendwin = tp->snd_wnd;
11968         if (ctf_outstanding(tp) >= tp->snd_wnd) {
11969                 /* We never want to go over our peers rcv-window */
11970                 len = 0;
11971         } else {
11972                 uint32_t flight;
11973
11974                 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
11975                 if (flight >= sendwin) {
11976                         /*
11977                          * We have in flight what we are allowed by cwnd (if
11978                          * it was rwnd blocking it would have hit above out
11979                          * >= tp->snd_wnd).
11980                          */
11981                         return (0);
11982                 }
11983                 len = sendwin - flight;
11984                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
11985                         /* We would send too much (beyond the rwnd) */
11986                         len = tp->snd_wnd - ctf_outstanding(tp);
11987                 }
11988                 if ((len + sb_offset) > avail) {
11989                         /*
11990                          * We don't have that much in the SB, how much is
11991                          * there?
11992                          */
11993                         len = avail - sb_offset;
11994                 }
11995         }
11996         return (len);
11997 }
11998
11999 static int
12000 rack_output(struct tcpcb *tp)
12001 {
12002         struct socket *so;
12003         uint32_t recwin;
12004         uint32_t sb_offset;
12005         int32_t len, flags, error = 0;
12006         struct mbuf *m;
12007         struct mbuf *mb;
12008         uint32_t if_hw_tsomaxsegcount = 0;
12009         uint32_t if_hw_tsomaxsegsize;
12010         int32_t segsiz, minseg;
12011         long tot_len_this_send = 0;
12012         struct ip *ip = NULL;
12013 #ifdef TCPDEBUG
12014         struct ipovly *ipov = NULL;
12015 #endif
12016         struct udphdr *udp = NULL;
12017         struct tcp_rack *rack;
12018         struct tcphdr *th;
12019         uint8_t pass = 0;
12020         uint8_t mark = 0;
12021         uint8_t wanted_cookie = 0;
12022         u_char opt[TCP_MAXOLEN];
12023         unsigned ipoptlen, optlen, hdrlen, ulen=0;
12024         uint32_t rack_seq;
12025
12026 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12027         unsigned ipsec_optlen = 0;
12028
12029 #endif
12030         int32_t idle, sendalot;
12031         int32_t sub_from_prr = 0;
12032         volatile int32_t sack_rxmit;
12033         struct rack_sendmap *rsm = NULL;
12034         int32_t tso, mtu;
12035         struct tcpopt to;
12036         int32_t slot = 0;
12037         int32_t sup_rack = 0;
12038         uint32_t cts, us_cts, delayed, early;
12039         uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
12040         uint32_t cwnd_to_use;
12041         int32_t do_a_prefetch;
12042         int32_t prefetch_rsm = 0;
12043         int force_tso = 0;
12044         int32_t orig_len;
12045         struct timeval tv;
12046         int32_t prefetch_so_done = 0;
12047         struct tcp_log_buffer *lgb = NULL;
12048         struct inpcb *inp;
12049         struct sockbuf *sb;
12050 #ifdef INET6
12051         struct ip6_hdr *ip6 = NULL;
12052         int32_t isipv6;
12053 #endif
12054         uint8_t filled_all = 0;
12055         bool hw_tls = false;
12056
12057         /* setup and take the cache hits here */
12058         rack = (struct tcp_rack *)tp->t_fb_ptr;
12059         inp = rack->rc_inp;
12060         so = inp->inp_socket;
12061         sb = &so->so_snd;
12062         kern_prefetch(sb, &do_a_prefetch);
12063         do_a_prefetch = 1;
12064         hpts_calling = inp->inp_hpts_calls;
12065 #ifdef KERN_TLS
12066         hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
12067 #endif
12068
12069         NET_EPOCH_ASSERT();
12070         INP_WLOCK_ASSERT(inp);
12071 #ifdef TCP_OFFLOAD
12072         if (tp->t_flags & TF_TOE)
12073                 return (tcp_offload_output(tp));
12074 #endif
12075         /*
12076          * For TFO connections in SYN_RECEIVED, only allow the initial
12077          * SYN|ACK and those sent by the retransmit timer.
12078          */
12079         if (IS_FASTOPEN(tp->t_flags) &&
12080             (tp->t_state == TCPS_SYN_RECEIVED) &&
12081             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
12082             (rack->r_ctl.rc_resend == NULL))         /* not a retransmit */
12083                 return (0);
12084 #ifdef INET6
12085         if (rack->r_state) {
12086                 /* Use the cache line loaded if possible */
12087                 isipv6 = rack->r_is_v6;
12088         } else {
12089                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
12090         }
12091 #endif
12092         early = 0;
12093         us_cts = tcp_get_usecs(&tv);
12094         cts = tcp_tv_to_mssectick(&tv);
12095         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
12096             inp->inp_in_hpts) {
12097                 /*
12098                  * We are on the hpts for some timer but not hptsi output.
12099                  * Remove from the hpts unconditionally.
12100                  */
12101                 rack_timer_cancel(tp, rack, cts, __LINE__);
12102         }
12103         /* Are we pacing and late? */
12104         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
12105             TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) {
12106                 /* We are delayed */
12107                 delayed = us_cts - rack->r_ctl.rc_last_output_to;
12108         } else {
12109                 delayed = 0;
12110         }
12111         /* Do the timers, which may override the pacer  */
12112         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
12113                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
12114                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
12115                         return (0);
12116                 }
12117         }
12118         if ((rack->r_timer_override) ||
12119             (delayed) ||
12120             (tp->t_state < TCPS_ESTABLISHED)) {
12121                 if (tp->t_inpcb->inp_in_hpts)
12122                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
12123         } else if (tp->t_inpcb->inp_in_hpts) {
12124                 /*
12125                  * On the hpts you can't pass even if ACKNOW is on, we will
12126                  * when the hpts fires.
12127                  */
12128                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
12129                 return (0);
12130         }
12131         inp->inp_hpts_calls = 0;
12132         /* Finish out both pacing early and late accounting */
12133         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
12134             TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
12135                 early = rack->r_ctl.rc_last_output_to - us_cts;
12136         } else
12137                 early = 0;
12138         if (delayed) {
12139                 rack->r_ctl.rc_agg_delayed += delayed;
12140                 rack->r_late = 1;
12141         } else if (early) {
12142                 rack->r_ctl.rc_agg_early += early;
12143                 rack->r_early = 1;
12144         }
12145         /* Now that early/late accounting is done turn off the flag */
12146         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
12147         rack->r_wanted_output = 0;
12148         rack->r_timer_override = 0;
12149         /*
12150          * For TFO connections in SYN_SENT or SYN_RECEIVED,
12151          * only allow the initial SYN or SYN|ACK and those sent
12152          * by the retransmit timer.
12153          */
12154         if (IS_FASTOPEN(tp->t_flags) &&
12155             ((tp->t_state == TCPS_SYN_RECEIVED) ||
12156              (tp->t_state == TCPS_SYN_SENT)) &&
12157             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
12158             (tp->t_rxtshift == 0)) {              /* not a retransmit */
12159                 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
12160                 goto just_return_nolock;
12161         }
12162         /*
12163          * Determine length of data that should be transmitted, and flags
12164          * that will be used. If there is some data or critical controls
12165          * (SYN, RST) to send, then transmit; otherwise, investigate
12166          * further.
12167          */
12168         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
12169         if (tp->t_idle_reduce) {
12170                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
12171                         rack_cc_after_idle(rack, tp);
12172         }
12173         tp->t_flags &= ~TF_LASTIDLE;
12174         if (idle) {
12175                 if (tp->t_flags & TF_MORETOCOME) {
12176                         tp->t_flags |= TF_LASTIDLE;
12177                         idle = 0;
12178                 }
12179         }
12180         if ((tp->snd_una == tp->snd_max) &&
12181             rack->r_ctl.rc_went_idle_time &&
12182             TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) {
12183                 idle = us_cts - rack->r_ctl.rc_went_idle_time;
12184                 if (idle > rack_min_probertt_hold) {
12185                         /* Count as a probe rtt */
12186                         if (rack->in_probe_rtt == 0) {
12187                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
12188                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
12189                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
12190                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
12191                         } else {
12192                                 rack_exit_probertt(rack, us_cts);
12193                         }
12194                 }
12195                 idle = 0;
12196         }
12197 again:
12198         /*
12199          * If we've recently taken a timeout, snd_max will be greater than
12200          * snd_nxt.  There may be SACK information that allows us to avoid
12201          * resending already delivered data.  Adjust snd_nxt accordingly.
12202          */
12203         sendalot = 0;
12204         us_cts = tcp_get_usecs(&tv);
12205         cts = tcp_tv_to_mssectick(&tv);
12206         tso = 0;
12207         mtu = 0;
12208         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
12209         if (so->so_snd.sb_flags & SB_TLS_IFNET) {
12210                 minseg = rack->r_ctl.rc_pace_min_segs;
12211         } else {
12212                 minseg = segsiz;
12213         }
12214         sb_offset = tp->snd_max - tp->snd_una;
12215         cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
12216 #ifdef NETFLIX_SHARED_CWND
12217         if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
12218             rack->rack_enable_scwnd) {
12219                 /* We are doing cwnd sharing */
12220                 if (rack->rc_gp_filled &&
12221                     (rack->rack_attempted_scwnd == 0) &&
12222                     (rack->r_ctl.rc_scw == NULL) &&
12223                     tp->t_lib) {
12224                         /* The pcbid is in, lets make an attempt */
12225                         counter_u64_add(rack_try_scwnd, 1);
12226                         rack->rack_attempted_scwnd = 1;
12227                         rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
12228                                                                    &rack->r_ctl.rc_scw_index,
12229                                                                    segsiz);
12230                 }
12231                 if (rack->r_ctl.rc_scw &&
12232                     (rack->rack_scwnd_is_idle == 1) &&
12233                     (rack->rc_in_persist == 0) &&
12234                     sbavail(sb)) {
12235                         /* we are no longer out of data */
12236                         tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
12237                         rack->rack_scwnd_is_idle = 0;
12238                 }
12239                 if (rack->r_ctl.rc_scw) {
12240                         /* First lets update and get the cwnd */
12241                         rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
12242                                                                     rack->r_ctl.rc_scw_index,
12243                                                                     tp->snd_cwnd, tp->snd_wnd, segsiz);
12244                 }
12245         }
12246 #endif
12247         flags = tcp_outflags[tp->t_state];
12248         while (rack->rc_free_cnt < rack_free_cache) {
12249                 rsm = rack_alloc(rack);
12250                 if (rsm == NULL) {
12251                         if (inp->inp_hpts_calls)
12252                                 /* Retry in a ms */
12253                                 slot = (1 * HPTS_USEC_IN_MSEC);
12254                         goto just_return_nolock;
12255                 }
12256                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
12257                 rack->rc_free_cnt++;
12258                 rsm = NULL;
12259         }
12260         if (inp->inp_hpts_calls)
12261                 inp->inp_hpts_calls = 0;
12262         sack_rxmit = 0;
12263         len = 0;
12264         rsm = NULL;
12265         if (flags & TH_RST) {
12266                 SOCKBUF_LOCK(sb);
12267                 goto send;
12268         }
12269         if (rack->r_ctl.rc_resend) {
12270                 /* Retransmit timer */
12271                 rsm = rack->r_ctl.rc_resend;
12272                 rack->r_ctl.rc_resend = NULL;
12273                 rsm->r_flags &= ~RACK_TLP;
12274                 len = rsm->r_end - rsm->r_start;
12275                 sack_rxmit = 1;
12276                 sendalot = 0;
12277                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
12278                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
12279                          __func__, __LINE__,
12280                          rsm->r_start, tp->snd_una, tp, rack, rsm));
12281                 sb_offset = rsm->r_start - tp->snd_una;
12282                 if (len >= segsiz)
12283                         len = segsiz;
12284         } else if ((rack->rc_in_persist == 0) &&
12285                    ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
12286                 /* We have a retransmit that takes precedence */
12287                 rsm->r_flags &= ~RACK_TLP;
12288                 if ((!IN_RECOVERY(tp->t_flags)) &&
12289                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
12290                         /* Enter recovery if not induced by a time-out */
12291                         rack->r_ctl.rc_rsm_start = rsm->r_start;
12292                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
12293                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
12294                         rack_cong_signal(tp, NULL, CC_NDUPACK);
12295                         /*
12296                          * When we enter recovery we need to assure we send
12297                          * one packet.
12298                          */
12299                         if (rack->rack_no_prr == 0) {
12300                                 rack->r_ctl.rc_prr_sndcnt = segsiz;
12301                                 rack_log_to_prr(rack, 13, 0);
12302                         }
12303                 }
12304 #ifdef INVARIANTS
12305                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
12306                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
12307                               tp, rack, rsm, rsm->r_start, tp->snd_una);
12308                 }
12309 #endif
12310                 len = rsm->r_end - rsm->r_start;
12311                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
12312                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
12313                          __func__, __LINE__,
12314                          rsm->r_start, tp->snd_una, tp, rack, rsm));
12315                 sb_offset = rsm->r_start - tp->snd_una;
12316                 /* Can we send it within the PRR boundary? */
12317                 if (rack->rack_no_prr == 0) {
12318                         if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
12319                                 /* It does not fit */
12320                                 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
12321                                     (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
12322                                         /*
12323                                          * prr is less than a segment, we
12324                                          * have more acks due in besides
12325                                          * what we need to resend. Lets not send
12326                                          * to avoid sending small pieces of
12327                                          * what we need to retransmit.
12328                                          */
12329                                         len = 0;
12330                                         goto just_return_nolock;
12331                                 }
12332                                 len = rack->r_ctl.rc_prr_sndcnt;
12333                         }
12334                 }
12335                 sendalot = 0;
12336                 if (len >= segsiz)
12337                         len = segsiz;
12338                 if (len > 0) {
12339                         sub_from_prr = 1;
12340                         sack_rxmit = 1;
12341                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
12342                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
12343                             min(len, segsiz));
12344                         counter_u64_add(rack_rtm_prr_retran, 1);
12345                 }
12346         } else  if (rack->r_ctl.rc_tlpsend) {
12347                 /* Tail loss probe */
12348                 long cwin;
12349                 long tlen;
12350
12351                 doing_tlp = 1;
12352                 /*
12353                  * Check if we can do a TLP with a RACK'd packet
12354                  * this can happen if we are not doing the rack
12355                  * cheat and we skipped to a TLP and it
12356                  * went off.
12357                  */
12358                 rsm = rack->r_ctl.rc_tlpsend;
12359                 rsm->r_flags |= RACK_TLP;
12360                 rack->r_ctl.rc_tlpsend = NULL;
12361                 sack_rxmit = 1;
12362                 tlen = rsm->r_end - rsm->r_start;
12363                 if (tlen > segsiz)
12364                         tlen = segsiz;
12365                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
12366                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
12367                          __func__, __LINE__,
12368                          rsm->r_start, tp->snd_una, tp, rack, rsm));
12369                 sb_offset = rsm->r_start - tp->snd_una;
12370                 cwin = min(tp->snd_wnd, tlen);
12371                 len = cwin;
12372         }
12373         /*
12374          * Enforce a connection sendmap count limit if set
12375          * as long as we are not retransmiting.
12376          */
12377         if ((rsm == NULL) &&
12378             (rack->do_detection == 0) &&
12379             (V_tcp_map_entries_limit > 0) &&
12380             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
12381                 counter_u64_add(rack_to_alloc_limited, 1);
12382                 if (!rack->alloc_limit_reported) {
12383                         rack->alloc_limit_reported = 1;
12384                         counter_u64_add(rack_alloc_limited_conns, 1);
12385                 }
12386                 goto just_return_nolock;
12387         }
12388         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
12389                 /* we are retransmitting the fin */
12390                 len--;
12391                 if (len) {
12392                         /*
12393                          * When retransmitting data do *not* include the
12394                          * FIN. This could happen from a TLP probe.
12395                          */
12396                         flags &= ~TH_FIN;
12397                 }
12398         }
12399 #ifdef INVARIANTS
12400         /* For debugging */
12401         rack->r_ctl.rc_rsm_at_retran = rsm;
12402 #endif
12403         /*
12404          * Get standard flags, and add SYN or FIN if requested by 'hidden'
12405          * state flags.
12406          */
12407         if (tp->t_flags & TF_NEEDFIN)
12408                 flags |= TH_FIN;
12409         if (tp->t_flags & TF_NEEDSYN)
12410                 flags |= TH_SYN;
12411         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
12412                 void *end_rsm;
12413                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
12414                 if (end_rsm)
12415                         kern_prefetch(end_rsm, &prefetch_rsm);
12416                 prefetch_rsm = 1;
12417         }
12418         SOCKBUF_LOCK(sb);
12419         /*
12420          * If snd_nxt == snd_max and we have transmitted a FIN, the
12421          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
12422          * negative length.  This can also occur when TCP opens up its
12423          * congestion window while receiving additional duplicate acks after
12424          * fast-retransmit because TCP will reset snd_nxt to snd_max after
12425          * the fast-retransmit.
12426          *
12427          * In the normal retransmit-FIN-only case, however, snd_nxt will be
12428          * set to snd_una, the sb_offset will be 0, and the length may wind
12429          * up 0.
12430          *
12431          * If sack_rxmit is true we are retransmitting from the scoreboard
12432          * in which case len is already set.
12433          */
12434         if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
12435                 uint32_t avail;
12436
12437                 avail = sbavail(sb);
12438                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
12439                         sb_offset = tp->snd_nxt - tp->snd_una;
12440                 else
12441                         sb_offset = 0;
12442                 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
12443                         if (rack->r_ctl.rc_tlp_new_data) {
12444                                 /* TLP is forcing out new data */
12445                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
12446                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
12447                                 }
12448                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
12449                                         len = tp->snd_wnd;
12450                                 else
12451                                         len = rack->r_ctl.rc_tlp_new_data;
12452                                 rack->r_ctl.rc_tlp_new_data = 0;
12453                                 new_data_tlp = doing_tlp = 1;
12454                         }  else
12455                                 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
12456                         if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) {
12457                                 /*
12458                                  * For prr=off, we need to send only 1 MSS
12459                                  * at a time. We do this because another sack could
12460                                  * be arriving that causes us to send retransmits and
12461                                  * we don't want to be on a long pace due to a larger send
12462                                  * that keeps us from sending out the retransmit.
12463                                  */
12464                                 len = segsiz;
12465                         }
12466                 } else {
12467                         uint32_t outstanding;
12468
12469                         /*
12470                          * We are inside of a SACK recovery episode and are
12471                          * sending new data, having retransmitted all the
12472                          * data possible so far in the scoreboard.
12473                          */
12474                         outstanding = tp->snd_max - tp->snd_una;
12475                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
12476                                 if (tp->snd_wnd > outstanding) {
12477                                         len = tp->snd_wnd - outstanding;
12478                                         /* Check to see if we have the data */
12479                                         if ((sb_offset + len) > avail) {
12480                                                 /* It does not all fit */
12481                                                 if (avail > sb_offset)
12482                                                         len = avail - sb_offset;
12483                                                 else
12484                                                         len = 0;
12485                                         }
12486                                 } else
12487                                         len = 0;
12488                         } else if (avail > sb_offset)
12489                                 len = avail - sb_offset;
12490                         else
12491                                 len = 0;
12492                         if (len > 0) {
12493                                 if (len > rack->r_ctl.rc_prr_sndcnt)
12494                                         len = rack->r_ctl.rc_prr_sndcnt;
12495                                 if (len > 0) {
12496                                         sub_from_prr = 1;
12497                                         counter_u64_add(rack_rtm_prr_newdata, 1);
12498                                 }
12499                         }
12500                         if (len > segsiz) {
12501                                 /*
12502                                  * We should never send more than a MSS when
12503                                  * retransmitting or sending new data in prr
12504                                  * mode unless the override flag is on. Most
12505                                  * likely the PRR algorithm is not going to
12506                                  * let us send a lot as well :-)
12507                                  */
12508                                 if (rack->r_ctl.rc_prr_sendalot == 0)
12509                                         len = segsiz;
12510                         } else if (len < segsiz) {
12511                                 /*
12512                                  * Do we send any? The idea here is if the
12513                                  * send empty's the socket buffer we want to
12514                                  * do it. However if not then lets just wait
12515                                  * for our prr_sndcnt to get bigger.
12516                                  */
12517                                 long leftinsb;
12518
12519                                 leftinsb = sbavail(sb) - sb_offset;
12520                                 if (leftinsb > len) {
12521                                         /* This send does not empty the sb */
12522                                         len = 0;
12523                                 }
12524                         }
12525                 }
12526         } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
12527                 /*
12528                  * If you have not established
12529                  * and are not doing FAST OPEN
12530                  * no data please.
12531                  */
12532                 if ((sack_rxmit == 0) &&
12533                     (!IS_FASTOPEN(tp->t_flags))){
12534                         len = 0;
12535                         sb_offset = 0;
12536                 }
12537         }
12538         if (prefetch_so_done == 0) {
12539                 kern_prefetch(so, &prefetch_so_done);
12540                 prefetch_so_done = 1;
12541         }
12542         /*
12543          * Lop off SYN bit if it has already been sent.  However, if this is
12544          * SYN-SENT state and if segment contains data and if we don't know
12545          * that foreign host supports TAO, suppress sending segment.
12546          */
12547         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
12548             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
12549                 /*
12550                  * When sending additional segments following a TFO SYN|ACK,
12551                  * do not include the SYN bit.
12552                  */
12553                 if (IS_FASTOPEN(tp->t_flags) &&
12554                     (tp->t_state == TCPS_SYN_RECEIVED))
12555                         flags &= ~TH_SYN;
12556         }
12557         /*
12558          * Be careful not to send data and/or FIN on SYN segments. This
12559          * measure is needed to prevent interoperability problems with not
12560          * fully conformant TCP implementations.
12561          */
12562         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
12563                 len = 0;
12564                 flags &= ~TH_FIN;
12565         }
12566         /*
12567          * On TFO sockets, ensure no data is sent in the following cases:
12568          *
12569          *  - When retransmitting SYN|ACK on a passively-created socket
12570          *
12571          *  - When retransmitting SYN on an actively created socket
12572          *
12573          *  - When sending a zero-length cookie (cookie request) on an
12574          *    actively created socket
12575          *
12576          *  - When the socket is in the CLOSED state (RST is being sent)
12577          */
12578         if (IS_FASTOPEN(tp->t_flags) &&
12579             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
12580              ((tp->t_state == TCPS_SYN_SENT) &&
12581               (tp->t_tfo_client_cookie_len == 0)) ||
12582              (flags & TH_RST))) {
12583                 sack_rxmit = 0;
12584                 len = 0;
12585         }
12586         /* Without fast-open there should never be data sent on a SYN */
12587         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
12588                 tp->snd_nxt = tp->iss;
12589                 len = 0;
12590         }
12591         orig_len = len;
12592         if (len <= 0) {
12593                 /*
12594                  * If FIN has been sent but not acked, but we haven't been
12595                  * called to retransmit, len will be < 0.  Otherwise, window
12596                  * shrank after we sent into it.  If window shrank to 0,
12597                  * cancel pending retransmit, pull snd_nxt back to (closed)
12598                  * window, and set the persist timer if it isn't already
12599                  * going.  If the window didn't close completely, just wait
12600                  * for an ACK.
12601                  *
12602                  * We also do a general check here to ensure that we will
12603                  * set the persist timer when we have data to send, but a
12604                  * 0-byte window. This makes sure the persist timer is set
12605                  * even if the packet hits one of the "goto send" lines
12606                  * below.
12607                  */
12608                 len = 0;
12609                 if ((tp->snd_wnd == 0) &&
12610                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12611                     (tp->snd_una == tp->snd_max) &&
12612                     (sb_offset < (int)sbavail(sb))) {
12613                         tp->snd_nxt = tp->snd_una;
12614                         rack_enter_persist(tp, rack, cts);
12615                 }
12616         } else if ((rsm == NULL) &&
12617                    ((doing_tlp == 0) || (new_data_tlp == 1)) &&
12618                    (len < rack->r_ctl.rc_pace_max_segs)) {
12619                 /*
12620                  * We are not sending a maximum sized segment for
12621                  * some reason. Should we not send anything (think
12622                  * sws or persists)?
12623                  */
12624                 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) &&
12625                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
12626                     (len < minseg) &&
12627                     (len < (int)(sbavail(sb) - sb_offset))) {
12628                         /*
12629                          * Here the rwnd is less than
12630                          * the minimum pacing size, this is not a retransmit,
12631                          * we are established and
12632                          * the send is not the last in the socket buffer
12633                          * we send nothing, and we may enter persists
12634                          * if nothing is outstanding.
12635                          */
12636                         len = 0;
12637                         if (tp->snd_max == tp->snd_una) {
12638                                 /*
12639                                  * Nothing out we can
12640                                  * go into persists.
12641                                  */
12642                                 rack_enter_persist(tp, rack, cts);
12643                                 tp->snd_nxt = tp->snd_una;
12644                         }
12645                 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
12646                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
12647                            (len < (int)(sbavail(sb) - sb_offset)) &&
12648                            (len < minseg)) {
12649                         /*
12650                          * Here we are not retransmitting, and
12651                          * the cwnd is not so small that we could
12652                          * not send at least a min size (rxt timer
12653                          * not having gone off), We have 2 segments or
12654                          * more already in flight, its not the tail end
12655                          * of the socket buffer  and the cwnd is blocking
12656                          * us from sending out a minimum pacing segment size.
12657                          * Lets not send anything.
12658                          */
12659                         len = 0;
12660                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
12661                             min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
12662                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
12663                            (len < (int)(sbavail(sb) - sb_offset)) &&
12664                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
12665                         /*
12666                          * Here we have a send window but we have
12667                          * filled it up and we can't send another pacing segment.
12668                          * We also have in flight more than 2 segments
12669                          * and we are not completing the sb i.e. we allow
12670                          * the last bytes of the sb to go out even if
12671                          * its not a full pacing segment.
12672                          */
12673                         len = 0;
12674                 }
12675         }
12676         /* len will be >= 0 after this point. */
12677         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
12678         tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use));
12679         /*
12680          * Decide if we can use TCP Segmentation Offloading (if supported by
12681          * hardware).
12682          *
12683          * TSO may only be used if we are in a pure bulk sending state.  The
12684          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
12685          * options prevent using TSO.  With TSO the TCP header is the same
12686          * (except for the sequence number) for all generated packets.  This
12687          * makes it impossible to transmit any options which vary per
12688          * generated segment or packet.
12689          *
12690          * IPv4 handling has a clear separation of ip options and ip header
12691          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
12692          * the right thing below to provide length of just ip options and thus
12693          * checking for ipoptlen is enough to decide if ip options are present.
12694          */
12695
12696 #ifdef INET6
12697         if (isipv6)
12698                 ipoptlen = ip6_optlen(tp->t_inpcb);
12699         else
12700 #endif
12701                 if (tp->t_inpcb->inp_options)
12702                         ipoptlen = tp->t_inpcb->inp_options->m_len -
12703                                 offsetof(struct ipoption, ipopt_list);
12704                 else
12705                         ipoptlen = 0;
12706 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12707         /*
12708          * Pre-calculate here as we save another lookup into the darknesses
12709          * of IPsec that way and can actually decide if TSO is ok.
12710          */
12711 #ifdef INET6
12712         if (isipv6 && IPSEC_ENABLED(ipv6))
12713                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
12714 #ifdef INET
12715         else
12716 #endif
12717 #endif                          /* INET6 */
12718 #ifdef INET
12719                 if (IPSEC_ENABLED(ipv4))
12720                         ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
12721 #endif                          /* INET */
12722 #endif
12723
12724 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
12725         ipoptlen += ipsec_optlen;
12726 #endif
12727         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
12728             (tp->t_port == 0) &&
12729             ((tp->t_flags & TF_SIGNATURE) == 0) &&
12730             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
12731             ipoptlen == 0)
12732                 tso = 1;
12733         {
12734                 uint32_t outstanding;
12735
12736                 outstanding = tp->snd_max - tp->snd_una;
12737                 if (tp->t_flags & TF_SENTFIN) {
12738                         /*
12739                          * If we sent a fin, snd_max is 1 higher than
12740                          * snd_una
12741                          */
12742                         outstanding--;
12743                 }
12744                 if (sack_rxmit) {
12745                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
12746                                 flags &= ~TH_FIN;
12747                 } else {
12748                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
12749                                    sbused(sb)))
12750                                 flags &= ~TH_FIN;
12751                 }
12752         }
12753         recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
12754             (long)TCP_MAXWIN << tp->rcv_scale);
12755
12756         /*
12757          * Sender silly window avoidance.   We transmit under the following
12758          * conditions when len is non-zero:
12759          *
12760          * - We have a full segment (or more with TSO) - This is the last
12761          * buffer in a write()/send() and we are either idle or running
12762          * NODELAY - we've timed out (e.g. persist timer) - we have more
12763          * then 1/2 the maximum send window's worth of data (receiver may be
12764          * limited the window size) - we need to retransmit
12765          */
12766         if (len) {
12767                 if (len >= segsiz) {
12768                         goto send;
12769                 }
12770                 /*
12771                  * NOTE! on localhost connections an 'ack' from the remote
12772                  * end may occur synchronously with the output and cause us
12773                  * to flush a buffer queued with moretocome.  XXX
12774                  *
12775                  */
12776                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
12777                     (idle || (tp->t_flags & TF_NODELAY)) &&
12778                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
12779                     (tp->t_flags & TF_NOPUSH) == 0) {
12780                         pass = 2;
12781                         goto send;
12782                 }
12783                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
12784                         pass = 22;
12785                         goto send;
12786                 }
12787                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
12788                         pass = 4;
12789                         goto send;
12790                 }
12791                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
12792                         pass = 5;
12793                         goto send;
12794                 }
12795                 if (sack_rxmit) {
12796                         pass = 6;
12797                         goto send;
12798                 }
12799                 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
12800                     (ctf_outstanding(tp) < (segsiz * 2))) {
12801                         /*
12802                          * We have less than two MSS outstanding (delayed ack)
12803                          * and our rwnd will not let us send a full sized
12804                          * MSS. Lets go ahead and let this small segment
12805                          * out because we want to try to have at least two
12806                          * packets inflight to not be caught by delayed ack.
12807                          */
12808                         pass = 12;
12809                         goto send;
12810                 }
12811         }
12812         /*
12813          * Sending of standalone window updates.
12814          *
12815          * Window updates are important when we close our window due to a
12816          * full socket buffer and are opening it again after the application
12817          * reads data from it.  Once the window has opened again and the
12818          * remote end starts to send again the ACK clock takes over and
12819          * provides the most current window information.
12820          *
12821          * We must avoid the silly window syndrome whereas every read from
12822          * the receive buffer, no matter how small, causes a window update
12823          * to be sent.  We also should avoid sending a flurry of window
12824          * updates when the socket buffer had queued a lot of data and the
12825          * application is doing small reads.
12826          *
12827          * Prevent a flurry of pointless window updates by only sending an
12828          * update when we can increase the advertized window by more than
12829          * 1/4th of the socket buffer capacity.  When the buffer is getting
12830          * full or is very small be more aggressive and send an update
12831          * whenever we can increase by two mss sized segments. In all other
12832          * situations the ACK's to new incoming data will carry further
12833          * window increases.
12834          *
12835          * Don't send an independent window update if a delayed ACK is
12836          * pending (it will get piggy-backed on it) or the remote side
12837          * already has done a half-close and won't send more data.  Skip
12838          * this if the connection is in T/TCP half-open state.
12839          */
12840         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
12841             !(tp->t_flags & TF_DELACK) &&
12842             !TCPS_HAVERCVDFIN(tp->t_state)) {
12843                 /*
12844                  * "adv" is the amount we could increase the window, taking
12845                  * into account that we are limited by TCP_MAXWIN <<
12846                  * tp->rcv_scale.
12847                  */
12848                 int32_t adv;
12849                 int oldwin;
12850
12851                 adv = recwin;
12852                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
12853                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
12854                         if (adv > oldwin)
12855                             adv -= oldwin;
12856                         else {
12857                                 /* We can't increase the window */
12858                                 adv = 0;
12859                         }
12860                 } else
12861                         oldwin = 0;
12862
12863                 /*
12864                  * If the new window size ends up being the same as or less
12865                  * than the old size when it is scaled, then don't force
12866                  * a window update.
12867                  */
12868                 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
12869                         goto dontupdate;
12870
12871                 if (adv >= (int32_t)(2 * segsiz) &&
12872                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
12873                      recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
12874                      so->so_rcv.sb_hiwat <= 8 * segsiz)) {
12875                         pass = 7;
12876                         goto send;
12877                 }
12878                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
12879                         pass = 23;
12880                         goto send;
12881                 }
12882         }
12883 dontupdate:
12884
12885         /*
12886          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
12887          * is also a catch-all for the retransmit timer timeout case.
12888          */
12889         if (tp->t_flags & TF_ACKNOW) {
12890                 pass = 8;
12891                 goto send;
12892         }
12893         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
12894                 pass = 9;
12895                 goto send;
12896         }
12897         /*
12898          * If our state indicates that FIN should be sent and we have not
12899          * yet done so, then we need to send.
12900          */
12901         if ((flags & TH_FIN) &&
12902             (tp->snd_nxt == tp->snd_una)) {
12903                 pass = 11;
12904                 goto send;
12905         }
12906         /*
12907          * No reason to send a segment, just return.
12908          */
12909 just_return:
12910         SOCKBUF_UNLOCK(sb);
12911 just_return_nolock:
12912         {
12913                 int app_limited = CTF_JR_SENT_DATA;
12914
12915                 if (tot_len_this_send > 0) {
12916                         /* Make sure snd_nxt is up to max */
12917                         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
12918                                 tp->snd_nxt = tp->snd_max;
12919                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
12920                 } else {
12921                         int end_window = 0;
12922                         uint32_t seq = tp->gput_ack;
12923
12924                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
12925                         if (rsm) {
12926                                 /*
12927                                  * Mark the last sent that we just-returned (hinting
12928                                  * that delayed ack may play a role in any rtt measurement).
12929                                  */
12930                                 rsm->r_just_ret = 1;
12931                         }
12932                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
12933                         rack->r_ctl.rc_agg_delayed = 0;
12934                         rack->r_early = 0;
12935                         rack->r_late = 0;
12936                         rack->r_ctl.rc_agg_early = 0;
12937                         if ((ctf_outstanding(tp) +
12938                              min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
12939                                  minseg)) >= tp->snd_wnd) {
12940                                 /* We are limited by the rwnd */
12941                                 app_limited = CTF_JR_RWND_LIMITED;
12942                         } else if (ctf_outstanding(tp) >= sbavail(sb)) {
12943                                 /* We are limited by whats available -- app limited */
12944                                 app_limited = CTF_JR_APP_LIMITED;
12945                         } else if ((idle == 0) &&
12946                                    ((tp->t_flags & TF_NODELAY) == 0) &&
12947                                    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
12948                                    (len < segsiz)) {
12949                                 /*
12950                                  * No delay is not on and the
12951                                  * user is sending less than 1MSS. This
12952                                  * brings out SWS avoidance so we
12953                                  * don't send. Another app-limited case.
12954                                  */
12955                                 app_limited = CTF_JR_APP_LIMITED;
12956                         } else if (tp->t_flags & TF_NOPUSH) {
12957                                 /*
12958                                  * The user has requested no push of
12959                                  * the last segment and we are
12960                                  * at the last segment. Another app
12961                                  * limited case.
12962                                  */
12963                                 app_limited = CTF_JR_APP_LIMITED;
12964                         } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
12965                                 /* Its the cwnd */
12966                                 app_limited = CTF_JR_CWND_LIMITED;
12967                         } else if (rack->rc_in_persist == 1) {
12968                                 /* We are in persists */
12969                                 app_limited = CTF_JR_PERSISTS;
12970                         } else if (IN_RECOVERY(tp->t_flags) &&
12971                                    (rack->rack_no_prr == 0) &&
12972                                    (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
12973                                 app_limited = CTF_JR_PRR;
12974                         } else {
12975                                 /* Now why here are we not sending? */
12976 #ifdef NOW
12977 #ifdef INVARIANTS
12978                                 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
12979 #endif
12980 #endif
12981                                 app_limited = CTF_JR_ASSESSING;
12982                         }
12983                         /*
12984                          * App limited in some fashion, for our pacing GP
12985                          * measurements we don't want any gap (even cwnd).
12986                          * Close  down the measurement window.
12987                          */
12988                         if (rack_cwnd_block_ends_measure &&
12989                             ((app_limited == CTF_JR_CWND_LIMITED) ||
12990                              (app_limited == CTF_JR_PRR))) {
12991                                 /*
12992                                  * The reason we are not sending is
12993                                  * the cwnd (or prr). We have been configured
12994                                  * to end the measurement window in
12995                                  * this case.
12996                                  */
12997                                 end_window = 1;
12998                         } else if (app_limited == CTF_JR_PERSISTS) {
12999                                 /*
13000                                  * We never end the measurement window
13001                                  * in persists, though in theory we
13002                                  * should be only entering after everything
13003                                  * is acknowledged (so we will probably
13004                                  * never come here).
13005                                  */
13006                                 end_window = 0;
13007                         } else if (rack_rwnd_block_ends_measure &&
13008                                    (app_limited == CTF_JR_RWND_LIMITED)) {
13009                                 /*
13010                                  * We are rwnd limited and have been
13011                                  * configured to end the measurement
13012                                  * window in this case.
13013                                  */
13014                                 end_window = 1;
13015                         } else if (app_limited == CTF_JR_APP_LIMITED) {
13016                                 /*
13017                                  * A true application limited period, we have
13018                                  * ran out of data.
13019                                  */
13020                                 end_window = 1;
13021                         } else if (app_limited == CTF_JR_ASSESSING) {
13022                                 /*
13023                                  * In the assessing case we hit the end of
13024                                  * the if/else and had no known reason
13025                                  * This will panic us under invariants..
13026                                  *
13027                                  * If we get this out in logs we need to
13028                                  * investagate which reason we missed.
13029                                  */
13030                                 end_window = 1;
13031                         }
13032                         if (end_window) {
13033                                 uint8_t log = 0;
13034
13035                                 if ((tp->t_flags & TF_GPUTINPROG) &&
13036                                     SEQ_GT(tp->gput_ack, tp->snd_max)) {
13037                                         /* Mark the last packet has app limited */
13038                                         tp->gput_ack = tp->snd_max;
13039                                         log = 1;
13040                                 }
13041                                 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
13042                                 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
13043                                         if (rack->r_ctl.rc_app_limited_cnt == 0)
13044                                                 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
13045                                         else {
13046                                                 /*
13047                                                  * Go out to the end app limited and mark
13048                                                  * this new one as next and move the end_appl up
13049                                                  * to this guy.
13050                                                  */
13051                                                 if (rack->r_ctl.rc_end_appl)
13052                                                         rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
13053                                                 rack->r_ctl.rc_end_appl = rsm;
13054                                         }
13055                                         rsm->r_flags |= RACK_APP_LIMITED;
13056                                         rack->r_ctl.rc_app_limited_cnt++;
13057                                 }
13058                                 if (log)
13059                                         rack_log_pacing_delay_calc(rack,
13060                                                                    rack->r_ctl.rc_app_limited_cnt, seq,
13061                                                                    tp->gput_ack, 0, 0, 4, __LINE__, NULL);
13062                         }
13063                 }
13064                 if (slot) {
13065                         /* set the rack tcb into the slot N */
13066                         counter_u64_add(rack_paced_segments, 1);
13067                 } else if (tot_len_this_send) {
13068                         counter_u64_add(rack_unpaced_segments, 1);
13069                 }
13070                 /* Check if we need to go into persists or not */
13071                 if ((rack->rc_in_persist == 0) &&
13072                     (tp->snd_max == tp->snd_una) &&
13073                     TCPS_HAVEESTABLISHED(tp->t_state) &&
13074                     sbavail(sb) &&
13075                     (sbavail(sb) > tp->snd_wnd) &&
13076                     (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
13077                         /* Yes lets make sure to move to persist before timer-start */
13078                         rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
13079                 }
13080                 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
13081                 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
13082         }
13083 #ifdef NETFLIX_SHARED_CWND
13084         if ((sbavail(sb) == 0) &&
13085             rack->r_ctl.rc_scw) {
13086                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
13087                 rack->rack_scwnd_is_idle = 1;
13088         }
13089 #endif
13090         return (0);
13091
13092 send:
13093         if ((flags & TH_FIN) &&
13094             sbavail(sb)) {
13095                 /*
13096                  * We do not transmit a FIN
13097                  * with data outstanding. We
13098                  * need to make it so all data
13099                  * is acked first.
13100                  */
13101                 flags &= ~TH_FIN;
13102         }
13103         /* Enforce stack imposed max seg size if we have one */
13104         if (rack->r_ctl.rc_pace_max_segs &&
13105             (len > rack->r_ctl.rc_pace_max_segs)) {
13106                 mark = 1;
13107                 len = rack->r_ctl.rc_pace_max_segs;
13108         }
13109         SOCKBUF_LOCK_ASSERT(sb);
13110         if (len > 0) {
13111                 if (len >= segsiz)
13112                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
13113                 else
13114                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
13115         }
13116         /*
13117          * Before ESTABLISHED, force sending of initial options unless TCP
13118          * set not to do any options. NOTE: we assume that the IP/TCP header
13119          * plus TCP options always fit in a single mbuf, leaving room for a
13120          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
13121          * + optlen <= MCLBYTES
13122          */
13123         optlen = 0;
13124 #ifdef INET6
13125         if (isipv6)
13126                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
13127         else
13128 #endif
13129                 hdrlen = sizeof(struct tcpiphdr);
13130
13131         /*
13132          * Compute options for segment. We only have to care about SYN and
13133          * established connection segments.  Options for SYN-ACK segments
13134          * are handled in TCP syncache.
13135          */
13136         to.to_flags = 0;
13137         if ((tp->t_flags & TF_NOOPT) == 0) {
13138                 /* Maximum segment size. */
13139                 if (flags & TH_SYN) {
13140                         tp->snd_nxt = tp->iss;
13141                         to.to_mss = tcp_mssopt(&inp->inp_inc);
13142 #ifdef NETFLIX_TCPOUDP
13143                         if (tp->t_port)
13144                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
13145 #endif
13146                         to.to_flags |= TOF_MSS;
13147
13148                         /*
13149                          * On SYN or SYN|ACK transmits on TFO connections,
13150                          * only include the TFO option if it is not a
13151                          * retransmit, as the presence of the TFO option may
13152                          * have caused the original SYN or SYN|ACK to have
13153                          * been dropped by a middlebox.
13154                          */
13155                         if (IS_FASTOPEN(tp->t_flags) &&
13156                             (tp->t_rxtshift == 0)) {
13157                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
13158                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
13159                                         to.to_tfo_cookie =
13160                                                 (u_int8_t *)&tp->t_tfo_cookie.server;
13161                                         to.to_flags |= TOF_FASTOPEN;
13162                                         wanted_cookie = 1;
13163                                 } else if (tp->t_state == TCPS_SYN_SENT) {
13164                                         to.to_tfo_len =
13165                                                 tp->t_tfo_client_cookie_len;
13166                                         to.to_tfo_cookie =
13167                                                 tp->t_tfo_cookie.client;
13168                                         to.to_flags |= TOF_FASTOPEN;
13169                                         wanted_cookie = 1;
13170                                         /*
13171                                          * If we wind up having more data to
13172                                          * send with the SYN than can fit in
13173                                          * one segment, don't send any more
13174                                          * until the SYN|ACK comes back from
13175                                          * the other end.
13176                                          */
13177                                         sendalot = 0;
13178                                 }
13179                         }
13180                 }
13181                 /* Window scaling. */
13182                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
13183                         to.to_wscale = tp->request_r_scale;
13184                         to.to_flags |= TOF_SCALE;
13185                 }
13186                 /* Timestamps. */
13187                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
13188                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
13189                         to.to_tsval = cts + tp->ts_offset;
13190                         to.to_tsecr = tp->ts_recent;
13191                         to.to_flags |= TOF_TS;
13192                 }
13193                 /* Set receive buffer autosizing timestamp. */
13194                 if (tp->rfbuf_ts == 0 &&
13195                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
13196                         tp->rfbuf_ts = tcp_ts_getticks();
13197                 /* Selective ACK's. */
13198                 if (flags & TH_SYN)
13199                         to.to_flags |= TOF_SACKPERM;
13200                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
13201                          tp->rcv_numsacks > 0) {
13202                         to.to_flags |= TOF_SACK;
13203                         to.to_nsacks = tp->rcv_numsacks;
13204                         to.to_sacks = (u_char *)tp->sackblks;
13205                 }
13206 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13207                 /* TCP-MD5 (RFC2385). */
13208                 if (tp->t_flags & TF_SIGNATURE)
13209                         to.to_flags |= TOF_SIGNATURE;
13210 #endif                          /* TCP_SIGNATURE */
13211
13212                 /* Processing the options. */
13213                 hdrlen += optlen = tcp_addoptions(&to, opt);
13214                 /*
13215                  * If we wanted a TFO option to be added, but it was unable
13216                  * to fit, ensure no data is sent.
13217                  */
13218                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
13219                     !(to.to_flags & TOF_FASTOPEN))
13220                         len = 0;
13221         }
13222 #ifdef NETFLIX_TCPOUDP
13223         if (tp->t_port) {
13224                 if (V_tcp_udp_tunneling_port == 0) {
13225                         /* The port was removed?? */
13226                         SOCKBUF_UNLOCK(&so->so_snd);
13227                         return (EHOSTUNREACH);
13228                 }
13229                 hdrlen += sizeof(struct udphdr);
13230         }
13231 #endif
13232 #ifdef INET6
13233         if (isipv6)
13234                 ipoptlen = ip6_optlen(tp->t_inpcb);
13235         else
13236 #endif
13237                 if (tp->t_inpcb->inp_options)
13238                         ipoptlen = tp->t_inpcb->inp_options->m_len -
13239                                 offsetof(struct ipoption, ipopt_list);
13240                 else
13241                         ipoptlen = 0;
13242 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
13243         ipoptlen += ipsec_optlen;
13244 #endif
13245
13246 #ifdef KERN_TLS
13247         /* force TSO for so TLS offload can get mss */
13248         if (sb->sb_flags & SB_TLS_IFNET) {
13249                 force_tso = 1;
13250         }
13251 #endif
13252         /*
13253          * Adjust data length if insertion of options will bump the packet
13254          * length beyond the t_maxseg length. Clear the FIN bit because we
13255          * cut off the tail of the segment.
13256          */
13257         if (len + optlen + ipoptlen > tp->t_maxseg) {
13258                 if (tso) {
13259                         uint32_t if_hw_tsomax;
13260                         uint32_t moff;
13261                         int32_t max_len;
13262
13263                         /* extract TSO information */
13264                         if_hw_tsomax = tp->t_tsomax;
13265                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
13266                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
13267                         KASSERT(ipoptlen == 0,
13268                                 ("%s: TSO can't do IP options", __func__));
13269
13270                         /*
13271                          * Check if we should limit by maximum payload
13272                          * length:
13273                          */
13274                         if (if_hw_tsomax != 0) {
13275                                 /* compute maximum TSO length */
13276                                 max_len = (if_hw_tsomax - hdrlen -
13277                                            max_linkhdr);
13278                                 if (max_len <= 0) {
13279                                         len = 0;
13280                                 } else if (len > max_len) {
13281                                         sendalot = 1;
13282                                         len = max_len;
13283                                         mark = 2;
13284                                 }
13285                         }
13286                         /*
13287                          * Prevent the last segment from being fractional
13288                          * unless the send sockbuf can be emptied:
13289                          */
13290                         max_len = (tp->t_maxseg - optlen);
13291                         if (((sb_offset + len) < sbavail(sb)) &&
13292                             (hw_tls == 0)) {
13293                                 moff = len % (u_int)max_len;
13294                                 if (moff != 0) {
13295                                         mark = 3;
13296                                         len -= moff;
13297                                 }
13298                         }
13299                         /*
13300                          * In case there are too many small fragments don't
13301                          * use TSO:
13302                          */
13303                         if (len <= segsiz) {
13304                                 mark = 4;
13305                                 tso = 0;
13306                         }
13307                         /*
13308                          * Send the FIN in a separate segment after the bulk
13309                          * sending is done. We don't trust the TSO
13310                          * implementations to clear the FIN flag on all but
13311                          * the last segment.
13312                          */
13313                         if (tp->t_flags & TF_NEEDFIN) {
13314                                 sendalot = 4;
13315                         }
13316                 } else {
13317                         mark = 5;
13318                         if (optlen + ipoptlen >= tp->t_maxseg) {
13319                                 /*
13320                                  * Since we don't have enough space to put
13321                                  * the IP header chain and the TCP header in
13322                                  * one packet as required by RFC 7112, don't
13323                                  * send it. Also ensure that at least one
13324                                  * byte of the payload can be put into the
13325                                  * TCP segment.
13326                                  */
13327                                 SOCKBUF_UNLOCK(&so->so_snd);
13328                                 error = EMSGSIZE;
13329                                 sack_rxmit = 0;
13330                                 goto out;
13331                         }
13332                         len = tp->t_maxseg - optlen - ipoptlen;
13333                         sendalot = 5;
13334                 }
13335         } else {
13336                 tso = 0;
13337                 mark = 6;
13338         }
13339         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
13340                 ("%s: len > IP_MAXPACKET", __func__));
13341 #ifdef DIAGNOSTIC
13342 #ifdef INET6
13343         if (max_linkhdr + hdrlen > MCLBYTES)
13344 #else
13345                 if (max_linkhdr + hdrlen > MHLEN)
13346 #endif
13347                         panic("tcphdr too big");
13348 #endif
13349
13350         /*
13351          * This KASSERT is here to catch edge cases at a well defined place.
13352          * Before, those had triggered (random) panic conditions further
13353          * down.
13354          */
13355         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
13356         if ((len == 0) &&
13357             (flags & TH_FIN) &&
13358             (sbused(sb))) {
13359                 /*
13360                  * We have outstanding data, don't send a fin by itself!.
13361                  */
13362                 goto just_return;
13363         }
13364         /*
13365          * Grab a header mbuf, attaching a copy of data to be transmitted,
13366          * and initialize the header from the template for sends on this
13367          * connection.
13368          */
13369         if (len) {
13370                 uint32_t max_val;
13371                 uint32_t moff;
13372
13373                 if (rack->r_ctl.rc_pace_max_segs)
13374                         max_val = rack->r_ctl.rc_pace_max_segs;
13375                 else if (rack->rc_user_set_max_segs)
13376                         max_val = rack->rc_user_set_max_segs * segsiz;
13377                 else
13378                         max_val = len;
13379                 /*
13380                  * We allow a limit on sending with hptsi.
13381                  */
13382                 if (len > max_val) {
13383                         mark = 7;
13384                         len = max_val;
13385                 }
13386 #ifdef INET6
13387                 if (MHLEN < hdrlen + max_linkhdr)
13388                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
13389                 else
13390 #endif
13391                         m = m_gethdr(M_NOWAIT, MT_DATA);
13392
13393                 if (m == NULL) {
13394                         SOCKBUF_UNLOCK(sb);
13395                         error = ENOBUFS;
13396                         sack_rxmit = 0;
13397                         goto out;
13398                 }
13399                 m->m_data += max_linkhdr;
13400                 m->m_len = hdrlen;
13401
13402                 /*
13403                  * Start the m_copy functions from the closest mbuf to the
13404                  * sb_offset in the socket buffer chain.
13405                  */
13406                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
13407                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
13408                         m_copydata(mb, moff, (int)len,
13409                                    mtod(m, caddr_t)+hdrlen);
13410                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
13411                                 sbsndptr_adv(sb, mb, len);
13412                         m->m_len += len;
13413                 } else {
13414                         struct sockbuf *msb;
13415
13416                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
13417                                 msb = NULL;
13418                         else
13419                                 msb = sb;
13420                         m->m_next = tcp_m_copym(
13421                                 mb, moff, &len,
13422                                 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
13423                                 ((rsm == NULL) ? hw_tls : 0)
13424 #ifdef NETFLIX_COPY_ARGS
13425                                 , &filled_all
13426 #endif
13427                                 );
13428                         if (len <= (tp->t_maxseg - optlen)) {
13429                                 /*
13430                                  * Must have ran out of mbufs for the copy
13431                                  * shorten it to no longer need tso. Lets
13432                                  * not put on sendalot since we are low on
13433                                  * mbufs.
13434                                  */
13435                                 tso = 0;
13436                         }
13437                         if (m->m_next == NULL) {
13438                                 SOCKBUF_UNLOCK(sb);
13439                                 (void)m_free(m);
13440                                 error = ENOBUFS;
13441                                 sack_rxmit = 0;
13442                                 goto out;
13443                         }
13444                 }
13445                 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
13446                         if (rsm && (rsm->r_flags & RACK_TLP)) {
13447                                 /*
13448                                  * TLP should not count in retran count, but
13449                                  * in its own bin
13450                                  */
13451                                 counter_u64_add(rack_tlp_retran, 1);
13452                                 counter_u64_add(rack_tlp_retran_bytes, len);
13453                         } else {
13454                                 tp->t_sndrexmitpack++;
13455                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
13456                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
13457                         }
13458 #ifdef STATS
13459                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
13460                                                  len);
13461 #endif
13462                 } else {
13463                         KMOD_TCPSTAT_INC(tcps_sndpack);
13464                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
13465 #ifdef STATS
13466                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
13467                                                  len);
13468 #endif
13469                 }
13470                 /*
13471                  * If we're sending everything we've got, set PUSH. (This
13472                  * will keep happy those implementations which only give
13473                  * data to the user when a buffer fills or a PUSH comes in.)
13474                  */
13475                 if (sb_offset + len == sbused(sb) &&
13476                     sbused(sb) &&
13477                     !(flags & TH_SYN))
13478                         flags |= TH_PUSH;
13479
13480                 SOCKBUF_UNLOCK(sb);
13481         } else {
13482                 SOCKBUF_UNLOCK(sb);
13483                 if (tp->t_flags & TF_ACKNOW)
13484                         KMOD_TCPSTAT_INC(tcps_sndacks);
13485                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
13486                         KMOD_TCPSTAT_INC(tcps_sndctrl);
13487                 else
13488                         KMOD_TCPSTAT_INC(tcps_sndwinup);
13489
13490                 m = m_gethdr(M_NOWAIT, MT_DATA);
13491                 if (m == NULL) {
13492                         error = ENOBUFS;
13493                         sack_rxmit = 0;
13494                         goto out;
13495                 }
13496 #ifdef INET6
13497                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
13498                     MHLEN >= hdrlen) {
13499                         M_ALIGN(m, hdrlen);
13500                 } else
13501 #endif
13502                         m->m_data += max_linkhdr;
13503                 m->m_len = hdrlen;
13504         }
13505         SOCKBUF_UNLOCK_ASSERT(sb);
13506         m->m_pkthdr.rcvif = (struct ifnet *)0;
13507 #ifdef MAC
13508         mac_inpcb_create_mbuf(inp, m);
13509 #endif
13510 #ifdef INET6
13511         if (isipv6) {
13512                 ip6 = mtod(m, struct ip6_hdr *);
13513 #ifdef NETFLIX_TCPOUDP
13514                 if (tp->t_port) {
13515                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
13516                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13517                         udp->uh_dport = tp->t_port;
13518                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
13519                         udp->uh_ulen = htons(ulen);
13520                         th = (struct tcphdr *)(udp + 1);
13521                 } else
13522 #endif
13523                         th = (struct tcphdr *)(ip6 + 1);
13524                 tcpip_fillheaders(inp,
13525 #ifdef NETFLIX_TCPOUDP
13526                                   tp->t_port,
13527 #endif
13528                                   ip6, th);
13529         } else
13530 #endif                          /* INET6 */
13531         {
13532                 ip = mtod(m, struct ip *);
13533 #ifdef TCPDEBUG
13534                 ipov = (struct ipovly *)ip;
13535 #endif
13536 #ifdef NETFLIX_TCPOUDP
13537                 if (tp->t_port) {
13538                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
13539                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13540                         udp->uh_dport = tp->t_port;
13541                         ulen = hdrlen + len - sizeof(struct ip);
13542                         udp->uh_ulen = htons(ulen);
13543                         th = (struct tcphdr *)(udp + 1);
13544                 } else
13545 #endif
13546                         th = (struct tcphdr *)(ip + 1);
13547                 tcpip_fillheaders(inp,
13548 #ifdef NETFLIX_TCPOUDP
13549                                   tp->t_port,
13550 #endif
13551                                   ip, th);
13552         }
13553         /*
13554          * Fill in fields, remembering maximum advertised window for use in
13555          * delaying messages about window sizes. If resending a FIN, be sure
13556          * not to use a new sequence number.
13557          */
13558         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
13559             tp->snd_nxt == tp->snd_max)
13560                 tp->snd_nxt--;
13561         /*
13562          * If we are starting a connection, send ECN setup SYN packet. If we
13563          * are on a retransmit, we may resend those bits a number of times
13564          * as per RFC 3168.
13565          */
13566         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
13567                 if (tp->t_rxtshift >= 1) {
13568                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
13569                                 flags |= TH_ECE | TH_CWR;
13570                 } else
13571                         flags |= TH_ECE | TH_CWR;
13572         }
13573         /* Handle parallel SYN for ECN */
13574         if ((tp->t_state == TCPS_SYN_RECEIVED) &&
13575             (tp->t_flags2 & TF2_ECN_SND_ECE)) {
13576                 flags |= TH_ECE;
13577                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
13578         }
13579         if (tp->t_state == TCPS_ESTABLISHED &&
13580             (tp->t_flags2 & TF2_ECN_PERMIT)) {
13581                 /*
13582                  * If the peer has ECN, mark data packets with ECN capable
13583                  * transmission (ECT). Ignore pure ack packets,
13584                  * retransmissions.
13585                  */
13586                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
13587                     (sack_rxmit == 0)) {
13588 #ifdef INET6
13589                         if (isipv6)
13590                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
13591                         else
13592 #endif
13593                                 ip->ip_tos |= IPTOS_ECN_ECT0;
13594                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
13595                         /*
13596                          * Reply with proper ECN notifications.
13597                          * Only set CWR on new data segments.
13598                          */
13599                         if (tp->t_flags2 & TF2_ECN_SND_CWR) {
13600                                 flags |= TH_CWR;
13601                                 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
13602                         }
13603                 }
13604                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
13605                         flags |= TH_ECE;
13606         }
13607         /*
13608          * If we are doing retransmissions, then snd_nxt will not reflect
13609          * the first unsent octet.  For ACK only packets, we do not want the
13610          * sequence number of the retransmitted packet, we want the sequence
13611          * number of the next unsent octet.  So, if there is no data (and no
13612          * SYN or FIN), use snd_max instead of snd_nxt when filling in
13613          * ti_seq.  But if we are in persist state, snd_max might reflect
13614          * one byte beyond the right edge of the window, so use snd_nxt in
13615          * that case, since we know we aren't doing a retransmission.
13616          * (retransmit and persist are mutually exclusive...)
13617          */
13618         if (sack_rxmit == 0) {
13619                 if (len || (flags & (TH_SYN | TH_FIN)) ||
13620                     rack->rc_in_persist) {
13621                         th->th_seq = htonl(tp->snd_nxt);
13622                         rack_seq = tp->snd_nxt;
13623                 } else if (flags & TH_RST) {
13624                         /*
13625                          * For a Reset send the last cum ack in sequence
13626                          * (this like any other choice may still generate a
13627                          * challenge ack, if a ack-update packet is in
13628                          * flight).
13629                          */
13630                         th->th_seq = htonl(tp->snd_una);
13631                         rack_seq = tp->snd_una;
13632                 } else {
13633                         th->th_seq = htonl(tp->snd_max);
13634                         rack_seq = tp->snd_max;
13635                 }
13636         } else {
13637                 th->th_seq = htonl(rsm->r_start);
13638                 rack_seq = rsm->r_start;
13639         }
13640         th->th_ack = htonl(tp->rcv_nxt);
13641         if (optlen) {
13642                 bcopy(opt, th + 1, optlen);
13643                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
13644         }
13645         th->th_flags = flags;
13646         /*
13647          * Calculate receive window.  Don't shrink window, but avoid silly
13648          * window syndrome.
13649          * If a RST segment is sent, advertise a window of zero.
13650          */
13651         if (flags & TH_RST) {
13652                 recwin = 0;
13653         } else {
13654                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
13655                     recwin < (long)segsiz)
13656                         recwin = 0;
13657                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
13658                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
13659                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
13660         }
13661
13662         /*
13663          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
13664          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
13665          * handled in syncache.
13666          */
13667         if (flags & TH_SYN)
13668                 th->th_win = htons((u_short)
13669                                    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
13670         else {
13671                 /* Avoid shrinking window with window scaling. */
13672                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
13673                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
13674         }
13675         /*
13676          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
13677          * window.  This may cause the remote transmitter to stall.  This
13678          * flag tells soreceive() to disable delayed acknowledgements when
13679          * draining the buffer.  This can occur if the receiver is
13680          * attempting to read more data than can be buffered prior to
13681          * transmitting on the connection.
13682          */
13683         if (th->th_win == 0) {
13684                 tp->t_sndzerowin++;
13685                 tp->t_flags |= TF_RXWIN0SENT;
13686         } else
13687                 tp->t_flags &= ~TF_RXWIN0SENT;
13688         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated  */
13689
13690 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
13691         if (to.to_flags & TOF_SIGNATURE) {
13692                 /*
13693                  * Calculate MD5 signature and put it into the place
13694                  * determined before.
13695                  * NOTE: since TCP options buffer doesn't point into
13696                  * mbuf's data, calculate offset and use it.
13697                  */
13698                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
13699                                                        (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
13700                         /*
13701                          * Do not send segment if the calculation of MD5
13702                          * digest has failed.
13703                          */
13704                         goto out;
13705                 }
13706         }
13707 #endif
13708
13709         /*
13710          * Put TCP length in extended header, and then checksum extended
13711          * header and data.
13712          */
13713         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
13714 #ifdef INET6
13715         if (isipv6) {
13716                 /*
13717                  * ip6_plen is not need to be filled now, and will be filled
13718                  * in ip6_output.
13719                  */
13720                 if (tp->t_port) {
13721                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
13722                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13723                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
13724                         th->th_sum = htons(0);
13725                         UDPSTAT_INC(udps_opackets);
13726                 } else {
13727                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
13728                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13729                         th->th_sum = in6_cksum_pseudo(ip6,
13730                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
13731                                                       0);
13732                 }
13733         }
13734 #endif
13735 #if defined(INET6) && defined(INET)
13736         else
13737 #endif
13738 #ifdef INET
13739         {
13740                 if (tp->t_port) {
13741                         m->m_pkthdr.csum_flags = CSUM_UDP;
13742                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
13743                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
13744                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
13745                         th->th_sum = htons(0);
13746                         UDPSTAT_INC(udps_opackets);
13747                 } else {
13748                         m->m_pkthdr.csum_flags = CSUM_TCP;
13749                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
13750                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
13751                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
13752                                                                         IPPROTO_TCP + len + optlen));
13753                 }
13754                 /* IP version must be set here for ipv4/ipv6 checking later */
13755                 KASSERT(ip->ip_v == IPVERSION,
13756                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
13757         }
13758 #endif
13759         /*
13760          * Enable TSO and specify the size of the segments. The TCP pseudo
13761          * header checksum is always provided. XXX: Fixme: This is currently
13762          * not the case for IPv6.
13763          */
13764         if (tso || force_tso) {
13765                 KASSERT(force_tso || len > tp->t_maxseg - optlen,
13766                         ("%s: len <= tso_segsz", __func__));
13767                 m->m_pkthdr.csum_flags |= CSUM_TSO;
13768                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
13769         }
13770         KASSERT(len + hdrlen == m_length(m, NULL),
13771                 ("%s: mbuf chain different than expected: %d + %u != %u",
13772                  __func__, len, hdrlen, m_length(m, NULL)));
13773
13774 #ifdef TCP_HHOOK
13775         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
13776         hhook_run_tcp_est_out(tp, th, &to, len, tso);
13777 #endif
13778 #ifdef TCPDEBUG
13779         /*
13780          * Trace.
13781          */
13782         if (so->so_options & SO_DEBUG) {
13783                 u_short save = 0;
13784
13785 #ifdef INET6
13786                 if (!isipv6)
13787 #endif
13788                 {
13789                         save = ipov->ih_len;
13790                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
13791                                                                  * (th->th_off << 2) */ );
13792                 }
13793                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
13794 #ifdef INET6
13795                 if (!isipv6)
13796 #endif
13797                         ipov->ih_len = save;
13798         }
13799 #endif                          /* TCPDEBUG */
13800
13801         /* We're getting ready to send; log now. */
13802         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13803                 union tcp_log_stackspecific log;
13804                 struct timeval tv;
13805
13806                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
13807                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
13808                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
13809                 if (rack->rack_no_prr)
13810                         log.u_bbr.flex1 = 0;
13811                 else
13812                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
13813                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
13814                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
13815                 log.u_bbr.flex4 = orig_len;
13816                 if (filled_all)
13817                         log.u_bbr.flex5 = 0x80000000;
13818                 else
13819                         log.u_bbr.flex5 = 0;
13820                 /* Save off the early/late values */
13821                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
13822                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
13823                 log.u_bbr.bw_inuse = rack_get_bw(rack);
13824                 if (rsm || sack_rxmit) {
13825                         if (doing_tlp)
13826                                 log.u_bbr.flex8 = 2;
13827                         else
13828                                 log.u_bbr.flex8 = 1;
13829                 } else {
13830                         log.u_bbr.flex8 = 0;
13831                 }
13832                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
13833                 log.u_bbr.flex7 = mark;
13834                 log.u_bbr.pkts_out = tp->t_maxseg;
13835                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
13836                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
13837                 log.u_bbr.lt_epoch = cwnd_to_use;
13838                 log.u_bbr.delivered = sendalot;
13839                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
13840                                      len, &log, false, NULL, NULL, 0, &tv);
13841         } else
13842                 lgb = NULL;
13843
13844         /*
13845          * Fill in IP length and desired time to live and send to IP level.
13846          * There should be a better way to handle ttl and tos; we could keep
13847          * them in the template, but need a way to checksum without them.
13848          */
13849         /*
13850          * m->m_pkthdr.len should have been set before cksum calcuration,
13851          * because in6_cksum() need it.
13852          */
13853 #ifdef INET6
13854         if (isipv6) {
13855                 /*
13856                  * we separately set hoplimit for every segment, since the
13857                  * user might want to change the value via setsockopt. Also,
13858                  * desired default hop limit might be changed via Neighbor
13859                  * Discovery.
13860                  */
13861                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
13862
13863                 /*
13864                  * Set the packet size here for the benefit of DTrace
13865                  * probes. ip6_output() will set it properly; it's supposed
13866                  * to include the option header lengths as well.
13867                  */
13868                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
13869
13870                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
13871                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
13872                 else
13873                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
13874
13875                 if (tp->t_state == TCPS_SYN_SENT)
13876                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
13877
13878                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
13879                 /* TODO: IPv6 IP6TOS_ECT bit on */
13880                 error = ip6_output(m, inp->in6p_outputopts,
13881                                    &inp->inp_route6,
13882                                    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
13883                                    NULL, NULL, inp);
13884
13885                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
13886                         mtu = inp->inp_route6.ro_nh->nh_mtu;
13887         }
13888 #endif                          /* INET6 */
13889 #if defined(INET) && defined(INET6)
13890         else
13891 #endif
13892 #ifdef INET
13893         {
13894                 ip->ip_len = htons(m->m_pkthdr.len);
13895 #ifdef INET6
13896                 if (inp->inp_vflag & INP_IPV6PROTO)
13897                         ip->ip_ttl = in6_selecthlim(inp, NULL);
13898 #endif                          /* INET6 */
13899                 /*
13900                  * If we do path MTU discovery, then we set DF on every
13901                  * packet. This might not be the best thing to do according
13902                  * to RFC3390 Section 2. However the tcp hostcache migitates
13903                  * the problem so it affects only the first tcp connection
13904                  * with a host.
13905                  *
13906                  * NB: Don't set DF on small MTU/MSS to have a safe
13907                  * fallback.
13908                  */
13909                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
13910                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
13911                         if (tp->t_port == 0 || len < V_tcp_minmss) {
13912                                 ip->ip_off |= htons(IP_DF);
13913                         }
13914                 } else {
13915                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
13916                 }
13917
13918                 if (tp->t_state == TCPS_SYN_SENT)
13919                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
13920
13921                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
13922
13923                 error = ip_output(m, inp->inp_options, &inp->inp_route,
13924                                   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
13925                                   inp);
13926                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
13927                         mtu = inp->inp_route.ro_nh->nh_mtu;
13928         }
13929 #endif                          /* INET */
13930
13931 out:
13932         if (lgb) {
13933                 lgb->tlb_errno = error;
13934                 lgb = NULL;
13935         }
13936         /*
13937          * In transmit state, time the transmission and arrange for the
13938          * retransmit.  In persist state, just set snd_max.
13939          */
13940         if (error == 0) {
13941                 rack->forced_ack = 0;   /* If we send something zap the FA flag */
13942                 if (rsm && (doing_tlp == 0)) {
13943                         /* Set we retransmitted */
13944                         rack->rc_gp_saw_rec = 1;
13945                 } else {
13946                         if (cwnd_to_use > tp->snd_ssthresh) {
13947                                 /* Set we sent in CA */
13948                                 rack->rc_gp_saw_ca = 1;
13949                         } else {
13950                                 /* Set we sent in SS */
13951                                 rack->rc_gp_saw_ss = 1;
13952                         }
13953                 }
13954                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
13955                     (tp->t_flags & TF_SACK_PERMIT) &&
13956                     tp->rcv_numsacks > 0)
13957                         tcp_clean_dsack_blocks(tp);
13958                 tot_len_this_send += len;
13959                 if (len == 0)
13960                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
13961                 else if (len == 1) {
13962                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
13963                 } else if (len > 1) {
13964                         int idx;
13965
13966                         idx = (len / segsiz) + 3;
13967                         if (idx >= TCP_MSS_ACCT_ATIMER)
13968                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
13969                         else
13970                                 counter_u64_add(rack_out_size[idx], 1);
13971                 }
13972                 if (hw_tls && len > 0) {
13973                         if (filled_all) {
13974                                 counter_u64_add(rack_tls_filled, 1);
13975                                 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1);
13976                         } else {
13977                                 if (rsm) {
13978                                         counter_u64_add(rack_tls_rxt, 1);
13979                                         rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1);
13980                                 } else if (doing_tlp) {
13981                                         counter_u64_add(rack_tls_tlp, 1);
13982                                         rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1);
13983                                 } else if ( (ctf_outstanding(tp) + minseg) > sbavail(sb)) {
13984                                         counter_u64_add(rack_tls_app, 1);
13985                                         rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1);
13986                                 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + minseg) > cwnd_to_use) {
13987                                         counter_u64_add(rack_tls_cwnd, 1);
13988                                         rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1);
13989                                 } else if ((ctf_outstanding(tp) + minseg) > tp->snd_wnd) {
13990                                         counter_u64_add(rack_tls_rwnd, 1);
13991                                         rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1);
13992                                 } else {
13993                                         rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1);
13994                                         counter_u64_add(rack_tls_other, 1);
13995                                 }
13996                         }
13997                 }
13998         }
13999         if (rack->rack_no_prr == 0) {
14000                 if (sub_from_prr && (error == 0)) {
14001                         if (rack->r_ctl.rc_prr_sndcnt >= len)
14002                                 rack->r_ctl.rc_prr_sndcnt -= len;
14003                         else
14004                                 rack->r_ctl.rc_prr_sndcnt = 0;
14005                 }
14006         }
14007         sub_from_prr = 0;
14008         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
14009                         pass, rsm, us_cts);
14010         if ((error == 0) &&
14011             (len > 0) &&
14012             (tp->snd_una == tp->snd_max))
14013                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
14014         /* Now are we in persists? */
14015         if (rack->rc_in_persist == 0) {
14016                 tcp_seq startseq = tp->snd_nxt;
14017
14018                 /* Track our lost count */
14019                 if (rsm && (doing_tlp == 0))
14020                         rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
14021                 /*
14022                  * Advance snd_nxt over sequence space of this segment.
14023                  */
14024                 if (error)
14025                         /* We don't log or do anything with errors */
14026                         goto nomore;
14027                 if (doing_tlp == 0) {
14028                         if (rsm == NULL) {
14029                                 /*
14030                                  * Not a retransmission of some
14031                                  * sort, new data is going out so
14032                                  * clear our TLP count and flag.
14033                                  */
14034                                 rack->rc_tlp_in_progress = 0;
14035                                 rack->r_ctl.rc_tlp_cnt_out = 0;
14036                         }
14037                 } else {
14038                         /*
14039                          * We have just sent a TLP, mark that it is true
14040                          * and make sure our in progress is set so we
14041                          * continue to check the count.
14042                          */
14043                         rack->rc_tlp_in_progress = 1;
14044                         rack->r_ctl.rc_tlp_cnt_out++;
14045                 }
14046                 if (flags & (TH_SYN | TH_FIN)) {
14047                         if (flags & TH_SYN)
14048                                 tp->snd_nxt++;
14049                         if (flags & TH_FIN) {
14050                                 tp->snd_nxt++;
14051                                 tp->t_flags |= TF_SENTFIN;
14052                         }
14053                 }
14054                 /* In the ENOBUFS case we do *not* update snd_max */
14055                 if (sack_rxmit)
14056                         goto nomore;
14057
14058                 tp->snd_nxt += len;
14059                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
14060                         if (tp->snd_una == tp->snd_max) {
14061                                 /*
14062                                  * Update the time we just added data since
14063                                  * none was outstanding.
14064                                  */
14065                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
14066                                 tp->t_acktime = ticks;
14067                         }
14068                         tp->snd_max = tp->snd_nxt;
14069                         /*
14070                          * Time this transmission if not a retransmission and
14071                          * not currently timing anything.
14072                          * This is only relevant in case of switching back to
14073                          * the base stack.
14074                          */
14075                         if (tp->t_rtttime == 0) {
14076                                 tp->t_rtttime = ticks;
14077                                 tp->t_rtseq = startseq;
14078                                 KMOD_TCPSTAT_INC(tcps_segstimed);
14079                         }
14080                         if (len &&
14081                             ((tp->t_flags & TF_GPUTINPROG) == 0))
14082                                 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
14083                 }
14084         } else {
14085                 /*
14086                  * Persist case, update snd_max but since we are in persist
14087                  * mode (no window) we do not update snd_nxt.
14088                  */
14089                 int32_t xlen = len;
14090
14091                 if (error)
14092                         goto nomore;
14093
14094                 if (flags & TH_SYN)
14095                         ++xlen;
14096                 if (flags & TH_FIN) {
14097                         ++xlen;
14098                         tp->t_flags |= TF_SENTFIN;
14099                 }
14100                 /* In the ENOBUFS case we do *not* update snd_max */
14101                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
14102                         if (tp->snd_una == tp->snd_max) {
14103                                 /*
14104                                  * Update the time we just added data since
14105                                  * none was outstanding.
14106                                  */
14107                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
14108                                 tp->t_acktime = ticks;
14109                         }
14110                         tp->snd_max = tp->snd_nxt + len;
14111                 }
14112         }
14113 nomore:
14114         if (error) {
14115                 rack->r_ctl.rc_agg_delayed = 0;
14116                 rack->r_early = 0;
14117                 rack->r_late = 0;
14118                 rack->r_ctl.rc_agg_early = 0;
14119                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
14120                 /*
14121                  * Failures do not advance the seq counter above. For the
14122                  * case of ENOBUFS we will fall out and retry in 1ms with
14123                  * the hpts. Everything else will just have to retransmit
14124                  * with the timer.
14125                  *
14126                  * In any case, we do not want to loop around for another
14127                  * send without a good reason.
14128                  */
14129                 sendalot = 0;
14130                 switch (error) {
14131                 case EPERM:
14132                         tp->t_softerror = error;
14133                         return (error);
14134                 case ENOBUFS:
14135                         if (slot == 0) {
14136                                 /*
14137                                  * Pace us right away to retry in a some
14138                                  * time
14139                                  */
14140                                 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
14141                                 if (rack->rc_enobuf < 126)
14142                                         rack->rc_enobuf++;
14143                                 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) {
14144                                         slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC;
14145                                 }
14146                                 if (slot < (10 * HPTS_USEC_IN_MSEC))
14147                                         slot = 10 * HPTS_USEC_IN_MSEC;
14148                         }
14149                         counter_u64_add(rack_saw_enobuf, 1);
14150                         error = 0;
14151                         goto enobufs;
14152                 case EMSGSIZE:
14153                         /*
14154                          * For some reason the interface we used initially
14155                          * to send segments changed to another or lowered
14156                          * its MTU. If TSO was active we either got an
14157                          * interface without TSO capabilits or TSO was
14158                          * turned off. If we obtained mtu from ip_output()
14159                          * then update it and try again.
14160                          */
14161                         if (tso)
14162                                 tp->t_flags &= ~TF_TSO;
14163                         if (mtu != 0) {
14164                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
14165                                 goto again;
14166                         }
14167                         slot = 10 * HPTS_USEC_IN_MSEC;
14168                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
14169                         return (error);
14170                 case ENETUNREACH:
14171                         counter_u64_add(rack_saw_enetunreach, 1);
14172                 case EHOSTDOWN:
14173                 case EHOSTUNREACH:
14174                 case ENETDOWN:
14175                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
14176                                 tp->t_softerror = error;
14177                         }
14178                         /* FALLTHROUGH */
14179                 default:
14180                         slot = 10 * HPTS_USEC_IN_MSEC;
14181                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
14182                         return (error);
14183                 }
14184         } else {
14185                 rack->rc_enobuf = 0;
14186         }
14187         KMOD_TCPSTAT_INC(tcps_sndtotal);
14188
14189         /*
14190          * Data sent (as far as we can tell). If this advertises a larger
14191          * window than any other segment, then remember the size of the
14192          * advertised window. Any pending ACK has now been sent.
14193          */
14194         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
14195                 tp->rcv_adv = tp->rcv_nxt + recwin;
14196         tp->last_ack_sent = tp->rcv_nxt;
14197         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
14198 enobufs:
14199         /* Assure when we leave that snd_nxt will point to top */
14200         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
14201                 tp->snd_nxt = tp->snd_max;
14202         if (sendalot) {
14203                 /* Do we need to turn off sendalot? */
14204                 if (rack->r_ctl.rc_pace_max_segs &&
14205                     (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
14206                         /* We hit our max. */
14207                         sendalot = 0;
14208                 } else if ((rack->rc_user_set_max_segs) &&
14209                            (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
14210                         /* We hit the user defined max */
14211                         sendalot = 0;
14212                 }
14213         }
14214         if ((error == 0) && (flags & TH_FIN))
14215                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
14216         if (flags & TH_RST) {
14217                 /*
14218                  * We don't send again after sending a RST.
14219                  */
14220                 slot = 0;
14221                 sendalot = 0;
14222                 if (error == 0)
14223                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
14224         } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
14225                 /*
14226                  * Get our pacing rate, if an error
14227                  * occured in sending (ENOBUF) we would
14228                  * hit the else if with slot preset. Other
14229                  * errors return.
14230                  */
14231                 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
14232         }
14233         if (rsm &&
14234             rack->use_rack_rr) {
14235                 /* Its a retransmit and we use the rack cheat? */
14236                 if ((slot == 0) ||
14237                     (rack->rc_always_pace == 0) ||
14238                     (rack->r_rr_config == 1)) {
14239                         /*
14240                          * We have no pacing set or we
14241                          * are using old-style rack or
14242                          * we are overriden to use the old 1ms pacing.
14243                          */
14244                         slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC;
14245                 }
14246         }
14247         if (slot) {
14248                 /* set the rack tcb into the slot N */
14249                 counter_u64_add(rack_paced_segments, 1);
14250         } else if (sendalot) {
14251                 if (len)
14252                         counter_u64_add(rack_unpaced_segments, 1);
14253                 sack_rxmit = 0;
14254                 goto again;
14255         } else if (len) {
14256                 counter_u64_add(rack_unpaced_segments, 1);
14257         }
14258         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
14259         return (error);
14260 }
14261
14262 static void
14263 rack_update_seg(struct tcp_rack *rack)
14264 {
14265         uint32_t orig_val;
14266
14267         orig_val = rack->r_ctl.rc_pace_max_segs;
14268         rack_set_pace_segments(rack->rc_tp, rack, __LINE__);
14269         if (orig_val != rack->r_ctl.rc_pace_max_segs)
14270                 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL);
14271 }
14272
14273 /*
14274  * rack_ctloutput() must drop the inpcb lock before performing copyin on
14275  * socket option arguments.  When it re-acquires the lock after the copy, it
14276  * has to revalidate that the connection is still valid for the socket
14277  * option.
14278  */
14279 static int
14280 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
14281     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
14282 {
14283         struct epoch_tracker et;
14284         uint64_t val;
14285         int32_t error = 0, optval;
14286         uint16_t ca, ss;
14287
14288
14289         switch (sopt->sopt_name) {
14290         case TCP_RACK_PROP_RATE:                /*  URL:prop_rate */
14291         case TCP_RACK_PROP      :               /*  URL:prop */
14292         case TCP_RACK_TLP_REDUCE:               /*  URL:tlp_reduce */
14293         case TCP_RACK_EARLY_RECOV:              /*  URL:early_recov */
14294         case TCP_RACK_PACE_REDUCE:              /*  Not used */
14295         /*  Pacing related ones */
14296         case TCP_RACK_PACE_ALWAYS:              /*  URL:pace_always */
14297         case TCP_BBR_RACK_INIT_RATE:            /*  URL:irate */
14298         case TCP_BBR_IWINTSO:                   /*  URL:tso_iwin */
14299         case TCP_RACK_PACE_MAX_SEG:             /*  URL:pace_max_seg */
14300         case TCP_RACK_FORCE_MSEG:               /*  URL:force_max_seg */
14301         case TCP_RACK_PACE_RATE_CA:             /*  URL:pr_ca */
14302         case TCP_RACK_PACE_RATE_SS:             /*  URL:pr_ss*/
14303         case TCP_RACK_PACE_RATE_REC:            /*  URL:pr_rec */
14304         case TCP_RACK_GP_INCREASE_CA:           /*  URL:gp_inc_ca */
14305         case TCP_RACK_GP_INCREASE_SS:           /*  URL:gp_inc_ss */
14306         case TCP_RACK_GP_INCREASE_REC:          /*  URL:gp_inc_rec */
14307         case TCP_RACK_RR_CONF:                  /*  URL:rrr_conf */
14308         case TCP_BBR_HDWR_PACE:                 /*  URL:hdwrpace */
14309        /* End pacing related */
14310         case TCP_DELACK:
14311         case TCP_RACK_PRR_SENDALOT:             /*  URL:prr_sendalot */
14312         case TCP_RACK_MIN_TO:                   /*  URL:min_to */
14313         case TCP_RACK_EARLY_SEG:                /*  URL:early_seg */
14314         case TCP_RACK_REORD_THRESH:             /*  URL:reord_thresh */
14315         case TCP_RACK_REORD_FADE:               /*  URL:reord_fade */
14316         case TCP_RACK_TLP_THRESH:               /*  URL:tlp_thresh */
14317         case TCP_RACK_PKT_DELAY:                /*  URL:pkt_delay */
14318         case TCP_RACK_TLP_USE:                  /*  URL:tlp_use */
14319         case TCP_RACK_TLP_INC_VAR:              /*  URL:tlp_inc_var */
14320         case TCP_RACK_IDLE_REDUCE_HIGH:         /*  URL:idle_reduce_high */
14321         case TCP_BBR_RACK_RTT_USE:              /*  URL:rttuse */
14322         case TCP_BBR_USE_RACK_RR:               /*  URL:rackrr */
14323         case TCP_RACK_DO_DETECTION:             /*  URL:detect */
14324         case TCP_NO_PRR:                        /*  URL:noprr */
14325         case TCP_TIMELY_DYN_ADJ:                /*  URL:dynamic */
14326         case TCP_DATA_AFTER_CLOSE:
14327         case TCP_RACK_NONRXT_CFG_RATE:          /*  URL:nonrxtcr */
14328         case TCP_SHARED_CWND_ENABLE:            /*  URL:scwnd */
14329         case TCP_RACK_MBUF_QUEUE:               /*  URL:mqueue */
14330         case TCP_RACK_NO_PUSH_AT_MAX:           /*  URL:npush */
14331         case TCP_RACK_PACE_TO_FILL:             /*  URL:fillcw */
14332         case TCP_SHARED_CWND_TIME_LIMIT:        /*  URL:lscwnd */
14333         case TCP_RACK_PROFILE:                  /*  URL:profile */
14334                 break;
14335         default:
14336                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14337                 break;
14338         }
14339         INP_WUNLOCK(inp);
14340         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
14341         if (error)
14342                 return (error);
14343         INP_WLOCK(inp);
14344         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
14345                 INP_WUNLOCK(inp);
14346                 return (ECONNRESET);
14347         }
14348         tp = intotcpcb(inp);
14349         rack = (struct tcp_rack *)tp->t_fb_ptr;
14350         switch (sopt->sopt_name) {
14351         case TCP_RACK_PROFILE:
14352                 RACK_OPTS_INC(tcp_profile);
14353                 if (optval == 1) {
14354                         /* pace_always=1 */
14355                         rack->rc_always_pace = 1;
14356                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
14357                         /* scwnd=1 */
14358                         rack->rack_enable_scwnd = 1;
14359                         /* dynamic=100 */
14360                         rack->rc_gp_dyn_mul = 1;
14361                         rack->r_ctl.rack_per_of_gp_ca = 100;
14362                         /* rrr_conf=3 */
14363                         rack->r_rr_config = 3;
14364                         /* npush=2 */
14365                         rack->r_ctl.rc_no_push_at_mrtt = 2;
14366                         /* fillcw=1 */
14367                         rack->rc_pace_to_cwnd = 1;
14368                         rack->rc_pace_fill_if_rttin_range = 0;
14369                         rack->rtt_limit_mul = 0;
14370                         /* noprr=1 */
14371                         rack->rack_no_prr = 1;
14372                         /* lscwnd=1 */
14373                         rack->r_limit_scw = 1;
14374                 } else if (optval == 2) {
14375                         /* pace_always=1 */
14376                         rack->rc_always_pace = 1;
14377                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
14378                         /* scwnd=1 */
14379                         rack->rack_enable_scwnd = 1;
14380                         /* dynamic=100 */
14381                         rack->rc_gp_dyn_mul = 1;
14382                         rack->r_ctl.rack_per_of_gp_ca = 100;
14383                         /* rrr_conf=3 */
14384                         rack->r_rr_config = 3;
14385                         /* npush=2 */
14386                         rack->r_ctl.rc_no_push_at_mrtt = 2;
14387                         /* fillcw=1 */
14388                         rack->rc_pace_to_cwnd = 1;
14389                         rack->rc_pace_fill_if_rttin_range = 0;
14390                         rack->rtt_limit_mul = 0;
14391                         /* noprr=1 */
14392                         rack->rack_no_prr = 1;
14393                         /* lscwnd=0 */
14394                         rack->r_limit_scw = 0;
14395                 }
14396                 break;
14397         case TCP_SHARED_CWND_TIME_LIMIT:
14398                 RACK_OPTS_INC(tcp_lscwnd);
14399                 if (optval)
14400                         rack->r_limit_scw = 1;
14401                 else
14402                         rack->r_limit_scw = 0;
14403                 break;
14404         case TCP_RACK_PACE_TO_FILL:
14405                 RACK_OPTS_INC(tcp_fillcw);
14406                 if (optval == 0)
14407                         rack->rc_pace_to_cwnd = 0;
14408                 else
14409                         rack->rc_pace_to_cwnd = 1;
14410                 if ((optval >= rack_gp_rtt_maxmul) &&
14411                     rack_gp_rtt_maxmul &&
14412                     (optval < 0xf)) {
14413                         rack->rc_pace_fill_if_rttin_range = 1;
14414                         rack->rtt_limit_mul = optval;
14415                 } else {
14416                         rack->rc_pace_fill_if_rttin_range = 0;
14417                         rack->rtt_limit_mul = 0;
14418                 }
14419                 break;
14420         case TCP_RACK_NO_PUSH_AT_MAX:
14421                 RACK_OPTS_INC(tcp_npush);
14422                 if (optval == 0)
14423                         rack->r_ctl.rc_no_push_at_mrtt = 0;
14424                 else if (optval < 0xff)
14425                         rack->r_ctl.rc_no_push_at_mrtt = optval;
14426                 else
14427                         error = EINVAL;
14428                 break;
14429         case TCP_SHARED_CWND_ENABLE:
14430                 RACK_OPTS_INC(tcp_rack_scwnd);
14431                 if (optval == 0)
14432                         rack->rack_enable_scwnd = 0;
14433                 else
14434                         rack->rack_enable_scwnd = 1;
14435                 break;
14436         case TCP_RACK_MBUF_QUEUE:
14437                 /* Now do we use the LRO mbuf-queue feature */
14438                 RACK_OPTS_INC(tcp_rack_mbufq);
14439                 if (optval)
14440                         rack->r_mbuf_queue = 1;
14441                 else
14442                         rack->r_mbuf_queue = 0;
14443                 if  (rack->r_mbuf_queue || rack->rc_always_pace)
14444                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
14445                 else
14446                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
14447                 break;
14448         case TCP_RACK_NONRXT_CFG_RATE:
14449                 RACK_OPTS_INC(tcp_rack_cfg_rate);
14450                 if (optval == 0)
14451                         rack->rack_rec_nonrxt_use_cr = 0;
14452                 else
14453                         rack->rack_rec_nonrxt_use_cr = 1;
14454                 break;
14455         case TCP_NO_PRR:
14456                 RACK_OPTS_INC(tcp_rack_noprr);
14457                 if (optval == 0)
14458                         rack->rack_no_prr = 0;
14459                 else
14460                         rack->rack_no_prr = 1;
14461                 break;
14462         case TCP_TIMELY_DYN_ADJ:
14463                 RACK_OPTS_INC(tcp_timely_dyn);
14464                 if (optval == 0)
14465                         rack->rc_gp_dyn_mul = 0;
14466                 else {
14467                         rack->rc_gp_dyn_mul = 1;
14468                         if (optval >= 100) {
14469                                 /*
14470                                  * If the user sets something 100 or more
14471                                  * its the gp_ca value.
14472                                  */
14473                                 rack->r_ctl.rack_per_of_gp_ca  = optval;
14474                         }
14475                 }
14476                 break;
14477         case TCP_RACK_DO_DETECTION:
14478                 RACK_OPTS_INC(tcp_rack_do_detection);
14479                 if (optval == 0)
14480                         rack->do_detection = 0;
14481                 else
14482                         rack->do_detection = 1;
14483                 break;
14484         case TCP_RACK_PROP_RATE:
14485                 if ((optval <= 0) || (optval >= 100)) {
14486                         error = EINVAL;
14487                         break;
14488                 }
14489                 RACK_OPTS_INC(tcp_rack_prop_rate);
14490                 rack->r_ctl.rc_prop_rate = optval;
14491                 break;
14492         case TCP_RACK_TLP_USE:
14493                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
14494                         error = EINVAL;
14495                         break;
14496                 }
14497                 RACK_OPTS_INC(tcp_tlp_use);
14498                 rack->rack_tlp_threshold_use = optval;
14499                 break;
14500         case TCP_RACK_PROP:
14501                 /* RACK proportional rate reduction (bool) */
14502                 RACK_OPTS_INC(tcp_rack_prop);
14503                 rack->r_ctl.rc_prop_reduce = optval;
14504                 break;
14505         case TCP_RACK_TLP_REDUCE:
14506                 /* RACK TLP cwnd reduction (bool) */
14507                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
14508                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
14509                 break;
14510         case TCP_RACK_EARLY_RECOV:
14511                 /* Should recovery happen early (bool) */
14512                 RACK_OPTS_INC(tcp_rack_early_recov);
14513                 rack->r_ctl.rc_early_recovery = optval;
14514                 break;
14515
14516         /*  Pacing related ones */
14517         case TCP_RACK_PACE_ALWAYS:
14518                 /*
14519                  * zero is old rack method, 1 is new
14520                  * method using a pacing rate.
14521                  */
14522                 RACK_OPTS_INC(tcp_rack_pace_always);
14523                 if (optval > 0)
14524                         rack->rc_always_pace = 1;
14525                 else
14526                         rack->rc_always_pace = 0;
14527                 if  (rack->r_mbuf_queue || rack->rc_always_pace)
14528                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
14529                 else
14530                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
14531                 /* A rate may be set irate or other, if so set seg size */
14532                 rack_update_seg(rack);
14533                 break;
14534         case TCP_BBR_RACK_INIT_RATE:
14535                 RACK_OPTS_INC(tcp_initial_rate);
14536                 val = optval;
14537                 /* Change from kbits per second to bytes per second */
14538                 val *= 1000;
14539                 val /= 8;
14540                 rack->r_ctl.init_rate = val;
14541                 if (rack->rc_init_win != rack_default_init_window) {
14542                         uint32_t win, snt;
14543
14544                         /*
14545                          * Options don't always get applied
14546                          * in the order you think. So in order
14547                          * to assure we update a cwnd we need
14548                          * to check and see if we are still
14549                          * where we should raise the cwnd.
14550                          */
14551                         win = rc_init_window(rack);
14552                         if (SEQ_GT(tp->snd_max, tp->iss))
14553                                 snt = tp->snd_max - tp->iss;
14554                         else
14555                                 snt = 0;
14556                         if ((snt < win) &&
14557                             (tp->snd_cwnd < win))
14558                                 tp->snd_cwnd = win;
14559                 }
14560                 if (rack->rc_always_pace)
14561                         rack_update_seg(rack);
14562                 break;
14563         case TCP_BBR_IWINTSO:
14564                 RACK_OPTS_INC(tcp_initial_win);
14565                 if (optval && (optval <= 0xff)) {
14566                         uint32_t win, snt;
14567
14568                         rack->rc_init_win = optval;
14569                         win = rc_init_window(rack);
14570                         if (SEQ_GT(tp->snd_max, tp->iss))
14571                                 snt = tp->snd_max - tp->iss;
14572                         else
14573                                 snt = 0;
14574                         if ((snt < win) &&
14575                             (tp->t_srtt |
14576 #ifdef NETFLIX_PEAKRATE
14577                              tp->t_maxpeakrate |
14578 #endif
14579                              rack->r_ctl.init_rate)) {
14580                                 /*
14581                                  * We are not past the initial window
14582                                  * and we have some bases for pacing,
14583                                  * so we need to possibly adjust up
14584                                  * the cwnd. Note even if we don't set
14585                                  * the cwnd, its still ok to raise the rc_init_win
14586                                  * which can be used coming out of idle when we
14587                                  * would have a rate.
14588                                  */
14589                                 if (tp->snd_cwnd < win)
14590                                         tp->snd_cwnd = win;
14591                         }
14592                         if (rack->rc_always_pace)
14593                                 rack_update_seg(rack);
14594                 } else
14595                         error = EINVAL;
14596                 break;
14597         case TCP_RACK_FORCE_MSEG:
14598                 RACK_OPTS_INC(tcp_rack_force_max_seg);
14599                 if (optval)
14600                         rack->rc_force_max_seg = 1;
14601                 else
14602                         rack->rc_force_max_seg = 0;
14603                 break;
14604         case TCP_RACK_PACE_MAX_SEG:
14605                 /* Max segments size in a pace in bytes */
14606                 RACK_OPTS_INC(tcp_rack_max_seg);
14607                 rack->rc_user_set_max_segs = optval;
14608                 rack_set_pace_segments(tp, rack, __LINE__);
14609                 break;
14610         case TCP_RACK_PACE_RATE_REC:
14611                 /* Set the fixed pacing rate in Bytes per second ca */
14612                 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
14613                 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
14614                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
14615                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
14616                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
14617                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
14618                 rack->use_fixed_rate = 1;
14619                 rack_log_pacing_delay_calc(rack,
14620                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
14621                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
14622                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
14623                                            __LINE__, NULL);
14624                 break;
14625
14626         case TCP_RACK_PACE_RATE_SS:
14627                 /* Set the fixed pacing rate in Bytes per second ca */
14628                 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
14629                 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
14630                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
14631                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
14632                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
14633                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
14634                 rack->use_fixed_rate = 1;
14635                 rack_log_pacing_delay_calc(rack,
14636                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
14637                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
14638                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
14639                                            __LINE__, NULL);
14640                 break;
14641
14642         case TCP_RACK_PACE_RATE_CA:
14643                 /* Set the fixed pacing rate in Bytes per second ca */
14644                 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
14645                 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
14646                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
14647                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
14648                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
14649                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
14650                 rack->use_fixed_rate = 1;
14651                 rack_log_pacing_delay_calc(rack,
14652                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
14653                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
14654                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
14655                                            __LINE__, NULL);
14656                 break;
14657         case TCP_RACK_GP_INCREASE_REC:
14658                 RACK_OPTS_INC(tcp_gp_inc_rec);
14659                 rack->r_ctl.rack_per_of_gp_rec = optval;
14660                 rack_log_pacing_delay_calc(rack,
14661                                            rack->r_ctl.rack_per_of_gp_ss,
14662                                            rack->r_ctl.rack_per_of_gp_ca,
14663                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
14664                                            __LINE__, NULL);
14665                 break;
14666         case TCP_RACK_GP_INCREASE_CA:
14667                 RACK_OPTS_INC(tcp_gp_inc_ca);
14668                 ca = optval;
14669                 if (ca < 100) {
14670                         /*
14671                          * We don't allow any reduction
14672                          * over the GP b/w.
14673                          */
14674                         error = EINVAL;
14675                         break;
14676                 }
14677                 rack->r_ctl.rack_per_of_gp_ca = ca;
14678                 rack_log_pacing_delay_calc(rack,
14679                                            rack->r_ctl.rack_per_of_gp_ss,
14680                                            rack->r_ctl.rack_per_of_gp_ca,
14681                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
14682                                            __LINE__, NULL);
14683                 break;
14684         case TCP_RACK_GP_INCREASE_SS:
14685                 RACK_OPTS_INC(tcp_gp_inc_ss);
14686                 ss = optval;
14687                 if (ss < 100) {
14688                         /*
14689                          * We don't allow any reduction
14690                          * over the GP b/w.
14691                          */
14692                         error = EINVAL;
14693                         break;
14694                 }
14695                 rack->r_ctl.rack_per_of_gp_ss = ss;
14696                 rack_log_pacing_delay_calc(rack,
14697                                            rack->r_ctl.rack_per_of_gp_ss,
14698                                            rack->r_ctl.rack_per_of_gp_ca,
14699                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
14700                                            __LINE__, NULL);
14701                 break;
14702         case TCP_RACK_RR_CONF:
14703                 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
14704                 if (optval && optval <= 3)
14705                         rack->r_rr_config = optval;
14706                 else
14707                         rack->r_rr_config = 0;
14708                 break;
14709         case TCP_BBR_HDWR_PACE:
14710                 RACK_OPTS_INC(tcp_hdwr_pacing);
14711                 if (optval){
14712                         if (rack->rack_hdrw_pacing == 0) {
14713                                 rack->rack_hdw_pace_ena = 1;
14714                                 rack->rack_attempt_hdwr_pace = 0;
14715                         } else
14716                                 error = EALREADY;
14717                 } else {
14718                         rack->rack_hdw_pace_ena = 0;
14719 #ifdef RATELIMIT
14720                         if (rack->rack_hdrw_pacing) {
14721                                 rack->rack_hdrw_pacing = 0;
14722                                 in_pcbdetach_txrtlmt(rack->rc_inp);
14723                         }
14724 #endif
14725                 }
14726                 break;
14727         /*  End Pacing related ones */
14728         case TCP_RACK_PRR_SENDALOT:
14729                 /* Allow PRR to send more than one seg */
14730                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
14731                 rack->r_ctl.rc_prr_sendalot = optval;
14732                 break;
14733         case TCP_RACK_MIN_TO:
14734                 /* Minimum time between rack t-o's in ms */
14735                 RACK_OPTS_INC(tcp_rack_min_to);
14736                 rack->r_ctl.rc_min_to = optval;
14737                 break;
14738         case TCP_RACK_EARLY_SEG:
14739                 /* If early recovery max segments */
14740                 RACK_OPTS_INC(tcp_rack_early_seg);
14741                 rack->r_ctl.rc_early_recovery_segs = optval;
14742                 break;
14743         case TCP_RACK_REORD_THRESH:
14744                 /* RACK reorder threshold (shift amount) */
14745                 RACK_OPTS_INC(tcp_rack_reord_thresh);
14746                 if ((optval > 0) && (optval < 31))
14747                         rack->r_ctl.rc_reorder_shift = optval;
14748                 else
14749                         error = EINVAL;
14750                 break;
14751         case TCP_RACK_REORD_FADE:
14752                 /* Does reordering fade after ms time */
14753                 RACK_OPTS_INC(tcp_rack_reord_fade);
14754                 rack->r_ctl.rc_reorder_fade = optval;
14755                 break;
14756         case TCP_RACK_TLP_THRESH:
14757                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
14758                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
14759                 if (optval)
14760                         rack->r_ctl.rc_tlp_threshold = optval;
14761                 else
14762                         error = EINVAL;
14763                 break;
14764         case TCP_BBR_USE_RACK_RR:
14765                 RACK_OPTS_INC(tcp_rack_rr);
14766                 if (optval)
14767                         rack->use_rack_rr = 1;
14768                 else
14769                         rack->use_rack_rr = 0;
14770                 break;
14771         case TCP_RACK_PKT_DELAY:
14772                 /* RACK added ms i.e. rack-rtt + reord + N */
14773                 RACK_OPTS_INC(tcp_rack_pkt_delay);
14774                 rack->r_ctl.rc_pkt_delay = optval;
14775                 break;
14776         case TCP_RACK_TLP_INC_VAR:
14777                 /* Does TLP include rtt variance in t-o */
14778                 error = EINVAL;
14779                 break;
14780         case TCP_RACK_IDLE_REDUCE_HIGH:
14781                 error = EINVAL;
14782                 break;
14783         case TCP_DELACK:
14784                 if (optval == 0)
14785                         tp->t_delayed_ack = 0;
14786                 else
14787                         tp->t_delayed_ack = 1;
14788                 if (tp->t_flags & TF_DELACK) {
14789                         tp->t_flags &= ~TF_DELACK;
14790                         tp->t_flags |= TF_ACKNOW;
14791                         NET_EPOCH_ENTER(et);
14792                         rack_output(tp);
14793                         NET_EPOCH_EXIT(et);
14794                 }
14795                 break;
14796
14797         case TCP_BBR_RACK_RTT_USE:
14798                 if ((optval != USE_RTT_HIGH) &&
14799                     (optval != USE_RTT_LOW) &&
14800                     (optval != USE_RTT_AVG))
14801                         error = EINVAL;
14802                 else
14803                         rack->r_ctl.rc_rate_sample_method = optval;
14804                 break;
14805         case TCP_DATA_AFTER_CLOSE:
14806                 if (optval)
14807                         rack->rc_allow_data_af_clo = 1;
14808                 else
14809                         rack->rc_allow_data_af_clo = 0;
14810                 break;
14811         case TCP_RACK_PACE_REDUCE:
14812                 /* sysctl only now */
14813                 error = EINVAL;
14814                 break;
14815         default:
14816                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14817                 break;
14818         }
14819 #ifdef NETFLIX_STATS
14820         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
14821 #endif
14822         INP_WUNLOCK(inp);
14823         return (error);
14824 }
14825
14826 static int
14827 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
14828     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
14829 {
14830         int32_t error, optval;
14831         uint64_t val;
14832         /*
14833          * Because all our options are either boolean or an int, we can just
14834          * pull everything into optval and then unlock and copy. If we ever
14835          * add a option that is not a int, then this will have quite an
14836          * impact to this routine.
14837          */
14838         error = 0;
14839         switch (sopt->sopt_name) {
14840         case TCP_RACK_PROFILE:
14841                 /* You cannot retrieve a profile, its write only */
14842                 error = EINVAL;
14843                 break;
14844         case TCP_RACK_PACE_TO_FILL:
14845                 optval = rack->rc_pace_to_cwnd;
14846                 break;
14847         case TCP_RACK_NO_PUSH_AT_MAX:
14848                 optval = rack->r_ctl.rc_no_push_at_mrtt;
14849                 break;
14850         case TCP_SHARED_CWND_ENABLE:
14851                 optval = rack->rack_enable_scwnd;
14852                 break;
14853         case TCP_RACK_NONRXT_CFG_RATE:
14854                 optval = rack->rack_rec_nonrxt_use_cr;
14855                 break;
14856         case TCP_NO_PRR:
14857                 optval = rack->rack_no_prr;
14858                 break;
14859         case TCP_RACK_DO_DETECTION:
14860                 optval = rack->do_detection;
14861                 break;
14862         case TCP_RACK_MBUF_QUEUE:
14863                 /* Now do we use the LRO mbuf-queue feature */
14864                 optval = rack->r_mbuf_queue;
14865                 break;
14866         case TCP_TIMELY_DYN_ADJ:
14867                 optval = rack->rc_gp_dyn_mul;
14868                 break;
14869         case TCP_BBR_IWINTSO:
14870                 optval = rack->rc_init_win;
14871                 break;
14872         case TCP_RACK_PROP_RATE:
14873                 optval = rack->r_ctl.rc_prop_rate;
14874                 break;
14875         case TCP_RACK_PROP:
14876                 /* RACK proportional rate reduction (bool) */
14877                 optval = rack->r_ctl.rc_prop_reduce;
14878                 break;
14879         case TCP_RACK_TLP_REDUCE:
14880                 /* RACK TLP cwnd reduction (bool) */
14881                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
14882                 break;
14883         case TCP_RACK_EARLY_RECOV:
14884                 /* Should recovery happen early (bool) */
14885                 optval = rack->r_ctl.rc_early_recovery;
14886                 break;
14887         case TCP_RACK_PACE_REDUCE:
14888                 /* RACK Hptsi reduction factor (divisor) */
14889                 error = EINVAL;
14890                 break;
14891         case TCP_BBR_RACK_INIT_RATE:
14892                 val = rack->r_ctl.init_rate;
14893                 /* convert to kbits per sec */
14894                 val *= 8;
14895                 val /= 1000;
14896                 optval = (uint32_t)val;
14897                 break;
14898         case TCP_RACK_FORCE_MSEG:
14899                 optval = rack->rc_force_max_seg;
14900                 break;
14901         case TCP_RACK_PACE_MAX_SEG:
14902                 /* Max segments in a pace */
14903                 optval = rack->rc_user_set_max_segs;
14904                 break;
14905         case TCP_RACK_PACE_ALWAYS:
14906                 /* Use the always pace method */
14907                 optval = rack->rc_always_pace;
14908                 break;
14909         case TCP_RACK_PRR_SENDALOT:
14910                 /* Allow PRR to send more than one seg */
14911                 optval = rack->r_ctl.rc_prr_sendalot;
14912                 break;
14913         case TCP_RACK_MIN_TO:
14914                 /* Minimum time between rack t-o's in ms */
14915                 optval = rack->r_ctl.rc_min_to;
14916                 break;
14917         case TCP_RACK_EARLY_SEG:
14918                 /* If early recovery max segments */
14919                 optval = rack->r_ctl.rc_early_recovery_segs;
14920                 break;
14921         case TCP_RACK_REORD_THRESH:
14922                 /* RACK reorder threshold (shift amount) */
14923                 optval = rack->r_ctl.rc_reorder_shift;
14924                 break;
14925         case TCP_RACK_REORD_FADE:
14926                 /* Does reordering fade after ms time */
14927                 optval = rack->r_ctl.rc_reorder_fade;
14928                 break;
14929         case TCP_BBR_USE_RACK_RR:
14930                 /* Do we use the rack cheat for rxt */
14931                 optval = rack->use_rack_rr;
14932                 break;
14933         case TCP_RACK_RR_CONF:
14934                 optval = rack->r_rr_config;
14935                 break;
14936         case TCP_BBR_HDWR_PACE:
14937                 optval = rack->rack_hdw_pace_ena;
14938                 break;
14939         case TCP_RACK_TLP_THRESH:
14940                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
14941                 optval = rack->r_ctl.rc_tlp_threshold;
14942                 break;
14943         case TCP_RACK_PKT_DELAY:
14944                 /* RACK added ms i.e. rack-rtt + reord + N */
14945                 optval = rack->r_ctl.rc_pkt_delay;
14946                 break;
14947         case TCP_RACK_TLP_USE:
14948                 optval = rack->rack_tlp_threshold_use;
14949                 break;
14950         case TCP_RACK_TLP_INC_VAR:
14951                 /* Does TLP include rtt variance in t-o */
14952                 error = EINVAL;
14953                 break;
14954         case TCP_RACK_IDLE_REDUCE_HIGH:
14955                 error = EINVAL;
14956                 break;
14957         case TCP_RACK_PACE_RATE_CA:
14958                 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
14959                 break;
14960         case TCP_RACK_PACE_RATE_SS:
14961                 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
14962                 break;
14963         case TCP_RACK_PACE_RATE_REC:
14964                 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
14965                 break;
14966         case TCP_RACK_GP_INCREASE_SS:
14967                 optval = rack->r_ctl.rack_per_of_gp_ca;
14968                 break;
14969         case TCP_RACK_GP_INCREASE_CA:
14970                 optval = rack->r_ctl.rack_per_of_gp_ss;
14971                 break;
14972         case TCP_BBR_RACK_RTT_USE:
14973                 optval = rack->r_ctl.rc_rate_sample_method;
14974                 break;
14975         case TCP_DELACK:
14976                 optval = tp->t_delayed_ack;
14977                 break;
14978         case TCP_DATA_AFTER_CLOSE:
14979                 optval = rack->rc_allow_data_af_clo;
14980                 break;
14981         case TCP_SHARED_CWND_TIME_LIMIT:
14982                 optval = rack->r_limit_scw;
14983                 break;
14984         default:
14985                 return (tcp_default_ctloutput(so, sopt, inp, tp));
14986                 break;
14987         }
14988         INP_WUNLOCK(inp);
14989         if (error == 0) {
14990                 error = sooptcopyout(sopt, &optval, sizeof optval);
14991         }
14992         return (error);
14993 }
14994
14995 static int
14996 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
14997 {
14998         int32_t error = EINVAL;
14999         struct tcp_rack *rack;
15000
15001         rack = (struct tcp_rack *)tp->t_fb_ptr;
15002         if (rack == NULL) {
15003                 /* Huh? */
15004                 goto out;
15005         }
15006         if (sopt->sopt_dir == SOPT_SET) {
15007                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
15008         } else if (sopt->sopt_dir == SOPT_GET) {
15009                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
15010         }
15011 out:
15012         INP_WUNLOCK(inp);
15013         return (error);
15014 }
15015
15016 static int
15017 rack_pru_options(struct tcpcb *tp, int flags)
15018 {
15019         if (flags & PRUS_OOB)
15020                 return (EOPNOTSUPP);
15021         return (0);
15022 }
15023
15024 static struct tcp_function_block __tcp_rack = {
15025         .tfb_tcp_block_name = __XSTRING(STACKNAME),
15026         .tfb_tcp_output = rack_output,
15027         .tfb_do_queued_segments = ctf_do_queued_segments,
15028         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
15029         .tfb_tcp_do_segment = rack_do_segment,
15030         .tfb_tcp_ctloutput = rack_ctloutput,
15031         .tfb_tcp_fb_init = rack_init,
15032         .tfb_tcp_fb_fini = rack_fini,
15033         .tfb_tcp_timer_stop_all = rack_stopall,
15034         .tfb_tcp_timer_activate = rack_timer_activate,
15035         .tfb_tcp_timer_active = rack_timer_active,
15036         .tfb_tcp_timer_stop = rack_timer_stop,
15037         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
15038         .tfb_tcp_handoff_ok = rack_handoff_ok,
15039         .tfb_pru_options = rack_pru_options,
15040 };
15041
15042 static const char *rack_stack_names[] = {
15043         __XSTRING(STACKNAME),
15044 #ifdef STACKALIAS
15045         __XSTRING(STACKALIAS),
15046 #endif
15047 };
15048
15049 static int
15050 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
15051 {
15052         memset(mem, 0, size);
15053         return (0);
15054 }
15055
15056 static void
15057 rack_dtor(void *mem, int32_t size, void *arg)
15058 {
15059
15060 }
15061
15062 static bool rack_mod_inited = false;
15063
15064 static int
15065 tcp_addrack(module_t mod, int32_t type, void *data)
15066 {
15067         int32_t err = 0;
15068         int num_stacks;
15069
15070         switch (type) {
15071         case MOD_LOAD:
15072                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
15073                     sizeof(struct rack_sendmap),
15074                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
15075
15076                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
15077                     sizeof(struct tcp_rack),
15078                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
15079
15080                 sysctl_ctx_init(&rack_sysctl_ctx);
15081                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
15082                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
15083                     OID_AUTO,
15084 #ifdef STACKALIAS
15085                     __XSTRING(STACKALIAS),
15086 #else
15087                     __XSTRING(STACKNAME),
15088 #endif
15089                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
15090                     "");
15091                 if (rack_sysctl_root == NULL) {
15092                         printf("Failed to add sysctl node\n");
15093                         err = EFAULT;
15094                         goto free_uma;
15095                 }
15096                 rack_init_sysctls();
15097                 num_stacks = nitems(rack_stack_names);
15098                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
15099                     rack_stack_names, &num_stacks);
15100                 if (err) {
15101                         printf("Failed to register %s stack name for "
15102                             "%s module\n", rack_stack_names[num_stacks],
15103                             __XSTRING(MODNAME));
15104                         sysctl_ctx_free(&rack_sysctl_ctx);
15105 free_uma:
15106                         uma_zdestroy(rack_zone);
15107                         uma_zdestroy(rack_pcb_zone);
15108                         rack_counter_destroy();
15109                         printf("Failed to register rack module -- err:%d\n", err);
15110                         return (err);
15111                 }
15112                 tcp_lro_reg_mbufq();
15113                 rack_mod_inited = true;
15114                 break;
15115         case MOD_QUIESCE:
15116                 err = deregister_tcp_functions(&__tcp_rack, true, false);
15117                 break;
15118         case MOD_UNLOAD:
15119                 err = deregister_tcp_functions(&__tcp_rack, false, true);
15120                 if (err == EBUSY)
15121                         break;
15122                 if (rack_mod_inited) {
15123                         uma_zdestroy(rack_zone);
15124                         uma_zdestroy(rack_pcb_zone);
15125                         sysctl_ctx_free(&rack_sysctl_ctx);
15126                         rack_counter_destroy();
15127                         rack_mod_inited = false;
15128                 }
15129                 tcp_lro_dereg_mbufq();
15130                 err = 0;
15131                 break;
15132         default:
15133                 return (EOPNOTSUPP);
15134         }
15135         return (err);
15136 }
15137
15138 static moduledata_t tcp_rack = {
15139         .name = __XSTRING(MODNAME),
15140         .evhand = tcp_addrack,
15141         .priv = 0
15142 };
15143
15144 MODULE_VERSION(MODNAME, 1);
15145 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
15146 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);