sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2019
   3  *      Netflix Inc.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  */
  27
  28 #include <sys/cdefs.h>
  29 __FBSDID("$FreeBSD$");
  30
  31 #include "opt_inet.h"
  32 #include "opt_inet6.h"
  33 #include "opt_ipsec.h"
  34 #include "opt_tcpdebug.h"
  35
  36 #include <sys/param.h>
  37 #include <sys/module.h>
  38 #include <sys/kernel.h>
  39 #ifdef TCP_HHOOK
  40 #include <sys/hhook.h>
  41 #endif
  42 #include <sys/lock.h>
  43 #include <sys/malloc.h>
  44 #include <sys/lock.h>
  45 #include <sys/mutex.h>
  46 #include <sys/mbuf.h>
  47 #include <sys/proc.h>           /* for proc0 declaration */
  48 #ifdef NETFLIX_STATS
  49 #include <sys/qmath.h>
  50 #endif
  51 #include <sys/socket.h>
  52 #include <sys/socketvar.h>
  53 #include <sys/sysctl.h>
  54 #include <sys/systm.h>
  55 #include <sys/tree.h>
  56 #ifdef NETFLIX_STATS
  57 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  58 #endif
  59 #include <sys/refcount.h>
  60 #include <sys/queue.h>
  61 #include <sys/smp.h>
  62 #include <sys/kthread.h>
  63 #include <sys/kern_prefetch.h>
  64
  65 #include <vm/uma.h>
  66
  67 #include <net/route.h>
  68 #include <net/vnet.h>
  69
  70 #define TCPSTATES               /* for logging */
  71
  72 #include <netinet/in.h>
  73 #include <netinet/in_kdtrace.h>
  74 #include <netinet/in_pcb.h>
  75 #include <netinet/ip.h>
  76 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  77 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  78 #include <netinet/ip_var.h>
  79 #include <netinet/ip6.h>
  80 #include <netinet6/in6_pcb.h>
  81 #include <netinet6/ip6_var.h>
  82 #define TCPOUTFLAGS
  83 #include <netinet/tcp.h>
  84 #include <netinet/tcp_fsm.h>
  85 #include <netinet/tcp_log_buf.h>
  86 #include <netinet/tcp_seq.h>
  87 #include <netinet/tcp_timer.h>
  88 #include <netinet/tcp_var.h>
  89 #include <netinet/tcp_hpts.h>
  90 #include <netinet/tcpip.h>
  91 #include <netinet/cc/cc.h>
  92 #include <netinet/tcp_fastopen.h>
  93 #ifdef TCPDEBUG
  94 #include <netinet/tcp_debug.h>
  95 #endif                          /* TCPDEBUG */
  96 #ifdef TCP_OFFLOAD
  97 #include <netinet/tcp_offload.h>
  98 #endif
  99 #ifdef INET6
 100 #include <netinet6/tcp6_var.h>
 101 #endif
 102
 103 #include <netipsec/ipsec_support.h>
 104
 105 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 106 #include <netipsec/ipsec.h>
 107 #include <netipsec/ipsec6.h>
 108 #endif                          /* IPSEC */
 109
 110 #include <netinet/udp.h>
 111 #include <netinet/udp_var.h>
 112 #include <machine/in_cksum.h>
 113
 114 #ifdef MAC
 115 #include <security/mac/mac_framework.h>
 116 #endif
 117 #include "sack_filter.h"
 118 #include "tcp_rack.h"
 119 #include "rack_bbr_common.h"
 120
 121 uma_zone_t rack_zone;
 122 uma_zone_t rack_pcb_zone;
 123
 124 #ifndef TICKS2SBT
 125 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 126 #endif
 127
 128 struct sysctl_ctx_list rack_sysctl_ctx;
 129 struct sysctl_oid *rack_sysctl_root;
 130
 131 #define CUM_ACKED 1
 132 #define SACKED 2
 133
 134 /*
 135  * The RACK module incorporates a number of
 136  * TCP ideas that have been put out into the IETF
 137  * over the last few years:
 138  * - Matt Mathis's Rate Halving which slowly drops
 139  *    the congestion window so that the ack clock can
 140  *    be maintained during a recovery.
 141  * - Yuchung Cheng's RACK TCP (for which its named) that
 142  *    will stop us using the number of dup acks and instead
 143  *    use time as the gage of when we retransmit.
 144  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 145  *    of Dukkipati et.al.
 146  * RACK depends on SACK, so if an endpoint arrives that
 147  * cannot do SACK the state machine below will shuttle the
 148  * connection back to using the "default" TCP stack that is
 149  * in FreeBSD.
 150  *
 151  * To implement RACK the original TCP stack was first decomposed
 152  * into a functional state machine with individual states
 153  * for each of the possible TCP connection states. The do_segement
 154  * functions role in life is to mandate the connection supports SACK
 155  * initially and then assure that the RACK state matches the conenction
 156  * state before calling the states do_segment function. Each
 157  * state is simplified due to the fact that the original do_segment
 158  * has been decomposed and we *know* what state we are in (no
 159  * switches on the state) and all tests for SACK are gone. This
 160  * greatly simplifies what each state does.
 161  *
 162  * TCP output is also over-written with a new version since it
 163  * must maintain the new rack scoreboard.
 164  *
 165  */
 166 static int32_t rack_precache = 1;
 167 static int32_t rack_tlp_thresh = 1;
 168 static int32_t rack_reorder_thresh = 2;
 169 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 170                                                  * - 60 seconds */
 171 static int32_t rack_pkt_delay = 1;
 172 static int32_t rack_inc_var = 0;/* For TLP */
 173 static int32_t rack_reduce_largest_on_idle = 0;
 174 static int32_t rack_min_pace_time = 0;
 175 static int32_t rack_min_pace_time_seg_req=6;
 176 static int32_t rack_early_recovery = 1;
 177 static int32_t rack_early_recovery_max_seg = 6;
 178 static int32_t rack_send_a_lot_in_prr = 1;
 179 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 180 static int32_t rack_tlp_in_recovery = 1;        /* Can we do TLP in recovery? */
 181 static int32_t rack_verbose_logging = 0;
 182 static int32_t rack_ignore_data_after_close = 1;
 183 static int32_t rack_map_entries_limit = 1024;
 184 static int32_t rack_map_split_limit = 256;
 185
 186 /*
 187  * Currently regular tcp has a rto_min of 30ms
 188  * the backoff goes 12 times so that ends up
 189  * being a total of 122.850 seconds before a
 190  * connection is killed.
 191  */
 192 static int32_t rack_tlp_min = 10;
 193 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 194 static int32_t rack_rto_max = 30000;    /* 30 seconds */
 195 static const int32_t rack_free_cache = 2;
 196 static int32_t rack_hptsi_segments = 40;
 197 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 198 static int32_t rack_pace_every_seg = 1;
 199 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 200 static int32_t rack_slot_reduction = 4;
 201 static int32_t rack_lower_cwnd_at_tlp = 0;
 202 static int32_t rack_use_proportional_reduce = 0;
 203 static int32_t rack_proportional_rate = 10;
 204 static int32_t rack_tlp_max_resend = 2;
 205 static int32_t rack_limited_retran = 0;
 206 static int32_t rack_always_send_oldest = 0;
 207 static int32_t rack_sack_block_limit = 128;
 208 static int32_t rack_use_sack_filter = 1;
 209 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 210
 211 /* Rack specific counters */
 212 counter_u64_t rack_badfr;
 213 counter_u64_t rack_badfr_bytes;
 214 counter_u64_t rack_rtm_prr_retran;
 215 counter_u64_t rack_rtm_prr_newdata;
 216 counter_u64_t rack_timestamp_mismatch;
 217 counter_u64_t rack_reorder_seen;
 218 counter_u64_t rack_paced_segments;
 219 counter_u64_t rack_unpaced_segments;
 220 counter_u64_t rack_saw_enobuf;
 221 counter_u64_t rack_saw_enetunreach;
 222
 223 /* Tail loss probe counters */
 224 counter_u64_t rack_tlp_tot;
 225 counter_u64_t rack_tlp_newdata;
 226 counter_u64_t rack_tlp_retran;
 227 counter_u64_t rack_tlp_retran_bytes;
 228 counter_u64_t rack_tlp_retran_fail;
 229 counter_u64_t rack_to_tot;
 230 counter_u64_t rack_to_arm_rack;
 231 counter_u64_t rack_to_arm_tlp;
 232 counter_u64_t rack_to_alloc;
 233 counter_u64_t rack_to_alloc_hard;
 234 counter_u64_t rack_to_alloc_emerg;
 235 counter_u64_t rack_to_alloc_limited;
 236 counter_u64_t rack_alloc_limited_conns;
 237 counter_u64_t rack_split_limited;
 238
 239 counter_u64_t rack_sack_proc_all;
 240 counter_u64_t rack_sack_proc_short;
 241 counter_u64_t rack_sack_proc_restart;
 242 counter_u64_t rack_runt_sacks;
 243 counter_u64_t rack_used_tlpmethod;
 244 counter_u64_t rack_used_tlpmethod2;
 245 counter_u64_t rack_enter_tlp_calc;
 246 counter_u64_t rack_input_idle_reduces;
 247 counter_u64_t rack_tlp_does_nada;
 248
 249 /* Temp CPU counters */
 250 counter_u64_t rack_find_high;
 251
 252 counter_u64_t rack_progress_drops;
 253 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 254 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 255
 256 /*
 257  * This was originally defined in tcp_timer.c, but is now reproduced here given
 258  * the unification of the SYN and non-SYN retransmit timer exponents combined
 259  * with wanting to retain previous behaviour for previously deployed stack
 260  * versions.
 261  */
 262 int     tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
 263     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
 264
 265 static void
 266 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 267
 268 static int
 269 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 270     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 271     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 272 static int
 273 rack_process_data(struct mbuf *m, struct tcphdr *th,
 274     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 275     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 276 static void
 277 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 278     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 279 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 280 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 281     uint8_t limit_type);
 282 static struct rack_sendmap *
 283 rack_check_recovery_mode(struct tcpcb *tp,
 284     uint32_t tsused);
 285 static void
 286 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 287     uint32_t type);
 288 static void rack_counter_destroy(void);
 289 static int
 290 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 291     struct inpcb *inp, struct tcpcb *tp);
 292 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 293 static void
 294 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 295     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 296     uint8_t iptos);
 297 static void rack_dtor(void *mem, int32_t size, void *arg);
 298 static void
 299 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 300     uint32_t t, uint32_t cts);
 301 static struct rack_sendmap *
 302 rack_find_high_nonack(struct tcp_rack *rack,
 303     struct rack_sendmap *rsm);
 304 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 305 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 306 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 307 static int
 308 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 309     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 310 static int32_t rack_handoff_ok(struct tcpcb *tp);
 311 static int32_t rack_init(struct tcpcb *tp);
 312 static void rack_init_sysctls(void);
 313 static void
 314 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 315     struct tcphdr *th);
 316 static void
 317 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 318     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 319     uint8_t pass, struct rack_sendmap *hintrsm);
 320 static void
 321 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 322     struct rack_sendmap *rsm);
 323 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 324 static int32_t rack_output(struct tcpcb *tp);
 325 static void
 326 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
 327     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 328     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
 329
 330 static uint32_t
 331 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 332     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 333     uint32_t cts);
 334 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 335 static void rack_remxt_tmr(struct tcpcb *tp);
 336 static int
 337 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 338     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 339 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 340 static int32_t rack_stopall(struct tcpcb *tp);
 341 static void
 342 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 343     uint32_t delta);
 344 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 345 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 346 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 347 static uint32_t
 348 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 349     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 350 static void
 351 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 352     struct rack_sendmap *rsm, uint32_t ts);
 353 static int
 354 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 355     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 356 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 357 static void
 358 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
 359     struct tcpcb *tp, int32_t * ret_val);
 360 static int
 361 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 362     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 363     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 364 static int
 365 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 366     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 367     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 368 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp);
 369 static void
 370 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
 371     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
 372 static void
 373 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
 374     struct tcphdr *th, int32_t rstreason, int32_t tlen);
 375 static int
 376 rack_do_established(struct mbuf *m, struct tcphdr *th,
 377     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 378     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 379 static int
 380 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 381     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 382     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
 383 static int
 384 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 385     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 386     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 387 static int
 388 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 389     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 390     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 391 static int
 392 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 393     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 394     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 395 static int
 396 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 397     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 398     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 399 static int
 400 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 401     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 402     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 403 static int
 404 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
 405     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
 406     int32_t * drop_hdrlen, int32_t * ret_val);
 407 static int
 408 rack_process_rst(struct mbuf *m, struct tcphdr *th,
 409     struct socket *so, struct tcpcb *tp);
 410 struct rack_sendmap *
 411 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 412     uint32_t tsused);
 413 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 414 static void
 415      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 416
 417 static int
 418 rack_ts_check(struct mbuf *m, struct tcphdr *th,
 419     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
 420
 421 int32_t rack_clear_counter=0;
 422
 423
 424 static int
 425 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 426 {
 427         uint32_t stat;
 428         int32_t error;
 429
 430         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 431         if (error || req->newptr == NULL)
 432                 return error;
 433
 434         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 435         if (error)
 436                 return (error);
 437         if (stat == 1) {
 438 #ifdef INVARIANTS
 439                 printf("Clearing RACK counters\n");
 440 #endif
 441                 counter_u64_zero(rack_badfr);
 442                 counter_u64_zero(rack_badfr_bytes);
 443                 counter_u64_zero(rack_rtm_prr_retran);
 444                 counter_u64_zero(rack_rtm_prr_newdata);
 445                 counter_u64_zero(rack_timestamp_mismatch);
 446                 counter_u64_zero(rack_reorder_seen);
 447                 counter_u64_zero(rack_tlp_tot);
 448                 counter_u64_zero(rack_tlp_newdata);
 449                 counter_u64_zero(rack_tlp_retran);
 450                 counter_u64_zero(rack_tlp_retran_bytes);
 451                 counter_u64_zero(rack_tlp_retran_fail);
 452                 counter_u64_zero(rack_to_tot);
 453                 counter_u64_zero(rack_to_arm_rack);
 454                 counter_u64_zero(rack_to_arm_tlp);
 455                 counter_u64_zero(rack_paced_segments);
 456                 counter_u64_zero(rack_unpaced_segments);
 457                 counter_u64_zero(rack_saw_enobuf);
 458                 counter_u64_zero(rack_saw_enetunreach);
 459                 counter_u64_zero(rack_to_alloc_hard);
 460                 counter_u64_zero(rack_to_alloc_emerg);
 461                 counter_u64_zero(rack_sack_proc_all);
 462                 counter_u64_zero(rack_sack_proc_short);
 463                 counter_u64_zero(rack_sack_proc_restart);
 464                 counter_u64_zero(rack_to_alloc);
 465                 counter_u64_zero(rack_to_alloc_limited);
 466                 counter_u64_zero(rack_alloc_limited_conns);
 467                 counter_u64_zero(rack_split_limited);
 468                 counter_u64_zero(rack_find_high);
 469                 counter_u64_zero(rack_runt_sacks);
 470                 counter_u64_zero(rack_used_tlpmethod);
 471                 counter_u64_zero(rack_used_tlpmethod2);
 472                 counter_u64_zero(rack_enter_tlp_calc);
 473                 counter_u64_zero(rack_progress_drops);
 474                 counter_u64_zero(rack_tlp_does_nada);
 475         }
 476         rack_clear_counter = 0;
 477         return (0);
 478 }
 479
 480
 481
 482 static void
 483 rack_init_sysctls()
 484 {
 485         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 486             SYSCTL_CHILDREN(rack_sysctl_root),
 487             OID_AUTO, "map_limit", CTLFLAG_RW,
 488             &rack_map_entries_limit , 1024,
 489             "Is there a limit on how big the sendmap can grow? ");
 490
 491         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 492             SYSCTL_CHILDREN(rack_sysctl_root),
 493             OID_AUTO, "map_splitlimit", CTLFLAG_RW,
 494             &rack_map_split_limit , 256,
 495             "Is there a limit on how much splitting a peer can do?");
 496
 497         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 498             SYSCTL_CHILDREN(rack_sysctl_root),
 499             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 500             &rack_rate_sample_method , USE_RTT_LOW,
 501             "What method should we use for rate sampling 0=high, 1=low ");
 502         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 503             SYSCTL_CHILDREN(rack_sysctl_root),
 504             OID_AUTO, "data_after_close", CTLFLAG_RW,
 505             &rack_ignore_data_after_close, 0,
 506             "Do we hold off sending a RST until all pending data is ack'd");
 507         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 508             SYSCTL_CHILDREN(rack_sysctl_root),
 509             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 510             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 511             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 512         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 513             SYSCTL_CHILDREN(rack_sysctl_root),
 514             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 515             &rack_min_pace_time, 0,
 516             "Should we enforce a minimum pace time of 1ms");
 517         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 518             SYSCTL_CHILDREN(rack_sysctl_root),
 519             OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 520             &rack_min_pace_time_seg_req, 6,
 521             "How many segments have to be in the len to enforce min-pace-time");
 522         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 523             SYSCTL_CHILDREN(rack_sysctl_root),
 524             OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 525             &rack_reduce_largest_on_idle, 0,
 526             "Should we reduce the largest cwnd seen to IW on idle reduction");
 527         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 528             SYSCTL_CHILDREN(rack_sysctl_root),
 529             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 530             &rack_verbose_logging, 0,
 531             "Should RACK black box logging be verbose");
 532         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 533             SYSCTL_CHILDREN(rack_sysctl_root),
 534             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 535             &rack_use_sack_filter, 1,
 536             "Do we use sack filtering?");
 537         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 538             SYSCTL_CHILDREN(rack_sysctl_root),
 539             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 540             &rack_delayed_ack_time, 200,
 541             "Delayed ack time (200ms)");
 542         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 543             SYSCTL_CHILDREN(rack_sysctl_root),
 544             OID_AUTO, "tlpminto", CTLFLAG_RW,
 545             &rack_tlp_min, 10,
 546             "TLP minimum timeout per the specification (10ms)");
 547         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 548             SYSCTL_CHILDREN(rack_sysctl_root),
 549             OID_AUTO, "precache", CTLFLAG_RW,
 550             &rack_precache, 0,
 551             "Where should we precache the mcopy (0 is not at all)");
 552         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 553             SYSCTL_CHILDREN(rack_sysctl_root),
 554             OID_AUTO, "sblklimit", CTLFLAG_RW,
 555             &rack_sack_block_limit, 128,
 556             "When do we start paying attention to small sack blocks");
 557         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 558             SYSCTL_CHILDREN(rack_sysctl_root),
 559             OID_AUTO, "send_oldest", CTLFLAG_RW,
 560             &rack_always_send_oldest, 1,
 561             "Should we always send the oldest TLP and RACK-TLP");
 562         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 563             SYSCTL_CHILDREN(rack_sysctl_root),
 564             OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 565             &rack_tlp_in_recovery, 1,
 566             "Can we do a TLP during recovery?");
 567         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 568             SYSCTL_CHILDREN(rack_sysctl_root),
 569             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 570             &rack_limited_retran, 0,
 571             "How many times can a rack timeout drive out sends");
 572         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 573             SYSCTL_CHILDREN(rack_sysctl_root),
 574             OID_AUTO, "minrto", CTLFLAG_RW,
 575             &rack_rto_min, 0,
 576             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 577         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 578             SYSCTL_CHILDREN(rack_sysctl_root),
 579             OID_AUTO, "maxrto", CTLFLAG_RW,
 580             &rack_rto_max, 0,
 581             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 582         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 583             SYSCTL_CHILDREN(rack_sysctl_root),
 584             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 585             &rack_tlp_max_resend, 2,
 586             "How many times does TLP retry a single segment or multiple with no ACK");
 587         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 588             SYSCTL_CHILDREN(rack_sysctl_root),
 589             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 590             &rack_use_proportional_reduce, 0,
 591             "Should we proportionaly reduce cwnd based on the number of losses ");
 592         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 593             SYSCTL_CHILDREN(rack_sysctl_root),
 594             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 595             &rack_proportional_rate, 10,
 596             "What percent reduction per loss");
 597         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 598             SYSCTL_CHILDREN(rack_sysctl_root),
 599             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 600             &rack_lower_cwnd_at_tlp, 0,
 601             "When a TLP completes a retran should we enter recovery?");
 602         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 603             SYSCTL_CHILDREN(rack_sysctl_root),
 604             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 605             &rack_slot_reduction, 4,
 606             "When setting a slot should we reduce by divisor");
 607         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 608             SYSCTL_CHILDREN(rack_sysctl_root),
 609             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 610             &rack_pace_every_seg, 1,
 611             "Should we pace out every segment hptsi");
 612         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 613             SYSCTL_CHILDREN(rack_sysctl_root),
 614             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 615             &rack_hptsi_segments, 6,
 616             "Should we pace out only a limited size of segments");
 617         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 618             SYSCTL_CHILDREN(rack_sysctl_root),
 619             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 620             &rack_send_a_lot_in_prr, 1,
 621             "Send a lot in prr");
 622         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 623             SYSCTL_CHILDREN(rack_sysctl_root),
 624             OID_AUTO, "minto", CTLFLAG_RW,
 625             &rack_min_to, 1,
 626             "Minimum rack timeout in milliseconds");
 627         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 628             SYSCTL_CHILDREN(rack_sysctl_root),
 629             OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 630             &rack_early_recovery_max_seg, 6,
 631             "Max segments in early recovery");
 632         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 633             SYSCTL_CHILDREN(rack_sysctl_root),
 634             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 635             &rack_early_recovery, 1,
 636             "Do we do early recovery with rack");
 637         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 638             SYSCTL_CHILDREN(rack_sysctl_root),
 639             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 640             &rack_reorder_thresh, 2,
 641             "What factor for rack will be added when seeing reordering (shift right)");
 642         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 643             SYSCTL_CHILDREN(rack_sysctl_root),
 644             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 645             &rack_tlp_thresh, 1,
 646             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 647         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 648             SYSCTL_CHILDREN(rack_sysctl_root),
 649             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 650             &rack_reorder_fade, 0,
 651             "Does reorder detection fade, if so how many ms (0 means never)");
 652         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 653             SYSCTL_CHILDREN(rack_sysctl_root),
 654             OID_AUTO, "pktdelay", CTLFLAG_RW,
 655             &rack_pkt_delay, 1,
 656             "Extra RACK time (in ms) besides reordering thresh");
 657         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 658             SYSCTL_CHILDREN(rack_sysctl_root),
 659             OID_AUTO, "inc_var", CTLFLAG_RW,
 660             &rack_inc_var, 0,
 661             "Should rack add to the TLP timer the variance in rtt calculation");
 662         rack_badfr = counter_u64_alloc(M_WAITOK);
 663         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 664             SYSCTL_CHILDREN(rack_sysctl_root),
 665             OID_AUTO, "badfr", CTLFLAG_RD,
 666             &rack_badfr, "Total number of bad FRs");
 667         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 668         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 669             SYSCTL_CHILDREN(rack_sysctl_root),
 670             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 671             &rack_badfr_bytes, "Total number of bad FRs");
 672         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 673         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 674             SYSCTL_CHILDREN(rack_sysctl_root),
 675             OID_AUTO, "prrsndret", CTLFLAG_RD,
 676             &rack_rtm_prr_retran,
 677             "Total number of prr based retransmits");
 678         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 680             SYSCTL_CHILDREN(rack_sysctl_root),
 681             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 682             &rack_rtm_prr_newdata,
 683             "Total number of prr based new transmits");
 684         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 685         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 686             SYSCTL_CHILDREN(rack_sysctl_root),
 687             OID_AUTO, "tsnf", CTLFLAG_RD,
 688             &rack_timestamp_mismatch,
 689             "Total number of timestamps that we could not find the reported ts");
 690         rack_find_high = counter_u64_alloc(M_WAITOK);
 691         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 692             SYSCTL_CHILDREN(rack_sysctl_root),
 693             OID_AUTO, "findhigh", CTLFLAG_RD,
 694             &rack_find_high,
 695             "Total number of FIN causing find-high");
 696         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 697         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 698             SYSCTL_CHILDREN(rack_sysctl_root),
 699             OID_AUTO, "reordering", CTLFLAG_RD,
 700             &rack_reorder_seen,
 701             "Total number of times we added delay due to reordering");
 702         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 703         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 704             SYSCTL_CHILDREN(rack_sysctl_root),
 705             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 706             &rack_tlp_tot,
 707             "Total number of tail loss probe expirations");
 708         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 709         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 710             SYSCTL_CHILDREN(rack_sysctl_root),
 711             OID_AUTO, "tlp_new", CTLFLAG_RD,
 712             &rack_tlp_newdata,
 713             "Total number of tail loss probe sending new data");
 714
 715         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 716         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 717             SYSCTL_CHILDREN(rack_sysctl_root),
 718             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 719             &rack_tlp_retran,
 720             "Total number of tail loss probe sending retransmitted data");
 721         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 722         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 723             SYSCTL_CHILDREN(rack_sysctl_root),
 724             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 725             &rack_tlp_retran_bytes,
 726             "Total bytes of tail loss probe sending retransmitted data");
 727         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 728         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 729             SYSCTL_CHILDREN(rack_sysctl_root),
 730             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 731             &rack_tlp_retran_fail,
 732             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 733         rack_to_tot = counter_u64_alloc(M_WAITOK);
 734         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 735             SYSCTL_CHILDREN(rack_sysctl_root),
 736             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 737             &rack_to_tot,
 738             "Total number of times the rack to expired?");
 739         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 740         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 741             SYSCTL_CHILDREN(rack_sysctl_root),
 742             OID_AUTO, "arm_rack", CTLFLAG_RD,
 743             &rack_to_arm_rack,
 744             "Total number of times the rack timer armed?");
 745         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 746         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 747             SYSCTL_CHILDREN(rack_sysctl_root),
 748             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 749             &rack_to_arm_tlp,
 750             "Total number of times the tlp timer armed?");
 751         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 752         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 753             SYSCTL_CHILDREN(rack_sysctl_root),
 754             OID_AUTO, "paced", CTLFLAG_RD,
 755             &rack_paced_segments,
 756             "Total number of times a segment send caused hptsi");
 757         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 758         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 759             SYSCTL_CHILDREN(rack_sysctl_root),
 760             OID_AUTO, "unpaced", CTLFLAG_RD,
 761             &rack_unpaced_segments,
 762             "Total number of times a segment did not cause hptsi");
 763         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 764         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 765             SYSCTL_CHILDREN(rack_sysctl_root),
 766             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 767             &rack_saw_enobuf,
 768             "Total number of times a segment did not cause hptsi");
 769         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 770         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 771             SYSCTL_CHILDREN(rack_sysctl_root),
 772             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 773             &rack_saw_enetunreach,
 774             "Total number of times a segment did not cause hptsi");
 775         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 776         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 777             SYSCTL_CHILDREN(rack_sysctl_root),
 778             OID_AUTO, "allocs", CTLFLAG_RD,
 779             &rack_to_alloc,
 780             "Total allocations of tracking structures");
 781         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 782         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 783             SYSCTL_CHILDREN(rack_sysctl_root),
 784             OID_AUTO, "allochard", CTLFLAG_RD,
 785             &rack_to_alloc_hard,
 786             "Total allocations done with sleeping the hard way");
 787         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 788         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 789             SYSCTL_CHILDREN(rack_sysctl_root),
 790             OID_AUTO, "allocemerg", CTLFLAG_RD,
 791             &rack_to_alloc_emerg,
 792             "Total allocations done from emergency cache");
 793         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
 794         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 795             SYSCTL_CHILDREN(rack_sysctl_root),
 796             OID_AUTO, "alloc_limited", CTLFLAG_RD,
 797             &rack_to_alloc_limited,
 798             "Total allocations dropped due to limit");
 799         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 800         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 801             SYSCTL_CHILDREN(rack_sysctl_root),
 802             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 803             &rack_alloc_limited_conns,
 804             "Connections with allocations dropped due to limit");
 805         rack_split_limited = counter_u64_alloc(M_WAITOK);
 806         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 807             SYSCTL_CHILDREN(rack_sysctl_root),
 808             OID_AUTO, "split_limited", CTLFLAG_RD,
 809             &rack_split_limited,
 810             "Split allocations dropped due to limit");
 811         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 812         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 813             SYSCTL_CHILDREN(rack_sysctl_root),
 814             OID_AUTO, "sack_long", CTLFLAG_RD,
 815             &rack_sack_proc_all,
 816             "Total times we had to walk whole list for sack processing");
 817
 818         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 819         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 820             SYSCTL_CHILDREN(rack_sysctl_root),
 821             OID_AUTO, "sack_restart", CTLFLAG_RD,
 822             &rack_sack_proc_restart,
 823             "Total times we had to walk whole list due to a restart");
 824         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 825         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 826             SYSCTL_CHILDREN(rack_sysctl_root),
 827             OID_AUTO, "sack_short", CTLFLAG_RD,
 828             &rack_sack_proc_short,
 829             "Total times we took shortcut for sack processing");
 830         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 831         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 832             SYSCTL_CHILDREN(rack_sysctl_root),
 833             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 834             &rack_enter_tlp_calc,
 835             "Total times we called calc-tlp");
 836         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 837         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 838             SYSCTL_CHILDREN(rack_sysctl_root),
 839             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 840             &rack_used_tlpmethod,
 841             "Total number of runt sacks");
 842         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 843         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 844             SYSCTL_CHILDREN(rack_sysctl_root),
 845             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 846             &rack_used_tlpmethod2,
 847             "Total number of runt sacks 2");
 848         rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 849         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 850             SYSCTL_CHILDREN(rack_sysctl_root),
 851             OID_AUTO, "runtsacks", CTLFLAG_RD,
 852             &rack_runt_sacks,
 853             "Total number of runt sacks");
 854         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 855         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 856             SYSCTL_CHILDREN(rack_sysctl_root),
 857             OID_AUTO, "prog_drops", CTLFLAG_RD,
 858             &rack_progress_drops,
 859             "Total number of progress drops");
 860         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 861         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 862             SYSCTL_CHILDREN(rack_sysctl_root),
 863             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 864             &rack_input_idle_reduces,
 865             "Total number of idle reductions on input");
 866         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 867         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 868             SYSCTL_CHILDREN(rack_sysctl_root),
 869             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 870             &rack_tlp_does_nada,
 871             "Total number of nada tlp calls");
 872         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 873         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 874             OID_AUTO, "outsize", CTLFLAG_RD,
 875             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 876         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 877         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 878             OID_AUTO, "opts", CTLFLAG_RD,
 879             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 880         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 881             SYSCTL_CHILDREN(rack_sysctl_root),
 882             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 883             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 884 }
 885
 886 static inline int32_t
 887 rack_progress_timeout_check(struct tcpcb *tp)
 888 {
 889 #ifdef NETFLIX_PROGRESS
 890         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 891                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 892                         /*
 893                          * There is an assumption that the caller
 894                          * will drop the connection so we will
 895                          * increment the counters here.
 896                          */
 897                         struct tcp_rack *rack;
 898                         rack = (struct tcp_rack *)tp->t_fb_ptr;
 899                         counter_u64_add(rack_progress_drops, 1);
 900                         TCPSTAT_INC(tcps_progdrops);
 901                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 902                         return (1);
 903                 }
 904         }
 905 #endif
 906         return (0);
 907 }
 908
 909
 910 static void
 911 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 912 {
 913         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 914                 union tcp_log_stackspecific log;
 915
 916                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 917                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 918                 log.u_bbr.flex2 = to;
 919                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 920                 log.u_bbr.flex4 = slot;
 921                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 922                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 923                 log.u_bbr.flex8 = which;
 924                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 925                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 926                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 927                     &rack->rc_inp->inp_socket->so_rcv,
 928                     &rack->rc_inp->inp_socket->so_snd,
 929                     BBR_LOG_TIMERSTAR, 0,
 930                     0, &log, false);
 931         }
 932 }
 933
 934 static void
 935 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 936 {
 937         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 938                 union tcp_log_stackspecific log;
 939
 940                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 941                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 942                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 943                 log.u_bbr.flex8 = to_num;
 944                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 945                 log.u_bbr.flex2 = rack->rc_rack_rtt;
 946                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 947                     &rack->rc_inp->inp_socket->so_rcv,
 948                     &rack->rc_inp->inp_socket->so_snd,
 949                     BBR_LOG_RTO, 0,
 950                     0, &log, false);
 951         }
 952 }
 953
 954 static void
 955 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
 956     uint32_t o_srtt, uint32_t o_var)
 957 {
 958         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 959                 union tcp_log_stackspecific log;
 960
 961                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 962                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 963                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 964                 log.u_bbr.flex1 = t;
 965                 log.u_bbr.flex2 = o_srtt;
 966                 log.u_bbr.flex3 = o_var;
 967                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 968                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 969                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 970                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 971                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 972                 TCP_LOG_EVENT(tp, NULL,
 973                     &rack->rc_inp->inp_socket->so_rcv,
 974                     &rack->rc_inp->inp_socket->so_snd,
 975                     BBR_LOG_BBRRTT, 0,
 976                     0, &log, false);
 977         }
 978 }
 979
 980 static void
 981 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 982 {
 983         /*
 984          * Log the rtt sample we are
 985          * applying to the srtt algorithm in
 986          * useconds.
 987          */
 988         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 989                 union tcp_log_stackspecific log;
 990                 struct timeval tv;
 991
 992                 memset(&log, 0, sizeof(log));
 993                 /* Convert our ms to a microsecond */
 994                 log.u_bbr.flex1 = rtt * 1000;
 995                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 996                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
 997                     &rack->rc_inp->inp_socket->so_rcv,
 998                     &rack->rc_inp->inp_socket->so_snd,
 999                     TCP_LOG_RTT, 0,
1000                     0, &log, false, &tv);
1001         }
1002 }
1003
1004
1005 static inline void
1006 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
1007 {
1008         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1009                 union tcp_log_stackspecific log;
1010
1011                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1012                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1013                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1014                 log.u_bbr.flex1 = line;
1015                 log.u_bbr.flex2 = tick;
1016                 log.u_bbr.flex3 = tp->t_maxunacktime;
1017                 log.u_bbr.flex4 = tp->t_acktime;
1018                 log.u_bbr.flex8 = event;
1019                 TCP_LOG_EVENT(tp, NULL,
1020                     &rack->rc_inp->inp_socket->so_rcv,
1021                     &rack->rc_inp->inp_socket->so_snd,
1022                     BBR_LOG_PROGRESS, 0,
1023                     0, &log, false);
1024         }
1025 }
1026
1027 static void
1028 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1029 {
1030         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1031                 union tcp_log_stackspecific log;
1032
1033                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1034                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1035                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1036                 log.u_bbr.flex1 = slot;
1037                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1038                 log.u_bbr.flex8 = rack->rc_in_persist;
1039                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1040                     &rack->rc_inp->inp_socket->so_rcv,
1041                     &rack->rc_inp->inp_socket->so_snd,
1042                     BBR_LOG_BBRSND, 0,
1043                     0, &log, false);
1044         }
1045 }
1046
1047 static void
1048 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1049 {
1050         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1051                 union tcp_log_stackspecific log;
1052
1053                 memset(&log, 0, sizeof(log));
1054                 log.u_bbr.flex1 = did_out;
1055                 log.u_bbr.flex2 = nxt_pkt;
1056                 log.u_bbr.flex3 = way_out;
1057                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1058                 log.u_bbr.flex7 = rack->r_wanted_output;
1059                 log.u_bbr.flex8 = rack->rc_in_persist;
1060                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1061                     &rack->rc_inp->inp_socket->so_rcv,
1062                     &rack->rc_inp->inp_socket->so_snd,
1063                     BBR_LOG_DOSEG_DONE, 0,
1064                     0, &log, false);
1065         }
1066 }
1067
1068
1069 static void
1070 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1071 {
1072         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1073                 union tcp_log_stackspecific log;
1074
1075                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1076                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1077                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1078                 log.u_bbr.flex1 = slot;
1079                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1080                 log.u_bbr.flex7 = hpts_calling;
1081                 log.u_bbr.flex8 = rack->rc_in_persist;
1082                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1083                     &rack->rc_inp->inp_socket->so_rcv,
1084                     &rack->rc_inp->inp_socket->so_snd,
1085                     BBR_LOG_JUSTRET, 0,
1086                     tlen, &log, false);
1087         }
1088 }
1089
1090 static void
1091 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1092 {
1093         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1094                 union tcp_log_stackspecific log;
1095
1096                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1097                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1098                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1099                 log.u_bbr.flex1 = line;
1100                 log.u_bbr.flex2 = 0;
1101                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1102                 log.u_bbr.flex4 = 0;
1103                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1104                 log.u_bbr.flex8 = hpts_removed;
1105                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1106                     &rack->rc_inp->inp_socket->so_rcv,
1107                     &rack->rc_inp->inp_socket->so_snd,
1108                     BBR_LOG_TIMERCANC, 0,
1109                     0, &log, false);
1110         }
1111 }
1112
1113 static void
1114 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1115 {
1116         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1117                 union tcp_log_stackspecific log;
1118
1119                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1120                 log.u_bbr.flex1 = timers;
1121                 log.u_bbr.flex2 = ret;
1122                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1123                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1124                 log.u_bbr.flex5 = cts;
1125                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1126                     &rack->rc_inp->inp_socket->so_rcv,
1127                     &rack->rc_inp->inp_socket->so_snd,
1128                     BBR_LOG_TO_PROCESS, 0,
1129                     0, &log, false);
1130         }
1131 }
1132
1133 static void
1134 rack_counter_destroy()
1135 {
1136         counter_u64_free(rack_badfr);
1137         counter_u64_free(rack_badfr_bytes);
1138         counter_u64_free(rack_rtm_prr_retran);
1139         counter_u64_free(rack_rtm_prr_newdata);
1140         counter_u64_free(rack_timestamp_mismatch);
1141         counter_u64_free(rack_reorder_seen);
1142         counter_u64_free(rack_tlp_tot);
1143         counter_u64_free(rack_tlp_newdata);
1144         counter_u64_free(rack_tlp_retran);
1145         counter_u64_free(rack_tlp_retran_bytes);
1146         counter_u64_free(rack_tlp_retran_fail);
1147         counter_u64_free(rack_to_tot);
1148         counter_u64_free(rack_to_arm_rack);
1149         counter_u64_free(rack_to_arm_tlp);
1150         counter_u64_free(rack_paced_segments);
1151         counter_u64_free(rack_unpaced_segments);
1152         counter_u64_free(rack_saw_enobuf);
1153         counter_u64_free(rack_saw_enetunreach);
1154         counter_u64_free(rack_to_alloc_hard);
1155         counter_u64_free(rack_to_alloc_emerg);
1156         counter_u64_free(rack_sack_proc_all);
1157         counter_u64_free(rack_sack_proc_short);
1158         counter_u64_free(rack_sack_proc_restart);
1159         counter_u64_free(rack_to_alloc);
1160         counter_u64_free(rack_to_alloc_limited);
1161         counter_u64_free(rack_split_limited);
1162         counter_u64_free(rack_find_high);
1163         counter_u64_free(rack_runt_sacks);
1164         counter_u64_free(rack_enter_tlp_calc);
1165         counter_u64_free(rack_used_tlpmethod);
1166         counter_u64_free(rack_used_tlpmethod2);
1167         counter_u64_free(rack_progress_drops);
1168         counter_u64_free(rack_input_idle_reduces);
1169         counter_u64_free(rack_tlp_does_nada);
1170         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1171         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1172 }
1173
1174 static struct rack_sendmap *
1175 rack_alloc(struct tcp_rack *rack)
1176 {
1177         struct rack_sendmap *rsm;
1178
1179         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1180         if (rsm) {
1181                 rack->r_ctl.rc_num_maps_alloced++;
1182                 counter_u64_add(rack_to_alloc, 1);
1183                 return (rsm);
1184         }
1185         if (rack->rc_free_cnt) {
1186                 counter_u64_add(rack_to_alloc_emerg, 1);
1187                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1188                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
1189                 rack->rc_free_cnt--;
1190                 return (rsm);
1191         }
1192         return (NULL);
1193 }
1194
1195 static struct rack_sendmap *
1196 rack_alloc_full_limit(struct tcp_rack *rack)
1197 {
1198         if ((rack_map_entries_limit > 0) &&
1199             (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
1200                 counter_u64_add(rack_to_alloc_limited, 1);
1201                 if (!rack->alloc_limit_reported) {
1202                         rack->alloc_limit_reported = 1;
1203                         counter_u64_add(rack_alloc_limited_conns, 1);
1204                 }
1205                 return (NULL);
1206         }
1207         return (rack_alloc(rack));
1208 }
1209
1210 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1211 static struct rack_sendmap *
1212 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1213 {
1214         struct rack_sendmap *rsm;
1215
1216         if (limit_type) {
1217                 /* currently there is only one limit type */
1218                 if (rack_map_split_limit > 0 &&
1219                     rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
1220                         counter_u64_add(rack_split_limited, 1);
1221                         if (!rack->alloc_limit_reported) {
1222                                 rack->alloc_limit_reported = 1;
1223                                 counter_u64_add(rack_alloc_limited_conns, 1);
1224                         }
1225                         return (NULL);
1226                 }
1227         }
1228
1229         /* allocate and mark in the limit type, if set */
1230         rsm = rack_alloc(rack);
1231         if (rsm != NULL && limit_type) {
1232                 rsm->r_limit_type = limit_type;
1233                 rack->r_ctl.rc_num_split_allocs++;
1234         }
1235         return (rsm);
1236 }
1237
1238 static void
1239 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1240 {
1241         if (rsm->r_limit_type) {
1242                 /* currently there is only one limit type */
1243                 rack->r_ctl.rc_num_split_allocs--;
1244         }
1245         if (rack->r_ctl.rc_tlpsend == rsm)
1246                 rack->r_ctl.rc_tlpsend = NULL;
1247         if (rack->r_ctl.rc_next == rsm)
1248                 rack->r_ctl.rc_next = NULL;
1249         if (rack->r_ctl.rc_sacklast == rsm)
1250                 rack->r_ctl.rc_sacklast = NULL;
1251         if (rack->rc_free_cnt < rack_free_cache) {
1252                 memset(rsm, 0, sizeof(struct rack_sendmap));
1253                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
1254                 rsm->r_limit_type = 0;
1255                 rack->rc_free_cnt++;
1256                 return;
1257         }
1258         rack->r_ctl.rc_num_maps_alloced--;
1259         uma_zfree(rack_zone, rsm);
1260 }
1261
1262 /*
1263  * CC wrapper hook functions
1264  */
1265 static void
1266 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1267     uint16_t type, int32_t recovery)
1268 {
1269 #ifdef NETFLIX_STATS
1270         int32_t gput;
1271 #endif
1272
1273         INP_WLOCK_ASSERT(tp->t_inpcb);
1274
1275         tp->ccv->nsegs = nsegs;
1276         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1277         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1278                 uint32_t max;
1279
1280                 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
1281                 if (tp->ccv->bytes_this_ack > max) {
1282                         tp->ccv->bytes_this_ack = max;
1283                 }
1284         }
1285         if (tp->snd_cwnd <= tp->snd_wnd)
1286                 tp->ccv->flags |= CCF_CWND_LIMITED;
1287         else
1288                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1289
1290         if (type == CC_ACK) {
1291 #ifdef NETFLIX_STATS
1292                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1293                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1294                 if ((tp->t_flags & TF_GPUTINPROG) &&
1295                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1296                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1297                             max(1, tcp_ts_getticks() - tp->gput_ts);
1298                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1299                             gput);
1300                         /*
1301                          * XXXLAS: This is a temporary hack, and should be
1302                          * chained off VOI_TCP_GPUT when stats(9) grows an
1303                          * API to deal with chained VOIs.
1304                          */
1305                         if (tp->t_stats_gput_prev > 0)
1306                                 stats_voi_update_abs_s32(tp->t_stats,
1307                                     VOI_TCP_GPUT_ND,
1308                                     ((gput - tp->t_stats_gput_prev) * 100) /
1309                                     tp->t_stats_gput_prev);
1310                         tp->t_flags &= ~TF_GPUTINPROG;
1311                         tp->t_stats_gput_prev = gput;
1312                         if (tp->t_maxpeakrate) {
1313                                 /*
1314                                  * We update t_peakrate_thr. This gives us roughly
1315                                  * one update per round trip time.
1316                                  */
1317                                 tcp_update_peakrate_thr(tp);
1318                         }
1319                 }
1320 #endif
1321                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1322                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1323                             nsegs * V_tcp_abc_l_var * tp->t_maxseg);
1324                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1325                                 tp->t_bytes_acked -= tp->snd_cwnd;
1326                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1327                         }
1328                 } else {
1329                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1330                         tp->t_bytes_acked = 0;
1331                 }
1332         }
1333         if (CC_ALGO(tp)->ack_received != NULL) {
1334                 /* XXXLAS: Find a way to live without this */
1335                 tp->ccv->curack = th->th_ack;
1336                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1337         }
1338 #ifdef NETFLIX_STATS
1339         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1340 #endif
1341         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1342                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1343         }
1344         /* we enforce max peak rate if it is set. */
1345         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1346                 tp->snd_cwnd = tp->t_peakrate_thr;
1347         }
1348 }
1349
1350 static void
1351 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1352 {
1353         struct tcp_rack *rack;
1354
1355         rack = (struct tcp_rack *)tp->t_fb_ptr;
1356         INP_WLOCK_ASSERT(tp->t_inpcb);
1357         if (rack->r_ctl.rc_prr_sndcnt > 0)
1358                 rack->r_wanted_output++;
1359 }
1360
1361 static void
1362 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1363 {
1364         struct tcp_rack *rack;
1365
1366         INP_WLOCK_ASSERT(tp->t_inpcb);
1367         rack = (struct tcp_rack *)tp->t_fb_ptr;
1368         if (CC_ALGO(tp)->post_recovery != NULL) {
1369                 tp->ccv->curack = th->th_ack;
1370                 CC_ALGO(tp)->post_recovery(tp->ccv);
1371         }
1372         /*
1373          * Here we can in theory adjust cwnd to be based on the number of
1374          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1375          * based on the rack_use_proportional flag.
1376          */
1377         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1378                 int32_t reduce;
1379
1380                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1381                 if (reduce > 50) {
1382                         reduce = 50;
1383                 }
1384                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1385         } else {
1386                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1387                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1388                         tp->snd_cwnd = tp->snd_ssthresh;
1389                 }
1390         }
1391         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1392                 /* Suck the next prr cnt back into cwnd */
1393                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1394                 rack->r_ctl.rc_prr_sndcnt = 0;
1395         }
1396         tp->snd_recover = tp->snd_una;
1397         EXIT_RECOVERY(tp->t_flags);
1398 }
1399
1400 static void
1401 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1402 {
1403         struct tcp_rack *rack;
1404
1405         INP_WLOCK_ASSERT(tp->t_inpcb);
1406
1407         rack = (struct tcp_rack *)tp->t_fb_ptr;
1408         switch (type) {
1409         case CC_NDUPACK:
1410 /*              rack->r_ctl.rc_ssthresh_set = 1;*/
1411                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1412                         rack->r_ctl.rc_tlp_rtx_out = 0;
1413                         rack->r_ctl.rc_prr_delivered = 0;
1414                         rack->r_ctl.rc_prr_out = 0;
1415                         rack->r_ctl.rc_loss_count = 0;
1416                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
1417                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1418                         tp->snd_recover = tp->snd_max;
1419                         if (tp->t_flags & TF_ECN_PERMIT)
1420                                 tp->t_flags |= TF_ECN_SND_CWR;
1421                 }
1422                 break;
1423         case CC_ECN:
1424                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1425                         TCPSTAT_INC(tcps_ecn_rcwnd);
1426                         tp->snd_recover = tp->snd_max;
1427                         if (tp->t_flags & TF_ECN_PERMIT)
1428                                 tp->t_flags |= TF_ECN_SND_CWR;
1429                 }
1430                 break;
1431         case CC_RTO:
1432                 tp->t_dupacks = 0;
1433                 tp->t_bytes_acked = 0;
1434                 EXIT_RECOVERY(tp->t_flags);
1435                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1436                     tp->t_maxseg) * tp->t_maxseg;
1437                 tp->snd_cwnd = tp->t_maxseg;
1438                 break;
1439         case CC_RTO_ERR:
1440                 TCPSTAT_INC(tcps_sndrexmitbad);
1441                 /* RTO was unnecessary, so reset everything. */
1442                 tp->snd_cwnd = tp->snd_cwnd_prev;
1443                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1444                 tp->snd_recover = tp->snd_recover_prev;
1445                 if (tp->t_flags & TF_WASFRECOVERY)
1446                         ENTER_FASTRECOVERY(tp->t_flags);
1447                 if (tp->t_flags & TF_WASCRECOVERY)
1448                         ENTER_CONGRECOVERY(tp->t_flags);
1449                 tp->snd_nxt = tp->snd_max;
1450                 tp->t_badrxtwin = 0;
1451                 break;
1452         }
1453
1454         if (CC_ALGO(tp)->cong_signal != NULL) {
1455                 if (th != NULL)
1456                         tp->ccv->curack = th->th_ack;
1457                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1458         }
1459 }
1460
1461
1462
1463 static inline void
1464 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
1465 {
1466         uint32_t i_cwnd;
1467
1468         INP_WLOCK_ASSERT(tp->t_inpcb);
1469
1470 #ifdef NETFLIX_STATS
1471         TCPSTAT_INC(tcps_idle_restarts);
1472         if (tp->t_state == TCPS_ESTABLISHED)
1473                 TCPSTAT_INC(tcps_idle_estrestarts);
1474 #endif
1475         if (CC_ALGO(tp)->after_idle != NULL)
1476                 CC_ALGO(tp)->after_idle(tp->ccv);
1477
1478         if (V_tcp_initcwnd_segments)
1479                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
1480                     max(2 * tp->t_maxseg, 14600));
1481         else if (V_tcp_do_rfc3390)
1482                 i_cwnd = min(4 * tp->t_maxseg,
1483                     max(2 * tp->t_maxseg, 4380));
1484         else {
1485                 /* Per RFC5681 Section 3.1 */
1486                 if (tp->t_maxseg > 2190)
1487                         i_cwnd = 2 * tp->t_maxseg;
1488                 else if (tp->t_maxseg > 1095)
1489                         i_cwnd = 3 * tp->t_maxseg;
1490                 else
1491                         i_cwnd = 4 * tp->t_maxseg;
1492         }
1493         if (reduce_largest) {
1494                 /*
1495                  * Do we reduce the largest cwnd to make
1496                  * rack play nice on restart hptsi wise?
1497                  */
1498                 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
1499                         ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
1500         }
1501         /*
1502          * Being idle is no differnt than the initial window. If the cc
1503          * clamps it down below the initial window raise it to the initial
1504          * window.
1505          */
1506         if (tp->snd_cwnd < i_cwnd) {
1507                 tp->snd_cwnd = i_cwnd;
1508         }
1509 }
1510
1511
1512 /*
1513  * Indicate whether this ack should be delayed.  We can delay the ack if
1514  * following conditions are met:
1515  *      - There is no delayed ack timer in progress.
1516  *      - Our last ack wasn't a 0-sized window. We never want to delay
1517  *        the ack that opens up a 0-sized window.
1518  *      - LRO wasn't used for this segment. We make sure by checking that the
1519  *        segment size is not larger than the MSS.
1520  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1521  *        connection.
1522  */
1523 #define DELAY_ACK(tp, tlen)                      \
1524         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1525         ((tp->t_flags & TF_DELACK) == 0) &&      \
1526         (tlen <= tp->t_maxseg) &&                \
1527         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1528
1529 static inline void
1530 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
1531 {
1532         int32_t win;
1533
1534         /*
1535          * Calculate amount of space in receive window, and then do TCP
1536          * input processing. Receive window is amount of space in rcv queue,
1537          * but not less than advertised window.
1538          */
1539         win = sbspace(&so->so_rcv);
1540         if (win < 0)
1541                 win = 0;
1542         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1543 }
1544
1545 static void
1546 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
1547 {
1548         /*
1549          * Drop space held by incoming segment and return.
1550          */
1551         if (tp != NULL)
1552                 INP_WUNLOCK(tp->t_inpcb);
1553         if (m)
1554                 m_freem(m);
1555 }
1556
1557 static void
1558 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen)
1559 {
1560         if (tp != NULL) {
1561                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1562                 INP_WUNLOCK(tp->t_inpcb);
1563         } else
1564                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1565 }
1566
1567 /*
1568  * The value in ret_val informs the caller
1569  * if we dropped the tcb (and lock) or not.
1570  * 1 = we dropped it, 0 = the TCB is still locked
1571  * and valid.
1572  */
1573 static void
1574 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
1575 {
1576         /*
1577          * Generate an ACK dropping incoming segment if it occupies sequence
1578          * space, where the ACK reflects our state.
1579          *
1580          * We can now skip the test for the RST flag since all paths to this
1581          * code happen after packets containing RST have been dropped.
1582          *
1583          * In the SYN-RECEIVED state, don't send an ACK unless the segment
1584          * we received passes the SYN-RECEIVED ACK test. If it fails send a
1585          * RST.  This breaks the loop in the "LAND" DoS attack, and also
1586          * prevents an ACK storm between two listening ports that have been
1587          * sent forged SYN segments, each with the source address of the
1588          * other.
1589          */
1590         struct tcp_rack *rack;
1591
1592         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1593             (SEQ_GT(tp->snd_una, th->th_ack) ||
1594             SEQ_GT(th->th_ack, tp->snd_max))) {
1595                 *ret_val = 1;
1596                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
1597                 return;
1598         } else
1599                 *ret_val = 0;
1600         rack = (struct tcp_rack *)tp->t_fb_ptr;
1601         rack->r_wanted_output++;
1602         tp->t_flags |= TF_ACKNOW;
1603         if (m)
1604                 m_freem(m);
1605 }
1606
1607
1608 static int
1609 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
1610 {
1611         /*
1612          * RFC5961 Section 3.2
1613          *
1614          * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
1615          * window, we send challenge ACK.
1616          *
1617          * Note: to take into account delayed ACKs, we should test against
1618          * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
1619          * of closed window, not covered by the RFC.
1620          */
1621         int dropped = 0;
1622
1623         if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
1624             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1625             (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1626
1627                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1628                 KASSERT(tp->t_state != TCPS_SYN_SENT,
1629                     ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
1630                     __func__, th, tp));
1631
1632                 if (V_tcp_insecure_rst ||
1633                     (tp->last_ack_sent == th->th_seq) ||
1634                     (tp->rcv_nxt == th->th_seq) ||
1635                     ((tp->last_ack_sent - 1) == th->th_seq)) {
1636                         TCPSTAT_INC(tcps_drops);
1637                         /* Drop the connection. */
1638                         switch (tp->t_state) {
1639                         case TCPS_SYN_RECEIVED:
1640                                 so->so_error = ECONNREFUSED;
1641                                 goto close;
1642                         case TCPS_ESTABLISHED:
1643                         case TCPS_FIN_WAIT_1:
1644                         case TCPS_FIN_WAIT_2:
1645                         case TCPS_CLOSE_WAIT:
1646                         case TCPS_CLOSING:
1647                         case TCPS_LAST_ACK:
1648                                 so->so_error = ECONNRESET;
1649                 close:
1650                                 tcp_state_change(tp, TCPS_CLOSED);
1651                                 /* FALLTHROUGH */
1652                         default:
1653                                 tp = tcp_close(tp);
1654                         }
1655                         dropped = 1;
1656                         rack_do_drop(m, tp);
1657                 } else {
1658                         TCPSTAT_INC(tcps_badrst);
1659                         /* Send challenge ACK. */
1660                         tcp_respond(tp, mtod(m, void *), th, m,
1661                             tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1662                         tp->last_ack_sent = tp->rcv_nxt;
1663                 }
1664         } else {
1665                 m_freem(m);
1666         }
1667         return (dropped);
1668 }
1669
1670 /*
1671  * The value in ret_val informs the caller
1672  * if we dropped the tcb (and lock) or not.
1673  * 1 = we dropped it, 0 = the TCB is still locked
1674  * and valid.
1675  */
1676 static void
1677 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
1678 {
1679         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1680
1681         TCPSTAT_INC(tcps_badsyn);
1682         if (V_tcp_insecure_syn &&
1683             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1684             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1685                 tp = tcp_drop(tp, ECONNRESET);
1686                 *ret_val = 1;
1687                 rack_do_drop(m, tp);
1688         } else {
1689                 /* Send challenge ACK. */
1690                 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
1691                     tp->snd_nxt, TH_ACK);
1692                 tp->last_ack_sent = tp->rcv_nxt;
1693                 m = NULL;
1694                 *ret_val = 0;
1695                 rack_do_drop(m, NULL);
1696         }
1697 }
1698
1699 /*
1700  * rack_ts_check returns 1 for you should not proceed. It places
1701  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1702  * that the TCB is unlocked and probably dropped. The 0 indicates the
1703  * TCB is still valid and locked.
1704  */
1705 static int
1706 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
1707 {
1708
1709         /* Check to see if ts_recent is over 24 days old.  */
1710         if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1711                 /*
1712                  * Invalidate ts_recent.  If this segment updates ts_recent,
1713                  * the age will be reset later and ts_recent will get a
1714                  * valid value.  If it does not, setting ts_recent to zero
1715                  * will at least satisfy the requirement that zero be placed
1716                  * in the timestamp echo reply when ts_recent isn't valid.
1717                  * The age isn't reset until we get a valid ts_recent
1718                  * because we don't want out-of-order segments to be dropped
1719                  * when ts_recent is old.
1720                  */
1721                 tp->ts_recent = 0;
1722         } else {
1723                 TCPSTAT_INC(tcps_rcvduppack);
1724                 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1725                 TCPSTAT_INC(tcps_pawsdrop);
1726                 *ret_val = 0;
1727                 if (tlen) {
1728                         rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1729                 } else {
1730                         rack_do_drop(m, NULL);
1731                 }
1732                 return (1);
1733         }
1734         return (0);
1735 }
1736
1737 /*
1738  * rack_drop_checks returns 1 for you should not proceed. It places
1739  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1740  * that the TCB is unlocked and probably dropped. The 0 indicates the
1741  * TCB is still valid and locked.
1742  */
1743 static int
1744 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
1745 {
1746         int32_t todrop;
1747         int32_t thflags;
1748         int32_t tlen;
1749
1750         thflags = *thf;
1751         tlen = *tlenp;
1752         todrop = tp->rcv_nxt - th->th_seq;
1753         if (todrop > 0) {
1754                 if (thflags & TH_SYN) {
1755                         thflags &= ~TH_SYN;
1756                         th->th_seq++;
1757                         if (th->th_urp > 1)
1758                                 th->th_urp--;
1759                         else
1760                                 thflags &= ~TH_URG;
1761                         todrop--;
1762                 }
1763                 /*
1764                  * Following if statement from Stevens, vol. 2, p. 960.
1765                  */
1766                 if (todrop > tlen
1767                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1768                         /*
1769                          * Any valid FIN must be to the left of the window.
1770                          * At this point the FIN must be a duplicate or out
1771                          * of sequence; drop it.
1772                          */
1773                         thflags &= ~TH_FIN;
1774                         /*
1775                          * Send an ACK to resynchronize and drop any data.
1776                          * But keep on processing for RST or ACK.
1777                          */
1778                         tp->t_flags |= TF_ACKNOW;
1779                         todrop = tlen;
1780                         TCPSTAT_INC(tcps_rcvduppack);
1781                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1782                 } else {
1783                         TCPSTAT_INC(tcps_rcvpartduppack);
1784                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1785                 }
1786                 *drop_hdrlen += todrop; /* drop from the top afterwards */
1787                 th->th_seq += todrop;
1788                 tlen -= todrop;
1789                 if (th->th_urp > todrop)
1790                         th->th_urp -= todrop;
1791                 else {
1792                         thflags &= ~TH_URG;
1793                         th->th_urp = 0;
1794                 }
1795         }
1796         /*
1797          * If segment ends after window, drop trailing data (and PUSH and
1798          * FIN); if nothing left, just ACK.
1799          */
1800         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1801         if (todrop > 0) {
1802                 TCPSTAT_INC(tcps_rcvpackafterwin);
1803                 if (todrop >= tlen) {
1804                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1805                         /*
1806                          * If window is closed can only take segments at
1807                          * window edge, and have to drop data and PUSH from
1808                          * incoming segments.  Continue processing, but
1809                          * remember to ack.  Otherwise, drop segment and
1810                          * ack.
1811                          */
1812                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1813                                 tp->t_flags |= TF_ACKNOW;
1814                                 TCPSTAT_INC(tcps_rcvwinprobe);
1815                         } else {
1816                                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1817                                 return (1);
1818                         }
1819                 } else
1820                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1821                 m_adj(m, -todrop);
1822                 tlen -= todrop;
1823                 thflags &= ~(TH_PUSH | TH_FIN);
1824         }
1825         *thf = thflags;
1826         *tlenp = tlen;
1827         return (0);
1828 }
1829
1830 static struct rack_sendmap *
1831 rack_find_lowest_rsm(struct tcp_rack *rack)
1832 {
1833         struct rack_sendmap *rsm;
1834
1835         /*
1836          * Walk the time-order transmitted list looking for an rsm that is
1837          * not acked. This will be the one that was sent the longest time
1838          * ago that is still outstanding.
1839          */
1840         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1841                 if (rsm->r_flags & RACK_ACKED) {
1842                         continue;
1843                 }
1844                 goto finish;
1845         }
1846 finish:
1847         return (rsm);
1848 }
1849
1850 static struct rack_sendmap *
1851 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1852 {
1853         struct rack_sendmap *prsm;
1854
1855         /*
1856          * Walk the sequence order list backward until we hit and arrive at
1857          * the highest seq not acked. In theory when this is called it
1858          * should be the last segment (which it was not).
1859          */
1860         counter_u64_add(rack_find_high, 1);
1861         prsm = rsm;
1862         TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
1863                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1864                         continue;
1865                 }
1866                 return (prsm);
1867         }
1868         return (NULL);
1869 }
1870
1871
1872 static uint32_t
1873 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1874 {
1875         int32_t lro;
1876         uint32_t thresh;
1877
1878         /*
1879          * lro is the flag we use to determine if we have seen reordering.
1880          * If it gets set we have seen reordering. The reorder logic either
1881          * works in one of two ways:
1882          *
1883          * If reorder-fade is configured, then we track the last time we saw
1884          * re-ordering occur. If we reach the point where enough time as
1885          * passed we no longer consider reordering has occuring.
1886          *
1887          * Or if reorder-face is 0, then once we see reordering we consider
1888          * the connection to alway be subject to reordering and just set lro
1889          * to 1.
1890          *
1891          * In the end if lro is non-zero we add the extra time for
1892          * reordering in.
1893          */
1894         if (srtt == 0)
1895                 srtt = 1;
1896         if (rack->r_ctl.rc_reorder_ts) {
1897                 if (rack->r_ctl.rc_reorder_fade) {
1898                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1899                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1900                                 if (lro == 0) {
1901                                         /*
1902                                          * No time as passed since the last
1903                                          * reorder, mark it as reordering.
1904                                          */
1905                                         lro = 1;
1906                                 }
1907                         } else {
1908                                 /* Negative time? */
1909                                 lro = 0;
1910                         }
1911                         if (lro > rack->r_ctl.rc_reorder_fade) {
1912                                 /* Turn off reordering seen too */
1913                                 rack->r_ctl.rc_reorder_ts = 0;
1914                                 lro = 0;
1915                         }
1916                 } else {
1917                         /* Reodering does not fade */
1918                         lro = 1;
1919                 }
1920         } else {
1921                 lro = 0;
1922         }
1923         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1924         if (lro) {
1925                 /* It must be set, if not you get 1/4 rtt */
1926                 if (rack->r_ctl.rc_reorder_shift)
1927                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1928                 else
1929                         thresh += (srtt >> 2);
1930         } else {
1931                 thresh += 1;
1932         }
1933         /* We don't let the rack timeout be above a RTO */
1934
1935         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1936                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1937         }
1938         /* And we don't want it above the RTO max either */
1939         if (thresh > rack_rto_max) {
1940                 thresh = rack_rto_max;
1941         }
1942         return (thresh);
1943 }
1944
1945 static uint32_t
1946 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
1947                      struct rack_sendmap *rsm, uint32_t srtt)
1948 {
1949         struct rack_sendmap *prsm;
1950         uint32_t thresh, len;
1951         int maxseg;
1952
1953         if (srtt == 0)
1954                 srtt = 1;
1955         if (rack->r_ctl.rc_tlp_threshold)
1956                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
1957         else
1958                 thresh = (srtt * 2);
1959
1960         /* Get the previous sent packet, if any  */
1961         maxseg = tcp_maxseg(tp);
1962         counter_u64_add(rack_enter_tlp_calc, 1);
1963         len = rsm->r_end - rsm->r_start;
1964         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
1965                 /* Exactly like the ID */
1966                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
1967                         uint32_t alt_thresh;
1968                         /*
1969                          * Compensate for delayed-ack with the d-ack time.
1970                          */
1971                         counter_u64_add(rack_used_tlpmethod, 1);
1972                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1973                         if (alt_thresh > thresh)
1974                                 thresh = alt_thresh;
1975                 }
1976         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
1977                 /* 2.1 behavior */
1978                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
1979                 if (prsm && (len <= maxseg)) {
1980                         /*
1981                          * Two packets outstanding, thresh should be (2*srtt) +
1982                          * possible inter-packet delay (if any).
1983                          */
1984                         uint32_t inter_gap = 0;
1985                         int idx, nidx;
1986
1987                         counter_u64_add(rack_used_tlpmethod, 1);
1988                         idx = rsm->r_rtr_cnt - 1;
1989                         nidx = prsm->r_rtr_cnt - 1;
1990                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
1991                                 /* Yes it was sent later (or at the same time) */
1992                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
1993                         }
1994                         thresh += inter_gap;
1995                 } else  if (len <= maxseg) {
1996                         /*
1997                          * Possibly compensate for delayed-ack.
1998                          */
1999                         uint32_t alt_thresh;
2000
2001                         counter_u64_add(rack_used_tlpmethod2, 1);
2002                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2003                         if (alt_thresh > thresh)
2004                                 thresh = alt_thresh;
2005                 }
2006         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2007                 /* 2.2 behavior */
2008                 if (len <= maxseg) {
2009                         uint32_t alt_thresh;
2010                         /*
2011                          * Compensate for delayed-ack with the d-ack time.
2012                          */
2013                         counter_u64_add(rack_used_tlpmethod, 1);
2014                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2015                         if (alt_thresh > thresh)
2016                                 thresh = alt_thresh;
2017                 }
2018         }
2019         /* Not above an RTO */
2020         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2021                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2022         }
2023         /* Not above a RTO max */
2024         if (thresh > rack_rto_max) {
2025                 thresh = rack_rto_max;
2026         }
2027         /* Apply user supplied min TLP */
2028         if (thresh < rack_tlp_min) {
2029                 thresh = rack_tlp_min;
2030         }
2031         return (thresh);
2032 }
2033
2034 static struct rack_sendmap *
2035 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2036 {
2037         /*
2038          * Check to see that we don't need to fall into recovery. We will
2039          * need to do so if our oldest transmit is past the time we should
2040          * have had an ack.
2041          */
2042         struct tcp_rack *rack;
2043         struct rack_sendmap *rsm;
2044         int32_t idx;
2045         uint32_t srtt_cur, srtt, thresh;
2046
2047         rack = (struct tcp_rack *)tp->t_fb_ptr;
2048         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
2049                 return (NULL);
2050         }
2051         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
2052         srtt = TICKS_2_MSEC(srtt_cur);
2053         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
2054                 srtt = rack->rc_rack_rtt;
2055
2056         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2057         if (rsm == NULL)
2058                 return (NULL);
2059
2060         if (rsm->r_flags & RACK_ACKED) {
2061                 rsm = rack_find_lowest_rsm(rack);
2062                 if (rsm == NULL)
2063                         return (NULL);
2064         }
2065         idx = rsm->r_rtr_cnt - 1;
2066         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2067         if (tsused < rsm->r_tim_lastsent[idx]) {
2068                 return (NULL);
2069         }
2070         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2071                 return (NULL);
2072         }
2073         /* Ok if we reach here we are over-due */
2074         rack->r_ctl.rc_rsm_start = rsm->r_start;
2075         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2076         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2077         rack_cong_signal(tp, NULL, CC_NDUPACK);
2078         return (rsm);
2079 }
2080
2081 static uint32_t
2082 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2083 {
2084         int32_t t;
2085         int32_t tt;
2086         uint32_t ret_val;
2087
2088         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2089         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2090             tcp_persmin, tcp_persmax);
2091         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2092                 tp->t_rxtshift++;
2093         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2094         ret_val = (uint32_t)tt;
2095         return (ret_val);
2096 }
2097
2098 static uint32_t
2099 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2100 {
2101         /*
2102          * Start the FR timer, we do this based on getting the first one in
2103          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2104          * events we need to stop the running timer (if its running) before
2105          * starting the new one.
2106          */
2107         uint32_t thresh, exp, to, srtt, time_since_sent;
2108         uint32_t srtt_cur;
2109         int32_t idx;
2110         int32_t is_tlp_timer = 0;
2111         struct rack_sendmap *rsm;
2112
2113         if (rack->t_timers_stopped) {
2114                 /* All timers have been stopped none are to run */
2115                 return (0);
2116         }
2117         if (rack->rc_in_persist) {
2118                 /* We can't start any timer in persists */
2119                 return (rack_get_persists_timer_val(tp, rack));
2120         }
2121         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2122         if (rsm == NULL) {
2123                 /* Nothing on the send map */
2124 activate_rxt:
2125                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2126                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2127                         to = TICKS_2_MSEC(tp->t_rxtcur);
2128                         if (to == 0)
2129                                 to = 1;
2130                         return (to);
2131                 }
2132                 return (0);
2133         }
2134         if (rsm->r_flags & RACK_ACKED) {
2135                 rsm = rack_find_lowest_rsm(rack);
2136                 if (rsm == NULL) {
2137                         /* No lowest? */
2138                         goto activate_rxt;
2139                 }
2140         }
2141         /* Convert from ms to usecs */
2142         if (rsm->r_flags & RACK_SACK_PASSED) {
2143                 if ((tp->t_flags & TF_SENTFIN) &&
2144                     ((tp->snd_max - tp->snd_una) == 1) &&
2145                     (rsm->r_flags & RACK_HAS_FIN)) {
2146                         /*
2147                          * We don't start a rack timer if all we have is a
2148                          * FIN outstanding.
2149                          */
2150                         goto activate_rxt;
2151                 }
2152                 if (tp->t_srtt) {
2153                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2154                         srtt = TICKS_2_MSEC(srtt_cur);
2155                 } else
2156                         srtt = RACK_INITIAL_RTO;
2157
2158                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2159                 idx = rsm->r_rtr_cnt - 1;
2160                 exp = rsm->r_tim_lastsent[idx] + thresh;
2161                 if (SEQ_GEQ(exp, cts)) {
2162                         to = exp - cts;
2163                         if (to < rack->r_ctl.rc_min_to) {
2164                                 to = rack->r_ctl.rc_min_to;
2165                         }
2166                 } else {
2167                         to = rack->r_ctl.rc_min_to;
2168                 }
2169         } else {
2170                 /* Ok we need to do a TLP not RACK */
2171                 if ((rack->rc_tlp_in_progress != 0) ||
2172                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2173                         /*
2174                          * The previous send was a TLP or a tlp_rtx is in
2175                          * process.
2176                          */
2177                         goto activate_rxt;
2178                 }
2179                 if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) {
2180                         /*
2181                          * Peer collapsed rwnd, don't do TLP.
2182                          */
2183                         goto activate_rxt;
2184                 }
2185                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2186                 if (rsm == NULL) {
2187                         /* We found no rsm to TLP with. */
2188                         goto activate_rxt;
2189                 }
2190                 if (rsm->r_flags & RACK_HAS_FIN) {
2191                         /* If its a FIN we dont do TLP */
2192                         rsm = NULL;
2193                         goto activate_rxt;
2194                 }
2195                 idx = rsm->r_rtr_cnt - 1;
2196                 if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
2197                         time_since_sent = cts - rsm->r_tim_lastsent[idx];
2198                 else
2199                         time_since_sent = 0;
2200                 is_tlp_timer = 1;
2201                 if (tp->t_srtt) {
2202                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2203                         srtt = TICKS_2_MSEC(srtt_cur);
2204                 } else
2205                         srtt = RACK_INITIAL_RTO;
2206                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2207                 if (thresh > time_since_sent)
2208                         to = thresh - time_since_sent;
2209                 else
2210                         to = rack->r_ctl.rc_min_to;
2211                 if (to > TCPTV_REXMTMAX) {
2212                         /*
2213                          * If the TLP time works out to larger than the max
2214                          * RTO lets not do TLP.. just RTO.
2215                          */
2216                         goto activate_rxt;
2217                 }
2218                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2219                         /*
2220                          * The tail is no longer the last one I did a probe
2221                          * on
2222                          */
2223                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2224                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2225                 }
2226         }
2227         if (is_tlp_timer == 0) {
2228                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2229         } else {
2230                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2231                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2232                         /*
2233                          * We have exceeded how many times we can retran the
2234                          * current TLP timer, switch to the RTO timer.
2235                          */
2236                         goto activate_rxt;
2237                 } else {
2238                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2239                 }
2240         }
2241         if (to == 0)
2242                 to = 1;
2243         return (to);
2244 }
2245
2246 static void
2247 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2248 {
2249         if (rack->rc_in_persist == 0) {
2250                 if (((tp->t_flags & TF_SENTFIN) == 0) &&
2251                     (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
2252                         /* Must need to send more data to enter persist */
2253                         return;
2254                 rack->r_ctl.rc_went_idle_time = cts;
2255                 rack_timer_cancel(tp, rack, cts, __LINE__);
2256                 tp->t_rxtshift = 0;
2257                 rack->rc_in_persist = 1;
2258         }
2259 }
2260
2261 static void
2262 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2263 {
2264         if (rack->rc_inp->inp_in_hpts)  {
2265                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2266                 rack->r_ctl.rc_hpts_flags  = 0;
2267         }
2268         rack->rc_in_persist = 0;
2269         rack->r_ctl.rc_went_idle_time = 0;
2270         tp->t_flags &= ~TF_FORCEDATA;
2271         tp->t_rxtshift = 0;
2272 }
2273
2274 static void
2275 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
2276     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
2277 {
2278         struct inpcb *inp;
2279         uint32_t delayed_ack = 0;
2280         uint32_t hpts_timeout;
2281         uint8_t stopped;
2282         uint32_t left = 0;
2283
2284         inp = tp->t_inpcb;
2285         if (inp->inp_in_hpts) {
2286                 /* A previous call is already set up */
2287                 return;
2288         }
2289
2290         if ((tp->t_state == TCPS_CLOSED) ||
2291             (tp->t_state == TCPS_LISTEN)) {
2292                 return;
2293         }
2294         stopped = rack->rc_tmr_stopped;
2295         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2296                 left = rack->r_ctl.rc_timer_exp - cts;
2297         }
2298         rack->r_ctl.rc_timer_exp = 0;
2299         if (rack->rc_inp->inp_in_hpts == 0) {
2300                 rack->r_ctl.rc_hpts_flags = 0;
2301         }
2302         if (slot) {
2303                 /* We are hptsi too */
2304                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2305         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2306                 /*
2307                  * We are still left on the hpts when the to goes
2308                  * it will be for output.
2309                  */
2310                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
2311                         slot = rack->r_ctl.rc_last_output_to - cts;
2312                 else
2313                         slot = 1;
2314         }
2315         if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2316                 /* No send window.. we must enter persist */
2317                 rack_enter_persist(tp, rack, cts);
2318         } else if ((frm_out_sbavail &&
2319                     (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
2320                     (tp->snd_wnd < tp->t_maxseg)) &&
2321             TCPS_HAVEESTABLISHED(tp->t_state)) {
2322                 /*
2323                  * If we have no window or we can't send a segment (and have
2324                  * data to send.. we cheat here and frm_out_sbavail is
2325                  * passed in with the sbavail(sb) only from bbr_output) and
2326                  * we are established, then we must enter persits (if not
2327                  * already in persits).
2328                  */
2329                 rack_enter_persist(tp, rack, cts);
2330         }
2331         hpts_timeout = rack_timer_start(tp, rack, cts);
2332         if (tp->t_flags & TF_DELACK) {
2333                 delayed_ack = tcp_delacktime;
2334                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2335         }
2336         if (delayed_ack && ((hpts_timeout == 0) ||
2337                             (delayed_ack < hpts_timeout)))
2338                 hpts_timeout = delayed_ack;
2339         else
2340                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2341         /*
2342          * If no timers are going to run and we will fall off the hptsi
2343          * wheel, we resort to a keep-alive timer if its configured.
2344          */
2345         if ((hpts_timeout == 0) &&
2346             (slot == 0)) {
2347                 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2348                     (tp->t_state <= TCPS_CLOSING)) {
2349                         /*
2350                          * Ok we have no timer (persists, rack, tlp, rxt  or
2351                          * del-ack), we don't have segments being paced. So
2352                          * all that is left is the keepalive timer.
2353                          */
2354                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2355                                 /* Get the established keep-alive time */
2356                                 hpts_timeout = TP_KEEPIDLE(tp);
2357                         } else {
2358                                 /* Get the initial setup keep-alive time */
2359                                 hpts_timeout = TP_KEEPINIT(tp);
2360                         }
2361                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2362                 }
2363         }
2364         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2365             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2366                 /*
2367                  * RACK, TLP, persists and RXT timers all are restartable
2368                  * based on actions input .. i.e we received a packet (ack
2369                  * or sack) and that changes things (rw, or snd_una etc).
2370                  * Thus we can restart them with a new value. For
2371                  * keep-alive, delayed_ack we keep track of what was left
2372                  * and restart the timer with a smaller value.
2373                  */
2374                 if (left < hpts_timeout)
2375                         hpts_timeout = left;
2376         }
2377         if (hpts_timeout) {
2378                 /*
2379                  * Hack alert for now we can't time-out over 2,147,483
2380                  * seconds (a bit more than 596 hours), which is probably ok
2381                  * :).
2382                  */
2383                 if (hpts_timeout > 0x7ffffffe)
2384                         hpts_timeout = 0x7ffffffe;
2385                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2386         }
2387         if (slot) {
2388                 rack->r_ctl.rc_last_output_to = cts + slot;
2389                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2390                         if (rack->rc_inp->inp_in_hpts == 0)
2391                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2392                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2393                 } else {
2394                         /*
2395                          * Arrange for the hpts to kick back in after the
2396                          * t-o if the t-o does not cause a send.
2397                          */
2398                         if (rack->rc_inp->inp_in_hpts == 0)
2399                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2400                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2401                 }
2402         } else if (hpts_timeout) {
2403                 if (rack->rc_inp->inp_in_hpts == 0)
2404                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2405                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2406         } else {
2407                 /* No timer starting */
2408 #ifdef INVARIANTS
2409                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2410                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2411                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2412                 }
2413 #endif
2414         }
2415         rack->rc_tmr_stopped = 0;
2416         if (slot)
2417                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2418 }
2419
2420 /*
2421  * RACK Timer, here we simply do logging and house keeping.
2422  * the normal rack_output() function will call the
2423  * appropriate thing to check if we need to do a RACK retransmit.
2424  * We return 1, saying don't proceed with rack_output only
2425  * when all timers have been stopped (destroyed PCB?).
2426  */
2427 static int
2428 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2429 {
2430         /*
2431          * This timer simply provides an internal trigger to send out data.
2432          * The check_recovery_mode call will see if there are needed
2433          * retransmissions, if so we will enter fast-recovery. The output
2434          * call may or may not do the same thing depending on sysctl
2435          * settings.
2436          */
2437         struct rack_sendmap *rsm;
2438         int32_t recovery;
2439
2440         if (tp->t_timers->tt_flags & TT_STOPPED) {
2441                 return (1);
2442         }
2443         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2444                 /* Its not time yet */
2445                 return (0);
2446         }
2447         rack_log_to_event(rack, RACK_TO_FRM_RACK);
2448         recovery = IN_RECOVERY(tp->t_flags);
2449         counter_u64_add(rack_to_tot, 1);
2450         if (rack->r_state && (rack->r_state != tp->t_state))
2451                 rack_set_state(tp, rack);
2452         rsm = rack_check_recovery_mode(tp, cts);
2453         if (rsm) {
2454                 uint32_t rtt;
2455
2456                 rtt = rack->rc_rack_rtt;
2457                 if (rtt == 0)
2458                         rtt = 1;
2459                 if ((recovery == 0) &&
2460                     (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
2461                         /*
2462                          * The rack-timeout that enter's us into recovery
2463                          * will force out one MSS and set us up so that we
2464                          * can do one more send in 2*rtt (transitioning the
2465                          * rack timeout into a rack-tlp).
2466                          */
2467                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2468                 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
2469                     ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
2470                         /*
2471                          * When a rack timer goes, we have to send at
2472                          * least one segment. They will be paced a min of 1ms
2473                          * apart via the next rack timer (or further
2474                          * if the rack timer dictates it).
2475                          */
2476                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2477                 }
2478         } else {
2479                 /* This is a case that should happen rarely if ever */
2480                 counter_u64_add(rack_tlp_does_nada, 1);
2481 #ifdef TCP_BLACKBOX
2482                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2483 #endif
2484                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2485         }
2486         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2487         return (0);
2488 }
2489
2490 static struct rack_sendmap *
2491 rack_merge_rsm(struct tcp_rack *rack,
2492                struct rack_sendmap *l_rsm,
2493                struct rack_sendmap *r_rsm)
2494 {
2495         /*
2496          * We are merging two ack'd RSM's,
2497          * the l_rsm is on the left (lower seq
2498          * values) and the r_rsm is on the right
2499          * (higher seq value). The simplest way
2500          * to merge these is to move the right
2501          * one into the left. I don't think there
2502          * is any reason we need to try to find
2503          * the oldest (or last oldest retransmitted).
2504          */
2505         l_rsm->r_end = r_rsm->r_end;
2506         if (r_rsm->r_rtr_bytes)
2507                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
2508         if (r_rsm->r_in_tmap) {
2509                 /* This really should not happen */
2510                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
2511         }
2512         /* Now the flags */
2513         if (r_rsm->r_flags & RACK_HAS_FIN)
2514                 l_rsm->r_flags |= RACK_HAS_FIN;
2515         if (r_rsm->r_flags & RACK_TLP)
2516                 l_rsm->r_flags |= RACK_TLP;
2517         TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next);
2518         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
2519                 /* Transfer the split limit to the map we free */
2520                 r_rsm->r_limit_type = l_rsm->r_limit_type;
2521                 l_rsm->r_limit_type = 0;
2522         }
2523         rack_free(rack, r_rsm);
2524         return(l_rsm);
2525 }
2526
2527 /*
2528  * TLP Timer, here we simply setup what segment we want to
2529  * have the TLP expire on, the normal rack_output() will then
2530  * send it out.
2531  *
2532  * We return 1, saying don't proceed with rack_output only
2533  * when all timers have been stopped (destroyed PCB?).
2534  */
2535 static int
2536 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2537 {
2538         /*
2539          * Tail Loss Probe.
2540          */
2541         struct rack_sendmap *rsm = NULL;
2542         struct socket *so;
2543         uint32_t amm, old_prr_snd = 0;
2544         uint32_t out, avail;
2545
2546         if (tp->t_timers->tt_flags & TT_STOPPED) {
2547                 return (1);
2548         }
2549         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2550                 /* Its not time yet */
2551                 return (0);
2552         }
2553         if (rack_progress_timeout_check(tp)) {
2554                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2555                 return (1);
2556         }
2557         /*
2558          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2559          * need to figure out how to force a full MSS segment out.
2560          */
2561         rack_log_to_event(rack, RACK_TO_FRM_TLP);
2562         counter_u64_add(rack_tlp_tot, 1);
2563         if (rack->r_state && (rack->r_state != tp->t_state))
2564                 rack_set_state(tp, rack);
2565         so = tp->t_inpcb->inp_socket;
2566         avail = sbavail(&so->so_snd);
2567         out = tp->snd_max - tp->snd_una;
2568         rack->rc_timer_up = 1;
2569         /*
2570          * If we are in recovery we can jazz out a segment if new data is
2571          * present simply by setting rc_prr_sndcnt to a segment.
2572          */
2573         if ((avail > out) &&
2574             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2575                 /* New data is available */
2576                 amm = avail - out;
2577                 if (amm > tp->t_maxseg) {
2578                         amm = tp->t_maxseg;
2579                 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
2580                         /* not enough to fill a MTU and no-delay is off */
2581                         goto need_retran;
2582                 }
2583                 if (IN_RECOVERY(tp->t_flags)) {
2584                         /* Unlikely */
2585                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2586                         if (out + amm <= tp->snd_wnd)
2587                                 rack->r_ctl.rc_prr_sndcnt = amm;
2588                         else
2589                                 goto need_retran;
2590                 } else {
2591                         /* Set the send-new override */
2592                         if (out + amm <= tp->snd_wnd)
2593                                 rack->r_ctl.rc_tlp_new_data = amm;
2594                         else
2595                                 goto need_retran;
2596                 }
2597                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2598                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2599                 rack->r_ctl.rc_tlpsend = NULL;
2600                 counter_u64_add(rack_tlp_newdata, 1);
2601                 goto send;
2602         }
2603 need_retran:
2604         /*
2605          * Ok we need to arrange the last un-acked segment to be re-sent, or
2606          * optionally the first un-acked segment.
2607          */
2608         if (rack_always_send_oldest)
2609                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2610         else {
2611                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
2612                 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2613                         rsm = rack_find_high_nonack(rack, rsm);
2614                 }
2615         }
2616         if (rsm == NULL) {
2617                 counter_u64_add(rack_tlp_does_nada, 1);
2618 #ifdef TCP_BLACKBOX
2619                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2620 #endif
2621                 goto out;
2622         }
2623         if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
2624                 /*
2625                  * We need to split this the last segment in two.
2626                  */
2627                 int32_t idx;
2628                 struct rack_sendmap *nrsm;
2629
2630                 nrsm = rack_alloc_full_limit(rack);
2631                 if (nrsm == NULL) {
2632                         /*
2633                          * No memory to split, we will just exit and punt
2634                          * off to the RXT timer.
2635                          */
2636                         counter_u64_add(rack_tlp_does_nada, 1);
2637                         goto out;
2638                 }
2639                 nrsm->r_start = (rsm->r_end - tp->t_maxseg);
2640                 nrsm->r_end = rsm->r_end;
2641                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2642                 nrsm->r_flags = rsm->r_flags;
2643                 nrsm->r_sndcnt = rsm->r_sndcnt;
2644                 nrsm->r_rtr_bytes = 0;
2645                 rsm->r_end = nrsm->r_start;
2646                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2647                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2648                 }
2649                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
2650                 if (rsm->r_in_tmap) {
2651                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2652                         nrsm->r_in_tmap = 1;
2653                 }
2654                 rsm->r_flags &= (~RACK_HAS_FIN);
2655                 rsm = nrsm;
2656         }
2657         rack->r_ctl.rc_tlpsend = rsm;
2658         rack->r_ctl.rc_tlp_rtx_out = 1;
2659         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2660                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2661                 tp->t_rxtshift++;
2662         } else {
2663                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2664                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2665         }
2666 send:
2667         rack->r_ctl.rc_tlp_send_cnt++;
2668         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2669                 /*
2670                  * Can't [re]/transmit a segment we have not heard from the
2671                  * peer in max times. We need the retransmit timer to take
2672                  * over.
2673                  */
2674 restore:
2675                 rack->r_ctl.rc_tlpsend = NULL;
2676                 if (rsm)
2677                         rsm->r_flags &= ~RACK_TLP;
2678                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2679                 counter_u64_add(rack_tlp_retran_fail, 1);
2680                 goto out;
2681         } else if (rsm) {
2682                 rsm->r_flags |= RACK_TLP;
2683         }
2684         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2685             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2686                 /*
2687                  * We don't want to send a single segment more than the max
2688                  * either.
2689                  */
2690                 goto restore;
2691         }
2692         rack->r_timer_override = 1;
2693         rack->r_tlp_running = 1;
2694         rack->rc_tlp_in_progress = 1;
2695         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2696         return (0);
2697 out:
2698         rack->rc_timer_up = 0;
2699         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2700         return (0);
2701 }
2702
2703 /*
2704  * Delayed ack Timer, here we simply need to setup the
2705  * ACK_NOW flag and remove the DELACK flag. From there
2706  * the output routine will send the ack out.
2707  *
2708  * We only return 1, saying don't proceed, if all timers
2709  * are stopped (destroyed PCB?).
2710  */
2711 static int
2712 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2713 {
2714         if (tp->t_timers->tt_flags & TT_STOPPED) {
2715                 return (1);
2716         }
2717         rack_log_to_event(rack, RACK_TO_FRM_DELACK);
2718         tp->t_flags &= ~TF_DELACK;
2719         tp->t_flags |= TF_ACKNOW;
2720         TCPSTAT_INC(tcps_delack);
2721         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2722         return (0);
2723 }
2724
2725 /*
2726  * Persists timer, here we simply need to setup the
2727  * FORCE-DATA flag the output routine will send
2728  * the one byte send.
2729  *
2730  * We only return 1, saying don't proceed, if all timers
2731  * are stopped (destroyed PCB?).
2732  */
2733 static int
2734 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2735 {
2736         struct inpcb *inp;
2737         int32_t retval = 0;
2738
2739         inp = tp->t_inpcb;
2740
2741         if (tp->t_timers->tt_flags & TT_STOPPED) {
2742                 return (1);
2743         }
2744         if (rack->rc_in_persist == 0)
2745                 return (0);
2746         if (rack_progress_timeout_check(tp)) {
2747                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2748                 return (1);
2749         }
2750         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2751         /*
2752          * Persistence timer into zero window. Force a byte to be output, if
2753          * possible.
2754          */
2755         TCPSTAT_INC(tcps_persisttimeo);
2756         /*
2757          * Hack: if the peer is dead/unreachable, we do not time out if the
2758          * window is closed.  After a full backoff, drop the connection if
2759          * the idle time (no responses to probes) reaches the maximum
2760          * backoff that we would use if retransmitting.
2761          */
2762         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2763             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2764             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2765                 TCPSTAT_INC(tcps_persistdrop);
2766                 retval = 1;
2767                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2768                 goto out;
2769         }
2770         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2771             tp->snd_una == tp->snd_max)
2772                 rack_exit_persist(tp, rack);
2773         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2774         /*
2775          * If the user has closed the socket then drop a persisting
2776          * connection after a much reduced timeout.
2777          */
2778         if (tp->t_state > TCPS_CLOSE_WAIT &&
2779             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2780                 retval = 1;
2781                 TCPSTAT_INC(tcps_persistdrop);
2782                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2783                 goto out;
2784         }
2785         tp->t_flags |= TF_FORCEDATA;
2786 out:
2787         rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
2788         return (retval);
2789 }
2790
2791 /*
2792  * If a keepalive goes off, we had no other timers
2793  * happening. We always return 1 here since this
2794  * routine either drops the connection or sends
2795  * out a segment with respond.
2796  */
2797 static int
2798 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2799 {
2800         struct tcptemp *t_template;
2801         struct inpcb *inp;
2802
2803         if (tp->t_timers->tt_flags & TT_STOPPED) {
2804                 return (1);
2805         }
2806         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
2807         inp = tp->t_inpcb;
2808         rack_log_to_event(rack, RACK_TO_FRM_KEEP);
2809         /*
2810          * Keep-alive timer went off; send something or drop connection if
2811          * idle for too long.
2812          */
2813         TCPSTAT_INC(tcps_keeptimeo);
2814         if (tp->t_state < TCPS_ESTABLISHED)
2815                 goto dropit;
2816         if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2817             tp->t_state <= TCPS_CLOSING) {
2818                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
2819                         goto dropit;
2820                 /*
2821                  * Send a packet designed to force a response if the peer is
2822                  * up and reachable: either an ACK if the connection is
2823                  * still alive, or an RST if the peer has closed the
2824                  * connection due to timeout or reboot. Using sequence
2825                  * number tp->snd_una-1 causes the transmitted zero-length
2826                  * segment to lie outside the receive window; by the
2827                  * protocol spec, this requires the correspondent TCP to
2828                  * respond.
2829                  */
2830                 TCPSTAT_INC(tcps_keepprobe);
2831                 t_template = tcpip_maketemplate(inp);
2832                 if (t_template) {
2833                         tcp_respond(tp, t_template->tt_ipgen,
2834                             &t_template->tt_t, (struct mbuf *)NULL,
2835                             tp->rcv_nxt, tp->snd_una - 1, 0);
2836                         free(t_template, M_TEMP);
2837                 }
2838         }
2839         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
2840         return (1);
2841 dropit:
2842         TCPSTAT_INC(tcps_keepdrops);
2843         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2844         return (1);
2845 }
2846
2847 /*
2848  * Retransmit helper function, clear up all the ack
2849  * flags and take care of important book keeping.
2850  */
2851 static void
2852 rack_remxt_tmr(struct tcpcb *tp)
2853 {
2854         /*
2855          * The retransmit timer went off, all sack'd blocks must be
2856          * un-acked.
2857          */
2858         struct rack_sendmap *rsm, *trsm = NULL;
2859         struct tcp_rack *rack;
2860         int32_t cnt = 0;
2861
2862         rack = (struct tcp_rack *)tp->t_fb_ptr;
2863         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
2864         rack_log_to_event(rack, RACK_TO_FRM_TMR);
2865         if (rack->r_state && (rack->r_state != tp->t_state))
2866                 rack_set_state(tp, rack);
2867         /*
2868          * Ideally we would like to be able to
2869          * mark SACK-PASS on anything not acked here.
2870          * However, if we do that we would burst out
2871          * all that data 1ms apart. This would be unwise,
2872          * so for now we will just let the normal rxt timer
2873          * and tlp timer take care of it.
2874          */
2875         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
2876                 if (rsm->r_flags & RACK_ACKED) {
2877                         cnt++;
2878                         rsm->r_sndcnt = 0;
2879                         if (rsm->r_in_tmap == 0) {
2880                                 /* We must re-add it back to the tlist */
2881                                 if (trsm == NULL) {
2882                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
2883                                 } else {
2884                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
2885                                 }
2886                                 rsm->r_in_tmap = 1;
2887                                 trsm = rsm;
2888                         }
2889                 }
2890                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
2891         }
2892         /* Clear the count (we just un-acked them) */
2893         rack->r_ctl.rc_sacked = 0;
2894         /* Clear the tlp rtx mark */
2895         rack->r_ctl.rc_tlp_rtx_out = 0;
2896         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2897         rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
2898         /* Setup so we send one segment */
2899         if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
2900                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2901         rack->r_timer_override = 1;
2902 }
2903
2904 /*
2905  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
2906  * we will setup to retransmit the lowest seq number outstanding.
2907  */
2908 static int
2909 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2910 {
2911         int32_t rexmt;
2912         struct inpcb *inp;
2913         int32_t retval = 0;
2914
2915         inp = tp->t_inpcb;
2916         if (tp->t_timers->tt_flags & TT_STOPPED) {
2917                 return (1);
2918         }
2919         if (rack_progress_timeout_check(tp)) {
2920                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2921                 return (1);
2922         }
2923         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
2924         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2925             (tp->snd_una == tp->snd_max)) {
2926                 /* Nothing outstanding .. nothing to do */
2927                 return (0);
2928         }
2929         /*
2930          * Retransmission timer went off.  Message has not been acked within
2931          * retransmit interval.  Back off to a longer retransmit interval
2932          * and retransmit one segment.
2933          */
2934         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
2935                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
2936                 TCPSTAT_INC(tcps_timeoutdrop);
2937                 retval = 1;
2938                 tcp_set_inp_to_drop(rack->rc_inp,
2939                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
2940                 goto out;
2941         }
2942         rack_remxt_tmr(tp);
2943         if (tp->t_state == TCPS_SYN_SENT) {
2944                 /*
2945                  * If the SYN was retransmitted, indicate CWND to be limited
2946                  * to 1 segment in cc_conn_init().
2947                  */
2948                 tp->snd_cwnd = 1;
2949         } else if (tp->t_rxtshift == 1) {
2950                 /*
2951                  * first retransmit; record ssthresh and cwnd so they can be
2952                  * recovered if this turns out to be a "bad" retransmit. A
2953                  * retransmit is considered "bad" if an ACK for this segment
2954                  * is received within RTT/2 interval; the assumption here is
2955                  * that the ACK was already in flight.  See "On Estimating
2956                  * End-to-End Network Path Properties" by Allman and Paxson
2957                  * for more details.
2958                  */
2959                 tp->snd_cwnd_prev = tp->snd_cwnd;
2960                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
2961                 tp->snd_recover_prev = tp->snd_recover;
2962                 if (IN_FASTRECOVERY(tp->t_flags))
2963                         tp->t_flags |= TF_WASFRECOVERY;
2964                 else
2965                         tp->t_flags &= ~TF_WASFRECOVERY;
2966                 if (IN_CONGRECOVERY(tp->t_flags))
2967                         tp->t_flags |= TF_WASCRECOVERY;
2968                 else
2969                         tp->t_flags &= ~TF_WASCRECOVERY;
2970                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
2971                 tp->t_flags |= TF_PREVVALID;
2972         } else
2973                 tp->t_flags &= ~TF_PREVVALID;
2974         TCPSTAT_INC(tcps_rexmttimeo);
2975         if ((tp->t_state == TCPS_SYN_SENT) ||
2976             (tp->t_state == TCPS_SYN_RECEIVED))
2977                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
2978         else
2979                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
2980         TCPT_RANGESET(tp->t_rxtcur, rexmt,
2981            max(MSEC_2_TICKS(rack_rto_min), rexmt),
2982            MSEC_2_TICKS(rack_rto_max));
2983         /*
2984          * We enter the path for PLMTUD if connection is established or, if
2985          * connection is FIN_WAIT_1 status, reason for the last is that if
2986          * amount of data we send is very small, we could send it in couple
2987          * of packets and process straight to FIN. In that case we won't
2988          * catch ESTABLISHED state.
2989          */
2990         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
2991             || (tp->t_state == TCPS_FIN_WAIT_1))) {
2992 #ifdef INET6
2993                 int32_t isipv6;
2994 #endif
2995
2996                 /*
2997                  * Idea here is that at each stage of mtu probe (usually,
2998                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
2999                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
3000                  * should take care of that.
3001                  */
3002                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
3003                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
3004                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
3005                     tp->t_rxtshift % 2 == 0)) {
3006                         /*
3007                          * Enter Path MTU Black-hole Detection mechanism: -
3008                          * Disable Path MTU Discovery (IP "DF" bit). -
3009                          * Reduce MTU to lower value than what we negotiated
3010                          * with peer.
3011                          */
3012                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
3013                                 /* Record that we may have found a black hole. */
3014                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
3015                                 /* Keep track of previous MSS. */
3016                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
3017                         }
3018
3019                         /*
3020                          * Reduce the MSS to blackhole value or to the
3021                          * default in an attempt to retransmit.
3022                          */
3023 #ifdef INET6
3024                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
3025                         if (isipv6 &&
3026                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3027                                 /* Use the sysctl tuneable blackhole MSS. */
3028                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3029                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3030                         } else if (isipv6) {
3031                                 /* Use the default MSS. */
3032                                 tp->t_maxseg = V_tcp_v6mssdflt;
3033                                 /*
3034                                  * Disable Path MTU Discovery when we switch
3035                                  * to minmss.
3036                                  */
3037                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3038                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3039                         }
3040 #endif
3041 #if defined(INET6) && defined(INET)
3042                         else
3043 #endif
3044 #ifdef INET
3045                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3046                                 /* Use the sysctl tuneable blackhole MSS. */
3047                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3048                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3049                         } else {
3050                                 /* Use the default MSS. */
3051                                 tp->t_maxseg = V_tcp_mssdflt;
3052                                 /*
3053                                  * Disable Path MTU Discovery when we switch
3054                                  * to minmss.
3055                                  */
3056                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3057                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3058                         }
3059 #endif
3060                 } else {
3061                         /*
3062                          * If further retransmissions are still unsuccessful
3063                          * with a lowered MTU, maybe this isn't a blackhole
3064                          * and we restore the previous MSS and blackhole
3065                          * detection flags. The limit '6' is determined by
3066                          * giving each probe stage (1448, 1188, 524) 2
3067                          * chances to recover.
3068                          */
3069                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3070                             (tp->t_rxtshift >= 6)) {
3071                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3072                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3073                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3074                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3075                         }
3076                 }
3077         }
3078         /*
3079          * Disable RFC1323 and SACK if we haven't got any response to our
3080          * third SYN to work-around some broken terminal servers (most of
3081          * which have hopefully been retired) that have bad VJ header
3082          * compression code which trashes TCP segments containing
3083          * unknown-to-them TCP options.
3084          */
3085         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
3086             (tp->t_rxtshift == 3))
3087                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
3088         /*
3089          * If we backed off this far, our srtt estimate is probably bogus.
3090          * Clobber it so we'll take the next rtt measurement as our srtt;
3091          * move the current srtt into rttvar to keep the current retransmit
3092          * times until then.
3093          */
3094         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3095 #ifdef INET6
3096                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3097                         in6_losing(tp->t_inpcb);
3098                 else
3099 #endif
3100                         in_losing(tp->t_inpcb);
3101                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3102                 tp->t_srtt = 0;
3103         }
3104         if (rack_use_sack_filter)
3105                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3106         tp->snd_recover = tp->snd_max;
3107         tp->t_flags |= TF_ACKNOW;
3108         tp->t_rtttime = 0;
3109         rack_cong_signal(tp, NULL, CC_RTO);
3110 out:
3111         return (retval);
3112 }
3113
3114 static int
3115 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3116 {
3117         int32_t ret = 0;
3118         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3119
3120         if (timers == 0) {
3121                 return (0);
3122         }
3123         if (tp->t_state == TCPS_LISTEN) {
3124                 /* no timers on listen sockets */
3125                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3126                         return (0);
3127                 return (1);
3128         }
3129         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3130                 uint32_t left;
3131
3132                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3133                         ret = -1;
3134                         rack_log_to_processing(rack, cts, ret, 0);
3135                         return (0);
3136                 }
3137                 if (hpts_calling == 0) {
3138                         ret = -2;
3139                         rack_log_to_processing(rack, cts, ret, 0);
3140                         return (0);
3141                 }
3142                 /*
3143                  * Ok our timer went off early and we are not paced false
3144                  * alarm, go back to sleep.
3145                  */
3146                 ret = -3;
3147                 left = rack->r_ctl.rc_timer_exp - cts;
3148                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3149                 rack_log_to_processing(rack, cts, ret, left);
3150                 rack->rc_last_pto_set = 0;
3151                 return (1);
3152         }
3153         rack->rc_tmr_stopped = 0;
3154         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3155         if (timers & PACE_TMR_DELACK) {
3156                 ret = rack_timeout_delack(tp, rack, cts);
3157         } else if (timers & PACE_TMR_RACK) {
3158                 ret = rack_timeout_rack(tp, rack, cts);
3159         } else if (timers & PACE_TMR_TLP) {
3160                 ret = rack_timeout_tlp(tp, rack, cts);
3161         } else if (timers & PACE_TMR_RXT) {
3162                 ret = rack_timeout_rxt(tp, rack, cts);
3163         } else if (timers & PACE_TMR_PERSIT) {
3164                 ret = rack_timeout_persist(tp, rack, cts);
3165         } else if (timers & PACE_TMR_KEEP) {
3166                 ret = rack_timeout_keepalive(tp, rack, cts);
3167         }
3168         rack_log_to_processing(rack, cts, ret, timers);
3169         return (ret);
3170 }
3171
3172 static void
3173 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3174 {
3175         uint8_t hpts_removed = 0;
3176
3177         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3178             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3179                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3180                 hpts_removed = 1;
3181         }
3182         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3183                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3184                 if (rack->rc_inp->inp_in_hpts &&
3185                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3186                         /*
3187                          * Canceling timer's when we have no output being
3188                          * paced. We also must remove ourselves from the
3189                          * hpts.
3190                          */
3191                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3192                         hpts_removed = 1;
3193                 }
3194                 rack_log_to_cancel(rack, hpts_removed, line);
3195                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3196         }
3197 }
3198
3199 static void
3200 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3201 {
3202         return;
3203 }
3204
3205 static int
3206 rack_stopall(struct tcpcb *tp)
3207 {
3208         struct tcp_rack *rack;
3209         rack = (struct tcp_rack *)tp->t_fb_ptr;
3210         rack->t_timers_stopped = 1;
3211         return (0);
3212 }
3213
3214 static void
3215 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3216 {
3217         return;
3218 }
3219
3220 static int
3221 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3222 {
3223         return (0);
3224 }
3225
3226 static void
3227 rack_stop_all_timers(struct tcpcb *tp)
3228 {
3229         struct tcp_rack *rack;
3230
3231         /*
3232          * Assure no timers are running.
3233          */
3234         if (tcp_timer_active(tp, TT_PERSIST)) {
3235                 /* We enter in persists, set the flag appropriately */
3236                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3237                 rack->rc_in_persist = 1;
3238         }
3239         tcp_timer_suspend(tp, TT_PERSIST);
3240         tcp_timer_suspend(tp, TT_REXMT);
3241         tcp_timer_suspend(tp, TT_KEEP);
3242         tcp_timer_suspend(tp, TT_DELACK);
3243 }
3244
3245 static void
3246 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3247     struct rack_sendmap *rsm, uint32_t ts)
3248 {
3249         int32_t idx;
3250
3251         rsm->r_rtr_cnt++;
3252         rsm->r_sndcnt++;
3253         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3254                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3255                 rsm->r_flags |= RACK_OVERMAX;
3256         }
3257         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3258                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3259                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3260         }
3261         idx = rsm->r_rtr_cnt - 1;
3262         rsm->r_tim_lastsent[idx] = ts;
3263         if (rsm->r_flags & RACK_ACKED) {
3264                 /* Problably MTU discovery messing with us */
3265                 rsm->r_flags &= ~RACK_ACKED;
3266                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3267         }
3268         if (rsm->r_in_tmap) {
3269                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3270         }
3271         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3272         rsm->r_in_tmap = 1;
3273         if (rsm->r_flags & RACK_SACK_PASSED) {
3274                 /* We have retransmitted due to the SACK pass */
3275                 rsm->r_flags &= ~RACK_SACK_PASSED;
3276                 rsm->r_flags |= RACK_WAS_SACKPASS;
3277         }
3278         /* Update memory for next rtr */
3279         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3280 }
3281
3282
3283 static uint32_t
3284 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3285     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
3286 {
3287         /*
3288          * We (re-)transmitted starting at rsm->r_start for some length
3289          * (possibly less than r_end.
3290          */
3291         struct rack_sendmap *nrsm;
3292         uint32_t c_end;
3293         int32_t len;
3294         int32_t idx;
3295
3296         len = *lenp;
3297         c_end = rsm->r_start + len;
3298         if (SEQ_GEQ(c_end, rsm->r_end)) {
3299                 /*
3300                  * We retransmitted the whole piece or more than the whole
3301                  * slopping into the next rsm.
3302                  */
3303                 rack_update_rsm(tp, rack, rsm, ts);
3304                 if (c_end == rsm->r_end) {
3305                         *lenp = 0;
3306                         return (0);
3307                 } else {
3308                         int32_t act_len;
3309
3310                         /* Hangs over the end return whats left */
3311                         act_len = rsm->r_end - rsm->r_start;
3312                         *lenp = (len - act_len);
3313                         return (rsm->r_end);
3314                 }
3315                 /* We don't get out of this block. */
3316         }
3317         /*
3318          * Here we retransmitted less than the whole thing which means we
3319          * have to split this into what was transmitted and what was not.
3320          */
3321         nrsm = rack_alloc_full_limit(rack);
3322         if (nrsm == NULL) {
3323                 /*
3324                  * We can't get memory, so lets not proceed.
3325                  */
3326                 *lenp = 0;
3327                 return (0);
3328         }
3329         /*
3330          * So here we are going to take the original rsm and make it what we
3331          * retransmitted. nrsm will be the tail portion we did not
3332          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3333          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3334          * 1, 6 and the new piece will be 6, 11.
3335          */
3336         nrsm->r_start = c_end;
3337         nrsm->r_end = rsm->r_end;
3338         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3339         nrsm->r_flags = rsm->r_flags;
3340         nrsm->r_sndcnt = rsm->r_sndcnt;
3341         nrsm->r_rtr_bytes = 0;
3342         rsm->r_end = c_end;
3343         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3344                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3345         }
3346         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3347         if (rsm->r_in_tmap) {
3348                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3349                 nrsm->r_in_tmap = 1;
3350         }
3351         rsm->r_flags &= (~RACK_HAS_FIN);
3352         rack_update_rsm(tp, rack, rsm, ts);
3353         *lenp = 0;
3354         return (0);
3355 }
3356
3357
3358 static void
3359 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3360     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3361     uint8_t pass, struct rack_sendmap *hintrsm)
3362 {
3363         struct tcp_rack *rack;
3364         struct rack_sendmap *rsm, *nrsm;
3365         register uint32_t snd_max, snd_una;
3366         int32_t idx;
3367
3368         /*
3369          * Add to the RACK log of packets in flight or retransmitted. If
3370          * there is a TS option we will use the TS echoed, if not we will
3371          * grab a TS.
3372          *
3373          * Retransmissions will increment the count and move the ts to its
3374          * proper place. Note that if options do not include TS's then we
3375          * won't be able to effectively use the ACK for an RTT on a retran.
3376          *
3377          * Notes about r_start and r_end. Lets consider a send starting at
3378          * sequence 1 for 10 bytes. In such an example the r_start would be
3379          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3380          * This means that r_end is actually the first sequence for the next
3381          * slot (11).
3382          *
3383          */
3384         /*
3385          * If err is set what do we do XXXrrs? should we not add the thing?
3386          * -- i.e. return if err != 0 or should we pretend we sent it? --
3387          * i.e. proceed with add ** do this for now.
3388          */
3389         INP_WLOCK_ASSERT(tp->t_inpcb);
3390         if (err)
3391                 /*
3392                  * We don't log errors -- we could but snd_max does not
3393                  * advance in this case either.
3394                  */
3395                 return;
3396
3397         if (th_flags & TH_RST) {
3398                 /*
3399                  * We don't log resets and we return immediately from
3400                  * sending
3401                  */
3402                 return;
3403         }
3404         rack = (struct tcp_rack *)tp->t_fb_ptr;
3405         snd_una = tp->snd_una;
3406         if (SEQ_LEQ((seq_out + len), snd_una)) {
3407                 /* Are sending an old segment to induce an ack (keep-alive)? */
3408                 return;
3409         }
3410         if (SEQ_LT(seq_out, snd_una)) {
3411                 /* huh? should we panic? */
3412                 uint32_t end;
3413
3414                 end = seq_out + len;
3415                 seq_out = snd_una;
3416                 len = end - seq_out;
3417         }
3418         snd_max = tp->snd_max;
3419         if (th_flags & (TH_SYN | TH_FIN)) {
3420                 /*
3421                  * The call to rack_log_output is made before bumping
3422                  * snd_max. This means we can record one extra byte on a SYN
3423                  * or FIN if seq_out is adding more on and a FIN is present
3424                  * (and we are not resending).
3425                  */
3426                 if (th_flags & TH_SYN)
3427                         len++;
3428                 if (th_flags & TH_FIN)
3429                         len++;
3430                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3431                         /*
3432                          * The add/update as not been done for the FIN/SYN
3433                          * yet.
3434                          */
3435                         snd_max = tp->snd_nxt;
3436                 }
3437         }
3438         if (len == 0) {
3439                 /* We don't log zero window probes */
3440                 return;
3441         }
3442         rack->r_ctl.rc_time_last_sent = ts;
3443         if (IN_RECOVERY(tp->t_flags)) {
3444                 rack->r_ctl.rc_prr_out += len;
3445         }
3446         /* First question is it a retransmission? */
3447         if (seq_out == snd_max) {
3448 again:
3449                 rsm = rack_alloc(rack);
3450                 if (rsm == NULL) {
3451                         /*
3452                          * Hmm out of memory and the tcb got destroyed while
3453                          * we tried to wait.
3454                          */
3455                         return;
3456                 }
3457                 if (th_flags & TH_FIN) {
3458                         rsm->r_flags = RACK_HAS_FIN;
3459                 } else {
3460                         rsm->r_flags = 0;
3461                 }
3462                 rsm->r_tim_lastsent[0] = ts;
3463                 rsm->r_rtr_cnt = 1;
3464                 rsm->r_rtr_bytes = 0;
3465                 rsm->r_start = seq_out;
3466                 rsm->r_end = rsm->r_start + len;
3467                 rsm->r_sndcnt = 0;
3468                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
3469                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3470                 rsm->r_in_tmap = 1;
3471                 return;
3472         }
3473         /*
3474          * If we reach here its a retransmission and we need to find it.
3475          */
3476 more:
3477         if (hintrsm && (hintrsm->r_start == seq_out)) {
3478                 rsm = hintrsm;
3479                 hintrsm = NULL;
3480         } else if (rack->r_ctl.rc_next) {
3481                 /* We have a hint from a previous run */
3482                 rsm = rack->r_ctl.rc_next;
3483         } else {
3484                 /* No hints sorry */
3485                 rsm = NULL;
3486         }
3487         if ((rsm) && (rsm->r_start == seq_out)) {
3488                 /*
3489                  * We used rc_next or hintrsm  to retransmit, hopefully the
3490                  * likely case.
3491                  */
3492                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3493                 if (len == 0) {
3494                         return;
3495                 } else {
3496                         goto more;
3497                 }
3498         }
3499         /* Ok it was not the last pointer go through it the hard way. */
3500         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3501                 if (rsm->r_start == seq_out) {
3502                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3503                         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3504                         if (len == 0) {
3505                                 return;
3506                         } else {
3507                                 continue;
3508                         }
3509                 }
3510                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3511                         /* Transmitted within this piece */
3512                         /*
3513                          * Ok we must split off the front and then let the
3514                          * update do the rest
3515                          */
3516                         nrsm = rack_alloc_full_limit(rack);
3517                         if (nrsm == NULL) {
3518                                 rack_update_rsm(tp, rack, rsm, ts);
3519                                 return;
3520                         }
3521                         /*
3522                          * copy rsm to nrsm and then trim the front of rsm
3523                          * to not include this part.
3524                          */
3525                         nrsm->r_start = seq_out;
3526                         nrsm->r_end = rsm->r_end;
3527                         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3528                         nrsm->r_flags = rsm->r_flags;
3529                         nrsm->r_sndcnt = rsm->r_sndcnt;
3530                         nrsm->r_rtr_bytes = 0;
3531                         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3532                                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3533                         }
3534                         rsm->r_end = nrsm->r_start;
3535                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3536                         if (rsm->r_in_tmap) {
3537                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3538                                 nrsm->r_in_tmap = 1;
3539                         }
3540                         rsm->r_flags &= (~RACK_HAS_FIN);
3541                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3542                         if (len == 0) {
3543                                 return;
3544                         }
3545                 }
3546         }
3547         /*
3548          * Hmm not found in map did they retransmit both old and on into the
3549          * new?
3550          */
3551         if (seq_out == tp->snd_max) {
3552                 goto again;
3553         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3554 #ifdef INVARIANTS
3555                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3556                     seq_out, len, tp->snd_una, tp->snd_max);
3557                 printf("Starting Dump of all rack entries\n");
3558                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3559                         printf("rsm:%p start:%u end:%u\n",
3560                             rsm, rsm->r_start, rsm->r_end);
3561                 }
3562                 printf("Dump complete\n");
3563                 panic("seq_out not found rack:%p tp:%p",
3564                     rack, tp);
3565 #endif
3566         } else {
3567 #ifdef INVARIANTS
3568                 /*
3569                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3570                  * flag)
3571                  */
3572                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3573                     seq_out, len, tp->snd_max, tp);
3574 #endif
3575         }
3576 }
3577
3578 /*
3579  * Record one of the RTT updates from an ack into
3580  * our sample structure.
3581  */
3582 static void
3583 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3584 {
3585         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3586             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3587                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3588         }
3589         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3590             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3591                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3592         }
3593         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3594         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3595         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3596 }
3597
3598 /*
3599  * Collect new round-trip time estimate
3600  * and update averages and current timeout.
3601  */
3602 static void
3603 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3604 {
3605         int32_t delta;
3606         uint32_t o_srtt, o_var;
3607         int32_t rtt;
3608
3609         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3610                 /* No valid sample */
3611                 return;
3612         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3613                 /* We are to use the lowest RTT seen in a single ack */
3614                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3615         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3616                 /* We are to use the highest RTT seen in a single ack */
3617                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3618         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3619                 /* We are to use the average RTT seen in a single ack */
3620                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3621                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3622         } else {
3623 #ifdef INVARIANTS
3624                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3625 #endif
3626                 return;
3627         }
3628         if (rtt == 0)
3629                 rtt = 1;
3630         rack_log_rtt_sample(rack, rtt);
3631         o_srtt = tp->t_srtt;
3632         o_var = tp->t_rttvar;
3633         rack = (struct tcp_rack *)tp->t_fb_ptr;
3634         if (tp->t_srtt != 0) {
3635                 /*
3636                  * srtt is stored as fixed point with 5 bits after the
3637                  * binary point (i.e., scaled by 8).  The following magic is
3638                  * equivalent to the smoothing algorithm in rfc793 with an
3639                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3640                  * Adjust rtt to origin 0.
3641                  */
3642                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3643                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3644
3645                 tp->t_srtt += delta;
3646                 if (tp->t_srtt <= 0)
3647                         tp->t_srtt = 1;
3648
3649                 /*
3650                  * We accumulate a smoothed rtt variance (actually, a
3651                  * smoothed mean difference), then set the retransmit timer
3652                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3653                  * is stored as fixed point with 4 bits after the binary
3654                  * point (scaled by 16).  The following is equivalent to
3655                  * rfc793 smoothing with an alpha of .75 (rttvar =
3656                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3657                  * wired-in beta.
3658                  */
3659                 if (delta < 0)
3660                         delta = -delta;
3661                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3662                 tp->t_rttvar += delta;
3663                 if (tp->t_rttvar <= 0)
3664                         tp->t_rttvar = 1;
3665                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3666                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3667         } else {
3668                 /*
3669                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3670                  * variance to half the rtt (so our first retransmit happens
3671                  * at 3*rtt).
3672                  */
3673                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3674                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3675                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3676         }
3677         TCPSTAT_INC(tcps_rttupdated);
3678         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3679         tp->t_rttupdated++;
3680 #ifdef NETFLIX_STATS
3681         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3682 #endif
3683         tp->t_rxtshift = 0;
3684
3685         /*
3686          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3687          * way we do the smoothing, srtt and rttvar will each average +1/2
3688          * tick of bias.  When we compute the retransmit timer, we want 1/2
3689          * tick of rounding and 1 extra tick because of +-1/2 tick
3690          * uncertainty in the firing of the timer.  The bias will give us
3691          * exactly the 1.5 tick we need.  But, because the bias is
3692          * statistical, we have to test that we don't drop below the minimum
3693          * feasible timer (which is 2 ticks).
3694          */
3695         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3696            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3697         tp->t_softerror = 0;
3698 }
3699
3700 static void
3701 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3702     uint32_t t, uint32_t cts)
3703 {
3704         /*
3705          * For this RSM, we acknowledged the data from a previous
3706          * transmission, not the last one we made. This means we did a false
3707          * retransmit.
3708          */
3709         struct tcp_rack *rack;
3710
3711         if (rsm->r_flags & RACK_HAS_FIN) {
3712                 /*
3713                  * The sending of the FIN often is multiple sent when we
3714                  * have everything outstanding ack'd. We ignore this case
3715                  * since its over now.
3716                  */
3717                 return;
3718         }
3719         if (rsm->r_flags & RACK_TLP) {
3720                 /*
3721                  * We expect TLP's to have this occur.
3722                  */
3723                 return;
3724         }
3725         rack = (struct tcp_rack *)tp->t_fb_ptr;
3726         /* should we undo cc changes and exit recovery? */
3727         if (IN_RECOVERY(tp->t_flags)) {
3728                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3729                         /*
3730                          * Undo what we ratched down and exit recovery if
3731                          * possible
3732                          */
3733                         EXIT_RECOVERY(tp->t_flags);
3734                         tp->snd_recover = tp->snd_una;
3735                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3736                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3737                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3738                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3739                 }
3740         }
3741         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3742                 /*
3743                  * We retransmitted based on a sack and the earlier
3744                  * retransmission ack'd it - re-ordering is occuring.
3745                  */
3746                 counter_u64_add(rack_reorder_seen, 1);
3747                 rack->r_ctl.rc_reorder_ts = cts;
3748         }
3749         counter_u64_add(rack_badfr, 1);
3750         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3751 }
3752
3753
3754 static int
3755 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3756     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3757 {
3758         int32_t i;
3759         uint32_t t;
3760
3761         if (rsm->r_flags & RACK_ACKED)
3762                 /* Already done */
3763                 return (0);
3764
3765
3766         if ((rsm->r_rtr_cnt == 1) ||
3767             ((ack_type == CUM_ACKED) &&
3768             (to->to_flags & TOF_TS) &&
3769             (to->to_tsecr) &&
3770             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3771             ) {
3772                 /*
3773                  * We will only find a matching timestamp if its cum-acked.
3774                  * But if its only one retransmission its for-sure matching
3775                  * :-)
3776                  */
3777                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3778                 if ((int)t <= 0)
3779                         t = 1;
3780                 if (!tp->t_rttlow || tp->t_rttlow > t)
3781                         tp->t_rttlow = t;
3782                 if (!rack->r_ctl.rc_rack_min_rtt ||
3783                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3784                         rack->r_ctl.rc_rack_min_rtt = t;
3785                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3786                                 rack->r_ctl.rc_rack_min_rtt = 1;
3787                         }
3788                 }
3789                 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
3790                 if ((rsm->r_flags & RACK_TLP) &&
3791                     (!IN_RECOVERY(tp->t_flags))) {
3792                         /* Segment was a TLP and our retrans matched */
3793                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
3794                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
3795                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
3796                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
3797                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
3798                                 /*
3799                                  * When we enter recovery we need to assure
3800                                  * we send one packet.
3801                                  */
3802                                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
3803                         } else
3804                                 rack->r_ctl.rc_tlp_rtx_out = 0;
3805                 }
3806                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3807                         /* New more recent rack_tmit_time */
3808                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3809                         rack->rc_rack_rtt = t;
3810                 }
3811                 return (1);
3812         }
3813         /*
3814          * We clear the soft/rxtshift since we got an ack.
3815          * There is no assurance we will call the commit() function
3816          * so we need to clear these to avoid incorrect handling.
3817          */
3818         tp->t_rxtshift = 0;
3819         tp->t_softerror = 0;
3820         if ((to->to_flags & TOF_TS) &&
3821             (ack_type == CUM_ACKED) &&
3822             (to->to_tsecr) &&
3823             ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
3824                 /*
3825                  * Now which timestamp does it match? In this block the ACK
3826                  * must be coming from a previous transmission.
3827                  */
3828                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
3829                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
3830                                 t = cts - rsm->r_tim_lastsent[i];
3831                                 if ((int)t <= 0)
3832                                         t = 1;
3833                                 if ((i + 1) < rsm->r_rtr_cnt) {
3834                                         /* Likely */
3835                                         rack_earlier_retran(tp, rsm, t, cts);
3836                                 }
3837                                 if (!tp->t_rttlow || tp->t_rttlow > t)
3838                                         tp->t_rttlow = t;
3839                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3840                                         rack->r_ctl.rc_rack_min_rtt = t;
3841                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3842                                                 rack->r_ctl.rc_rack_min_rtt = 1;
3843                                         }
3844                                 }
3845                                 /*
3846                                  * Note the following calls to
3847                                  * tcp_rack_xmit_timer() are being commented
3848                                  * out for now. They give us no more accuracy
3849                                  * and often lead to a wrong choice. We have
3850                                  * enough samples that have not been
3851                                  * retransmitted. I leave the commented out
3852                                  * code in here in case in the future we
3853                                  * decide to add it back (though I can't forsee
3854                                  * doing that). That way we will easily see
3855                                  * where they need to be placed.
3856                                  */
3857                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
3858                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3859                                         /* New more recent rack_tmit_time */
3860                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3861                                         rack->rc_rack_rtt = t;
3862                                 }
3863                                 return (1);
3864                         }
3865                 }
3866                 goto ts_not_found;
3867         } else {
3868                 /*
3869                  * Ok its a SACK block that we retransmitted. or a windows
3870                  * machine without timestamps. We can tell nothing from the
3871                  * time-stamp since its not there or the time the peer last
3872                  * recieved a segment that moved forward its cum-ack point.
3873                  */
3874 ts_not_found:
3875                 i = rsm->r_rtr_cnt - 1;
3876                 t = cts - rsm->r_tim_lastsent[i];
3877                 if ((int)t <= 0)
3878                         t = 1;
3879                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3880                         /*
3881                          * We retransmitted and the ack came back in less
3882                          * than the smallest rtt we have observed. We most
3883                          * likey did an improper retransmit as outlined in
3884                          * 4.2 Step 3 point 2 in the rack-draft.
3885                          */
3886                         i = rsm->r_rtr_cnt - 2;
3887                         t = cts - rsm->r_tim_lastsent[i];
3888                         rack_earlier_retran(tp, rsm, t, cts);
3889                 } else if (rack->r_ctl.rc_rack_min_rtt) {
3890                         /*
3891                          * We retransmitted it and the retransmit did the
3892                          * job.
3893                          */
3894                         if (!rack->r_ctl.rc_rack_min_rtt ||
3895                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3896                                 rack->r_ctl.rc_rack_min_rtt = t;
3897                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
3898                                         rack->r_ctl.rc_rack_min_rtt = 1;
3899                                 }
3900                         }
3901                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
3902                                 /* New more recent rack_tmit_time */
3903                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
3904                                 rack->rc_rack_rtt = t;
3905                         }
3906                         return (1);
3907                 }
3908         }
3909         return (0);
3910 }
3911
3912 /*
3913  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
3914  */
3915 static void
3916 rack_log_sack_passed(struct tcpcb *tp,
3917     struct tcp_rack *rack, struct rack_sendmap *rsm)
3918 {
3919         struct rack_sendmap *nrsm;
3920         uint32_t ts;
3921         int32_t idx;
3922
3923         idx = rsm->r_rtr_cnt - 1;
3924         ts = rsm->r_tim_lastsent[idx];
3925         nrsm = rsm;
3926         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
3927             rack_head, r_tnext) {
3928                 if (nrsm == rsm) {
3929                         /* Skip orginal segment he is acked */
3930                         continue;
3931                 }
3932                 if (nrsm->r_flags & RACK_ACKED) {
3933                         /* Skip ack'd segments */
3934                         continue;
3935                 }
3936                 if (nrsm->r_flags & RACK_SACK_PASSED) {
3937                         /*
3938                          * We found one that is already marked
3939                          * passed, we have been here before and
3940                          * so all others below this are marked.
3941                          */
3942                         break;
3943                 }
3944                 idx = nrsm->r_rtr_cnt - 1;
3945                 if (ts == nrsm->r_tim_lastsent[idx]) {
3946                         /*
3947                          * For this case lets use seq no, if we sent in a
3948                          * big block (TSO) we would have a bunch of segments
3949                          * sent at the same time.
3950                          *
3951                          * We would only get a report if its SEQ is earlier.
3952                          * If we have done multiple retransmits the times
3953                          * would not be equal.
3954                          */
3955                         if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
3956                                 nrsm->r_flags |= RACK_SACK_PASSED;
3957                                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3958                         }
3959                 } else {
3960                         /*
3961                          * Here they were sent at different times, not a big
3962                          * block. Since we transmitted this one later and
3963                          * see it sack'd then this must also be missing (or
3964                          * we would have gotten a sack block for it)
3965                          */
3966                         nrsm->r_flags |= RACK_SACK_PASSED;
3967                         nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3968                 }
3969         }
3970 }
3971
3972 static uint32_t
3973 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
3974     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
3975 {
3976         int32_t idx;
3977         int32_t times = 0;
3978         uint32_t start, end, changed = 0;
3979         struct rack_sendmap *rsm, *nrsm;
3980         int32_t used_ref = 1;
3981
3982         start = sack->start;
3983         end = sack->end;
3984         rsm = *prsm;
3985         if (rsm && SEQ_LT(start, rsm->r_start)) {
3986                 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
3987                         if (SEQ_GEQ(start, rsm->r_start) &&
3988                             SEQ_LT(start, rsm->r_end)) {
3989                                 goto do_rest_ofb;
3990                         }
3991                 }
3992         }
3993         if (rsm == NULL) {
3994 start_at_beginning:
3995                 rsm = NULL;
3996                 used_ref = 0;
3997         }
3998         /* First lets locate the block where this guy is */
3999         TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
4000                 if (SEQ_GEQ(start, rsm->r_start) &&
4001                     SEQ_LT(start, rsm->r_end)) {
4002                         break;
4003                 }
4004         }
4005 do_rest_ofb:
4006         if (rsm == NULL) {
4007                 /*
4008                  * This happens when we get duplicate sack blocks with the
4009                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
4010                  * will not change there location so we would just start at
4011                  * the end of the first one and get lost.
4012                  */
4013                 if (tp->t_flags & TF_SENTFIN) {
4014                         /*
4015                          * Check to see if we have not logged the FIN that
4016                          * went out.
4017                          */
4018                         nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4019                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
4020                                 /*
4021                                  * Ok we did not get the FIN logged.
4022                                  */
4023                                 nrsm->r_end++;
4024                                 rsm = nrsm;
4025                                 goto do_rest_ofb;
4026                         }
4027                 }
4028                 if (times == 1) {
4029 #ifdef INVARIANTS
4030                         panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
4031                             tp, rack, sack, to, prsm);
4032 #else
4033                         goto out;
4034 #endif
4035                 }
4036                 times++;
4037                 counter_u64_add(rack_sack_proc_restart, 1);
4038                 goto start_at_beginning;
4039         }
4040         /* Ok we have an ACK for some piece of rsm */
4041         if (rsm->r_start != start) {
4042                 /*
4043                  * Need to split this in two pieces the before and after.
4044                  */
4045                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4046                 if (nrsm == NULL) {
4047                         /*
4048                          * failed XXXrrs what can we do but loose the sack
4049                          * info?
4050                          */
4051                         goto out;
4052                 }
4053                 nrsm->r_start = start;
4054                 nrsm->r_rtr_bytes = 0;
4055                 nrsm->r_end = rsm->r_end;
4056                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4057                 nrsm->r_flags = rsm->r_flags;
4058                 nrsm->r_sndcnt = rsm->r_sndcnt;
4059                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4060                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4061                 }
4062                 rsm->r_end = nrsm->r_start;
4063                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4064                 if (rsm->r_in_tmap) {
4065                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4066                         nrsm->r_in_tmap = 1;
4067                 }
4068                 rsm->r_flags &= (~RACK_HAS_FIN);
4069                 rsm = nrsm;
4070         }
4071         if (SEQ_GEQ(end, rsm->r_end)) {
4072                 /*
4073                  * The end of this block is either beyond this guy or right
4074                  * at this guy.
4075                  */
4076
4077                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4078                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4079                         changed += (rsm->r_end - rsm->r_start);
4080                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4081                         rack_log_sack_passed(tp, rack, rsm);
4082                         /* Is Reordering occuring? */
4083                         if (rsm->r_flags & RACK_SACK_PASSED) {
4084                                 counter_u64_add(rack_reorder_seen, 1);
4085                                 rack->r_ctl.rc_reorder_ts = cts;
4086                         }
4087                         rsm->r_flags |= RACK_ACKED;
4088                         rsm->r_flags &= ~RACK_TLP;
4089                         if (rsm->r_in_tmap) {
4090                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4091                                 rsm->r_in_tmap = 0;
4092                         }
4093                 }
4094                 if (end == rsm->r_end) {
4095                         /* This block only - done */
4096                         goto out;
4097                 }
4098                 /* There is more not coverend by this rsm move on */
4099                 start = rsm->r_end;
4100                 nrsm = TAILQ_NEXT(rsm, r_next);
4101                 rsm = nrsm;
4102                 times = 0;
4103                 goto do_rest_ofb;
4104         }
4105         /* Ok we need to split off this one at the tail */
4106         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4107         if (nrsm == NULL) {
4108                 /* failed rrs what can we do but loose the sack info? */
4109                 goto out;
4110         }
4111         /* Clone it */
4112         nrsm->r_start = end;
4113         nrsm->r_end = rsm->r_end;
4114         nrsm->r_rtr_bytes = 0;
4115         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4116         nrsm->r_flags = rsm->r_flags;
4117         nrsm->r_sndcnt = rsm->r_sndcnt;
4118         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4119                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4120         }
4121         /* The sack block does not cover this guy fully */
4122         rsm->r_flags &= (~RACK_HAS_FIN);
4123         rsm->r_end = end;
4124         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4125         if (rsm->r_in_tmap) {
4126                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4127                 nrsm->r_in_tmap = 1;
4128         }
4129         if (rsm->r_flags & RACK_ACKED) {
4130                 /* Been here done that */
4131                 goto out;
4132         }
4133         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4134         changed += (rsm->r_end - rsm->r_start);
4135         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4136         rack_log_sack_passed(tp, rack, rsm);
4137         /* Is Reordering occuring? */
4138         if (rsm->r_flags & RACK_SACK_PASSED) {
4139                 counter_u64_add(rack_reorder_seen, 1);
4140                 rack->r_ctl.rc_reorder_ts = cts;
4141         }
4142         rsm->r_flags |= RACK_ACKED;
4143         rsm->r_flags &= ~RACK_TLP;
4144         if (rsm->r_in_tmap) {
4145                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4146                 rsm->r_in_tmap = 0;
4147         }
4148 out:
4149         if (rsm && (rsm->r_flags & RACK_ACKED)) {
4150                 /*
4151                  * Now can we merge this newly acked
4152                  * block with either the previous or
4153                  * next block?
4154                  */
4155                 nrsm = TAILQ_NEXT(rsm, r_next);
4156                 if (nrsm &&
4157                     (nrsm->r_flags & RACK_ACKED)) {
4158                         /* yep this and next can be merged */
4159                         rsm = rack_merge_rsm(rack, rsm, nrsm);
4160                 }
4161                 /* Now what about the previous? */
4162                 nrsm = TAILQ_PREV(rsm, rack_head, r_next);
4163                 if (nrsm &&
4164                     (nrsm->r_flags & RACK_ACKED)) {
4165                         /* yep the previous and this can be merged */
4166                         rsm = rack_merge_rsm(rack, nrsm, rsm);
4167                 }
4168         }
4169         if (used_ref == 0) {
4170                 counter_u64_add(rack_sack_proc_all, 1);
4171         } else {
4172                 counter_u64_add(rack_sack_proc_short, 1);
4173         }
4174         /* Save off where we last were */
4175         if (rsm)
4176                 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
4177         else
4178                 rack->r_ctl.rc_sacklast = NULL;
4179         *prsm = rsm;
4180         return (changed);
4181 }
4182
4183 static void inline
4184 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4185 {
4186         struct rack_sendmap *tmap;
4187
4188         tmap = NULL;
4189         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4190                 /* Its no longer sacked, mark it so */
4191                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4192 #ifdef INVARIANTS
4193                 if (rsm->r_in_tmap) {
4194                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4195                               rack, rsm, rsm->r_flags);
4196                 }
4197 #endif
4198                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4199                 /* Rebuild it into our tmap */
4200                 if (tmap == NULL) {
4201                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4202                         tmap = rsm;
4203                 } else {
4204                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4205                         tmap = rsm;
4206                 }
4207                 tmap->r_in_tmap = 1;
4208                 rsm = TAILQ_NEXT(rsm, r_next);
4209         }
4210         /*
4211          * Now lets possibly clear the sack filter so we start
4212          * recognizing sacks that cover this area.
4213          */
4214         if (rack_use_sack_filter)
4215                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4216
4217 }
4218
4219 static void
4220 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4221 {
4222         uint32_t changed, last_seq, entered_recovery = 0;
4223         struct tcp_rack *rack;
4224         struct rack_sendmap *rsm;
4225         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4226         register uint32_t th_ack;
4227         int32_t i, j, k, num_sack_blks = 0;
4228         uint32_t cts, acked, ack_point, sack_changed = 0;
4229
4230         INP_WLOCK_ASSERT(tp->t_inpcb);
4231         if (th->th_flags & TH_RST) {
4232                 /* We don't log resets */
4233                 return;
4234         }
4235         rack = (struct tcp_rack *)tp->t_fb_ptr;
4236         cts = tcp_ts_getticks();
4237         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4238         changed = 0;
4239         th_ack = th->th_ack;
4240
4241         if (SEQ_GT(th_ack, tp->snd_una)) {
4242                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4243                 tp->t_acktime = ticks;
4244         }
4245         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4246                 changed = th_ack - rsm->r_start;
4247         if (changed) {
4248                 /*
4249                  * The ACK point is advancing to th_ack, we must drop off
4250                  * the packets in the rack log and calculate any eligble
4251                  * RTT's.
4252                  */
4253                 rack->r_wanted_output++;
4254 more:
4255                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4256                 if (rsm == NULL) {
4257                         if ((th_ack - 1) == tp->iss) {
4258                                 /*
4259                                  * For the SYN incoming case we will not
4260                                  * have called tcp_output for the sending of
4261                                  * the SYN, so there will be no map. All
4262                                  * other cases should probably be a panic.
4263                                  */
4264                                 goto proc_sack;
4265                         }
4266                         if (tp->t_flags & TF_SENTFIN) {
4267                                 /* if we send a FIN we will not hav a map */
4268                                 goto proc_sack;
4269                         }
4270 #ifdef INVARIANTS
4271                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4272                             tp,
4273                             th, tp->t_state, rack,
4274                             tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4275 #endif
4276                         goto proc_sack;
4277                 }
4278                 if (SEQ_LT(th_ack, rsm->r_start)) {
4279                         /* Huh map is missing this */
4280 #ifdef INVARIANTS
4281                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4282                             rsm->r_start,
4283                             th_ack, tp->t_state, rack->r_state);
4284 #endif
4285                         goto proc_sack;
4286                 }
4287                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4288                 /* Now do we consume the whole thing? */
4289                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4290                         /* Its all consumed. */
4291                         uint32_t left;
4292
4293                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4294                         rsm->r_rtr_bytes = 0;
4295                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
4296                         if (rsm->r_in_tmap) {
4297                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4298                                 rsm->r_in_tmap = 0;
4299                         }
4300                         if (rack->r_ctl.rc_next == rsm) {
4301                                 /* scoot along the marker */
4302                                 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
4303                         }
4304                         if (rsm->r_flags & RACK_ACKED) {
4305                                 /*
4306                                  * It was acked on the scoreboard -- remove
4307                                  * it from total
4308                                  */
4309                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4310                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4311                                 /*
4312                                  * There are acked segments ACKED on the
4313                                  * scoreboard further up. We are seeing
4314                                  * reordering.
4315                                  */
4316                                 counter_u64_add(rack_reorder_seen, 1);
4317                                 rsm->r_flags |= RACK_ACKED;
4318                                 rack->r_ctl.rc_reorder_ts = cts;
4319                         }
4320                         left = th_ack - rsm->r_end;
4321                         if (rsm->r_rtr_cnt > 1) {
4322                                 /*
4323                                  * Technically we should make r_rtr_cnt be
4324                                  * monotonicly increasing and just mod it to
4325                                  * the timestamp it is replacing.. that way
4326                                  * we would have the last 3 retransmits. Now
4327                                  * rc_loss_count will be wrong if we
4328                                  * retransmit something more than 2 times in
4329                                  * recovery :(
4330                                  */
4331                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4332                         }
4333                         /* Free back to zone */
4334                         rack_free(rack, rsm);
4335                         if (left) {
4336                                 goto more;
4337                         }
4338                         goto proc_sack;
4339                 }
4340                 if (rsm->r_flags & RACK_ACKED) {
4341                         /*
4342                          * It was acked on the scoreboard -- remove it from
4343                          * total for the part being cum-acked.
4344                          */
4345                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4346                 }
4347                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4348                 rsm->r_rtr_bytes = 0;
4349                 rsm->r_start = th_ack;
4350         }
4351 proc_sack:
4352         /* Check for reneging */
4353         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4354         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4355                 /*
4356                  * The peer has moved snd_una up to
4357                  * the edge of this send, i.e. one
4358                  * that it had previously acked. The only
4359                  * way that can be true if the peer threw
4360                  * away data (space issues) that it had
4361                  * previously sacked (else it would have
4362                  * given us snd_una up to (rsm->r_end).
4363                  * We need to undo the acked markings here.
4364                  *
4365                  * Note we have to look to make sure th_ack is
4366                  * our rsm->r_start in case we get an old ack
4367                  * where th_ack is behind snd_una.
4368                  */
4369                 rack_peer_reneges(rack, rsm, th->th_ack);
4370         }
4371         if ((to->to_flags & TOF_SACK) == 0) {
4372                 /* We are done nothing left to log */
4373                 goto out;
4374         }
4375         rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4376         if (rsm) {
4377                 last_seq = rsm->r_end;
4378         } else {
4379                 last_seq = tp->snd_max;
4380         }
4381         /* Sack block processing */
4382         if (SEQ_GT(th_ack, tp->snd_una))
4383                 ack_point = th_ack;
4384         else
4385                 ack_point = tp->snd_una;
4386         for (i = 0; i < to->to_nsacks; i++) {
4387                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4388                     &sack, sizeof(sack));
4389                 sack.start = ntohl(sack.start);
4390                 sack.end = ntohl(sack.end);
4391                 if (SEQ_GT(sack.end, sack.start) &&
4392                     SEQ_GT(sack.start, ack_point) &&
4393                     SEQ_LT(sack.start, tp->snd_max) &&
4394                     SEQ_GT(sack.end, ack_point) &&
4395                     SEQ_LEQ(sack.end, tp->snd_max)) {
4396                         if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
4397                             (SEQ_LT(sack.end, last_seq)) &&
4398                             ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
4399                                 /*
4400                                  * Not the last piece and its smaller than
4401                                  * 1/8th of a MSS. We ignore this.
4402                                  */
4403                                 counter_u64_add(rack_runt_sacks, 1);
4404                                 continue;
4405                         }
4406                         sack_blocks[num_sack_blks] = sack;
4407                         num_sack_blks++;
4408                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4409                            SEQ_LEQ(sack.end, th_ack)) {
4410                         /*
4411                          * Its a D-SACK block.
4412                          */
4413 /*                      tcp_record_dsack(sack.start, sack.end); */
4414                 }
4415         }
4416         if (num_sack_blks == 0)
4417                 goto out;
4418         /*
4419          * Sort the SACK blocks so we can update the rack scoreboard with
4420          * just one pass.
4421          */
4422         if (rack_use_sack_filter) {
4423                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
4424                                                  num_sack_blks, th->th_ack);
4425                 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
4426         }
4427         if (num_sack_blks < 2) {
4428                 goto do_sack_work;
4429         }
4430         /* Sort the sacks */
4431         for (i = 0; i < num_sack_blks; i++) {
4432                 for (j = i + 1; j < num_sack_blks; j++) {
4433                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4434                                 sack = sack_blocks[i];
4435                                 sack_blocks[i] = sack_blocks[j];
4436                                 sack_blocks[j] = sack;
4437                         }
4438                 }
4439         }
4440         /*
4441          * Now are any of the sack block ends the same (yes some
4442          * implememtations send these)?
4443          */
4444 again:
4445         if (num_sack_blks > 1) {
4446                 for (i = 0; i < num_sack_blks; i++) {
4447                         for (j = i + 1; j < num_sack_blks; j++) {
4448                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4449                                         /*
4450                                          * Ok these two have the same end we
4451                                          * want the smallest end and then
4452                                          * throw away the larger and start
4453                                          * again.
4454                                          */
4455                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4456                                                 /*
4457                                                  * The second block covers
4458                                                  * more area use that
4459                                                  */
4460                                                 sack_blocks[i].start = sack_blocks[j].start;
4461                                         }
4462                                         /*
4463                                          * Now collapse out the dup-sack and
4464                                          * lower the count
4465                                          */
4466                                         for (k = (j + 1); k < num_sack_blks; k++) {
4467                                                 sack_blocks[j].start = sack_blocks[k].start;
4468                                                 sack_blocks[j].end = sack_blocks[k].end;
4469                                                 j++;
4470                                         }
4471                                         num_sack_blks--;
4472                                         goto again;
4473                                 }
4474                         }
4475                 }
4476         }
4477 do_sack_work:
4478         rsm = rack->r_ctl.rc_sacklast;
4479         for (i = 0; i < num_sack_blks; i++) {
4480                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
4481                 if (acked) {
4482                         rack->r_wanted_output++;
4483                         changed += acked;
4484                         sack_changed += acked;
4485                 }
4486         }
4487 out:
4488         if (changed) {
4489                 /* Something changed cancel the rack timer */
4490                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4491         }
4492         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
4493                 /*
4494                  * Ok we have a high probability that we need to go in to
4495                  * recovery since we have data sack'd
4496                  */
4497                 struct rack_sendmap *rsm;
4498                 uint32_t tsused;
4499
4500                 tsused = tcp_ts_getticks();
4501                 rsm = tcp_rack_output(tp, rack, tsused);
4502                 if (rsm) {
4503                         /* Enter recovery */
4504                         rack->r_ctl.rc_rsm_start = rsm->r_start;
4505                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4506                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4507                         entered_recovery = 1;
4508                         rack_cong_signal(tp, NULL, CC_NDUPACK);
4509                         /*
4510                          * When we enter recovery we need to assure we send
4511                          * one packet.
4512                          */
4513                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
4514                         rack->r_timer_override = 1;
4515                 }
4516         }
4517         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
4518                 /* Deal with changed an PRR here (in recovery only) */
4519                 uint32_t pipe, snd_una;
4520
4521                 rack->r_ctl.rc_prr_delivered += changed;
4522                 /* Compute prr_sndcnt */
4523                 if (SEQ_GT(tp->snd_una, th_ack)) {
4524                         snd_una = tp->snd_una;
4525                 } else {
4526                         snd_una = th_ack;
4527                 }
4528                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
4529                 if (pipe > tp->snd_ssthresh) {
4530                         long sndcnt;
4531
4532                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
4533                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
4534                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
4535                         else {
4536                                 rack->r_ctl.rc_prr_sndcnt = 0;
4537                                 sndcnt = 0;
4538                         }
4539                         sndcnt++;
4540                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
4541                                 sndcnt -= rack->r_ctl.rc_prr_out;
4542                         else
4543                                 sndcnt = 0;
4544                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
4545                 } else {
4546                         uint32_t limit;
4547
4548                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
4549                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
4550                         else
4551                                 limit = 0;
4552                         if (changed > limit)
4553                                 limit = changed;
4554                         limit += tp->t_maxseg;
4555                         if (tp->snd_ssthresh > pipe) {
4556                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
4557                         } else {
4558                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
4559                         }
4560                 }
4561                 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
4562                         rack->r_timer_override = 1;
4563                 }
4564         }
4565 }
4566
4567 /*
4568  * Return value of 1, we do not need to call rack_process_data().
4569  * return value of 0, rack_process_data can be called.
4570  * For ret_val if its 0 the TCP is locked, if its non-zero
4571  * its unlocked and probably unsafe to touch the TCB.
4572  */
4573 static int
4574 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
4575     struct tcpcb *tp, struct tcpopt *to,
4576     uint32_t tiwin, int32_t tlen,
4577     int32_t * ofia, int32_t thflags, int32_t * ret_val)
4578 {
4579         int32_t ourfinisacked = 0;
4580         int32_t nsegs, acked_amount;
4581         int32_t acked;
4582         struct mbuf *mfree;
4583         struct tcp_rack *rack;
4584         int32_t recovery = 0;
4585
4586         rack = (struct tcp_rack *)tp->t_fb_ptr;
4587         if (SEQ_GT(th->th_ack, tp->snd_max)) {
4588                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
4589                 return (1);
4590         }
4591         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
4592                 rack_log_ack(tp, to, th);
4593         }
4594         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
4595                 /*
4596                  * Old ack, behind (or duplicate to) the last one rcv'd
4597                  * Note: Should mark reordering is occuring! We should also
4598                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
4599                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
4600                  * retran and> ack 3
4601                  */
4602                 return (0);
4603         }
4604         /*
4605          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
4606          * something we sent.
4607          */
4608         if (tp->t_flags & TF_NEEDSYN) {
4609                 /*
4610                  * T/TCP: Connection was half-synchronized, and our SYN has
4611                  * been ACK'd (so connection is now fully synchronized).  Go
4612                  * to non-starred state, increment snd_una for ACK of SYN,
4613                  * and check if we can do window scaling.
4614                  */
4615                 tp->t_flags &= ~TF_NEEDSYN;
4616                 tp->snd_una++;
4617                 /* Do window scaling? */
4618                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
4619                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
4620                         tp->rcv_scale = tp->request_r_scale;
4621                         /* Send window already scaled. */
4622                 }
4623         }
4624         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4625         INP_WLOCK_ASSERT(tp->t_inpcb);
4626
4627         acked = BYTES_THIS_ACK(tp, th);
4628         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
4629         TCPSTAT_ADD(tcps_rcvackbyte, acked);
4630
4631         /*
4632          * If we just performed our first retransmit, and the ACK arrives
4633          * within our recovery window, then it was a mistake to do the
4634          * retransmit in the first place.  Recover our original cwnd and
4635          * ssthresh, and proceed to transmit where we left off.
4636          */
4637         if (tp->t_flags & TF_PREVVALID) {
4638                 tp->t_flags &= ~TF_PREVVALID;
4639                 if (tp->t_rxtshift == 1 &&
4640                     (int)(ticks - tp->t_badrxtwin) < 0)
4641                         rack_cong_signal(tp, th, CC_RTO_ERR);
4642         }
4643         /*
4644          * If we have a timestamp reply, update smoothed round trip time. If
4645          * no timestamp is present but transmit timer is running and timed
4646          * sequence number was acked, update smoothed round trip time. Since
4647          * we now have an rtt measurement, cancel the timer backoff (cf.,
4648          * Phil Karn's retransmit alg.). Recompute the initial retransmit
4649          * timer.
4650          *
4651          * Some boxes send broken timestamp replies during the SYN+ACK
4652          * phase, ignore timestamps of 0 or we could calculate a huge RTT
4653          * and blow up the retransmit timer.
4654          */
4655         /*
4656          * If all outstanding data is acked, stop retransmit timer and
4657          * remember to restart (more output or persist). If there is more
4658          * data to be acked, restart retransmit timer, using current
4659          * (possibly backed-off) value.
4660          */
4661         if (th->th_ack == tp->snd_max) {
4662                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4663                 rack->r_wanted_output++;
4664         }
4665         /*
4666          * If no data (only SYN) was ACK'd, skip rest of ACK processing.
4667          */
4668         if (acked == 0) {
4669                 if (ofia)
4670                         *ofia = ourfinisacked;
4671                 return (0);
4672         }
4673         if (rack->r_ctl.rc_early_recovery) {
4674                 if (IN_RECOVERY(tp->t_flags)) {
4675                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
4676                             (SEQ_LT(th->th_ack, tp->snd_max))) {
4677                                 tcp_rack_partialack(tp, th);
4678                         } else {
4679                                 rack_post_recovery(tp, th);
4680                                 recovery = 1;
4681                         }
4682                 }
4683         }
4684         /*
4685          * Let the congestion control algorithm update congestion control
4686          * related information. This typically means increasing the
4687          * congestion window.
4688          */
4689         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
4690         SOCKBUF_LOCK(&so->so_snd);
4691         acked_amount = min(acked, (int)sbavail(&so->so_snd));
4692         tp->snd_wnd -= acked_amount;
4693         mfree = sbcut_locked(&so->so_snd, acked_amount);
4694         if ((sbused(&so->so_snd) == 0) &&
4695             (acked > acked_amount) &&
4696             (tp->t_state >= TCPS_FIN_WAIT_1)) {
4697                 ourfinisacked = 1;
4698         }
4699         /* NB: sowwakeup_locked() does an implicit unlock. */
4700         sowwakeup_locked(so);
4701         m_freem(mfree);
4702         if (rack->r_ctl.rc_early_recovery == 0) {
4703                 if (IN_RECOVERY(tp->t_flags)) {
4704                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
4705                             (SEQ_LT(th->th_ack, tp->snd_max))) {
4706                                 tcp_rack_partialack(tp, th);
4707                         } else {
4708                                 rack_post_recovery(tp, th);
4709                         }
4710                 }
4711         }
4712         tp->snd_una = th->th_ack;
4713         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4714                 tp->snd_recover = tp->snd_una;
4715
4716         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4717                 tp->snd_nxt = tp->snd_una;
4718         }
4719         if (tp->snd_una == tp->snd_max) {
4720                 /* Nothing left outstanding */
4721                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
4722                 tp->t_acktime = 0;
4723                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4724                 /* Set need output so persist might get set */
4725                 rack->r_wanted_output++;
4726                 if (rack_use_sack_filter)
4727                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
4728                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
4729                     (sbavail(&so->so_snd) == 0) &&
4730                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
4731                         /*
4732                          * The socket was gone and the
4733                          * peer sent data, time to
4734                          * reset him.
4735                          */
4736                         *ret_val = 1;
4737                         tp = tcp_close(tp);
4738                         rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
4739                         return (1);
4740                 }
4741         }
4742         if (ofia)
4743                 *ofia = ourfinisacked;
4744         return (0);
4745 }
4746
4747
4748 /*
4749  * Return value of 1, the TCB is unlocked and most
4750  * likely gone, return value of 0, the TCP is still
4751  * locked.
4752  */
4753 static int
4754 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
4755     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
4756     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
4757 {
4758         /*
4759          * Update window information. Don't look at window if no ACK: TAC's
4760          * send garbage on first SYN.
4761          */
4762         int32_t nsegs;
4763 #ifdef TCP_RFC7413
4764         int32_t tfo_syn;
4765 #else
4766 #define tfo_syn (FALSE)
4767 #endif
4768         struct tcp_rack *rack;
4769
4770         rack = (struct tcp_rack *)tp->t_fb_ptr;
4771         INP_WLOCK_ASSERT(tp->t_inpcb);
4772         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4773         if ((thflags & TH_ACK) &&
4774             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4775             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4776             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4777                 /* keep track of pure window updates */
4778                 if (tlen == 0 &&
4779                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4780                         TCPSTAT_INC(tcps_rcvwinupd);
4781                 tp->snd_wnd = tiwin;
4782                 tp->snd_wl1 = th->th_seq;
4783                 tp->snd_wl2 = th->th_ack;
4784                 if (tp->snd_wnd > tp->max_sndwnd)
4785                         tp->max_sndwnd = tp->snd_wnd;
4786                 rack->r_wanted_output++;
4787         } else if (thflags & TH_ACK) {
4788                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
4789                         tp->snd_wnd = tiwin;
4790                         tp->snd_wl1 = th->th_seq;
4791                         tp->snd_wl2 = th->th_ack;
4792                 }
4793         }
4794         /* Was persist timer active and now we have window space? */
4795         if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
4796                 rack_exit_persist(tp, rack);
4797                 tp->snd_nxt = tp->snd_max;
4798                 /* Make sure we output to start the timer */
4799                 rack->r_wanted_output++;
4800         }
4801         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
4802                 m_freem(m);
4803                 return (0);
4804         }
4805         /*
4806          * Process segments with URG.
4807          */
4808         if ((thflags & TH_URG) && th->th_urp &&
4809             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4810                 /*
4811                  * This is a kludge, but if we receive and accept random
4812                  * urgent pointers, we'll crash in soreceive.  It's hard to
4813                  * imagine someone actually wanting to send this much urgent
4814                  * data.
4815                  */
4816                 SOCKBUF_LOCK(&so->so_rcv);
4817                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
4818                         th->th_urp = 0; /* XXX */
4819                         thflags &= ~TH_URG;     /* XXX */
4820                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
4821                         goto dodata;    /* XXX */
4822                 }
4823                 /*
4824                  * If this segment advances the known urgent pointer, then
4825                  * mark the data stream.  This should not happen in
4826                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
4827                  * FIN has been received from the remote side. In these
4828                  * states we ignore the URG.
4829                  *
4830                  * According to RFC961 (Assigned Protocols), the urgent
4831                  * pointer points to the last octet of urgent data.  We
4832                  * continue, however, to consider it to indicate the first
4833                  * octet of data past the urgent section as the original
4834                  * spec states (in one of two places).
4835                  */
4836                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
4837                         tp->rcv_up = th->th_seq + th->th_urp;
4838                         so->so_oobmark = sbavail(&so->so_rcv) +
4839                             (tp->rcv_up - tp->rcv_nxt) - 1;
4840                         if (so->so_oobmark == 0)
4841                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
4842                         sohasoutofband(so);
4843                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4844                 }
4845                 SOCKBUF_UNLOCK(&so->so_rcv);
4846                 /*
4847                  * Remove out of band data so doesn't get presented to user.
4848                  * This can happen independent of advancing the URG pointer,
4849                  * but if two URG's are pending at once, some out-of-band
4850                  * data may creep in... ick.
4851                  */
4852                 if (th->th_urp <= (uint32_t) tlen &&
4853                     !(so->so_options & SO_OOBINLINE)) {
4854                         /* hdr drop is delayed */
4855                         tcp_pulloutofband(so, th, m, drop_hdrlen);
4856                 }
4857         } else {
4858                 /*
4859                  * If no out of band data is expected, pull receive urgent
4860                  * pointer along with the receive window.
4861                  */
4862                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4863                         tp->rcv_up = tp->rcv_nxt;
4864         }
4865 dodata:                         /* XXX */
4866         INP_WLOCK_ASSERT(tp->t_inpcb);
4867
4868         /*
4869          * Process the segment text, merging it into the TCP sequencing
4870          * queue, and arranging for acknowledgment of receipt if necessary.
4871          * This process logically involves adjusting tp->rcv_wnd as data is
4872          * presented to the user (this happens in tcp_usrreq.c, case
4873          * PRU_RCVD).  If a FIN has already been received on this connection
4874          * then we just ignore the text.
4875          */
4876 #ifdef TCP_RFC7413
4877         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
4878             (tp->t_flags & TF_FASTOPEN));
4879 #endif
4880         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
4881             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4882                 tcp_seq save_start = th->th_seq;
4883                 tcp_seq save_rnxt  = tp->rcv_nxt;
4884                 int     save_tlen  = tlen;
4885
4886                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4887                 /*
4888                  * Insert segment which includes th into TCP reassembly
4889                  * queue with control block tp.  Set thflags to whether
4890                  * reassembly now includes a segment with FIN.  This handles
4891                  * the common case inline (segment is the next to be
4892                  * received on an established connection, and the queue is
4893                  * empty), avoiding linkage into and removal from the queue
4894                  * and repetition of various conversions. Set DELACK for
4895                  * segments received in order, but ack immediately when
4896                  * segments are out of order (so fast retransmit can work).
4897                  */
4898                 if (th->th_seq == tp->rcv_nxt &&
4899                     SEGQ_EMPTY(tp) &&
4900                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
4901                     tfo_syn)) {
4902                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
4903                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4904                                 tp->t_flags |= TF_DELACK;
4905                         } else {
4906                                 rack->r_wanted_output++;
4907                                 tp->t_flags |= TF_ACKNOW;
4908                         }
4909                         tp->rcv_nxt += tlen;
4910                         thflags = th->th_flags & TH_FIN;
4911                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
4912                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
4913                         SOCKBUF_LOCK(&so->so_rcv);
4914                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4915                                 m_freem(m);
4916                         else
4917                                 sbappendstream_locked(&so->so_rcv, m, 0);
4918                         /* NB: sorwakeup_locked() does an implicit unlock. */
4919                         sorwakeup_locked(so);
4920                 } else {
4921                         /*
4922                          * XXX: Due to the header drop above "th" is
4923                          * theoretically invalid by now.  Fortunately
4924                          * m_adj() doesn't actually frees any mbufs when
4925                          * trimming from the head.
4926                          */
4927                         tcp_seq temp = save_start;
4928                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
4929                         tp->t_flags |= TF_ACKNOW;
4930                 }
4931                 if (((tlen == 0) && (save_tlen > 0) &&
4932                     (SEQ_LT(save_start, save_rnxt)))) {
4933                         /*
4934                          * DSACK actually handled in the fastpath
4935                          * above.
4936                          */
4937                         tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4938                 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
4939                         /*
4940                          * Cleaning sackblks by using zero length
4941                          * update.
4942                          */
4943                         tcp_update_sack_list(tp, save_start, save_start);
4944                 } else if ((tlen > 0) && (tlen >= save_tlen)) {
4945                         /* Update of sackblks. */
4946                         tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4947                 } else if (tlen > 0) {
4948                         tcp_update_sack_list(tp, save_start, save_start+tlen);
4949                 }
4950         } else {
4951                 m_freem(m);
4952                 thflags &= ~TH_FIN;
4953         }
4954
4955         /*
4956          * If FIN is received ACK the FIN and let the user know that the
4957          * connection is closing.
4958          */
4959         if (thflags & TH_FIN) {
4960                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4961                         socantrcvmore(so);
4962                         /*
4963                          * If connection is half-synchronized (ie NEEDSYN
4964                          * flag on) then delay ACK, so it may be piggybacked
4965                          * when SYN is sent. Otherwise, since we received a
4966                          * FIN then no more input can be expected, send ACK
4967                          * now.
4968                          */
4969                         if (tp->t_flags & TF_NEEDSYN) {
4970                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4971                                 tp->t_flags |= TF_DELACK;
4972                         } else {
4973                                 tp->t_flags |= TF_ACKNOW;
4974                         }
4975                         tp->rcv_nxt++;
4976                 }
4977                 switch (tp->t_state) {
4978
4979                         /*
4980                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
4981                          * CLOSE_WAIT state.
4982                          */
4983                 case TCPS_SYN_RECEIVED:
4984                         tp->t_starttime = ticks;
4985                         /* FALLTHROUGH */
4986                 case TCPS_ESTABLISHED:
4987                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4988                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
4989                         break;
4990
4991                         /*
4992                          * If still in FIN_WAIT_1 STATE FIN has not been
4993                          * acked so enter the CLOSING state.
4994                          */
4995                 case TCPS_FIN_WAIT_1:
4996                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4997                         tcp_state_change(tp, TCPS_CLOSING);
4998                         break;
4999
5000                         /*
5001                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
5002                          * starting the time-wait timer, turning off the
5003                          * other standard timers.
5004                          */
5005                 case TCPS_FIN_WAIT_2:
5006                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5007                         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5008                         tcp_twstart(tp);
5009                         return (1);
5010                 }
5011         }
5012         /*
5013          * Return any desired output.
5014          */
5015         if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
5016                 rack->r_wanted_output++;
5017         }
5018         INP_WLOCK_ASSERT(tp->t_inpcb);
5019         return (0);
5020 }
5021
5022 /*
5023  * Here nothing is really faster, its just that we
5024  * have broken out the fast-data path also just like
5025  * the fast-ack.
5026  */
5027 static int
5028 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
5029     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5030     uint32_t tiwin, int32_t nxt_pkt)
5031 {
5032         int32_t nsegs;
5033         int32_t newsize = 0;    /* automatic sockbuf scaling */
5034         struct tcp_rack *rack;
5035 #ifdef TCPDEBUG
5036         /*
5037          * The size of tcp_saveipgen must be the size of the max ip header,
5038          * now IPv6.
5039          */
5040         u_char tcp_saveipgen[IP6_HDR_LEN];
5041         struct tcphdr tcp_savetcp;
5042         short ostate = 0;
5043
5044 #endif
5045         /*
5046          * If last ACK falls within this segment's sequence numbers, record
5047          * the timestamp. NOTE that the test is modified according to the
5048          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5049          */
5050         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5051                 return (0);
5052         }
5053         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5054                 return (0);
5055         }
5056         if (tiwin && tiwin != tp->snd_wnd) {
5057                 return (0);
5058         }
5059         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5060                 return (0);
5061         }
5062         if (__predict_false((to->to_flags & TOF_TS) &&
5063             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5064                 return (0);
5065         }
5066         if (__predict_false((th->th_ack != tp->snd_una))) {
5067                 return (0);
5068         }
5069         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5070                 return (0);
5071         }
5072         if ((to->to_flags & TOF_TS) != 0 &&
5073             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5074                 tp->ts_recent_age = tcp_ts_getticks();
5075                 tp->ts_recent = to->to_tsval;
5076         }
5077         rack = (struct tcp_rack *)tp->t_fb_ptr;
5078         /*
5079          * This is a pure, in-sequence data packet with nothing on the
5080          * reassembly queue and we have enough buffer space to take it.
5081          */
5082         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5083
5084
5085         /* Clean receiver SACK report if present */
5086         if (tp->rcv_numsacks)
5087                 tcp_clean_sackreport(tp);
5088         TCPSTAT_INC(tcps_preddat);
5089         tp->rcv_nxt += tlen;
5090         /*
5091          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5092          */
5093         tp->snd_wl1 = th->th_seq;
5094         /*
5095          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5096          */
5097         tp->rcv_up = tp->rcv_nxt;
5098         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5099         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5100 #ifdef TCPDEBUG
5101         if (so->so_options & SO_DEBUG)
5102                 tcp_trace(TA_INPUT, ostate, tp,
5103                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5104 #endif
5105         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5106
5107         /* Add data to socket buffer. */
5108         SOCKBUF_LOCK(&so->so_rcv);
5109         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5110                 m_freem(m);
5111         } else {
5112                 /*
5113                  * Set new socket buffer size. Give up when limit is
5114                  * reached.
5115                  */
5116                 if (newsize)
5117                         if (!sbreserve_locked(&so->so_rcv,
5118                             newsize, so, NULL))
5119                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5120                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5121                 sbappendstream_locked(&so->so_rcv, m, 0);
5122                 rack_calc_rwin(so, tp);
5123         }
5124         /* NB: sorwakeup_locked() does an implicit unlock. */
5125         sorwakeup_locked(so);
5126         if (DELAY_ACK(tp, tlen)) {
5127                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5128                 tp->t_flags |= TF_DELACK;
5129         } else {
5130                 tp->t_flags |= TF_ACKNOW;
5131                 rack->r_wanted_output++;
5132         }
5133         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
5134                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5135         return (1);
5136 }
5137
5138 /*
5139  * This subfunction is used to try to highly optimize the
5140  * fast path. We again allow window updates that are
5141  * in sequence to remain in the fast-path. We also add
5142  * in the __predict's to attempt to help the compiler.
5143  * Note that if we return a 0, then we can *not* process
5144  * it and the caller should push the packet into the
5145  * slow-path.
5146  */
5147 static int
5148 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5149     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5150     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
5151 {
5152         int32_t acked;
5153         int32_t nsegs;
5154
5155 #ifdef TCPDEBUG
5156         /*
5157          * The size of tcp_saveipgen must be the size of the max ip header,
5158          * now IPv6.
5159          */
5160         u_char tcp_saveipgen[IP6_HDR_LEN];
5161         struct tcphdr tcp_savetcp;
5162         short ostate = 0;
5163
5164 #endif
5165         struct tcp_rack *rack;
5166
5167         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5168                 /* Old ack, behind (or duplicate to) the last one rcv'd */
5169                 return (0);
5170         }
5171         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
5172                 /* Above what we have sent? */
5173                 return (0);
5174         }
5175         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5176                 /* We are retransmitting */
5177                 return (0);
5178         }
5179         if (__predict_false(tiwin == 0)) {
5180                 /* zero window */
5181                 return (0);
5182         }
5183         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
5184                 /* We need a SYN or a FIN, unlikely.. */
5185                 return (0);
5186         }
5187         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
5188                 /* Timestamp is behind .. old ack with seq wrap? */
5189                 return (0);
5190         }
5191         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
5192                 /* Still recovering */
5193                 return (0);
5194         }
5195         rack = (struct tcp_rack *)tp->t_fb_ptr;
5196         if (rack->r_ctl.rc_sacked) {
5197                 /* We have sack holes on our scoreboard */
5198                 return (0);
5199         }
5200         /* Ok if we reach here, we can process a fast-ack */
5201         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5202         rack_log_ack(tp, to, th);
5203         /* Did the window get updated? */
5204         if (tiwin != tp->snd_wnd) {
5205                 tp->snd_wnd = tiwin;
5206                 tp->snd_wl1 = th->th_seq;
5207                 if (tp->snd_wnd > tp->max_sndwnd)
5208                         tp->max_sndwnd = tp->snd_wnd;
5209         }
5210         if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
5211                 rack_exit_persist(tp, rack);
5212         }
5213         /*
5214          * If last ACK falls within this segment's sequence numbers, record
5215          * the timestamp. NOTE that the test is modified according to the
5216          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5217          */
5218         if ((to->to_flags & TOF_TS) != 0 &&
5219             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5220                 tp->ts_recent_age = tcp_ts_getticks();
5221                 tp->ts_recent = to->to_tsval;
5222         }
5223         /*
5224          * This is a pure ack for outstanding data.
5225          */
5226         TCPSTAT_INC(tcps_predack);
5227
5228         /*
5229          * "bad retransmit" recovery.
5230          */
5231         if (tp->t_flags & TF_PREVVALID) {
5232                 tp->t_flags &= ~TF_PREVVALID;
5233                 if (tp->t_rxtshift == 1 &&
5234                     (int)(ticks - tp->t_badrxtwin) < 0)
5235                         rack_cong_signal(tp, th, CC_RTO_ERR);
5236         }
5237         /*
5238          * Recalculate the transmit timer / rtt.
5239          *
5240          * Some boxes send broken timestamp replies during the SYN+ACK
5241          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5242          * and blow up the retransmit timer.
5243          */
5244         acked = BYTES_THIS_ACK(tp, th);
5245
5246 #ifdef TCP_HHOOK
5247         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
5248         hhook_run_tcp_est_in(tp, th, to);
5249 #endif
5250
5251         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5252         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5253         sbdrop(&so->so_snd, acked);
5254         /*
5255          * Let the congestion control algorithm update congestion control
5256          * related information. This typically means increasing the
5257          * congestion window.
5258          */
5259         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
5260
5261         tp->snd_una = th->th_ack;
5262         /*
5263          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
5264          */
5265         tp->snd_wl2 = th->th_ack;
5266         tp->t_dupacks = 0;
5267         m_freem(m);
5268         /* ND6_HINT(tp);         *//* Some progress has been made. */
5269
5270         /*
5271          * If all outstanding data are acked, stop retransmit timer,
5272          * otherwise restart timer using current (possibly backed-off)
5273          * value. If process is waiting for space, wakeup/selwakeup/signal.
5274          * If data are ready to send, let tcp_output decide between more
5275          * output or persist.
5276          */
5277 #ifdef TCPDEBUG
5278         if (so->so_options & SO_DEBUG)
5279                 tcp_trace(TA_INPUT, ostate, tp,
5280                     (void *)tcp_saveipgen,
5281                     &tcp_savetcp, 0);
5282 #endif
5283         if (tp->snd_una == tp->snd_max) {
5284                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5285                 tp->t_acktime = 0;
5286                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5287         }
5288         /* Wake up the socket if we have room to write more */
5289         sowwakeup(so);
5290         if (sbavail(&so->so_snd)) {
5291                 rack->r_wanted_output++;
5292         }
5293         return (1);
5294 }
5295
5296 /*
5297  * Return value of 1, the TCB is unlocked and most
5298  * likely gone, return value of 0, the TCP is still
5299  * locked.
5300  */
5301 static int
5302 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
5303     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5304     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5305 {
5306         int32_t ret_val = 0;
5307         int32_t todrop;
5308         int32_t ourfinisacked = 0;
5309
5310         rack_calc_rwin(so, tp);
5311         /*
5312          * If the state is SYN_SENT: if seg contains an ACK, but not for our
5313          * SYN, drop the input. if seg contains a RST, then drop the
5314          * connection. if seg does not contain SYN, then drop it. Otherwise
5315          * this is an acceptable SYN segment initialize tp->rcv_nxt and
5316          * tp->irs if seg contains ack then advance tp->snd_una if seg
5317          * contains an ECE and ECN support is enabled, the stream is ECN
5318          * capable. if SYN has been acked change to ESTABLISHED else
5319          * SYN_RCVD state arrange for segment to be acked (eventually)
5320          * continue processing rest of data/controls, beginning with URG
5321          */
5322         if ((thflags & TH_ACK) &&
5323             (SEQ_LEQ(th->th_ack, tp->iss) ||
5324             SEQ_GT(th->th_ack, tp->snd_max))) {
5325                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5326                 return (1);
5327         }
5328         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
5329                 TCP_PROBE5(connect__refused, NULL, tp,
5330                     mtod(m, const char *), tp, th);
5331                 tp = tcp_drop(tp, ECONNREFUSED);
5332                 rack_do_drop(m, tp);
5333                 return (1);
5334         }
5335         if (thflags & TH_RST) {
5336                 rack_do_drop(m, tp);
5337                 return (1);
5338         }
5339         if (!(thflags & TH_SYN)) {
5340                 rack_do_drop(m, tp);
5341                 return (1);
5342         }
5343         tp->irs = th->th_seq;
5344         tcp_rcvseqinit(tp);
5345         if (thflags & TH_ACK) {
5346                 TCPSTAT_INC(tcps_connects);
5347                 soisconnected(so);
5348 #ifdef MAC
5349                 mac_socketpeer_set_from_mbuf(m, so);
5350 #endif
5351                 /* Do window scaling on this connection? */
5352                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5353                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5354                         tp->rcv_scale = tp->request_r_scale;
5355                 }
5356                 tp->rcv_adv += min(tp->rcv_wnd,
5357                     TCP_MAXWIN << tp->rcv_scale);
5358                 /*
5359                  * If there's data, delay ACK; if there's also a FIN ACKNOW
5360                  * will be turned on later.
5361                  */
5362                 if (DELAY_ACK(tp, tlen) && tlen != 0) {
5363                         rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
5364                                           ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
5365                         tp->t_flags |= TF_DELACK;
5366                 } else {
5367                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
5368                         tp->t_flags |= TF_ACKNOW;
5369                 }
5370
5371                 if ((thflags & TH_ECE) && V_tcp_do_ecn) {
5372                         tp->t_flags |= TF_ECN_PERMIT;
5373                         TCPSTAT_INC(tcps_ecn_shs);
5374                 }
5375                 /*
5376                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
5377                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
5378                  */
5379                 tp->t_starttime = ticks;
5380                 if (tp->t_flags & TF_NEEDFIN) {
5381                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
5382                         tp->t_flags &= ~TF_NEEDFIN;
5383                         thflags &= ~TH_SYN;
5384                 } else {
5385                         tcp_state_change(tp, TCPS_ESTABLISHED);
5386                         TCP_PROBE5(connect__established, NULL, tp,
5387                             mtod(m, const char *), tp, th);
5388                         cc_conn_init(tp);
5389                 }
5390         } else {
5391                 /*
5392                  * Received initial SYN in SYN-SENT[*] state => simultaneous
5393                  * open.  If segment contains CC option and there is a
5394                  * cached CC, apply TAO test. If it succeeds, connection is *
5395                  * half-synchronized. Otherwise, do 3-way handshake:
5396                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
5397                  * there was no CC option, clear cached CC value.
5398                  */
5399                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
5400                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
5401         }
5402         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5403         INP_WLOCK_ASSERT(tp->t_inpcb);
5404         /*
5405          * Advance th->th_seq to correspond to first data byte. If data,
5406          * trim to stay within window, dropping FIN if necessary.
5407          */
5408         th->th_seq++;
5409         if (tlen > tp->rcv_wnd) {
5410                 todrop = tlen - tp->rcv_wnd;
5411                 m_adj(m, -todrop);
5412                 tlen = tp->rcv_wnd;
5413                 thflags &= ~TH_FIN;
5414                 TCPSTAT_INC(tcps_rcvpackafterwin);
5415                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
5416         }
5417         tp->snd_wl1 = th->th_seq - 1;
5418         tp->rcv_up = th->th_seq;
5419         /*
5420          * Client side of transaction: already sent SYN and data. If the
5421          * remote host used T/TCP to validate the SYN, our data will be
5422          * ACK'd; if so, enter normal data segment processing in the middle
5423          * of step 5, ack processing. Otherwise, goto step 6.
5424          */
5425         if (thflags & TH_ACK) {
5426                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
5427                         return (ret_val);
5428                 /* We may have changed to FIN_WAIT_1 above */
5429                 if (tp->t_state == TCPS_FIN_WAIT_1) {
5430                         /*
5431                          * In FIN_WAIT_1 STATE in addition to the processing
5432                          * for the ESTABLISHED state if our FIN is now
5433                          * acknowledged then enter FIN_WAIT_2.
5434                          */
5435                         if (ourfinisacked) {
5436                                 /*
5437                                  * If we can't receive any more data, then
5438                                  * closing user can proceed. Starting the
5439                                  * timer is contrary to the specification,
5440                                  * but if we don't get a FIN we'll hang
5441                                  * forever.
5442                                  *
5443                                  * XXXjl: we should release the tp also, and
5444                                  * use a compressed state.
5445                                  */
5446                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5447                                         soisdisconnected(so);
5448                                         tcp_timer_activate(tp, TT_2MSL,
5449                                             (tcp_fast_finwait2_recycle ?
5450                                             tcp_finwait2_timeout :
5451                                             TP_MAXIDLE(tp)));
5452                                 }
5453                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5454                         }
5455                 }
5456         }
5457         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5458             tiwin, thflags, nxt_pkt));
5459 }
5460
5461 /*
5462  * Return value of 1, the TCB is unlocked and most
5463  * likely gone, return value of 0, the TCP is still
5464  * locked.
5465  */
5466 static int
5467 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
5468     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5469     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5470 {
5471         int32_t ret_val = 0;
5472         int32_t ourfinisacked = 0;
5473
5474         rack_calc_rwin(so, tp);
5475
5476         if ((thflags & TH_ACK) &&
5477             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
5478             SEQ_GT(th->th_ack, tp->snd_max))) {
5479                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5480                 return (1);
5481         }
5482 #ifdef TCP_RFC7413
5483         if (tp->t_flags & TF_FASTOPEN) {
5484                 /*
5485                  * When a TFO connection is in SYN_RECEIVED, the only valid
5486                  * packets are the initial SYN, a retransmit/copy of the
5487                  * initial SYN (possibly with a subset of the original
5488                  * data), a valid ACK, a FIN, or a RST.
5489                  */
5490                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
5491                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5492                         return (1);
5493                 } else if (thflags & TH_SYN) {
5494                         /* non-initial SYN is ignored */
5495                         struct tcp_rack *rack;
5496
5497                         rack = (struct tcp_rack *)tp->t_fb_ptr;
5498                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
5499                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
5500                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
5501                                 rack_do_drop(m, NULL);
5502                                 return (0);
5503                         }
5504                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
5505                         rack_do_drop(m, NULL);
5506                         return (0);
5507                 }
5508         }
5509 #endif
5510         if (thflags & TH_RST)
5511                 return (rack_process_rst(m, th, so, tp));
5512         /*
5513          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5514          * synchronized state.
5515          */
5516         if (thflags & TH_SYN) {
5517                 rack_challenge_ack(m, th, tp, &ret_val);
5518                 return (ret_val);
5519         }
5520         /*
5521          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5522          * it's less than ts_recent, drop it.
5523          */
5524         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5525             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5526                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5527                         return (ret_val);
5528         }
5529         /*
5530          * In the SYN-RECEIVED state, validate that the packet belongs to
5531          * this connection before trimming the data to fit the receive
5532          * window.  Check the sequence number versus IRS since we know the
5533          * sequence numbers haven't wrapped.  This is a partial fix for the
5534          * "LAND" DoS attack.
5535          */
5536         if (SEQ_LT(th->th_seq, tp->irs)) {
5537                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5538                 return (1);
5539         }
5540         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5541                 return (ret_val);
5542         }
5543         /*
5544          * If last ACK falls within this segment's sequence numbers, record
5545          * its timestamp. NOTE: 1) That the test incorporates suggestions
5546          * from the latest proposal of the tcplw@cray.com list (Braden
5547          * 1993/04/26). 2) That updating only on newer timestamps interferes
5548          * with our earlier PAWS tests, so this check should be solely
5549          * predicated on the sequence space of this segment. 3) That we
5550          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5551          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5552          * SEG.Len, This modified check allows us to overcome RFC1323's
5553          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5554          * p.869. In such cases, we can still calculate the RTT correctly
5555          * when RCV.NXT == Last.ACK.Sent.
5556          */
5557         if ((to->to_flags & TOF_TS) != 0 &&
5558             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5559             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5560             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5561                 tp->ts_recent_age = tcp_ts_getticks();
5562                 tp->ts_recent = to->to_tsval;
5563         }
5564         /*
5565          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5566          * is on (half-synchronized state), then queue data for later
5567          * processing; else drop segment and return.
5568          */
5569         if ((thflags & TH_ACK) == 0) {
5570 #ifdef TCP_RFC7413
5571                 if (tp->t_flags & TF_FASTOPEN) {
5572                         tp->snd_wnd = tiwin;
5573                         cc_conn_init(tp);
5574                 }
5575 #endif
5576                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5577                     tiwin, thflags, nxt_pkt));
5578         }
5579         TCPSTAT_INC(tcps_connects);
5580         soisconnected(so);
5581         /* Do window scaling? */
5582         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5583             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5584                 tp->rcv_scale = tp->request_r_scale;
5585                 tp->snd_wnd = tiwin;
5586         }
5587         /*
5588          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
5589          * FIN-WAIT-1
5590          */
5591         tp->t_starttime = ticks;
5592         if (tp->t_flags & TF_NEEDFIN) {
5593                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
5594                 tp->t_flags &= ~TF_NEEDFIN;
5595         } else {
5596                 tcp_state_change(tp, TCPS_ESTABLISHED);
5597                 TCP_PROBE5(accept__established, NULL, tp,
5598                     mtod(m, const char *), tp, th);
5599 #ifdef TCP_RFC7413
5600                 if (tp->t_tfo_pending) {
5601                         tcp_fastopen_decrement_counter(tp->t_tfo_pending);
5602                         tp->t_tfo_pending = NULL;
5603
5604                         /*
5605                          * Account for the ACK of our SYN prior to regular
5606                          * ACK processing below.
5607                          */
5608                         tp->snd_una++;
5609                 }
5610                 /*
5611                  * TFO connections call cc_conn_init() during SYN
5612                  * processing.  Calling it again here for such connections
5613                  * is not harmless as it would undo the snd_cwnd reduction
5614                  * that occurs when a TFO SYN|ACK is retransmitted.
5615                  */
5616                 if (!(tp->t_flags & TF_FASTOPEN))
5617 #endif
5618                         cc_conn_init(tp);
5619         }
5620         /*
5621          * If segment contains data or ACK, will call tcp_reass() later; if
5622          * not, do so now to pass queued data to user.
5623          */
5624         if (tlen == 0 && (thflags & TH_FIN) == 0)
5625                 (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
5626                     (struct mbuf *)0);
5627         tp->snd_wl1 = th->th_seq - 1;
5628         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5629                 return (ret_val);
5630         }
5631         if (tp->t_state == TCPS_FIN_WAIT_1) {
5632                 /* We could have went to FIN_WAIT_1 (or EST) above */
5633                 /*
5634                  * In FIN_WAIT_1 STATE in addition to the processing for the
5635                  * ESTABLISHED state if our FIN is now acknowledged then
5636                  * enter FIN_WAIT_2.
5637                  */
5638                 if (ourfinisacked) {
5639                         /*
5640                          * If we can't receive any more data, then closing
5641                          * user can proceed. Starting the timer is contrary
5642                          * to the specification, but if we don't get a FIN
5643                          * we'll hang forever.
5644                          *
5645                          * XXXjl: we should release the tp also, and use a
5646                          * compressed state.
5647                          */
5648                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5649                                 soisdisconnected(so);
5650                                 tcp_timer_activate(tp, TT_2MSL,
5651                                     (tcp_fast_finwait2_recycle ?
5652                                     tcp_finwait2_timeout :
5653                                     TP_MAXIDLE(tp)));
5654                         }
5655                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
5656                 }
5657         }
5658         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5659             tiwin, thflags, nxt_pkt));
5660 }
5661
5662 /*
5663  * Return value of 1, the TCB is unlocked and most
5664  * likely gone, return value of 0, the TCP is still
5665  * locked.
5666  */
5667 static int
5668 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
5669     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5670     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5671 {
5672         int32_t ret_val = 0;
5673
5674         /*
5675          * Header prediction: check for the two common cases of a
5676          * uni-directional data xfer.  If the packet has no control flags,
5677          * is in-sequence, the window didn't change and we're not
5678          * retransmitting, it's a candidate.  If the length is zero and the
5679          * ack moved forward, we're the sender side of the xfer.  Just free
5680          * the data acked & wake any higher level process that was blocked
5681          * waiting for space.  If the length is non-zero and the ack didn't
5682          * move, we're the receiver side.  If we're getting packets in-order
5683          * (the reassembly queue is empty), add the data toc The socket
5684          * buffer and note that we need a delayed ack. Make sure that the
5685          * hidden state-flags are also off. Since we check for
5686          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
5687          */
5688         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
5689             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
5690             __predict_true(SEGQ_EMPTY(tp)) &&
5691             __predict_true(th->th_seq == tp->rcv_nxt)) {
5692                 struct tcp_rack *rack;
5693
5694                 rack = (struct tcp_rack *)tp->t_fb_ptr;
5695                 if (tlen == 0) {
5696                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
5697                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
5698                                 return (0);
5699                         }
5700                 } else {
5701                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
5702                             tiwin, nxt_pkt)) {
5703                                 return (0);
5704                         }
5705                 }
5706         }
5707         rack_calc_rwin(so, tp);
5708
5709         if (thflags & TH_RST)
5710                 return (rack_process_rst(m, th, so, tp));
5711
5712         /*
5713          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5714          * synchronized state.
5715          */
5716         if (thflags & TH_SYN) {
5717                 rack_challenge_ack(m, th, tp, &ret_val);
5718                 return (ret_val);
5719         }
5720         /*
5721          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5722          * it's less than ts_recent, drop it.
5723          */
5724         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5725             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5726                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5727                         return (ret_val);
5728         }
5729         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5730                 return (ret_val);
5731         }
5732         /*
5733          * If last ACK falls within this segment's sequence numbers, record
5734          * its timestamp. NOTE: 1) That the test incorporates suggestions
5735          * from the latest proposal of the tcplw@cray.com list (Braden
5736          * 1993/04/26). 2) That updating only on newer timestamps interferes
5737          * with our earlier PAWS tests, so this check should be solely
5738          * predicated on the sequence space of this segment. 3) That we
5739          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5740          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5741          * SEG.Len, This modified check allows us to overcome RFC1323's
5742          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5743          * p.869. In such cases, we can still calculate the RTT correctly
5744          * when RCV.NXT == Last.ACK.Sent.
5745          */
5746         if ((to->to_flags & TOF_TS) != 0 &&
5747             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5748             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5749             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5750                 tp->ts_recent_age = tcp_ts_getticks();
5751                 tp->ts_recent = to->to_tsval;
5752         }
5753         /*
5754          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5755          * is on (half-synchronized state), then queue data for later
5756          * processing; else drop segment and return.
5757          */
5758         if ((thflags & TH_ACK) == 0) {
5759                 if (tp->t_flags & TF_NEEDSYN) {
5760
5761                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5762                             tiwin, thflags, nxt_pkt));
5763
5764                 } else if (tp->t_flags & TF_ACKNOW) {
5765                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5766                         return (ret_val);
5767                 } else {
5768                         rack_do_drop(m, NULL);
5769                         return (0);
5770                 }
5771         }
5772         /*
5773          * Ack processing.
5774          */
5775         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5776                 return (ret_val);
5777         }
5778         if (sbavail(&so->so_snd)) {
5779                 if (rack_progress_timeout_check(tp)) {
5780                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5781                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5782                         return (1);
5783                 }
5784         }
5785         /* State changes only happen in rack_process_data() */
5786         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5787             tiwin, thflags, nxt_pkt));
5788 }
5789
5790 /*
5791  * Return value of 1, the TCB is unlocked and most
5792  * likely gone, return value of 0, the TCP is still
5793  * locked.
5794  */
5795 static int
5796 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
5797     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5798     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5799 {
5800         int32_t ret_val = 0;
5801
5802         rack_calc_rwin(so, tp);
5803         if (thflags & TH_RST)
5804                 return (rack_process_rst(m, th, so, tp));
5805         /*
5806          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5807          * synchronized state.
5808          */
5809         if (thflags & TH_SYN) {
5810                 rack_challenge_ack(m, th, tp, &ret_val);
5811                 return (ret_val);
5812         }
5813         /*
5814          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5815          * it's less than ts_recent, drop it.
5816          */
5817         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5818             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5819                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5820                         return (ret_val);
5821         }
5822         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5823                 return (ret_val);
5824         }
5825         /*
5826          * If last ACK falls within this segment's sequence numbers, record
5827          * its timestamp. NOTE: 1) That the test incorporates suggestions
5828          * from the latest proposal of the tcplw@cray.com list (Braden
5829          * 1993/04/26). 2) That updating only on newer timestamps interferes
5830          * with our earlier PAWS tests, so this check should be solely
5831          * predicated on the sequence space of this segment. 3) That we
5832          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5833          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5834          * SEG.Len, This modified check allows us to overcome RFC1323's
5835          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5836          * p.869. In such cases, we can still calculate the RTT correctly
5837          * when RCV.NXT == Last.ACK.Sent.
5838          */
5839         if ((to->to_flags & TOF_TS) != 0 &&
5840             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5841             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5842             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5843                 tp->ts_recent_age = tcp_ts_getticks();
5844                 tp->ts_recent = to->to_tsval;
5845         }
5846         /*
5847          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5848          * is on (half-synchronized state), then queue data for later
5849          * processing; else drop segment and return.
5850          */
5851         if ((thflags & TH_ACK) == 0) {
5852                 if (tp->t_flags & TF_NEEDSYN) {
5853                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5854                             tiwin, thflags, nxt_pkt));
5855
5856                 } else if (tp->t_flags & TF_ACKNOW) {
5857                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5858                         return (ret_val);
5859                 } else {
5860                         rack_do_drop(m, NULL);
5861                         return (0);
5862                 }
5863         }
5864         /*
5865          * Ack processing.
5866          */
5867         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5868                 return (ret_val);
5869         }
5870         if (sbavail(&so->so_snd)) {
5871                 if (rack_progress_timeout_check(tp)) {
5872                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5873                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5874                         return (1);
5875                 }
5876         }
5877         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5878             tiwin, thflags, nxt_pkt));
5879 }
5880
5881 static int
5882 rack_check_data_after_close(struct mbuf *m,
5883     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
5884 {
5885         struct tcp_rack *rack;
5886
5887         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5888         rack = (struct tcp_rack *)tp->t_fb_ptr;
5889         if (rack->rc_allow_data_af_clo == 0) {
5890         close_now:
5891                 tp = tcp_close(tp);
5892                 TCPSTAT_INC(tcps_rcvafterclose);
5893                 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
5894                 return (1);
5895         }
5896         if (sbavail(&so->so_snd) == 0)
5897                 goto close_now;
5898         /* Ok we allow data that is ignored and a followup reset */
5899         tp->rcv_nxt = th->th_seq + *tlen;
5900         tp->t_flags2 |= TF2_DROP_AF_DATA;
5901         rack->r_wanted_output = 1;
5902         *tlen = 0;
5903         return (0);
5904 }
5905
5906 /*
5907  * Return value of 1, the TCB is unlocked and most
5908  * likely gone, return value of 0, the TCP is still
5909  * locked.
5910  */
5911 static int
5912 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
5913     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5914     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5915 {
5916         int32_t ret_val = 0;
5917         int32_t ourfinisacked = 0;
5918
5919         rack_calc_rwin(so, tp);
5920
5921         if (thflags & TH_RST)
5922                 return (rack_process_rst(m, th, so, tp));
5923         /*
5924          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5925          * synchronized state.
5926          */
5927         if (thflags & TH_SYN) {
5928                 rack_challenge_ack(m, th, tp, &ret_val);
5929                 return (ret_val);
5930         }
5931         /*
5932          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5933          * it's less than ts_recent, drop it.
5934          */
5935         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5936             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5937                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5938                         return (ret_val);
5939         }
5940         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5941                 return (ret_val);
5942         }
5943         /*
5944          * If new data are received on a connection after the user processes
5945          * are gone, then RST the other end.
5946          */
5947         if ((so->so_state & SS_NOFDREF) && tlen) {
5948                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
5949                         return (1);
5950         }
5951         /*
5952          * If last ACK falls within this segment's sequence numbers, record
5953          * its timestamp. NOTE: 1) That the test incorporates suggestions
5954          * from the latest proposal of the tcplw@cray.com list (Braden
5955          * 1993/04/26). 2) That updating only on newer timestamps interferes
5956          * with our earlier PAWS tests, so this check should be solely
5957          * predicated on the sequence space of this segment. 3) That we
5958          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5959          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5960          * SEG.Len, This modified check allows us to overcome RFC1323's
5961          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5962          * p.869. In such cases, we can still calculate the RTT correctly
5963          * when RCV.NXT == Last.ACK.Sent.
5964          */
5965         if ((to->to_flags & TOF_TS) != 0 &&
5966             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5967             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5968             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5969                 tp->ts_recent_age = tcp_ts_getticks();
5970                 tp->ts_recent = to->to_tsval;
5971         }
5972         /*
5973          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5974          * is on (half-synchronized state), then queue data for later
5975          * processing; else drop segment and return.
5976          */
5977         if ((thflags & TH_ACK) == 0) {
5978                 if (tp->t_flags & TF_NEEDSYN) {
5979                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5980                             tiwin, thflags, nxt_pkt));
5981                 } else if (tp->t_flags & TF_ACKNOW) {
5982                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5983                         return (ret_val);
5984                 } else {
5985                         rack_do_drop(m, NULL);
5986                         return (0);
5987                 }
5988         }
5989         /*
5990          * Ack processing.
5991          */
5992         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5993                 return (ret_val);
5994         }
5995         if (ourfinisacked) {
5996                 /*
5997                  * If we can't receive any more data, then closing user can
5998                  * proceed. Starting the timer is contrary to the
5999                  * specification, but if we don't get a FIN we'll hang
6000                  * forever.
6001                  *
6002                  * XXXjl: we should release the tp also, and use a
6003                  * compressed state.
6004                  */
6005                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6006                         soisdisconnected(so);
6007                         tcp_timer_activate(tp, TT_2MSL,
6008                             (tcp_fast_finwait2_recycle ?
6009                             tcp_finwait2_timeout :
6010                             TP_MAXIDLE(tp)));
6011                 }
6012                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6013         }
6014         if (sbavail(&so->so_snd)) {
6015                 if (rack_progress_timeout_check(tp)) {
6016                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6017                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6018                         return (1);
6019                 }
6020         }
6021         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6022             tiwin, thflags, nxt_pkt));
6023 }
6024
6025 /*
6026  * Return value of 1, the TCB is unlocked and most
6027  * likely gone, return value of 0, the TCP is still
6028  * locked.
6029  */
6030 static int
6031 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
6032     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6033     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6034 {
6035         int32_t ret_val = 0;
6036         int32_t ourfinisacked = 0;
6037
6038         rack_calc_rwin(so, tp);
6039
6040         if (thflags & TH_RST)
6041                 return (rack_process_rst(m, th, so, tp));
6042         /*
6043          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6044          * synchronized state.
6045          */
6046         if (thflags & TH_SYN) {
6047                 rack_challenge_ack(m, th, tp, &ret_val);
6048                 return (ret_val);
6049         }
6050         /*
6051          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6052          * it's less than ts_recent, drop it.
6053          */
6054         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6055             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6056                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6057                         return (ret_val);
6058         }
6059         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6060                 return (ret_val);
6061         }
6062         /*
6063          * If new data are received on a connection after the user processes
6064          * are gone, then RST the other end.
6065          */
6066         if ((so->so_state & SS_NOFDREF) && tlen) {
6067                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6068                         return (1);
6069         }
6070         /*
6071          * If last ACK falls within this segment's sequence numbers, record
6072          * its timestamp. NOTE: 1) That the test incorporates suggestions
6073          * from the latest proposal of the tcplw@cray.com list (Braden
6074          * 1993/04/26). 2) That updating only on newer timestamps interferes
6075          * with our earlier PAWS tests, so this check should be solely
6076          * predicated on the sequence space of this segment. 3) That we
6077          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6078          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6079          * SEG.Len, This modified check allows us to overcome RFC1323's
6080          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6081          * p.869. In such cases, we can still calculate the RTT correctly
6082          * when RCV.NXT == Last.ACK.Sent.
6083          */
6084         if ((to->to_flags & TOF_TS) != 0 &&
6085             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6086             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6087             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6088                 tp->ts_recent_age = tcp_ts_getticks();
6089                 tp->ts_recent = to->to_tsval;
6090         }
6091         /*
6092          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6093          * is on (half-synchronized state), then queue data for later
6094          * processing; else drop segment and return.
6095          */
6096         if ((thflags & TH_ACK) == 0) {
6097                 if (tp->t_flags & TF_NEEDSYN) {
6098                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6099                             tiwin, thflags, nxt_pkt));
6100                 } else if (tp->t_flags & TF_ACKNOW) {
6101                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6102                         return (ret_val);
6103                 } else {
6104                         rack_do_drop(m, NULL);
6105                         return (0);
6106                 }
6107         }
6108         /*
6109          * Ack processing.
6110          */
6111         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6112                 return (ret_val);
6113         }
6114         if (ourfinisacked) {
6115                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6116                 tcp_twstart(tp);
6117                 m_freem(m);
6118                 return (1);
6119         }
6120         if (sbavail(&so->so_snd)) {
6121                 if (rack_progress_timeout_check(tp)) {
6122                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6123                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6124                         return (1);
6125                 }
6126         }
6127         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6128             tiwin, thflags, nxt_pkt));
6129 }
6130
6131 /*
6132  * Return value of 1, the TCB is unlocked and most
6133  * likely gone, return value of 0, the TCP is still
6134  * locked.
6135  */
6136 static int
6137 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6138     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6139     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6140 {
6141         int32_t ret_val = 0;
6142         int32_t ourfinisacked = 0;
6143
6144         rack_calc_rwin(so, tp);
6145
6146         if (thflags & TH_RST)
6147                 return (rack_process_rst(m, th, so, tp));
6148         /*
6149          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6150          * synchronized state.
6151          */
6152         if (thflags & TH_SYN) {
6153                 rack_challenge_ack(m, th, tp, &ret_val);
6154                 return (ret_val);
6155         }
6156         /*
6157          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6158          * it's less than ts_recent, drop it.
6159          */
6160         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6161             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6162                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6163                         return (ret_val);
6164         }
6165         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6166                 return (ret_val);
6167         }
6168         /*
6169          * If new data are received on a connection after the user processes
6170          * are gone, then RST the other end.
6171          */
6172         if ((so->so_state & SS_NOFDREF) && tlen) {
6173                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6174                         return (1);
6175         }
6176         /*
6177          * If last ACK falls within this segment's sequence numbers, record
6178          * its timestamp. NOTE: 1) That the test incorporates suggestions
6179          * from the latest proposal of the tcplw@cray.com list (Braden
6180          * 1993/04/26). 2) That updating only on newer timestamps interferes
6181          * with our earlier PAWS tests, so this check should be solely
6182          * predicated on the sequence space of this segment. 3) That we
6183          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6184          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6185          * SEG.Len, This modified check allows us to overcome RFC1323's
6186          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6187          * p.869. In such cases, we can still calculate the RTT correctly
6188          * when RCV.NXT == Last.ACK.Sent.
6189          */
6190         if ((to->to_flags & TOF_TS) != 0 &&
6191             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6192             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6193             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6194                 tp->ts_recent_age = tcp_ts_getticks();
6195                 tp->ts_recent = to->to_tsval;
6196         }
6197         /*
6198          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6199          * is on (half-synchronized state), then queue data for later
6200          * processing; else drop segment and return.
6201          */
6202         if ((thflags & TH_ACK) == 0) {
6203                 if (tp->t_flags & TF_NEEDSYN) {
6204                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6205                             tiwin, thflags, nxt_pkt));
6206                 } else if (tp->t_flags & TF_ACKNOW) {
6207                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6208                         return (ret_val);
6209                 } else {
6210                         rack_do_drop(m, NULL);
6211                         return (0);
6212                 }
6213         }
6214         /*
6215          * case TCPS_LAST_ACK: Ack processing.
6216          */
6217         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6218                 return (ret_val);
6219         }
6220         if (ourfinisacked) {
6221                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6222                 tp = tcp_close(tp);
6223                 rack_do_drop(m, tp);
6224                 return (1);
6225         }
6226         if (sbavail(&so->so_snd)) {
6227                 if (rack_progress_timeout_check(tp)) {
6228                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6229                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6230                         return (1);
6231                 }
6232         }
6233         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6234             tiwin, thflags, nxt_pkt));
6235 }
6236
6237
6238 /*
6239  * Return value of 1, the TCB is unlocked and most
6240  * likely gone, return value of 0, the TCP is still
6241  * locked.
6242  */
6243 static int
6244 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
6245     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6246     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6247 {
6248         int32_t ret_val = 0;
6249         int32_t ourfinisacked = 0;
6250
6251         rack_calc_rwin(so, tp);
6252
6253         /* Reset receive buffer auto scaling when not in bulk receive mode. */
6254         if (thflags & TH_RST)
6255                 return (rack_process_rst(m, th, so, tp));
6256         /*
6257          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6258          * synchronized state.
6259          */
6260         if (thflags & TH_SYN) {
6261                 rack_challenge_ack(m, th, tp, &ret_val);
6262                 return (ret_val);
6263         }
6264         /*
6265          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6266          * it's less than ts_recent, drop it.
6267          */
6268         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6269             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6270                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6271                         return (ret_val);
6272         }
6273         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6274                 return (ret_val);
6275         }
6276         /*
6277          * If new data are received on a connection after the user processes
6278          * are gone, then RST the other end.
6279          */
6280         if ((so->so_state & SS_NOFDREF) &&
6281             tlen) {
6282                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6283                         return (1);
6284         }
6285         /*
6286          * If last ACK falls within this segment's sequence numbers, record
6287          * its timestamp. NOTE: 1) That the test incorporates suggestions
6288          * from the latest proposal of the tcplw@cray.com list (Braden
6289          * 1993/04/26). 2) That updating only on newer timestamps interferes
6290          * with our earlier PAWS tests, so this check should be solely
6291          * predicated on the sequence space of this segment. 3) That we
6292          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6293          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6294          * SEG.Len, This modified check allows us to overcome RFC1323's
6295          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6296          * p.869. In such cases, we can still calculate the RTT correctly
6297          * when RCV.NXT == Last.ACK.Sent.
6298          */
6299         if ((to->to_flags & TOF_TS) != 0 &&
6300             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6301             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6302             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6303                 tp->ts_recent_age = tcp_ts_getticks();
6304                 tp->ts_recent = to->to_tsval;
6305         }
6306         /*
6307          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6308          * is on (half-synchronized state), then queue data for later
6309          * processing; else drop segment and return.
6310          */
6311         if ((thflags & TH_ACK) == 0) {
6312                 if (tp->t_flags & TF_NEEDSYN) {
6313                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6314                             tiwin, thflags, nxt_pkt));
6315                 } else if (tp->t_flags & TF_ACKNOW) {
6316                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6317                         return (ret_val);
6318                 } else {
6319                         rack_do_drop(m, NULL);
6320                         return (0);
6321                 }
6322         }
6323         /*
6324          * Ack processing.
6325          */
6326         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6327                 return (ret_val);
6328         }
6329         if (sbavail(&so->so_snd)) {
6330                 if (rack_progress_timeout_check(tp)) {
6331                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6332                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6333                         return (1);
6334                 }
6335         }
6336         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6337             tiwin, thflags, nxt_pkt));
6338 }
6339
6340
6341 static void inline
6342 rack_clear_rate_sample(struct tcp_rack *rack)
6343 {
6344         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
6345         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
6346         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
6347 }
6348
6349 static int
6350 rack_init(struct tcpcb *tp)
6351 {
6352         struct tcp_rack *rack = NULL;
6353
6354         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
6355         if (tp->t_fb_ptr == NULL) {
6356                 /*
6357                  * We need to allocate memory but cant. The INP and INP_INFO
6358                  * locks and they are recusive (happens during setup. So a
6359                  * scheme to drop the locks fails :(
6360                  *
6361                  */
6362                 return (ENOMEM);
6363         }
6364         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
6365
6366         rack = (struct tcp_rack *)tp->t_fb_ptr;
6367         TAILQ_INIT(&rack->r_ctl.rc_map);
6368         TAILQ_INIT(&rack->r_ctl.rc_free);
6369         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6370         rack->rc_tp = tp;
6371         if (tp->t_inpcb) {
6372                 rack->rc_inp = tp->t_inpcb;
6373         }
6374         /* Probably not needed but lets be sure */
6375         rack_clear_rate_sample(rack);
6376         rack->r_cpu = 0;
6377         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
6378         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
6379         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
6380         rack->rc_pace_reduce = rack_slot_reduction;
6381         if (V_tcp_delack_enabled)
6382                 tp->t_delayed_ack = 1;
6383         else
6384                 tp->t_delayed_ack = 0;
6385         rack->rc_pace_max_segs = rack_hptsi_segments;
6386         rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
6387         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
6388         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
6389         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
6390         rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
6391         rack->r_enforce_min_pace = rack_min_pace_time;
6392         rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
6393         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
6394         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
6395         rack->r_ctl.rc_early_recovery = rack_early_recovery;
6396         rack->rc_always_pace = rack_pace_every_seg;
6397         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
6398         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
6399         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
6400         rack->r_ctl.rc_min_to = rack_min_to;
6401         rack->r_ctl.rc_prr_inc_var = rack_inc_var;
6402         if (tp->snd_una != tp->snd_max) {
6403                 /* Create a send map for the current outstanding data */
6404                 struct rack_sendmap *rsm;
6405
6406                 rsm = rack_alloc(rack);
6407                 if (rsm == NULL) {
6408                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6409                         tp->t_fb_ptr = NULL;
6410                         return (ENOMEM);
6411                 }
6412                 rsm->r_flags = RACK_OVERMAX;
6413                 rsm->r_tim_lastsent[0] = tcp_ts_getticks();
6414                 rsm->r_rtr_cnt = 1;
6415                 rsm->r_rtr_bytes = 0;
6416                 rsm->r_start = tp->snd_una;
6417                 rsm->r_end = tp->snd_max;
6418                 rsm->r_sndcnt = 0;
6419                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
6420                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6421                 rsm->r_in_tmap = 1;
6422         }
6423         rack_stop_all_timers(tp);
6424         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6425         return (0);
6426 }
6427
6428 static int
6429 rack_handoff_ok(struct tcpcb *tp)
6430 {
6431         if ((tp->t_state == TCPS_CLOSED) ||
6432             (tp->t_state == TCPS_LISTEN)) {
6433                 /* Sure no problem though it may not stick */
6434                 return (0);
6435         }
6436         if ((tp->t_state == TCPS_SYN_SENT) ||
6437             (tp->t_state == TCPS_SYN_RECEIVED)) {
6438                 /*
6439                  * We really don't know you have to get to ESTAB or beyond
6440                  * to tell.
6441                  */
6442                 return (EAGAIN);
6443         }
6444         if (tp->t_flags & TF_SACK_PERMIT) {
6445                 return (0);
6446         }
6447         /*
6448          * If we reach here we don't do SACK on this connection so we can
6449          * never do rack.
6450          */
6451         return (EINVAL);
6452 }
6453
6454 static void
6455 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
6456 {
6457         if (tp->t_fb_ptr) {
6458                 struct tcp_rack *rack;
6459                 struct rack_sendmap *rsm;
6460
6461                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6462 #ifdef TCP_BLACKBOX
6463                 tcp_log_flowend(tp);
6464 #endif
6465                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6466                 while (rsm) {
6467                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
6468                         uma_zfree(rack_zone, rsm);
6469                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6470                 }
6471                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6472                 while (rsm) {
6473                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
6474                         uma_zfree(rack_zone, rsm);
6475                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6476                 }
6477                 rack->rc_free_cnt = 0;
6478                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6479                 tp->t_fb_ptr = NULL;
6480         }
6481         /* Make sure snd_nxt is correctly set */
6482         tp->snd_nxt = tp->snd_max;
6483 }
6484
6485 static void
6486 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
6487 {
6488         switch (tp->t_state) {
6489         case TCPS_SYN_SENT:
6490                 rack->r_state = TCPS_SYN_SENT;
6491                 rack->r_substate = rack_do_syn_sent;
6492                 break;
6493         case TCPS_SYN_RECEIVED:
6494                 rack->r_state = TCPS_SYN_RECEIVED;
6495                 rack->r_substate = rack_do_syn_recv;
6496                 break;
6497         case TCPS_ESTABLISHED:
6498                 rack->r_state = TCPS_ESTABLISHED;
6499                 rack->r_substate = rack_do_established;
6500                 break;
6501         case TCPS_CLOSE_WAIT:
6502                 rack->r_state = TCPS_CLOSE_WAIT;
6503                 rack->r_substate = rack_do_close_wait;
6504                 break;
6505         case TCPS_FIN_WAIT_1:
6506                 rack->r_state = TCPS_FIN_WAIT_1;
6507                 rack->r_substate = rack_do_fin_wait_1;
6508                 break;
6509         case TCPS_CLOSING:
6510                 rack->r_state = TCPS_CLOSING;
6511                 rack->r_substate = rack_do_closing;
6512                 break;
6513         case TCPS_LAST_ACK:
6514                 rack->r_state = TCPS_LAST_ACK;
6515                 rack->r_substate = rack_do_lastack;
6516                 break;
6517         case TCPS_FIN_WAIT_2:
6518                 rack->r_state = TCPS_FIN_WAIT_2;
6519                 rack->r_substate = rack_do_fin_wait_2;
6520                 break;
6521         case TCPS_LISTEN:
6522         case TCPS_CLOSED:
6523         case TCPS_TIME_WAIT:
6524         default:
6525                 break;
6526         };
6527 }
6528
6529
6530 static void
6531 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
6532 {
6533         /*
6534          * We received an ack, and then did not
6535          * call send or were bounced out due to the
6536          * hpts was running. Now a timer is up as well, is
6537          * it the right timer?
6538          */
6539         struct rack_sendmap *rsm;
6540         int tmr_up;
6541
6542         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
6543         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
6544                 return;
6545         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6546         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
6547             (tmr_up == PACE_TMR_RXT)) {
6548                 /* Should be an RXT */
6549                 return;
6550         }
6551         if (rsm == NULL) {
6552                 /* Nothing outstanding? */
6553                 if (tp->t_flags & TF_DELACK) {
6554                         if (tmr_up == PACE_TMR_DELACK)
6555                                 /* We are supposed to have delayed ack up and we do */
6556                                 return;
6557                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
6558                         /*
6559                          * if we hit enobufs then we would expect the possiblity
6560                          * of nothing outstanding and the RXT up (and the hptsi timer).
6561                          */
6562                         return;
6563                 } else if (((tcp_always_keepalive ||
6564                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6565                             (tp->t_state <= TCPS_CLOSING)) &&
6566                            (tmr_up == PACE_TMR_KEEP) &&
6567                            (tp->snd_max == tp->snd_una)) {
6568                         /* We should have keep alive up and we do */
6569                         return;
6570                 }
6571         }
6572         if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
6573                 if ((tp->t_flags & TF_SENTFIN) &&
6574                     ((tp->snd_max - tp->snd_una) == 1) &&
6575                     (rsm->r_flags & RACK_HAS_FIN)) {
6576                         /* needs to be a RXT */
6577                         if (tmr_up == PACE_TMR_RXT)
6578                                 return;
6579                 } else if (tmr_up == PACE_TMR_RACK)
6580                         return;
6581         } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
6582                    ((tmr_up == PACE_TMR_TLP) ||
6583                     (tmr_up == PACE_TMR_RXT))) {
6584                 /*
6585                  * Either a TLP or RXT is fine if no sack-passed
6586                  * is in place and data is outstanding.
6587                  */
6588                 return;
6589         } else if (tmr_up == PACE_TMR_DELACK) {
6590                 /*
6591                  * If the delayed ack was going to go off
6592                  * before the rtx/tlp/rack timer were going to
6593                  * expire, then that would be the timer in control.
6594                  * Note we don't check the time here trusting the
6595                  * code is correct.
6596                  */
6597                 return;
6598         }
6599         /*
6600          * Ok the timer originally started is not what we want now.
6601          * We will force the hpts to be stopped if any, and restart
6602          * with the slot set to what was in the saved slot.
6603          */
6604         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6605         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6606 }
6607
6608 static void
6609 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6610     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
6611     int32_t nxt_pkt, struct timeval *tv)
6612 {
6613         int32_t thflags, retval, did_out = 0;
6614         int32_t way_out = 0;
6615         uint32_t cts;
6616         uint32_t tiwin;
6617         struct tcpopt to;
6618         struct tcp_rack *rack;
6619         struct rack_sendmap *rsm;
6620         int32_t prev_state = 0;
6621
6622         cts = tcp_tv_to_mssectick(tv);
6623         rack = (struct tcp_rack *)tp->t_fb_ptr;
6624
6625         kern_prefetch(rack, &prev_state);
6626         prev_state = 0;
6627         thflags = th->th_flags;
6628         /*
6629          * If this is either a state-changing packet or current state isn't
6630          * established, we require a read lock on tcbinfo.  Otherwise, we
6631          * allow the tcbinfo to be in either locked or unlocked, as the
6632          * caller may have unnecessarily acquired a lock due to a race.
6633          */
6634         INP_WLOCK_ASSERT(tp->t_inpcb);
6635         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
6636             __func__));
6637         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
6638             __func__));
6639         {
6640                 union tcp_log_stackspecific log;
6641
6642                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6643                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
6644                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
6645                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
6646                 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
6647                     tlen, &log, true);
6648         }
6649         /*
6650          * Segment received on connection. Reset idle time and keep-alive
6651          * timer. XXX: This should be done after segment validation to
6652          * ignore broken/spoofed segs.
6653          */
6654         if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
6655                 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
6656                         counter_u64_add(rack_input_idle_reduces, 1);
6657                         rack_cc_after_idle(tp,
6658                             (rack->r_idle_reduce_largest ? 1 :0));
6659                 }
6660         }
6661         rack->r_ctl.rc_rcvtime = cts;
6662         tp->t_rcvtime = ticks;
6663
6664         /*
6665          * Unscale the window into a 32-bit value. For the SYN_SENT state
6666          * the scale is zero.
6667          */
6668         tiwin = th->th_win << tp->snd_scale;
6669 #ifdef NETFLIX_STATS
6670         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
6671 #endif
6672         /*
6673          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
6674          * this to occur after we've validated the segment.
6675          */
6676         if (tp->t_flags & TF_ECN_PERMIT) {
6677                 if (thflags & TH_CWR)
6678                         tp->t_flags &= ~TF_ECN_SND_ECE;
6679                 switch (iptos & IPTOS_ECN_MASK) {
6680                 case IPTOS_ECN_CE:
6681                         tp->t_flags |= TF_ECN_SND_ECE;
6682                         TCPSTAT_INC(tcps_ecn_ce);
6683                         break;
6684                 case IPTOS_ECN_ECT0:
6685                         TCPSTAT_INC(tcps_ecn_ect0);
6686                         break;
6687                 case IPTOS_ECN_ECT1:
6688                         TCPSTAT_INC(tcps_ecn_ect1);
6689                         break;
6690                 }
6691                 /* Congestion experienced. */
6692                 if (thflags & TH_ECE) {
6693                         rack_cong_signal(tp, th, CC_ECN);
6694                 }
6695         }
6696         /*
6697          * Parse options on any incoming segment.
6698          */
6699         tcp_dooptions(&to, (u_char *)(th + 1),
6700             (th->th_off << 2) - sizeof(struct tcphdr),
6701             (thflags & TH_SYN) ? TO_SYN : 0);
6702
6703         /*
6704          * If echoed timestamp is later than the current time, fall back to
6705          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
6706          * were used when this connection was established.
6707          */
6708         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
6709                 to.to_tsecr -= tp->ts_offset;
6710                 if (TSTMP_GT(to.to_tsecr, cts))
6711                         to.to_tsecr = 0;
6712         }
6713         /*
6714          * If its the first time in we need to take care of options and
6715          * verify we can do SACK for rack!
6716          */
6717         if (rack->r_state == 0) {
6718                 /* Should be init'd by rack_init() */
6719                 KASSERT(rack->rc_inp != NULL,
6720                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
6721                 if (rack->rc_inp == NULL) {
6722                         rack->rc_inp = tp->t_inpcb;
6723                 }
6724
6725                 /*
6726                  * Process options only when we get SYN/ACK back. The SYN
6727                  * case for incoming connections is handled in tcp_syncache.
6728                  * According to RFC1323 the window field in a SYN (i.e., a
6729                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
6730                  * this is traditional behavior, may need to be cleaned up.
6731                  */
6732                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
6733                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
6734                         if ((to.to_flags & TOF_SCALE) &&
6735                             (tp->t_flags & TF_REQ_SCALE)) {
6736                                 tp->t_flags |= TF_RCVD_SCALE;
6737                                 tp->snd_scale = to.to_wscale;
6738                         }
6739                         /*
6740                          * Initial send window.  It will be updated with the
6741                          * next incoming segment to the scaled value.
6742                          */
6743                         tp->snd_wnd = th->th_win;
6744                         if (to.to_flags & TOF_TS) {
6745                                 tp->t_flags |= TF_RCVD_TSTMP;
6746                                 tp->ts_recent = to.to_tsval;
6747                                 tp->ts_recent_age = cts;
6748                         }
6749                         if (to.to_flags & TOF_MSS)
6750                                 tcp_mss(tp, to.to_mss);
6751                         if ((tp->t_flags & TF_SACK_PERMIT) &&
6752                             (to.to_flags & TOF_SACKPERM) == 0)
6753                                 tp->t_flags &= ~TF_SACK_PERMIT;
6754                 }
6755                 /*
6756                  * At this point we are at the initial call. Here we decide
6757                  * if we are doing RACK or not. We do this by seeing if
6758                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
6759                  * we switch to the default code.
6760                  */
6761                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
6762                         tcp_switch_back_to_default(tp);
6763                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
6764                             tlen, iptos);
6765                         return;
6766                 }
6767                 /* Set the flag */
6768                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
6769                 tcp_set_hpts(tp->t_inpcb);
6770                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
6771         }
6772         /*
6773          * This is the one exception case where we set the rack state
6774          * always. All other times (timers etc) we must have a rack-state
6775          * set (so we assure we have done the checks above for SACK).
6776          */
6777         if (rack->r_state != tp->t_state)
6778                 rack_set_state(tp, rack);
6779         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
6780                 kern_prefetch(rsm, &prev_state);
6781         prev_state = rack->r_state;
6782         rack->r_ctl.rc_tlp_send_cnt = 0;
6783         rack_clear_rate_sample(rack);
6784         retval = (*rack->r_substate) (m, th, so,
6785             tp, &to, drop_hdrlen,
6786             tlen, tiwin, thflags, nxt_pkt);
6787 #ifdef INVARIANTS
6788         if ((retval == 0) &&
6789             (tp->t_inpcb == NULL)) {
6790                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
6791                     retval, tp, prev_state);
6792         }
6793 #endif
6794         if (retval == 0) {
6795                 /*
6796                  * If retval is 1 the tcb is unlocked and most likely the tp
6797                  * is gone.
6798                  */
6799                 INP_WLOCK_ASSERT(tp->t_inpcb);
6800                 tcp_rack_xmit_timer_commit(rack, tp);
6801                 if (nxt_pkt == 0) {
6802                         if (rack->r_wanted_output != 0) {
6803                                 did_out = 1;
6804                                 (void)tp->t_fb->tfb_tcp_output(tp);
6805                         }
6806                         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
6807                 }
6808                 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
6809                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
6810                      (tp->t_flags & TF_DELACK) ||
6811                      ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6812                       (tp->t_state <= TCPS_CLOSING)))) {
6813                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
6814                         if ((tp->snd_max == tp->snd_una) &&
6815                             ((tp->t_flags & TF_DELACK) == 0) &&
6816                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
6817                                 /* keep alive not needed if we are hptsi output yet */
6818                                 ;
6819                         } else {
6820                                 if (rack->rc_inp->inp_in_hpts)
6821                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6822                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6823                         }
6824                         way_out = 1;
6825                 } else {
6826                         /* Do we have the correct timer running? */
6827                         rack_timer_audit(tp, rack, &so->so_snd);
6828                         way_out = 2;
6829                 }
6830                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
6831                 if (did_out)
6832                         rack->r_wanted_output = 0;
6833 #ifdef INVARIANTS
6834                 if (tp->t_inpcb == NULL) {
6835                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
6836                               did_out,
6837                               retval, tp, prev_state);
6838                 }
6839 #endif
6840                 INP_WUNLOCK(tp->t_inpcb);
6841         }
6842 }
6843
6844 void
6845 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6846     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
6847 {
6848         struct timeval tv;
6849 #ifdef RSS
6850         struct tcp_function_block *tfb;
6851         struct tcp_rack *rack;
6852         struct inpcb *inp;
6853
6854         rack = (struct tcp_rack *)tp->t_fb_ptr;
6855         if (rack->r_state == 0) {
6856                 /*
6857                  * Initial input (ACK to SYN-ACK etc)lets go ahead and get
6858                  * it processed
6859                  */
6860                 tcp_get_usecs(&tv);
6861                 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6862                     tlen, iptos, 0, &tv);
6863                 return;
6864         }
6865         tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
6866         INP_WUNLOCK(tp->t_inpcb);
6867 #else
6868         tcp_get_usecs(&tv);
6869         rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6870             tlen, iptos, 0, &tv);
6871 #endif
6872 }
6873
6874 struct rack_sendmap *
6875 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
6876 {
6877         struct rack_sendmap *rsm = NULL;
6878         int32_t idx;
6879         uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
6880
6881         /* Return the next guy to be re-transmitted */
6882         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
6883                 return (NULL);
6884         }
6885         if (tp->t_flags & TF_SENTFIN) {
6886                 /* retran the end FIN? */
6887                 return (NULL);
6888         }
6889         /* ok lets look at this one */
6890         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6891         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
6892                 goto check_it;
6893         }
6894         rsm = rack_find_lowest_rsm(rack);
6895         if (rsm == NULL) {
6896                 return (NULL);
6897         }
6898 check_it:
6899         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
6900         srtt = TICKS_2_MSEC(srtt_cur);
6901         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
6902                 srtt = rack->rc_rack_rtt;
6903         if (rsm->r_flags & RACK_ACKED) {
6904                 return (NULL);
6905         }
6906         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
6907                 /* Its not yet ready */
6908                 return (NULL);
6909         }
6910         idx = rsm->r_rtr_cnt - 1;
6911         ts_low = rsm->r_tim_lastsent[idx];
6912         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6913         if (tsused <= ts_low) {
6914                 return (NULL);
6915         }
6916         if ((tsused - ts_low) >= thresh) {
6917                 return (rsm);
6918         }
6919         return (NULL);
6920 }
6921
6922 static int
6923 rack_output(struct tcpcb *tp)
6924 {
6925         struct socket *so;
6926         uint32_t recwin, sendwin;
6927         uint32_t sb_offset;
6928         int32_t len, flags, error = 0;
6929         struct mbuf *m;
6930         struct mbuf *mb;
6931         uint32_t if_hw_tsomaxsegcount = 0;
6932         uint32_t if_hw_tsomaxsegsize;
6933         long tot_len_this_send = 0;
6934         struct ip *ip = NULL;
6935 #ifdef TCPDEBUG
6936         struct ipovly *ipov = NULL;
6937 #endif
6938 #ifdef NETFLIX_TCP_O_UDP
6939         struct udphdr *udp = NULL;
6940 #endif
6941         struct tcp_rack *rack;
6942         struct tcphdr *th;
6943         uint8_t pass = 0;
6944         u_char opt[TCP_MAXOLEN];
6945         unsigned ipoptlen, optlen, hdrlen;
6946 #ifdef NETFLIX_TCP_O_UDP
6947         unsigned ulen;
6948 #endif
6949         uint32_t rack_seq;
6950
6951 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
6952         unsigned ipsec_optlen = 0;
6953
6954 #endif
6955         int32_t idle, sendalot;
6956         int32_t sub_from_prr = 0;
6957         volatile int32_t sack_rxmit;
6958         struct rack_sendmap *rsm = NULL;
6959         int32_t tso, mtu, would_have_fin = 0;
6960         struct tcpopt to;
6961         int32_t slot = 0;
6962         uint32_t cts;
6963         uint8_t hpts_calling, doing_tlp = 0;
6964         int32_t do_a_prefetch;
6965         int32_t prefetch_rsm = 0;
6966         int32_t prefetch_so_done = 0;
6967         struct tcp_log_buffer *lgb = NULL;
6968         struct inpcb *inp;
6969         struct sockbuf *sb;
6970 #ifdef INET6
6971         struct ip6_hdr *ip6 = NULL;
6972         int32_t isipv6;
6973 #endif
6974         /* setup and take the cache hits here */
6975         rack = (struct tcp_rack *)tp->t_fb_ptr;
6976         inp = rack->rc_inp;
6977         so = inp->inp_socket;
6978         sb = &so->so_snd;
6979         kern_prefetch(sb, &do_a_prefetch);
6980         do_a_prefetch = 1;
6981
6982         INP_WLOCK_ASSERT(inp);
6983 #ifdef TCP_OFFLOAD
6984         if (tp->t_flags & TF_TOE)
6985                 return (tcp_offload_output(tp));
6986 #endif
6987
6988 #ifdef TCP_RFC7413
6989         /*
6990          * For TFO connections in SYN_RECEIVED, only allow the initial
6991          * SYN|ACK and those sent by the retransmit timer.
6992          */
6993         if ((tp->t_flags & TF_FASTOPEN) &&
6994             (tp->t_state == TCPS_SYN_RECEIVED) &&
6995             SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
6996             (tp->snd_nxt != tp->snd_una))       /* not a retransmit */
6997                 return (0);
6998 #endif
6999 #ifdef INET6
7000         if (rack->r_state) {
7001                 /* Use the cache line loaded if possible */
7002                 isipv6 = rack->r_is_v6;
7003         } else {
7004                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
7005         }
7006 #endif
7007         cts = tcp_ts_getticks();
7008         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
7009             inp->inp_in_hpts) {
7010                 /*
7011                  * We are on the hpts for some timer but not hptsi output.
7012                  * Remove from the hpts unconditionally.
7013                  */
7014                 rack_timer_cancel(tp, rack, cts, __LINE__);
7015         }
7016         /* Mark that we have called rack_output(). */
7017         if ((rack->r_timer_override) ||
7018             (tp->t_flags & TF_FORCEDATA) ||
7019             (tp->t_state < TCPS_ESTABLISHED)) {
7020                 if (tp->t_inpcb->inp_in_hpts)
7021                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
7022         } else if (tp->t_inpcb->inp_in_hpts) {
7023                 /*
7024                  * On the hpts you can't pass even if ACKNOW is on, we will
7025                  * when the hpts fires.
7026                  */
7027                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
7028                 return (0);
7029         }
7030         hpts_calling = inp->inp_hpts_calls;
7031         inp->inp_hpts_calls = 0;
7032         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7033                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
7034                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
7035                         return (0);
7036                 }
7037         }
7038         rack->r_wanted_output = 0;
7039         rack->r_timer_override = 0;
7040         /*
7041          * Determine length of data that should be transmitted, and flags
7042          * that will be used. If there is some data or critical controls
7043          * (SYN, RST) to send, then transmit; otherwise, investigate
7044          * further.
7045          */
7046         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
7047         if (tp->t_idle_reduce) {
7048                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
7049                         rack_cc_after_idle(tp,
7050                             (rack->r_idle_reduce_largest ? 1 :0));
7051         }
7052         tp->t_flags &= ~TF_LASTIDLE;
7053         if (idle) {
7054                 if (tp->t_flags & TF_MORETOCOME) {
7055                         tp->t_flags |= TF_LASTIDLE;
7056                         idle = 0;
7057                 }
7058         }
7059 again:
7060         /*
7061          * If we've recently taken a timeout, snd_max will be greater than
7062          * snd_nxt.  There may be SACK information that allows us to avoid
7063          * resending already delivered data.  Adjust snd_nxt accordingly.
7064          */
7065         sendalot = 0;
7066         cts = tcp_ts_getticks();
7067         tso = 0;
7068         mtu = 0;
7069         sb_offset = tp->snd_max - tp->snd_una;
7070         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
7071
7072         flags = tcp_outflags[tp->t_state];
7073         /*
7074          * Send any SACK-generated retransmissions.  If we're explicitly
7075          * trying to send out new data (when sendalot is 1), bypass this
7076          * function. If we retransmit in fast recovery mode, decrement
7077          * snd_cwnd, since we're replacing a (future) new transmission with
7078          * a retransmission now, and we previously incremented snd_cwnd in
7079          * tcp_input().
7080          */
7081         /*
7082          * Still in sack recovery , reset rxmit flag to zero.
7083          */
7084         while (rack->rc_free_cnt < rack_free_cache) {
7085                 rsm = rack_alloc(rack);
7086                 if (rsm == NULL) {
7087                         if (inp->inp_hpts_calls)
7088                                 /* Retry in a ms */
7089                                 slot = 1;
7090                         goto just_return_nolock;
7091                 }
7092                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
7093                 rack->rc_free_cnt++;
7094                 rsm = NULL;
7095         }
7096         if (inp->inp_hpts_calls)
7097                 inp->inp_hpts_calls = 0;
7098         sack_rxmit = 0;
7099         len = 0;
7100         rsm = NULL;
7101         if (flags & TH_RST) {
7102                 SOCKBUF_LOCK(sb);
7103                 goto send;
7104         }
7105         if (rack->r_ctl.rc_tlpsend) {
7106                 /* Tail loss probe */
7107                 long cwin;
7108                 long tlen;
7109
7110                 doing_tlp = 1;
7111                 rsm = rack->r_ctl.rc_tlpsend;
7112                 rack->r_ctl.rc_tlpsend = NULL;
7113                 sack_rxmit = 1;
7114                 tlen = rsm->r_end - rsm->r_start;
7115                 if (tlen > tp->t_maxseg)
7116                         tlen = tp->t_maxseg;
7117 #ifdef INVARIANTS
7118                 if (SEQ_GT(tp->snd_una, rsm->r_start)) {
7119                         panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
7120                             tp, rack, tp->snd_una, rsm, rsm->r_start);
7121                 }
7122 #endif
7123                 sb_offset = rsm->r_start - tp->snd_una;
7124                 cwin = min(tp->snd_wnd, tlen);
7125                 len = cwin;
7126         } else if (rack->r_ctl.rc_resend) {
7127                 /* Retransmit timer */
7128                 rsm = rack->r_ctl.rc_resend;
7129                 rack->r_ctl.rc_resend = NULL;
7130                 len = rsm->r_end - rsm->r_start;
7131                 sack_rxmit = 1;
7132                 sendalot = 0;
7133                 sb_offset = rsm->r_start - tp->snd_una;
7134                 if (len >= tp->t_maxseg) {
7135                         len = tp->t_maxseg;
7136                 }
7137                 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
7138                     __func__, sb_offset));
7139         } else if ((rack->rc_in_persist == 0) &&
7140             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
7141                 long tlen;
7142
7143                 if ((!IN_RECOVERY(tp->t_flags)) &&
7144                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
7145                         /* Enter recovery if not induced by a time-out */
7146                         rack->r_ctl.rc_rsm_start = rsm->r_start;
7147                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
7148                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
7149                         rack_cong_signal(tp, NULL, CC_NDUPACK);
7150                         /*
7151                          * When we enter recovery we need to assure we send
7152                          * one packet.
7153                          */
7154                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
7155                 }
7156 #ifdef INVARIANTS
7157                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
7158                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
7159                             tp, rack, rsm, rsm->r_start, tp->snd_una);
7160                 }
7161 #endif
7162                 tlen = rsm->r_end - rsm->r_start;
7163                 sb_offset = rsm->r_start - tp->snd_una;
7164                 if (tlen > rack->r_ctl.rc_prr_sndcnt) {
7165                         len = rack->r_ctl.rc_prr_sndcnt;
7166                 } else {
7167                         len = tlen;
7168                 }
7169                 if (len >= tp->t_maxseg) {
7170                         sendalot = 1;
7171                         len = tp->t_maxseg;
7172                 } else {
7173                         sendalot = 0;
7174                         if ((rack->rc_timer_up == 0) &&
7175                             (len < tlen)) {
7176                                 /*
7177                                  * If its not a timer don't send a partial
7178                                  * segment.
7179                                  */
7180                                 len = 0;
7181                                 goto just_return_nolock;
7182                         }
7183                 }
7184                 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
7185                     __func__, sb_offset));
7186                 if (len > 0) {
7187                         sub_from_prr = 1;
7188                         sack_rxmit = 1;
7189                         TCPSTAT_INC(tcps_sack_rexmits);
7190                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
7191                             min(len, tp->t_maxseg));
7192                         counter_u64_add(rack_rtm_prr_retran, 1);
7193                 }
7194         }
7195         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
7196                 /* we are retransmitting the fin */
7197                 len--;
7198                 if (len) {
7199                         /*
7200                          * When retransmitting data do *not* include the
7201                          * FIN. This could happen from a TLP probe.
7202                          */
7203                         flags &= ~TH_FIN;
7204                 }
7205         }
7206 #ifdef INVARIANTS
7207         /* For debugging */
7208         rack->r_ctl.rc_rsm_at_retran = rsm;
7209 #endif
7210         /*
7211          * Enforce a connection sendmap count limit if set
7212          * as long as we are not retransmiting.
7213          */
7214         if ((rsm == NULL) &&
7215             (rack_map_entries_limit > 0) &&
7216             (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
7217                 counter_u64_add(rack_to_alloc_limited, 1);
7218                 if (!rack->alloc_limit_reported) {
7219                         rack->alloc_limit_reported = 1;
7220                         counter_u64_add(rack_alloc_limited_conns, 1);
7221                 }
7222                 goto just_return_nolock;
7223         }
7224         /*
7225          * Get standard flags, and add SYN or FIN if requested by 'hidden'
7226          * state flags.
7227          */
7228         if (tp->t_flags & TF_NEEDFIN)
7229                 flags |= TH_FIN;
7230         if (tp->t_flags & TF_NEEDSYN)
7231                 flags |= TH_SYN;
7232         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
7233                 void *end_rsm;
7234                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
7235                 if (end_rsm)
7236                         kern_prefetch(end_rsm, &prefetch_rsm);
7237                 prefetch_rsm = 1;
7238         }
7239         SOCKBUF_LOCK(sb);
7240         /*
7241          * If in persist timeout with window of 0, send 1 byte. Otherwise,
7242          * if window is small but nonzero and time TF_SENTFIN expired, we
7243          * will send what we can and go to transmit state.
7244          */
7245         if (tp->t_flags & TF_FORCEDATA) {
7246                 if (sendwin == 0) {
7247                         /*
7248                          * If we still have some data to send, then clear
7249                          * the FIN bit.  Usually this would happen below
7250                          * when it realizes that we aren't sending all the
7251                          * data.  However, if we have exactly 1 byte of
7252                          * unsent data, then it won't clear the FIN bit
7253                          * below, and if we are in persist state, we wind up
7254                          * sending the packet without recording that we sent
7255                          * the FIN bit.
7256                          *
7257                          * We can't just blindly clear the FIN bit, because
7258                          * if we don't have any more data to send then the
7259                          * probe will be the FIN itself.
7260                          */
7261                         if (sb_offset < sbused(sb))
7262                                 flags &= ~TH_FIN;
7263                         sendwin = 1;
7264                 } else {
7265                         if (rack->rc_in_persist)
7266                                 rack_exit_persist(tp, rack);
7267                         /*
7268                          * If we are dropping persist mode then we need to
7269                          * correct snd_nxt/snd_max and off.
7270                          */
7271                         tp->snd_nxt = tp->snd_max;
7272                         sb_offset = tp->snd_nxt - tp->snd_una;
7273                 }
7274         }
7275         /*
7276          * If snd_nxt == snd_max and we have transmitted a FIN, the
7277          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
7278          * negative length.  This can also occur when TCP opens up its
7279          * congestion window while receiving additional duplicate acks after
7280          * fast-retransmit because TCP will reset snd_nxt to snd_max after
7281          * the fast-retransmit.
7282          *
7283          * In the normal retransmit-FIN-only case, however, snd_nxt will be
7284          * set to snd_una, the sb_offset will be 0, and the length may wind
7285          * up 0.
7286          *
7287          * If sack_rxmit is true we are retransmitting from the scoreboard
7288          * in which case len is already set.
7289          */
7290         if (sack_rxmit == 0) {
7291                 uint32_t avail;
7292
7293                 avail = sbavail(sb);
7294                 if (SEQ_GT(tp->snd_nxt, tp->snd_una))
7295                         sb_offset = tp->snd_nxt - tp->snd_una;
7296                 else
7297                         sb_offset = 0;
7298                 if (IN_RECOVERY(tp->t_flags) == 0) {
7299                         if (rack->r_ctl.rc_tlp_new_data) {
7300                                 /* TLP is forcing out new data */
7301                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
7302                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
7303                                 }
7304                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
7305                                         len = tp->snd_wnd;
7306                                 else
7307                                         len = rack->r_ctl.rc_tlp_new_data;
7308                                 rack->r_ctl.rc_tlp_new_data = 0;
7309                                 doing_tlp = 1;
7310                         } else {
7311                                 if (sendwin > avail) {
7312                                         /* use the available */
7313                                         if (avail > sb_offset) {
7314                                                 len = (int32_t)(avail - sb_offset);
7315                                         } else {
7316                                                 len = 0;
7317                                         }
7318                                 } else {
7319                                         if (sendwin > sb_offset) {
7320                                                 len = (int32_t)(sendwin - sb_offset);
7321                                         } else {
7322                                                 len = 0;
7323                                         }
7324                                 }
7325                         }
7326                 } else {
7327                         uint32_t outstanding;
7328
7329                         /*
7330                          * We are inside of a SACK recovery episode and are
7331                          * sending new data, having retransmitted all the
7332                          * data possible so far in the scoreboard.
7333                          */
7334                         outstanding = tp->snd_max - tp->snd_una;
7335                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
7336                                 if (tp->snd_wnd > outstanding) {
7337                                         len = tp->snd_wnd - outstanding;
7338                                         /* Check to see if we have the data */
7339                                         if (((sb_offset + len) > avail) &&
7340                                             (avail > sb_offset))
7341                                                 len = avail - sb_offset;
7342                                         else
7343                                                 len = 0;
7344                                 } else
7345                                         len = 0;
7346                         } else if (avail > sb_offset)
7347                                 len = avail - sb_offset;
7348                         else
7349                                 len = 0;
7350                         if (len > 0) {
7351                                 if (len > rack->r_ctl.rc_prr_sndcnt)
7352                                         len = rack->r_ctl.rc_prr_sndcnt;
7353
7354                                 if (len > 0) {
7355                                         sub_from_prr = 1;
7356                                         counter_u64_add(rack_rtm_prr_newdata, 1);
7357                                 }
7358                         }
7359                         if (len > tp->t_maxseg) {
7360                                 /*
7361                                  * We should never send more than a MSS when
7362                                  * retransmitting or sending new data in prr
7363                                  * mode unless the override flag is on. Most
7364                                  * likely the PRR algorithm is not going to
7365                                  * let us send a lot as well :-)
7366                                  */
7367                                 if (rack->r_ctl.rc_prr_sendalot == 0)
7368                                         len = tp->t_maxseg;
7369                         } else if (len < tp->t_maxseg) {
7370                                 /*
7371                                  * Do we send any? The idea here is if the
7372                                  * send empty's the socket buffer we want to
7373                                  * do it. However if not then lets just wait
7374                                  * for our prr_sndcnt to get bigger.
7375                                  */
7376                                 long leftinsb;
7377
7378                                 leftinsb = sbavail(sb) - sb_offset;
7379                                 if (leftinsb > len) {
7380                                         /* This send does not empty the sb */
7381                                         len = 0;
7382                                 }
7383                         }
7384                 }
7385         }
7386         if (prefetch_so_done == 0) {
7387                 kern_prefetch(so, &prefetch_so_done);
7388                 prefetch_so_done = 1;
7389         }
7390         /*
7391          * Lop off SYN bit if it has already been sent.  However, if this is
7392          * SYN-SENT state and if segment contains data and if we don't know
7393          * that foreign host supports TAO, suppress sending segment.
7394          */
7395         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
7396                 if ((tp->t_state != TCPS_SYN_RECEIVED) &&
7397                     (tp->t_state != TCPS_SYN_SENT))
7398                         flags &= ~TH_SYN;
7399 #ifdef TCP_RFC7413
7400                 /*
7401                  * When sending additional segments following a TFO SYN|ACK,
7402                  * do not include the SYN bit.
7403                  */
7404                 if ((tp->t_flags & TF_FASTOPEN) &&
7405                     (tp->t_state == TCPS_SYN_RECEIVED))
7406                         flags &= ~TH_SYN;
7407 #endif
7408                 sb_offset--, len++;
7409                 if (sbavail(sb) == 0)
7410                         len = 0;
7411         }
7412         /*
7413          * Be careful not to send data and/or FIN on SYN segments. This
7414          * measure is needed to prevent interoperability problems with not
7415          * fully conformant TCP implementations.
7416          */
7417         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
7418                 len = 0;
7419                 flags &= ~TH_FIN;
7420         }
7421 #ifdef TCP_RFC7413
7422         /*
7423          * When retransmitting SYN|ACK on a passively-created TFO socket,
7424          * don't include data, as the presence of data may have caused the
7425          * original SYN|ACK to have been dropped by a middlebox.
7426          */
7427         if ((tp->t_flags & TF_FASTOPEN) &&
7428             ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
7429                 len = 0;
7430 #endif
7431         if (len <= 0) {
7432                 /*
7433                  * If FIN has been sent but not acked, but we haven't been
7434                  * called to retransmit, len will be < 0.  Otherwise, window
7435                  * shrank after we sent into it.  If window shrank to 0,
7436                  * cancel pending retransmit, pull snd_nxt back to (closed)
7437                  * window, and set the persist timer if it isn't already
7438                  * going.  If the window didn't close completely, just wait
7439                  * for an ACK.
7440                  *
7441                  * We also do a general check here to ensure that we will
7442                  * set the persist timer when we have data to send, but a
7443                  * 0-byte window. This makes sure the persist timer is set
7444                  * even if the packet hits one of the "goto send" lines
7445                  * below.
7446                  */
7447                 len = 0;
7448                 if ((tp->snd_wnd == 0) &&
7449                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
7450                     (sb_offset < (int)sbavail(sb))) {
7451                         tp->snd_nxt = tp->snd_una;
7452                         rack_enter_persist(tp, rack, cts);
7453                 }
7454         }
7455         /* len will be >= 0 after this point. */
7456         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7457         tcp_sndbuf_autoscale(tp, so, sendwin);
7458         /*
7459          * Decide if we can use TCP Segmentation Offloading (if supported by
7460          * hardware).
7461          *
7462          * TSO may only be used if we are in a pure bulk sending state.  The
7463          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
7464          * options prevent using TSO.  With TSO the TCP header is the same
7465          * (except for the sequence number) for all generated packets.  This
7466          * makes it impossible to transmit any options which vary per
7467          * generated segment or packet.
7468          *
7469          * IPv4 handling has a clear separation of ip options and ip header
7470          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
7471          * the right thing below to provide length of just ip options and thus
7472          * checking for ipoptlen is enough to decide if ip options are present.
7473          */
7474
7475 #ifdef INET6
7476         if (isipv6)
7477                 ipoptlen = ip6_optlen(tp->t_inpcb);
7478         else
7479 #endif
7480                 if (tp->t_inpcb->inp_options)
7481                         ipoptlen = tp->t_inpcb->inp_options->m_len -
7482                             offsetof(struct ipoption, ipopt_list);
7483                 else
7484                         ipoptlen = 0;
7485 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7486         /*
7487          * Pre-calculate here as we save another lookup into the darknesses
7488          * of IPsec that way and can actually decide if TSO is ok.
7489          */
7490 #ifdef INET6
7491         if (isipv6 && IPSEC_ENABLED(ipv6))
7492                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
7493 #ifdef INET
7494         else
7495 #endif
7496 #endif                          /* INET6 */
7497 #ifdef INET
7498         if (IPSEC_ENABLED(ipv4))
7499                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
7500 #endif                          /* INET */
7501 #endif
7502
7503 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7504         ipoptlen += ipsec_optlen;
7505 #endif
7506         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
7507 #ifdef NETFLIX_TCP_O_UDP
7508             (tp->t_port == 0) &&
7509 #endif
7510             ((tp->t_flags & TF_SIGNATURE) == 0) &&
7511             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
7512             ipoptlen == 0)
7513                 tso = 1;
7514         {
7515                 uint32_t outstanding;
7516
7517                 outstanding = tp->snd_max - tp->snd_una;
7518                 if (tp->t_flags & TF_SENTFIN) {
7519                         /*
7520                          * If we sent a fin, snd_max is 1 higher than
7521                          * snd_una
7522                          */
7523                         outstanding--;
7524                 }
7525                 if (outstanding > 0) {
7526                         /*
7527                          * This is sub-optimal. We only send a stand alone
7528                          * FIN on its own segment.
7529                          */
7530                         if (flags & TH_FIN) {
7531                                 flags &= ~TH_FIN;
7532                                 would_have_fin = 1;
7533                         }
7534                 } else if (sack_rxmit) {
7535                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
7536                                 flags &= ~TH_FIN;
7537                 } else {
7538                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
7539                             sbused(sb)))
7540                                 flags &= ~TH_FIN;
7541                 }
7542         }
7543         recwin = sbspace(&so->so_rcv);
7544
7545         /*
7546          * Sender silly window avoidance.   We transmit under the following
7547          * conditions when len is non-zero:
7548          *
7549          * - We have a full segment (or more with TSO) - This is the last
7550          * buffer in a write()/send() and we are either idle or running
7551          * NODELAY - we've timed out (e.g. persist timer) - we have more
7552          * then 1/2 the maximum send window's worth of data (receiver may be
7553          * limited the window size) - we need to retransmit
7554          */
7555         if (len) {
7556                 if (len >= tp->t_maxseg) {
7557                         pass = 1;
7558                         goto send;
7559                 }
7560                 /*
7561                  * NOTE! on localhost connections an 'ack' from the remote
7562                  * end may occur synchronously with the output and cause us
7563                  * to flush a buffer queued with moretocome.  XXX
7564                  *
7565                  */
7566                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
7567                     (idle || (tp->t_flags & TF_NODELAY)) &&
7568                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
7569                     (tp->t_flags & TF_NOPUSH) == 0) {
7570                         pass = 2;
7571                         goto send;
7572                 }
7573                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
7574                         pass = 3;
7575                         goto send;
7576                 }
7577                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
7578                         goto send;
7579                 }
7580                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
7581                         pass = 4;
7582                         goto send;
7583                 }
7584                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
7585                         pass = 5;
7586                         goto send;
7587                 }
7588                 if (sack_rxmit) {
7589                         pass = 6;
7590                         goto send;
7591                 }
7592         }
7593         /*
7594          * Sending of standalone window updates.
7595          *
7596          * Window updates are important when we close our window due to a
7597          * full socket buffer and are opening it again after the application
7598          * reads data from it.  Once the window has opened again and the
7599          * remote end starts to send again the ACK clock takes over and
7600          * provides the most current window information.
7601          *
7602          * We must avoid the silly window syndrome whereas every read from
7603          * the receive buffer, no matter how small, causes a window update
7604          * to be sent.  We also should avoid sending a flurry of window
7605          * updates when the socket buffer had queued a lot of data and the
7606          * application is doing small reads.
7607          *
7608          * Prevent a flurry of pointless window updates by only sending an
7609          * update when we can increase the advertized window by more than
7610          * 1/4th of the socket buffer capacity.  When the buffer is getting
7611          * full or is very small be more aggressive and send an update
7612          * whenever we can increase by two mss sized segments. In all other
7613          * situations the ACK's to new incoming data will carry further
7614          * window increases.
7615          *
7616          * Don't send an independent window update if a delayed ACK is
7617          * pending (it will get piggy-backed on it) or the remote side
7618          * already has done a half-close and won't send more data.  Skip
7619          * this if the connection is in T/TCP half-open state.
7620          */
7621         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
7622             !(tp->t_flags & TF_DELACK) &&
7623             !TCPS_HAVERCVDFIN(tp->t_state)) {
7624                 /*
7625                  * "adv" is the amount we could increase the window, taking
7626                  * into account that we are limited by TCP_MAXWIN <<
7627                  * tp->rcv_scale.
7628                  */
7629                 int32_t adv;
7630                 int oldwin;
7631
7632                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
7633                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
7634                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
7635                         adv -= oldwin;
7636                 } else
7637                         oldwin = 0;
7638
7639                 /*
7640                  * If the new window size ends up being the same as the old
7641                  * size when it is scaled, then don't force a window update.
7642                  */
7643                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
7644                         goto dontupdate;
7645
7646                 if (adv >= (int32_t)(2 * tp->t_maxseg) &&
7647                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
7648                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
7649                     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
7650                         pass = 7;
7651                         goto send;
7652                 }
7653                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
7654                         goto send;
7655         }
7656 dontupdate:
7657
7658         /*
7659          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
7660          * is also a catch-all for the retransmit timer timeout case.
7661          */
7662         if (tp->t_flags & TF_ACKNOW) {
7663                 pass = 8;
7664                 goto send;
7665         }
7666         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
7667                 pass = 9;
7668                 goto send;
7669         }
7670         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
7671                 pass = 10;
7672                 goto send;
7673         }
7674         /*
7675          * If our state indicates that FIN should be sent and we have not
7676          * yet done so, then we need to send.
7677          */
7678         if (flags & TH_FIN) {
7679                 if ((tp->t_flags & TF_SENTFIN) ||
7680                     (((tp->t_flags & TF_SENTFIN) == 0) &&
7681                      (tp->snd_nxt == tp->snd_una))) {
7682                         pass = 11;
7683                         goto send;
7684                 }
7685         }
7686         /*
7687          * No reason to send a segment, just return.
7688          */
7689 just_return:
7690         SOCKBUF_UNLOCK(sb);
7691 just_return_nolock:
7692         if (tot_len_this_send == 0)
7693                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
7694         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
7695         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
7696         tp->t_flags &= ~TF_FORCEDATA;
7697         return (0);
7698
7699 send:
7700         if (doing_tlp == 0) {
7701                 /*
7702                  * Data not a TLP, and its not the rxt firing. If it is the
7703                  * rxt firing, we want to leave the tlp_in_progress flag on
7704                  * so we don't send another TLP. It has to be a rack timer
7705                  * or normal send (response to acked data) to clear the tlp
7706                  * in progress flag.
7707                  */
7708                 rack->rc_tlp_in_progress = 0;
7709         }
7710         SOCKBUF_LOCK_ASSERT(sb);
7711         if (len > 0) {
7712                 if (len >= tp->t_maxseg)
7713                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
7714                 else
7715                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
7716         }
7717         /*
7718          * Before ESTABLISHED, force sending of initial options unless TCP
7719          * set not to do any options. NOTE: we assume that the IP/TCP header
7720          * plus TCP options always fit in a single mbuf, leaving room for a
7721          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
7722          * + optlen <= MCLBYTES
7723          */
7724         optlen = 0;
7725 #ifdef INET6
7726         if (isipv6)
7727                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
7728         else
7729 #endif
7730                 hdrlen = sizeof(struct tcpiphdr);
7731
7732         /*
7733          * Compute options for segment. We only have to care about SYN and
7734          * established connection segments.  Options for SYN-ACK segments
7735          * are handled in TCP syncache.
7736          */
7737         to.to_flags = 0;
7738         if ((tp->t_flags & TF_NOOPT) == 0) {
7739                 /* Maximum segment size. */
7740                 if (flags & TH_SYN) {
7741                         tp->snd_nxt = tp->iss;
7742                         to.to_mss = tcp_mssopt(&inp->inp_inc);
7743 #ifdef NETFLIX_TCP_O_UDP
7744                         if (tp->t_port)
7745                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
7746 #endif
7747                         to.to_flags |= TOF_MSS;
7748 #ifdef TCP_RFC7413
7749                         /*
7750                          * Only include the TFO option on the first
7751                          * transmission of the SYN|ACK on a
7752                          * passively-created TFO socket, as the presence of
7753                          * the TFO option may have caused the original
7754                          * SYN|ACK to have been dropped by a middlebox.
7755                          */
7756                         if ((tp->t_flags & TF_FASTOPEN) &&
7757                             (tp->t_state == TCPS_SYN_RECEIVED) &&
7758                             (tp->t_rxtshift == 0)) {
7759                                 to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
7760                                 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
7761                                 to.to_flags |= TOF_FASTOPEN;
7762                         }
7763 #endif
7764                 }
7765                 /* Window scaling. */
7766                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
7767                         to.to_wscale = tp->request_r_scale;
7768                         to.to_flags |= TOF_SCALE;
7769                 }
7770                 /* Timestamps. */
7771                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
7772                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
7773                         to.to_tsval = cts + tp->ts_offset;
7774                         to.to_tsecr = tp->ts_recent;
7775                         to.to_flags |= TOF_TS;
7776                 }
7777                 /* Set receive buffer autosizing timestamp. */
7778                 if (tp->rfbuf_ts == 0 &&
7779                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
7780                         tp->rfbuf_ts = tcp_ts_getticks();
7781                 /* Selective ACK's. */
7782                 if (flags & TH_SYN)
7783                         to.to_flags |= TOF_SACKPERM;
7784                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7785                     tp->rcv_numsacks > 0) {
7786                         to.to_flags |= TOF_SACK;
7787                         to.to_nsacks = tp->rcv_numsacks;
7788                         to.to_sacks = (u_char *)tp->sackblks;
7789                 }
7790 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
7791                 /* TCP-MD5 (RFC2385). */
7792                 if (tp->t_flags & TF_SIGNATURE)
7793                         to.to_flags |= TOF_SIGNATURE;
7794 #endif                          /* TCP_SIGNATURE */
7795
7796                 /* Processing the options. */
7797                 hdrlen += optlen = tcp_addoptions(&to, opt);
7798         }
7799 #ifdef NETFLIX_TCP_O_UDP
7800         if (tp->t_port) {
7801                 if (V_tcp_udp_tunneling_port == 0) {
7802                         /* The port was removed?? */
7803                         SOCKBUF_UNLOCK(&so->so_snd);
7804                         return (EHOSTUNREACH);
7805                 }
7806                 hdrlen += sizeof(struct udphdr);
7807         }
7808 #endif
7809         ipoptlen = 0;
7810 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7811         ipoptlen += ipsec_optlen;
7812 #endif
7813
7814         /*
7815          * Adjust data length if insertion of options will bump the packet
7816          * length beyond the t_maxseg length. Clear the FIN bit because we
7817          * cut off the tail of the segment.
7818          */
7819         if (len + optlen + ipoptlen > tp->t_maxseg) {
7820                 if (flags & TH_FIN) {
7821                         would_have_fin = 1;
7822                         flags &= ~TH_FIN;
7823                 }
7824                 if (tso) {
7825                         uint32_t if_hw_tsomax;
7826                         uint32_t moff;
7827                         int32_t max_len;
7828
7829                         /* extract TSO information */
7830                         if_hw_tsomax = tp->t_tsomax;
7831                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
7832                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
7833                         KASSERT(ipoptlen == 0,
7834                             ("%s: TSO can't do IP options", __func__));
7835
7836                         /*
7837                          * Check if we should limit by maximum payload
7838                          * length:
7839                          */
7840                         if (if_hw_tsomax != 0) {
7841                                 /* compute maximum TSO length */
7842                                 max_len = (if_hw_tsomax - hdrlen -
7843                                     max_linkhdr);
7844                                 if (max_len <= 0) {
7845                                         len = 0;
7846                                 } else if (len > max_len) {
7847                                         sendalot = 1;
7848                                         len = max_len;
7849                                 }
7850                         }
7851                         /*
7852                          * Prevent the last segment from being fractional
7853                          * unless the send sockbuf can be emptied:
7854                          */
7855                         max_len = (tp->t_maxseg - optlen);
7856                         if ((sb_offset + len) < sbavail(sb)) {
7857                                 moff = len % (u_int)max_len;
7858                                 if (moff != 0) {
7859                                         len -= moff;
7860                                         sendalot = 1;
7861                                 }
7862                         }
7863                         /*
7864                          * In case there are too many small fragments don't
7865                          * use TSO:
7866                          */
7867                         if (len <= max_len) {
7868                                 len = max_len;
7869                                 sendalot = 1;
7870                                 tso = 0;
7871                         }
7872                         /*
7873                          * Send the FIN in a separate segment after the bulk
7874                          * sending is done. We don't trust the TSO
7875                          * implementations to clear the FIN flag on all but
7876                          * the last segment.
7877                          */
7878                         if (tp->t_flags & TF_NEEDFIN)
7879                                 sendalot = 1;
7880
7881                 } else {
7882                         len = tp->t_maxseg - optlen - ipoptlen;
7883                         sendalot = 1;
7884                 }
7885         } else
7886                 tso = 0;
7887         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
7888             ("%s: len > IP_MAXPACKET", __func__));
7889 #ifdef DIAGNOSTIC
7890 #ifdef INET6
7891         if (max_linkhdr + hdrlen > MCLBYTES)
7892 #else
7893         if (max_linkhdr + hdrlen > MHLEN)
7894 #endif
7895                 panic("tcphdr too big");
7896 #endif
7897
7898         /*
7899          * This KASSERT is here to catch edge cases at a well defined place.
7900          * Before, those had triggered (random) panic conditions further
7901          * down.
7902          */
7903         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7904         if ((len == 0) &&
7905             (flags & TH_FIN) &&
7906             (sbused(sb))) {
7907                 /*
7908                  * We have outstanding data, don't send a fin by itself!.
7909                  */
7910                 goto just_return;
7911         }
7912         /*
7913          * Grab a header mbuf, attaching a copy of data to be transmitted,
7914          * and initialize the header from the template for sends on this
7915          * connection.
7916          */
7917         if (len) {
7918                 uint32_t max_val;
7919                 uint32_t moff;
7920
7921                 if (rack->rc_pace_max_segs)
7922                         max_val = rack->rc_pace_max_segs * tp->t_maxseg;
7923                 else
7924                         max_val = len;
7925                 /*
7926                  * We allow a limit on sending with hptsi.
7927                  */
7928                 if (len > max_val) {
7929                         len = max_val;
7930                 }
7931 #ifdef INET6
7932                 if (MHLEN < hdrlen + max_linkhdr)
7933                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
7934                 else
7935 #endif
7936                         m = m_gethdr(M_NOWAIT, MT_DATA);
7937
7938                 if (m == NULL) {
7939                         SOCKBUF_UNLOCK(sb);
7940                         error = ENOBUFS;
7941                         sack_rxmit = 0;
7942                         goto out;
7943                 }
7944                 m->m_data += max_linkhdr;
7945                 m->m_len = hdrlen;
7946
7947                 /*
7948                  * Start the m_copy functions from the closest mbuf to the
7949                  * sb_offset in the socket buffer chain.
7950                  */
7951                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
7952                 if (len <= MHLEN - hdrlen - max_linkhdr) {
7953                         m_copydata(mb, moff, (int)len,
7954                             mtod(m, caddr_t)+hdrlen);
7955                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7956                                 sbsndptr_adv(sb, mb, len);
7957                         m->m_len += len;
7958                 } else {
7959                         struct sockbuf *msb;
7960
7961                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7962                                 msb = NULL;
7963                         else
7964                                 msb = sb;
7965                         m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len,
7966                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/);
7967                         if (len <= (tp->t_maxseg - optlen)) {
7968                                 /*
7969                                  * Must have ran out of mbufs for the copy
7970                                  * shorten it to no longer need tso. Lets
7971                                  * not put on sendalot since we are low on
7972                                  * mbufs.
7973                                  */
7974                                 tso = 0;
7975                         }
7976                         if (m->m_next == NULL) {
7977                                 SOCKBUF_UNLOCK(sb);
7978                                 (void)m_free(m);
7979                                 error = ENOBUFS;
7980                                 sack_rxmit = 0;
7981                                 goto out;
7982                         }
7983                 }
7984                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
7985                         TCPSTAT_INC(tcps_sndprobe);
7986 #ifdef NETFLIX_STATS
7987                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7988                                 stats_voi_update_abs_u32(tp->t_stats,
7989                                     VOI_TCP_RETXPB, len);
7990                         else
7991                                 stats_voi_update_abs_u64(tp->t_stats,
7992                                     VOI_TCP_TXPB, len);
7993 #endif
7994                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
7995                         if (rsm && (rsm->r_flags & RACK_TLP)) {
7996                                 /*
7997                                  * TLP should not count in retran count, but
7998                                  * in its own bin
7999                                  */
8000 /*                              tp->t_sndtlppack++;*/
8001 /*                              tp->t_sndtlpbyte += len;*/
8002                                 counter_u64_add(rack_tlp_retran, 1);
8003                                 counter_u64_add(rack_tlp_retran_bytes, len);
8004                         } else {
8005                                 tp->t_sndrexmitpack++;
8006                                 TCPSTAT_INC(tcps_sndrexmitpack);
8007                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
8008                         }
8009 #ifdef NETFLIX_STATS
8010                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
8011                             len);
8012 #endif
8013                 } else {
8014                         TCPSTAT_INC(tcps_sndpack);
8015                         TCPSTAT_ADD(tcps_sndbyte, len);
8016 #ifdef NETFLIX_STATS
8017                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
8018                             len);
8019 #endif
8020                 }
8021                 /*
8022                  * If we're sending everything we've got, set PUSH. (This
8023                  * will keep happy those implementations which only give
8024                  * data to the user when a buffer fills or a PUSH comes in.)
8025                  */
8026                 if (sb_offset + len == sbused(sb) &&
8027                     sbused(sb) &&
8028                     !(flags & TH_SYN))
8029                         flags |= TH_PUSH;
8030
8031                 /*
8032                  * Are we doing hptsi, if so we must calculate the slot. We
8033                  * only do hptsi in ESTABLISHED and with no RESET being
8034                  * sent where we have data to send.
8035                  */
8036                 if (((tp->t_state == TCPS_ESTABLISHED) ||
8037                     (tp->t_state == TCPS_CLOSE_WAIT) ||
8038                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
8039                     ((tp->t_flags & TF_SENTFIN) == 0) &&
8040                     ((flags & TH_FIN) == 0))) &&
8041                     ((flags & TH_RST) == 0) &&
8042                     (rack->rc_always_pace)) {
8043                         /*
8044                          * We use the most optimistic possible cwnd/srtt for
8045                          * sending calculations. This will make our
8046                          * calculation anticipate getting more through
8047                          * quicker then possible. But thats ok we don't want
8048                          * the peer to have a gap in data sending.
8049                          */
8050                         uint32_t srtt, cwnd, tr_perms = 0;
8051
8052                         if (rack->r_ctl.rc_rack_min_rtt)
8053                                 srtt = rack->r_ctl.rc_rack_min_rtt;
8054                         else
8055                                 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8056                         if (rack->r_ctl.rc_rack_largest_cwnd)
8057                                 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8058                         else
8059                                 cwnd = tp->snd_cwnd;
8060                         tr_perms = cwnd / srtt;
8061                         if (tr_perms == 0) {
8062                                 tr_perms = tp->t_maxseg;
8063                         }
8064                         tot_len_this_send += len;
8065                         /*
8066                          * Calculate how long this will take to drain, if
8067                          * the calculation comes out to zero, thats ok we
8068                          * will use send_a_lot to possibly spin around for
8069                          * more increasing tot_len_this_send to the point
8070                          * that its going to require a pace, or we hit the
8071                          * cwnd. Which in that case we are just waiting for
8072                          * a ACK.
8073                          */
8074                         slot = tot_len_this_send / tr_perms;
8075                         /* Now do we reduce the time so we don't run dry? */
8076                         if (slot && rack->rc_pace_reduce) {
8077                                 int32_t reduce;
8078
8079                                 reduce = (slot / rack->rc_pace_reduce);
8080                                 if (reduce < slot) {
8081                                         slot -= reduce;
8082                                 } else
8083                                         slot = 0;
8084                         }
8085                         if (rack->r_enforce_min_pace &&
8086                             (slot == 0) &&
8087                             (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
8088                                 /* We are enforcing a minimum pace time of 1ms */
8089                                 slot = rack->r_enforce_min_pace;
8090                         }
8091                 }
8092                 SOCKBUF_UNLOCK(sb);
8093         } else {
8094                 SOCKBUF_UNLOCK(sb);
8095                 if (tp->t_flags & TF_ACKNOW)
8096                         TCPSTAT_INC(tcps_sndacks);
8097                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
8098                         TCPSTAT_INC(tcps_sndctrl);
8099                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
8100                         TCPSTAT_INC(tcps_sndurg);
8101                 else
8102                         TCPSTAT_INC(tcps_sndwinup);
8103
8104                 m = m_gethdr(M_NOWAIT, MT_DATA);
8105                 if (m == NULL) {
8106                         error = ENOBUFS;
8107                         sack_rxmit = 0;
8108                         goto out;
8109                 }
8110 #ifdef INET6
8111                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
8112                     MHLEN >= hdrlen) {
8113                         M_ALIGN(m, hdrlen);
8114                 } else
8115 #endif
8116                         m->m_data += max_linkhdr;
8117                 m->m_len = hdrlen;
8118         }
8119         SOCKBUF_UNLOCK_ASSERT(sb);
8120         m->m_pkthdr.rcvif = (struct ifnet *)0;
8121 #ifdef MAC
8122         mac_inpcb_create_mbuf(inp, m);
8123 #endif
8124 #ifdef INET6
8125         if (isipv6) {
8126                 ip6 = mtod(m, struct ip6_hdr *);
8127 #ifdef NETFLIX_TCP_O_UDP
8128                 if (tp->t_port) {
8129                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
8130                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8131                         udp->uh_dport = tp->t_port;
8132                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
8133                         udp->uh_ulen = htons(ulen);
8134                         th = (struct tcphdr *)(udp + 1);
8135                 } else
8136 #endif
8137                         th = (struct tcphdr *)(ip6 + 1);
8138                 tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th);
8139         } else
8140 #endif                          /* INET6 */
8141         {
8142                 ip = mtod(m, struct ip *);
8143 #ifdef TCPDEBUG
8144                 ipov = (struct ipovly *)ip;
8145 #endif
8146 #ifdef NETFLIX_TCP_O_UDP
8147                 if (tp->t_port) {
8148                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
8149                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8150                         udp->uh_dport = tp->t_port;
8151                         ulen = hdrlen + len - sizeof(struct ip);
8152                         udp->uh_ulen = htons(ulen);
8153                         th = (struct tcphdr *)(udp + 1);
8154                 } else
8155 #endif
8156                         th = (struct tcphdr *)(ip + 1);
8157                 tcpip_fillheaders(inp,/*tp->t_port, */ ip, th);
8158         }
8159         /*
8160          * Fill in fields, remembering maximum advertised window for use in
8161          * delaying messages about window sizes. If resending a FIN, be sure
8162          * not to use a new sequence number.
8163          */
8164         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
8165             tp->snd_nxt == tp->snd_max)
8166                 tp->snd_nxt--;
8167         /*
8168          * If we are starting a connection, send ECN setup SYN packet. If we
8169          * are on a retransmit, we may resend those bits a number of times
8170          * as per RFC 3168.
8171          */
8172         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
8173                 if (tp->t_rxtshift >= 1) {
8174                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
8175                                 flags |= TH_ECE | TH_CWR;
8176                 } else
8177                         flags |= TH_ECE | TH_CWR;
8178         }
8179         if (tp->t_state == TCPS_ESTABLISHED &&
8180             (tp->t_flags & TF_ECN_PERMIT)) {
8181                 /*
8182                  * If the peer has ECN, mark data packets with ECN capable
8183                  * transmission (ECT). Ignore pure ack packets,
8184                  * retransmissions and window probes.
8185                  */
8186                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
8187                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
8188 #ifdef INET6
8189                         if (isipv6)
8190                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
8191                         else
8192 #endif
8193                                 ip->ip_tos |= IPTOS_ECN_ECT0;
8194                         TCPSTAT_INC(tcps_ecn_ect0);
8195                 }
8196                 /*
8197                  * Reply with proper ECN notifications.
8198                  */
8199                 if (tp->t_flags & TF_ECN_SND_CWR) {
8200                         flags |= TH_CWR;
8201                         tp->t_flags &= ~TF_ECN_SND_CWR;
8202                 }
8203                 if (tp->t_flags & TF_ECN_SND_ECE)
8204                         flags |= TH_ECE;
8205         }
8206         /*
8207          * If we are doing retransmissions, then snd_nxt will not reflect
8208          * the first unsent octet.  For ACK only packets, we do not want the
8209          * sequence number of the retransmitted packet, we want the sequence
8210          * number of the next unsent octet.  So, if there is no data (and no
8211          * SYN or FIN), use snd_max instead of snd_nxt when filling in
8212          * ti_seq.  But if we are in persist state, snd_max might reflect
8213          * one byte beyond the right edge of the window, so use snd_nxt in
8214          * that case, since we know we aren't doing a retransmission.
8215          * (retransmit and persist are mutually exclusive...)
8216          */
8217         if (sack_rxmit == 0) {
8218                 if (len || (flags & (TH_SYN | TH_FIN)) ||
8219                     rack->rc_in_persist) {
8220                         th->th_seq = htonl(tp->snd_nxt);
8221                         rack_seq = tp->snd_nxt;
8222                 } else if (flags & TH_RST) {
8223                         /*
8224                          * For a Reset send the last cum ack in sequence
8225                          * (this like any other choice may still generate a
8226                          * challenge ack, if a ack-update packet is in
8227                          * flight).
8228                          */
8229                         th->th_seq = htonl(tp->snd_una);
8230                         rack_seq = tp->snd_una;
8231                 } else {
8232                         th->th_seq = htonl(tp->snd_max);
8233                         rack_seq = tp->snd_max;
8234                 }
8235         } else {
8236                 th->th_seq = htonl(rsm->r_start);
8237                 rack_seq = rsm->r_start;
8238         }
8239         th->th_ack = htonl(tp->rcv_nxt);
8240         if (optlen) {
8241                 bcopy(opt, th + 1, optlen);
8242                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
8243         }
8244         th->th_flags = flags;
8245         /*
8246          * Calculate receive window.  Don't shrink window, but avoid silly
8247          * window syndrome.
8248          */
8249         if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
8250             recwin < (long)tp->t_maxseg)
8251                 recwin = 0;
8252         if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
8253             recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
8254                 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
8255         if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
8256                 recwin = (long)TCP_MAXWIN << tp->rcv_scale;
8257
8258         /*
8259          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
8260          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
8261          * handled in syncache.
8262          */
8263         if (flags & TH_SYN)
8264                 th->th_win = htons((u_short)
8265                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
8266         else
8267                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
8268         /*
8269          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
8270          * window.  This may cause the remote transmitter to stall.  This
8271          * flag tells soreceive() to disable delayed acknowledgements when
8272          * draining the buffer.  This can occur if the receiver is
8273          * attempting to read more data than can be buffered prior to
8274          * transmitting on the connection.
8275          */
8276         if (th->th_win == 0) {
8277                 tp->t_sndzerowin++;
8278                 tp->t_flags |= TF_RXWIN0SENT;
8279         } else
8280                 tp->t_flags &= ~TF_RXWIN0SENT;
8281         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8282                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
8283                 th->th_flags |= TH_URG;
8284         } else
8285                 /*
8286                  * If no urgent pointer to send, then we pull the urgent
8287                  * pointer to the left edge of the send window so that it
8288                  * doesn't drift into the send window on sequence number
8289                  * wraparound.
8290                  */
8291                 tp->snd_up = tp->snd_una;       /* drag it along */
8292
8293 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
8294         if (to.to_flags & TOF_SIGNATURE) {
8295                 /*
8296                  * Calculate MD5 signature and put it into the place
8297                  * determined before.
8298                  * NOTE: since TCP options buffer doesn't point into
8299                  * mbuf's data, calculate offset and use it.
8300                  */
8301                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
8302                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
8303                         /*
8304                          * Do not send segment if the calculation of MD5
8305                          * digest has failed.
8306                          */
8307                         goto out;
8308                 }
8309         }
8310 #endif
8311
8312         /*
8313          * Put TCP length in extended header, and then checksum extended
8314          * header and data.
8315          */
8316         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
8317 #ifdef INET6
8318         if (isipv6) {
8319                 /*
8320                  * ip6_plen is not need to be filled now, and will be filled
8321                  * in ip6_output.
8322                  */
8323 #ifdef NETFLIX_TCP_O_UDP
8324                 if (tp->t_port) {
8325                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
8326                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8327                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
8328                         th->th_sum = htons(0);
8329                         UDPSTAT_INC(udps_opackets);
8330                 } else {
8331 #endif
8332                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
8333                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8334                         th->th_sum = in6_cksum_pseudo(ip6,
8335                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
8336                             0);
8337 #ifdef NETFLIX_TCP_O_UDP
8338                 }
8339 #endif
8340         }
8341 #endif
8342 #if defined(INET6) && defined(INET)
8343         else
8344 #endif
8345 #ifdef INET
8346         {
8347 #ifdef NETFLIX_TCP_O_UDP
8348                 if (tp->t_port) {
8349                         m->m_pkthdr.csum_flags = CSUM_UDP;
8350                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8351                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
8352                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
8353                         th->th_sum = htons(0);
8354                         UDPSTAT_INC(udps_opackets);
8355                 } else {
8356 #endif
8357                         m->m_pkthdr.csum_flags = CSUM_TCP;
8358                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8359                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
8360                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
8361                             IPPROTO_TCP + len + optlen));
8362 #ifdef NETFLIX_TCP_O_UDP
8363                 }
8364 #endif
8365                 /* IP version must be set here for ipv4/ipv6 checking later */
8366                 KASSERT(ip->ip_v == IPVERSION,
8367                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
8368         }
8369 #endif
8370
8371         /*
8372          * Enable TSO and specify the size of the segments. The TCP pseudo
8373          * header checksum is always provided. XXX: Fixme: This is currently
8374          * not the case for IPv6.
8375          */
8376         if (tso) {
8377                 KASSERT(len > tp->t_maxseg - optlen,
8378                     ("%s: len <= tso_segsz", __func__));
8379                 m->m_pkthdr.csum_flags |= CSUM_TSO;
8380                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
8381         }
8382 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8383         KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
8384             ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
8385             __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
8386 #else
8387         KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
8388             ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
8389             __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
8390 #endif
8391
8392 #ifdef TCP_HHOOK
8393         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
8394         hhook_run_tcp_est_out(tp, th, &to, len, tso);
8395 #endif
8396
8397 #ifdef TCPDEBUG
8398         /*
8399          * Trace.
8400          */
8401         if (so->so_options & SO_DEBUG) {
8402                 u_short save = 0;
8403
8404 #ifdef INET6
8405                 if (!isipv6)
8406 #endif
8407                 {
8408                         save = ipov->ih_len;
8409                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
8410                               * (th->th_off << 2) */ );
8411                 }
8412                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
8413 #ifdef INET6
8414                 if (!isipv6)
8415 #endif
8416                         ipov->ih_len = save;
8417         }
8418 #endif                          /* TCPDEBUG */
8419
8420         /* We're getting ready to send; log now. */
8421         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
8422                 union tcp_log_stackspecific log;
8423
8424                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
8425                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
8426                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
8427                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
8428                 if (rsm || sack_rxmit) {
8429                         log.u_bbr.flex8 = 1;
8430                 } else {
8431                         log.u_bbr.flex8 = 0;
8432                 }
8433                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
8434                     len, &log, false, NULL, NULL, 0, NULL);
8435         } else
8436                 lgb = NULL;
8437
8438         /*
8439          * Fill in IP length and desired time to live and send to IP level.
8440          * There should be a better way to handle ttl and tos; we could keep
8441          * them in the template, but need a way to checksum without them.
8442          */
8443         /*
8444          * m->m_pkthdr.len should have been set before cksum calcuration,
8445          * because in6_cksum() need it.
8446          */
8447 #ifdef INET6
8448         if (isipv6) {
8449                 /*
8450                  * we separately set hoplimit for every segment, since the
8451                  * user might want to change the value via setsockopt. Also,
8452                  * desired default hop limit might be changed via Neighbor
8453                  * Discovery.
8454                  */
8455                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
8456
8457                 /*
8458                  * Set the packet size here for the benefit of DTrace
8459                  * probes. ip6_output() will set it properly; it's supposed
8460                  * to include the option header lengths as well.
8461                  */
8462                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
8463
8464                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
8465                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8466                 else
8467                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8468
8469                 if (tp->t_state == TCPS_SYN_SENT)
8470                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
8471
8472                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
8473                 /* TODO: IPv6 IP6TOS_ECT bit on */
8474                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
8475                     &inp->inp_route6,
8476                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
8477                     NULL, NULL, inp);
8478
8479                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
8480                         mtu = inp->inp_route6.ro_rt->rt_mtu;
8481         }
8482 #endif                          /* INET6 */
8483 #if defined(INET) && defined(INET6)
8484         else
8485 #endif
8486 #ifdef INET
8487         {
8488                 ip->ip_len = htons(m->m_pkthdr.len);
8489 #ifdef INET6
8490                 if (inp->inp_vflag & INP_IPV6PROTO)
8491                         ip->ip_ttl = in6_selecthlim(inp, NULL);
8492 #endif                          /* INET6 */
8493                 /*
8494                  * If we do path MTU discovery, then we set DF on every
8495                  * packet. This might not be the best thing to do according
8496                  * to RFC3390 Section 2. However the tcp hostcache migitates
8497                  * the problem so it affects only the first tcp connection
8498                  * with a host.
8499                  *
8500                  * NB: Don't set DF on small MTU/MSS to have a safe
8501                  * fallback.
8502                  */
8503                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
8504                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8505                         if (tp->t_port == 0 || len < V_tcp_minmss) {
8506                                 ip->ip_off |= htons(IP_DF);
8507                         }
8508                 } else {
8509                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8510                 }
8511
8512                 if (tp->t_state == TCPS_SYN_SENT)
8513                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
8514
8515                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
8516
8517                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
8518                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
8519                     inp);
8520                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
8521                         mtu = inp->inp_route.ro_rt->rt_mtu;
8522         }
8523 #endif                          /* INET */
8524
8525 out:
8526         if (lgb) {
8527                 lgb->tlb_errno = error;
8528                 lgb = NULL;
8529         }
8530         /*
8531          * In transmit state, time the transmission and arrange for the
8532          * retransmit.  In persist state, just set snd_max.
8533          */
8534         if (error == 0) {
8535                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
8536                     (tp->t_flags & TF_SACK_PERMIT) &&
8537                     tp->rcv_numsacks > 0)
8538                     tcp_clean_dsack_blocks(tp);
8539                 if (len == 0)
8540                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
8541                 else if (len == 1) {
8542                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
8543                 } else if (len > 1) {
8544                         int idx;
8545
8546                         idx = (len / tp->t_maxseg) + 3;
8547                         if (idx >= TCP_MSS_ACCT_ATIMER)
8548                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
8549                         else
8550                                 counter_u64_add(rack_out_size[idx], 1);
8551                 }
8552         }
8553         if (sub_from_prr && (error == 0)) {
8554                 if (rack->r_ctl.rc_prr_sndcnt >= len)
8555                         rack->r_ctl.rc_prr_sndcnt -= len;
8556                 else
8557                         rack->r_ctl.rc_prr_sndcnt = 0;
8558         }
8559         sub_from_prr = 0;
8560         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
8561             pass, rsm);
8562         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
8563             (rack->rc_in_persist == 0)) {
8564 #ifdef NETFLIX_STATS
8565                 tcp_seq startseq = tp->snd_nxt;
8566 #endif
8567                 /*
8568                  * Advance snd_nxt over sequence space of this segment.
8569                  */
8570                 if (error)
8571                         /* We don't log or do anything with errors */
8572                         goto timer;
8573
8574                 if (flags & (TH_SYN | TH_FIN)) {
8575                         if (flags & TH_SYN)
8576                                 tp->snd_nxt++;
8577                         if (flags & TH_FIN) {
8578                                 tp->snd_nxt++;
8579                                 tp->t_flags |= TF_SENTFIN;
8580                         }
8581                 }
8582                 /* In the ENOBUFS case we do *not* update snd_max */
8583                 if (sack_rxmit)
8584                         goto timer;
8585
8586                 tp->snd_nxt += len;
8587                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
8588                         if (tp->snd_una == tp->snd_max) {
8589                                 /*
8590                                  * Update the time we just added data since
8591                                  * none was outstanding.
8592                                  */
8593                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8594                                 tp->t_acktime = ticks;
8595                         }
8596                         tp->snd_max = tp->snd_nxt;
8597 #ifdef NETFLIX_STATS
8598                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
8599                                 tp->t_flags |= TF_GPUTINPROG;
8600                                 tp->gput_seq = startseq;
8601                                 tp->gput_ack = startseq +
8602                                     ulmin(sbavail(sb) - sb_offset, sendwin);
8603                                 tp->gput_ts = tcp_ts_getticks();
8604                         }
8605 #endif
8606                 }
8607                 /*
8608                  * Set retransmit timer if not currently set, and not doing
8609                  * a pure ack or a keep-alive probe. Initial value for
8610                  * retransmit timer is smoothed round-trip time + 2 *
8611                  * round-trip time variance. Initialize shift counter which
8612                  * is used for backoff of retransmit time.
8613                  */
8614 timer:
8615                 if ((tp->snd_wnd == 0) &&
8616                     TCPS_HAVEESTABLISHED(tp->t_state)) {
8617                         /*
8618                          * If the persists timer was set above (right before
8619                          * the goto send), and still needs to be on. Lets
8620                          * make sure all is canceled. If the persist timer
8621                          * is not running, we want to get it up.
8622                          */
8623                         if (rack->rc_in_persist == 0) {
8624                                 rack_enter_persist(tp, rack, cts);
8625                         }
8626                 }
8627         } else {
8628                 /*
8629                  * Persist case, update snd_max but since we are in persist
8630                  * mode (no window) we do not update snd_nxt.
8631                  */
8632                 int32_t xlen = len;
8633
8634                 if (error)
8635                         goto nomore;
8636
8637                 if (flags & TH_SYN)
8638                         ++xlen;
8639                 if (flags & TH_FIN) {
8640                         ++xlen;
8641                         tp->t_flags |= TF_SENTFIN;
8642                 }
8643                 /* In the ENOBUFS case we do *not* update snd_max */
8644                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
8645                         if (tp->snd_una == tp->snd_max) {
8646                                 /*
8647                                  * Update the time we just added data since
8648                                  * none was outstanding.
8649                                  */
8650                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8651                                 tp->t_acktime = ticks;
8652                         }
8653                         tp->snd_max = tp->snd_nxt + len;
8654                 }
8655         }
8656 nomore:
8657         if (error) {
8658                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
8659                 /*
8660                  * Failures do not advance the seq counter above. For the
8661                  * case of ENOBUFS we will fall out and retry in 1ms with
8662                  * the hpts. Everything else will just have to retransmit
8663                  * with the timer.
8664                  *
8665                  * In any case, we do not want to loop around for another
8666                  * send without a good reason.
8667                  */
8668                 sendalot = 0;
8669                 switch (error) {
8670                 case EPERM:
8671                         tp->t_flags &= ~TF_FORCEDATA;
8672                         tp->t_softerror = error;
8673                         return (error);
8674                 case ENOBUFS:
8675                         if (slot == 0) {
8676                                 /*
8677                                  * Pace us right away to retry in a some
8678                                  * time
8679                                  */
8680                                 slot = 1 + rack->rc_enobuf;
8681                                 if (rack->rc_enobuf < 255)
8682                                         rack->rc_enobuf++;
8683                                 if (slot > (rack->rc_rack_rtt / 2)) {
8684                                         slot = rack->rc_rack_rtt / 2;
8685                                 }
8686                                 if (slot < 10)
8687                                         slot = 10;
8688                         }
8689                         counter_u64_add(rack_saw_enobuf, 1);
8690                         error = 0;
8691                         goto enobufs;
8692                 case EMSGSIZE:
8693                         /*
8694                          * For some reason the interface we used initially
8695                          * to send segments changed to another or lowered
8696                          * its MTU. If TSO was active we either got an
8697                          * interface without TSO capabilits or TSO was
8698                          * turned off. If we obtained mtu from ip_output()
8699                          * then update it and try again.
8700                          */
8701                         if (tso)
8702                                 tp->t_flags &= ~TF_TSO;
8703                         if (mtu != 0) {
8704                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
8705                                 goto again;
8706                         }
8707                         slot = 10;
8708                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8709                         tp->t_flags &= ~TF_FORCEDATA;
8710                         return (error);
8711                 case ENETUNREACH:
8712                         counter_u64_add(rack_saw_enetunreach, 1);
8713                 case EHOSTDOWN:
8714                 case EHOSTUNREACH:
8715                 case ENETDOWN:
8716                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
8717                                 tp->t_softerror = error;
8718                         }
8719                         /* FALLTHROUGH */
8720                 default:
8721                         slot = 10;
8722                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8723                         tp->t_flags &= ~TF_FORCEDATA;
8724                         return (error);
8725                 }
8726         } else {
8727                 rack->rc_enobuf = 0;
8728         }
8729         TCPSTAT_INC(tcps_sndtotal);
8730
8731         /*
8732          * Data sent (as far as we can tell). If this advertises a larger
8733          * window than any other segment, then remember the size of the
8734          * advertised window. Any pending ACK has now been sent.
8735          */
8736         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8737                 tp->rcv_adv = tp->rcv_nxt + recwin;
8738         tp->last_ack_sent = tp->rcv_nxt;
8739         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
8740 enobufs:
8741         rack->r_tlp_running = 0;
8742         if ((flags & TH_RST) || (would_have_fin == 1)) {
8743                 /*
8744                  * We don't send again after a RST. We also do *not* send
8745                  * again if we would have had a find, but now have
8746                  * outstanding data.
8747                  */
8748                 slot = 0;
8749                 sendalot = 0;
8750         }
8751         if (slot) {
8752                 /* set the rack tcb into the slot N */
8753                 counter_u64_add(rack_paced_segments, 1);
8754         } else if (sendalot) {
8755                 if (len)
8756                         counter_u64_add(rack_unpaced_segments, 1);
8757                 sack_rxmit = 0;
8758                 tp->t_flags &= ~TF_FORCEDATA;
8759                 goto again;
8760         } else if (len) {
8761                 counter_u64_add(rack_unpaced_segments, 1);
8762         }
8763         tp->t_flags &= ~TF_FORCEDATA;
8764         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
8765         return (error);
8766 }
8767
8768 /*
8769  * rack_ctloutput() must drop the inpcb lock before performing copyin on
8770  * socket option arguments.  When it re-acquires the lock after the copy, it
8771  * has to revalidate that the connection is still valid for the socket
8772  * option.
8773  */
8774 static int
8775 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
8776     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8777 {
8778         int32_t error = 0, optval;
8779
8780         switch (sopt->sopt_name) {
8781         case TCP_RACK_PROP_RATE:
8782         case TCP_RACK_PROP:
8783         case TCP_RACK_TLP_REDUCE:
8784         case TCP_RACK_EARLY_RECOV:
8785         case TCP_RACK_PACE_ALWAYS:
8786         case TCP_DELACK:
8787         case TCP_RACK_PACE_REDUCE:
8788         case TCP_RACK_PACE_MAX_SEG:
8789         case TCP_RACK_PRR_SENDALOT:
8790         case TCP_RACK_MIN_TO:
8791         case TCP_RACK_EARLY_SEG:
8792         case TCP_RACK_REORD_THRESH:
8793         case TCP_RACK_REORD_FADE:
8794         case TCP_RACK_TLP_THRESH:
8795         case TCP_RACK_PKT_DELAY:
8796         case TCP_RACK_TLP_USE:
8797         case TCP_RACK_TLP_INC_VAR:
8798         case TCP_RACK_IDLE_REDUCE_HIGH:
8799         case TCP_RACK_MIN_PACE:
8800         case TCP_RACK_MIN_PACE_SEG:
8801         case TCP_BBR_RACK_RTT_USE:
8802         case TCP_DATA_AFTER_CLOSE:
8803                 break;
8804         default:
8805                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8806                 break;
8807         }
8808         INP_WUNLOCK(inp);
8809         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
8810         if (error)
8811                 return (error);
8812         INP_WLOCK(inp);
8813         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
8814                 INP_WUNLOCK(inp);
8815                 return (ECONNRESET);
8816         }
8817         tp = intotcpcb(inp);
8818         rack = (struct tcp_rack *)tp->t_fb_ptr;
8819         switch (sopt->sopt_name) {
8820         case TCP_RACK_PROP_RATE:
8821                 if ((optval <= 0) || (optval >= 100)) {
8822                         error = EINVAL;
8823                         break;
8824                 }
8825                 RACK_OPTS_INC(tcp_rack_prop_rate);
8826                 rack->r_ctl.rc_prop_rate = optval;
8827                 break;
8828         case TCP_RACK_TLP_USE:
8829                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
8830                         error = EINVAL;
8831                         break;
8832                 }
8833                 RACK_OPTS_INC(tcp_tlp_use);
8834                 rack->rack_tlp_threshold_use = optval;
8835                 break;
8836         case TCP_RACK_PROP:
8837                 /* RACK proportional rate reduction (bool) */
8838                 RACK_OPTS_INC(tcp_rack_prop);
8839                 rack->r_ctl.rc_prop_reduce = optval;
8840                 break;
8841         case TCP_RACK_TLP_REDUCE:
8842                 /* RACK TLP cwnd reduction (bool) */
8843                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
8844                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
8845                 break;
8846         case TCP_RACK_EARLY_RECOV:
8847                 /* Should recovery happen early (bool) */
8848                 RACK_OPTS_INC(tcp_rack_early_recov);
8849                 rack->r_ctl.rc_early_recovery = optval;
8850                 break;
8851         case TCP_RACK_PACE_ALWAYS:
8852                 /* Use the always pace method (bool)  */
8853                 RACK_OPTS_INC(tcp_rack_pace_always);
8854                 if (optval > 0)
8855                         rack->rc_always_pace = 1;
8856                 else
8857                         rack->rc_always_pace = 0;
8858                 break;
8859         case TCP_RACK_PACE_REDUCE:
8860                 /* RACK Hptsi reduction factor (divisor) */
8861                 RACK_OPTS_INC(tcp_rack_pace_reduce);
8862                 if (optval)
8863                         /* Must be non-zero */
8864                         rack->rc_pace_reduce = optval;
8865                 else
8866                         error = EINVAL;
8867                 break;
8868         case TCP_RACK_PACE_MAX_SEG:
8869                 /* Max segments in a pace */
8870                 RACK_OPTS_INC(tcp_rack_max_seg);
8871                 rack->rc_pace_max_segs = optval;
8872                 break;
8873         case TCP_RACK_PRR_SENDALOT:
8874                 /* Allow PRR to send more than one seg */
8875                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
8876                 rack->r_ctl.rc_prr_sendalot = optval;
8877                 break;
8878         case TCP_RACK_MIN_TO:
8879                 /* Minimum time between rack t-o's in ms */
8880                 RACK_OPTS_INC(tcp_rack_min_to);
8881                 rack->r_ctl.rc_min_to = optval;
8882                 break;
8883         case TCP_RACK_EARLY_SEG:
8884                 /* If early recovery max segments */
8885                 RACK_OPTS_INC(tcp_rack_early_seg);
8886                 rack->r_ctl.rc_early_recovery_segs = optval;
8887                 break;
8888         case TCP_RACK_REORD_THRESH:
8889                 /* RACK reorder threshold (shift amount) */
8890                 RACK_OPTS_INC(tcp_rack_reord_thresh);
8891                 if ((optval > 0) && (optval < 31))
8892                         rack->r_ctl.rc_reorder_shift = optval;
8893                 else
8894                         error = EINVAL;
8895                 break;
8896         case TCP_RACK_REORD_FADE:
8897                 /* Does reordering fade after ms time */
8898                 RACK_OPTS_INC(tcp_rack_reord_fade);
8899                 rack->r_ctl.rc_reorder_fade = optval;
8900                 break;
8901         case TCP_RACK_TLP_THRESH:
8902                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
8903                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
8904                 if (optval)
8905                         rack->r_ctl.rc_tlp_threshold = optval;
8906                 else
8907                         error = EINVAL;
8908                 break;
8909         case TCP_RACK_PKT_DELAY:
8910                 /* RACK added ms i.e. rack-rtt + reord + N */
8911                 RACK_OPTS_INC(tcp_rack_pkt_delay);
8912                 rack->r_ctl.rc_pkt_delay = optval;
8913                 break;
8914         case TCP_RACK_TLP_INC_VAR:
8915                 /* Does TLP include rtt variance in t-o */
8916                 RACK_OPTS_INC(tcp_rack_tlp_inc_var);
8917                 rack->r_ctl.rc_prr_inc_var = optval;
8918                 break;
8919         case TCP_RACK_IDLE_REDUCE_HIGH:
8920                 RACK_OPTS_INC(tcp_rack_idle_reduce_high);
8921                 if (optval)
8922                         rack->r_idle_reduce_largest = 1;
8923                 else
8924                         rack->r_idle_reduce_largest = 0;
8925                 break;
8926         case TCP_DELACK:
8927                 if (optval == 0)
8928                         tp->t_delayed_ack = 0;
8929                 else
8930                         tp->t_delayed_ack = 1;
8931                 if (tp->t_flags & TF_DELACK) {
8932                         tp->t_flags &= ~TF_DELACK;
8933                         tp->t_flags |= TF_ACKNOW;
8934                         rack_output(tp);
8935                 }
8936                 break;
8937         case TCP_RACK_MIN_PACE:
8938                 RACK_OPTS_INC(tcp_rack_min_pace);
8939                 if (optval > 3)
8940                         rack->r_enforce_min_pace = 3;
8941                 else
8942                         rack->r_enforce_min_pace = optval;
8943                 break;
8944         case TCP_RACK_MIN_PACE_SEG:
8945                 RACK_OPTS_INC(tcp_rack_min_pace_seg);
8946                 if (optval >= 16)
8947                         rack->r_min_pace_seg_thresh = 15;
8948                 else
8949                         rack->r_min_pace_seg_thresh = optval;
8950                 break;
8951         case TCP_BBR_RACK_RTT_USE:
8952                 if ((optval != USE_RTT_HIGH) &&
8953                     (optval != USE_RTT_LOW) &&
8954                     (optval != USE_RTT_AVG))
8955                         error = EINVAL;
8956                 else
8957                         rack->r_ctl.rc_rate_sample_method = optval;
8958                 break;
8959         case TCP_DATA_AFTER_CLOSE:
8960                 if (optval)
8961                         rack->rc_allow_data_af_clo = 1;
8962                 else
8963                         rack->rc_allow_data_af_clo = 0;
8964                 break;
8965         default:
8966                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8967                 break;
8968         }
8969 /*      tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/
8970         INP_WUNLOCK(inp);
8971         return (error);
8972 }
8973
8974 static int
8975 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
8976     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8977 {
8978         int32_t error, optval;
8979
8980         /*
8981          * Because all our options are either boolean or an int, we can just
8982          * pull everything into optval and then unlock and copy. If we ever
8983          * add a option that is not a int, then this will have quite an
8984          * impact to this routine.
8985          */
8986         switch (sopt->sopt_name) {
8987         case TCP_RACK_PROP_RATE:
8988                 optval = rack->r_ctl.rc_prop_rate;
8989                 break;
8990         case TCP_RACK_PROP:
8991                 /* RACK proportional rate reduction (bool) */
8992                 optval = rack->r_ctl.rc_prop_reduce;
8993                 break;
8994         case TCP_RACK_TLP_REDUCE:
8995                 /* RACK TLP cwnd reduction (bool) */
8996                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
8997                 break;
8998         case TCP_RACK_EARLY_RECOV:
8999                 /* Should recovery happen early (bool) */
9000                 optval = rack->r_ctl.rc_early_recovery;
9001                 break;
9002         case TCP_RACK_PACE_REDUCE:
9003                 /* RACK Hptsi reduction factor (divisor) */
9004                 optval = rack->rc_pace_reduce;
9005                 break;
9006         case TCP_RACK_PACE_MAX_SEG:
9007                 /* Max segments in a pace */
9008                 optval = rack->rc_pace_max_segs;
9009                 break;
9010         case TCP_RACK_PACE_ALWAYS:
9011                 /* Use the always pace method */
9012                 optval = rack->rc_always_pace;
9013                 break;
9014         case TCP_RACK_PRR_SENDALOT:
9015                 /* Allow PRR to send more than one seg */
9016                 optval = rack->r_ctl.rc_prr_sendalot;
9017                 break;
9018         case TCP_RACK_MIN_TO:
9019                 /* Minimum time between rack t-o's in ms */
9020                 optval = rack->r_ctl.rc_min_to;
9021                 break;
9022         case TCP_RACK_EARLY_SEG:
9023                 /* If early recovery max segments */
9024                 optval = rack->r_ctl.rc_early_recovery_segs;
9025                 break;
9026         case TCP_RACK_REORD_THRESH:
9027                 /* RACK reorder threshold (shift amount) */
9028                 optval = rack->r_ctl.rc_reorder_shift;
9029                 break;
9030         case TCP_RACK_REORD_FADE:
9031                 /* Does reordering fade after ms time */
9032                 optval = rack->r_ctl.rc_reorder_fade;
9033                 break;
9034         case TCP_RACK_TLP_THRESH:
9035                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
9036                 optval = rack->r_ctl.rc_tlp_threshold;
9037                 break;
9038         case TCP_RACK_PKT_DELAY:
9039                 /* RACK added ms i.e. rack-rtt + reord + N */
9040                 optval = rack->r_ctl.rc_pkt_delay;
9041                 break;
9042         case TCP_RACK_TLP_USE:
9043                 optval = rack->rack_tlp_threshold_use;
9044                 break;
9045         case TCP_RACK_TLP_INC_VAR:
9046                 /* Does TLP include rtt variance in t-o */
9047                 optval = rack->r_ctl.rc_prr_inc_var;
9048                 break;
9049         case TCP_RACK_IDLE_REDUCE_HIGH:
9050                 optval = rack->r_idle_reduce_largest;
9051                 break;
9052         case TCP_RACK_MIN_PACE:
9053                 optval = rack->r_enforce_min_pace;
9054                 break;
9055         case TCP_RACK_MIN_PACE_SEG:
9056                 optval = rack->r_min_pace_seg_thresh;
9057                 break;
9058         case TCP_BBR_RACK_RTT_USE:
9059                 optval = rack->r_ctl.rc_rate_sample_method;
9060                 break;
9061         case TCP_DELACK:
9062                 optval = tp->t_delayed_ack;
9063                 break;
9064         case TCP_DATA_AFTER_CLOSE:
9065                 optval = rack->rc_allow_data_af_clo;
9066                 break;
9067         default:
9068                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9069                 break;
9070         }
9071         INP_WUNLOCK(inp);
9072         error = sooptcopyout(sopt, &optval, sizeof optval);
9073         return (error);
9074 }
9075
9076 static int
9077 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
9078 {
9079         int32_t error = EINVAL;
9080         struct tcp_rack *rack;
9081
9082         rack = (struct tcp_rack *)tp->t_fb_ptr;
9083         if (rack == NULL) {
9084                 /* Huh? */
9085                 goto out;
9086         }
9087         if (sopt->sopt_dir == SOPT_SET) {
9088                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
9089         } else if (sopt->sopt_dir == SOPT_GET) {
9090                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
9091         }
9092 out:
9093         INP_WUNLOCK(inp);
9094         return (error);
9095 }
9096
9097
9098 struct tcp_function_block __tcp_rack = {
9099         .tfb_tcp_block_name = __XSTRING(STACKNAME),
9100         .tfb_tcp_output = rack_output,
9101         .tfb_tcp_do_segment = rack_do_segment,
9102         .tfb_tcp_ctloutput = rack_ctloutput,
9103         .tfb_tcp_fb_init = rack_init,
9104         .tfb_tcp_fb_fini = rack_fini,
9105         .tfb_tcp_timer_stop_all = rack_stopall,
9106         .tfb_tcp_timer_activate = rack_timer_activate,
9107         .tfb_tcp_timer_active = rack_timer_active,
9108         .tfb_tcp_timer_stop = rack_timer_stop,
9109         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
9110         .tfb_tcp_handoff_ok = rack_handoff_ok
9111 };
9112
9113 static const char *rack_stack_names[] = {
9114         __XSTRING(STACKNAME),
9115 #ifdef STACKALIAS
9116         __XSTRING(STACKALIAS),
9117 #endif
9118 };
9119
9120 static int
9121 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
9122 {
9123         memset(mem, 0, size);
9124         return (0);
9125 }
9126
9127 static void
9128 rack_dtor(void *mem, int32_t size, void *arg)
9129 {
9130
9131 }
9132
9133 static bool rack_mod_inited = false;
9134
9135 static int
9136 tcp_addrack(module_t mod, int32_t type, void *data)
9137 {
9138         int32_t err = 0;
9139         int num_stacks;
9140
9141         switch (type) {
9142         case MOD_LOAD:
9143                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
9144                     sizeof(struct rack_sendmap),
9145                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
9146
9147                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
9148                     sizeof(struct tcp_rack),
9149                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
9150
9151                 sysctl_ctx_init(&rack_sysctl_ctx);
9152                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
9153                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
9154                     OID_AUTO,
9155                     __XSTRING(STACKNAME),
9156                     CTLFLAG_RW, 0,
9157                     "");
9158                 if (rack_sysctl_root == NULL) {
9159                         printf("Failed to add sysctl node\n");
9160                         err = EFAULT;
9161                         goto free_uma;
9162                 }
9163                 rack_init_sysctls();
9164                 num_stacks = nitems(rack_stack_names);
9165                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
9166                     rack_stack_names, &num_stacks);
9167                 if (err) {
9168                         printf("Failed to register %s stack name for "
9169                             "%s module\n", rack_stack_names[num_stacks],
9170                             __XSTRING(MODNAME));
9171                         sysctl_ctx_free(&rack_sysctl_ctx);
9172 free_uma:
9173                         uma_zdestroy(rack_zone);
9174                         uma_zdestroy(rack_pcb_zone);
9175                         rack_counter_destroy();
9176                         printf("Failed to register rack module -- err:%d\n", err);
9177                         return (err);
9178                 }
9179                 rack_mod_inited = true;
9180                 break;
9181         case MOD_QUIESCE:
9182                 err = deregister_tcp_functions(&__tcp_rack, true, false);
9183                 break;
9184         case MOD_UNLOAD:
9185                 err = deregister_tcp_functions(&__tcp_rack, false, true);
9186                 if (err == EBUSY)
9187                         break;
9188                 if (rack_mod_inited) {
9189                         uma_zdestroy(rack_zone);
9190                         uma_zdestroy(rack_pcb_zone);
9191                         sysctl_ctx_free(&rack_sysctl_ctx);
9192                         rack_counter_destroy();
9193                         rack_mod_inited = false;
9194                 }
9195                 err = 0;
9196                 break;
9197         default:
9198                 return (EOPNOTSUPP);
9199         }
9200         return (err);
9201 }
9202
9203 static moduledata_t tcp_rack = {
9204         .name = __XSTRING(MODNAME),
9205         .evhand = tcp_addrack,
9206         .priv = 0
9207 };
9208
9209 MODULE_VERSION(MODNAME, 1);
9210 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
9211 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);