sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2018
   3  *      Netflix Inc.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  */
  27
  28 #include <sys/cdefs.h>
  29 __FBSDID("$FreeBSD$");
  30
  31 #include "opt_inet.h"
  32 #include "opt_inet6.h"
  33 #include "opt_ipsec.h"
  34 #include "opt_tcpdebug.h"
  35
  36 #include <sys/param.h>
  37 #include <sys/module.h>
  38 #include <sys/kernel.h>
  39 #ifdef TCP_HHOOK
  40 #include <sys/hhook.h>
  41 #endif
  42 #include <sys/lock.h>
  43 #include <sys/malloc.h>
  44 #include <sys/lock.h>
  45 #include <sys/mutex.h>
  46 #include <sys/mbuf.h>
  47 #include <sys/proc.h>           /* for proc0 declaration */
  48 #include <sys/socket.h>
  49 #include <sys/socketvar.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/systm.h>
  52 #ifdef NETFLIX_STATS
  53 #include <sys/stats.h>
  54 #endif
  55 #include <sys/refcount.h>
  56 #include <sys/queue.h>
  57 #include <sys/smp.h>
  58 #include <sys/kthread.h>
  59 #include <sys/kern_prefetch.h>
  60
  61 #include <vm/uma.h>
  62
  63 #include <net/route.h>
  64 #include <net/vnet.h>
  65
  66 #define TCPSTATES               /* for logging */
  67
  68 #include <netinet/in.h>
  69 #include <netinet/in_kdtrace.h>
  70 #include <netinet/in_pcb.h>
  71 #include <netinet/ip.h>
  72 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  73 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  74 #include <netinet/ip_var.h>
  75 #include <netinet/ip6.h>
  76 #include <netinet6/in6_pcb.h>
  77 #include <netinet6/ip6_var.h>
  78 #include <netinet/tcp.h>
  79 #define TCPOUTFLAGS
  80 #include <netinet/tcp_fsm.h>
  81 #include <netinet/tcp_log_buf.h>
  82 #include <netinet/tcp_seq.h>
  83 #include <netinet/tcp_timer.h>
  84 #include <netinet/tcp_var.h>
  85 #include <netinet/tcp_hpts.h>
  86 #include <netinet/tcpip.h>
  87 #include <netinet/cc/cc.h>
  88 #ifdef NETFLIX_CWV
  89 #include <netinet/tcp_newcwv.h>
  90 #endif
  91 #include <netinet/tcp_fastopen.h>
  92 #ifdef TCPDEBUG
  93 #include <netinet/tcp_debug.h>
  94 #endif                          /* TCPDEBUG */
  95 #ifdef TCP_OFFLOAD
  96 #include <netinet/tcp_offload.h>
  97 #endif
  98 #ifdef INET6
  99 #include <netinet6/tcp6_var.h>
 100 #endif
 101
 102 #include <netipsec/ipsec_support.h>
 103
 104 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 105 #include <netipsec/ipsec.h>
 106 #include <netipsec/ipsec6.h>
 107 #endif                          /* IPSEC */
 108
 109 #include <netinet/udp.h>
 110 #include <netinet/udp_var.h>
 111 #include <machine/in_cksum.h>
 112
 113 #ifdef MAC
 114 #include <security/mac/mac_framework.h>
 115 #endif
 116 #include "sack_filter.h"
 117 #include "tcp_rack.h"
 118 #include "rack_bbr_common.h"
 119
 120 uma_zone_t rack_zone;
 121 uma_zone_t rack_pcb_zone;
 122
 123 #ifndef TICKS2SBT
 124 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 125 #endif
 126
 127 struct sysctl_ctx_list rack_sysctl_ctx;
 128 struct sysctl_oid *rack_sysctl_root;
 129
 130 #define CUM_ACKED 1
 131 #define SACKED 2
 132
 133 /*
 134  * The RACK module incorporates a number of
 135  * TCP ideas that have been put out into the IETF
 136  * over the last few years:
 137  * - Matt Mathis's Rate Halving which slowly drops
 138  *    the congestion window so that the ack clock can
 139  *    be maintained during a recovery.
 140  * - Yuchung Cheng's RACK TCP (for which its named) that
 141  *    will stop us using the number of dup acks and instead
 142  *    use time as the gage of when we retransmit.
 143  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 144  *    of Dukkipati et.al.
 145  * RACK depends on SACK, so if an endpoint arrives that
 146  * cannot do SACK the state machine below will shuttle the
 147  * connection back to using the "default" TCP stack that is
 148  * in FreeBSD.
 149  *
 150  * To implement RACK the original TCP stack was first decomposed
 151  * into a functional state machine with individual states
 152  * for each of the possible TCP connection states. The do_segement
 153  * functions role in life is to mandate the connection supports SACK
 154  * initially and then assure that the RACK state matches the conenction
 155  * state before calling the states do_segment function. Each
 156  * state is simplified due to the fact that the original do_segment
 157  * has been decomposed and we *know* what state we are in (no
 158  * switches on the state) and all tests for SACK are gone. This
 159  * greatly simplifies what each state does.
 160  *
 161  * TCP output is also over-written with a new version since it
 162  * must maintain the new rack scoreboard.
 163  *
 164  */
 165 static int32_t rack_precache = 1;
 166 static int32_t rack_tlp_thresh = 1;
 167 static int32_t rack_reorder_thresh = 2;
 168 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 169                                                  * - 60 seconds */
 170 static int32_t rack_pkt_delay = 1;
 171 static int32_t rack_inc_var = 0;/* For TLP */
 172 static int32_t rack_reduce_largest_on_idle = 0;
 173 static int32_t rack_min_pace_time = 0;
 174 static int32_t rack_min_pace_time_seg_req=6;
 175 static int32_t rack_early_recovery = 1;
 176 static int32_t rack_early_recovery_max_seg = 6;
 177 static int32_t rack_send_a_lot_in_prr = 1;
 178 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 179 static int32_t rack_tlp_in_recovery = 1;        /* Can we do TLP in recovery? */
 180 static int32_t rack_verbose_logging = 0;
 181 static int32_t rack_ignore_data_after_close = 1;
 182 /*
 183  * Currently regular tcp has a rto_min of 30ms
 184  * the backoff goes 12 times so that ends up
 185  * being a total of 122.850 seconds before a
 186  * connection is killed.
 187  */
 188 static int32_t rack_tlp_min = 10;
 189 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 190 static int32_t rack_rto_max = 30000;    /* 30 seconds */
 191 static const int32_t rack_free_cache = 2;
 192 static int32_t rack_hptsi_segments = 40;
 193 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 194 static int32_t rack_pace_every_seg = 1;
 195 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 196 static int32_t rack_slot_reduction = 4;
 197 static int32_t rack_lower_cwnd_at_tlp = 0;
 198 static int32_t rack_use_proportional_reduce = 0;
 199 static int32_t rack_proportional_rate = 10;
 200 static int32_t rack_tlp_max_resend = 2;
 201 static int32_t rack_limited_retran = 0;
 202 static int32_t rack_always_send_oldest = 0;
 203 static int32_t rack_sack_block_limit = 128;
 204 static int32_t rack_use_sack_filter = 1;
 205 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 206
 207 /* Rack specific counters */
 208 counter_u64_t rack_badfr;
 209 counter_u64_t rack_badfr_bytes;
 210 counter_u64_t rack_rtm_prr_retran;
 211 counter_u64_t rack_rtm_prr_newdata;
 212 counter_u64_t rack_timestamp_mismatch;
 213 counter_u64_t rack_reorder_seen;
 214 counter_u64_t rack_paced_segments;
 215 counter_u64_t rack_unpaced_segments;
 216 counter_u64_t rack_saw_enobuf;
 217 counter_u64_t rack_saw_enetunreach;
 218
 219 /* Tail loss probe counters */
 220 counter_u64_t rack_tlp_tot;
 221 counter_u64_t rack_tlp_newdata;
 222 counter_u64_t rack_tlp_retran;
 223 counter_u64_t rack_tlp_retran_bytes;
 224 counter_u64_t rack_tlp_retran_fail;
 225 counter_u64_t rack_to_tot;
 226 counter_u64_t rack_to_arm_rack;
 227 counter_u64_t rack_to_arm_tlp;
 228 counter_u64_t rack_to_alloc;
 229 counter_u64_t rack_to_alloc_hard;
 230 counter_u64_t rack_to_alloc_emerg;
 231
 232 counter_u64_t rack_sack_proc_all;
 233 counter_u64_t rack_sack_proc_short;
 234 counter_u64_t rack_sack_proc_restart;
 235 counter_u64_t rack_runt_sacks;
 236 counter_u64_t rack_used_tlpmethod;
 237 counter_u64_t rack_used_tlpmethod2;
 238 counter_u64_t rack_enter_tlp_calc;
 239 counter_u64_t rack_input_idle_reduces;
 240 counter_u64_t rack_tlp_does_nada;
 241
 242 /* Temp CPU counters */
 243 counter_u64_t rack_find_high;
 244
 245 counter_u64_t rack_progress_drops;
 246 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 247 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 248
 249 static void
 250 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 251
 252 static int
 253 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 254     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 255     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 256 static int
 257 rack_process_data(struct mbuf *m, struct tcphdr *th,
 258     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 259     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 260 static void
 261 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 262     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 263 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 264 static struct rack_sendmap *
 265 rack_check_recovery_mode(struct tcpcb *tp,
 266     uint32_t tsused);
 267 static void
 268 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 269     uint32_t type);
 270 static void rack_counter_destroy(void);
 271 static int
 272 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 273     struct inpcb *inp, struct tcpcb *tp);
 274 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 275 static void
 276 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 277     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 278     uint8_t iptos);
 279 static void rack_dtor(void *mem, int32_t size, void *arg);
 280 static void
 281 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 282     uint32_t t, uint32_t cts);
 283 static struct rack_sendmap *
 284 rack_find_high_nonack(struct tcp_rack *rack,
 285     struct rack_sendmap *rsm);
 286 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 287 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 288 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 289 static int
 290 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 291     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 292 static int32_t rack_handoff_ok(struct tcpcb *tp);
 293 static int32_t rack_init(struct tcpcb *tp);
 294 static void rack_init_sysctls(void);
 295 static void
 296 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 297     struct tcphdr *th);
 298 static void
 299 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 300     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 301     uint8_t pass, struct rack_sendmap *hintrsm);
 302 static void
 303 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 304     struct rack_sendmap *rsm);
 305 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 306 static int32_t rack_output(struct tcpcb *tp);
 307 static void
 308 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
 309     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 310     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
 311
 312 static uint32_t
 313 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 314     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 315     uint32_t cts);
 316 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 317 static void rack_remxt_tmr(struct tcpcb *tp);
 318 static int
 319 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 320     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 321 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 322 static int32_t rack_stopall(struct tcpcb *tp);
 323 static void
 324 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 325     uint32_t delta);
 326 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 327 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 328 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 329 static uint32_t
 330 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 331     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 332 static void
 333 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 334     struct rack_sendmap *rsm, uint32_t ts);
 335 static int
 336 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 337     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 338 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 339 static void
 340 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
 341     struct tcpcb *tp, int32_t * ret_val);
 342 static int
 343 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 344     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 345     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 346 static int
 347 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 348     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 349     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 350 static void
 351 rack_do_drop(struct mbuf *m, struct tcpcb *tp);
 352 static void
 353 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
 354     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
 355 static void
 356 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
 357         struct tcphdr *th, int32_t rstreason, int32_t tlen);
 358 static int
 359 rack_do_established(struct mbuf *m, struct tcphdr *th,
 360     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 361     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 362 static int
 363 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 364     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 365     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
 366 static int
 367 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 368     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 369     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 370 static int
 371 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 372     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 373     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 374 static int
 375 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 376     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 377     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 378 static int
 379 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 380     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 381     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 382 static int
 383 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 384     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 385     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 386 static int
 387 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
 388     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
 389     int32_t * drop_hdrlen, int32_t * ret_val);
 390 static int
 391 rack_process_rst(struct mbuf *m, struct tcphdr *th,
 392     struct socket *so, struct tcpcb *tp);
 393 struct rack_sendmap *
 394 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 395     uint32_t tsused);
 396 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 397 static void
 398      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 399
 400 static int
 401 rack_ts_check(struct mbuf *m, struct tcphdr *th,
 402     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
 403
 404 int32_t rack_clear_counter=0;
 405
 406
 407 static int
 408 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 409 {
 410         uint32_t stat;
 411         int32_t error;
 412
 413         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 414         if (error || req->newptr == NULL)
 415                 return error;
 416
 417         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 418         if (error)
 419                 return (error);
 420         if (stat == 1) {
 421 #ifdef INVARIANTS
 422                 printf("Clearing RACK counters\n");
 423 #endif
 424                 counter_u64_zero(rack_badfr);
 425                 counter_u64_zero(rack_badfr_bytes);
 426                 counter_u64_zero(rack_rtm_prr_retran);
 427                 counter_u64_zero(rack_rtm_prr_newdata);
 428                 counter_u64_zero(rack_timestamp_mismatch);
 429                 counter_u64_zero(rack_reorder_seen);
 430                 counter_u64_zero(rack_tlp_tot);
 431                 counter_u64_zero(rack_tlp_newdata);
 432                 counter_u64_zero(rack_tlp_retran);
 433                 counter_u64_zero(rack_tlp_retran_bytes);
 434                 counter_u64_zero(rack_tlp_retran_fail);
 435                 counter_u64_zero(rack_to_tot);
 436                 counter_u64_zero(rack_to_arm_rack);
 437                 counter_u64_zero(rack_to_arm_tlp);
 438                 counter_u64_zero(rack_paced_segments);
 439                 counter_u64_zero(rack_unpaced_segments);
 440                 counter_u64_zero(rack_saw_enobuf);
 441                 counter_u64_zero(rack_saw_enetunreach);
 442                 counter_u64_zero(rack_to_alloc_hard);
 443                 counter_u64_zero(rack_to_alloc_emerg);
 444                 counter_u64_zero(rack_sack_proc_all);
 445                 counter_u64_zero(rack_sack_proc_short);
 446                 counter_u64_zero(rack_sack_proc_restart);
 447                 counter_u64_zero(rack_to_alloc);
 448                 counter_u64_zero(rack_find_high);
 449                 counter_u64_zero(rack_runt_sacks);
 450                 counter_u64_zero(rack_used_tlpmethod);
 451                 counter_u64_zero(rack_used_tlpmethod2);
 452                 counter_u64_zero(rack_enter_tlp_calc);
 453                 counter_u64_zero(rack_progress_drops);
 454                 counter_u64_zero(rack_tlp_does_nada);
 455         }
 456         rack_clear_counter = 0;
 457         return (0);
 458 }
 459
 460
 461
 462 static void
 463 rack_init_sysctls()
 464 {
 465         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 466             SYSCTL_CHILDREN(rack_sysctl_root),
 467             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 468             &rack_rate_sample_method , USE_RTT_LOW,
 469             "What method should we use for rate sampling 0=high, 1=low ");
 470         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 471             SYSCTL_CHILDREN(rack_sysctl_root),
 472             OID_AUTO, "data_after_close", CTLFLAG_RW,
 473             &rack_ignore_data_after_close, 0,
 474             "Do we hold off sending a RST until all pending data is ack'd");
 475         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 476             SYSCTL_CHILDREN(rack_sysctl_root),
 477             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 478             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 479             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 480         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 481             SYSCTL_CHILDREN(rack_sysctl_root),
 482             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 483             &rack_min_pace_time, 0,
 484             "Should we enforce a minimum pace time of 1ms");
 485         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 486             SYSCTL_CHILDREN(rack_sysctl_root),
 487             OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 488             &rack_min_pace_time_seg_req, 6,
 489             "How many segments have to be in the len to enforce min-pace-time");
 490         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 491             SYSCTL_CHILDREN(rack_sysctl_root),
 492             OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 493             &rack_reduce_largest_on_idle, 0,
 494             "Should we reduce the largest cwnd seen to IW on idle reduction");
 495         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 496             SYSCTL_CHILDREN(rack_sysctl_root),
 497             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 498             &rack_verbose_logging, 0,
 499             "Should RACK black box logging be verbose");
 500         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 501             SYSCTL_CHILDREN(rack_sysctl_root),
 502             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 503             &rack_use_sack_filter, 1,
 504             "Do we use sack filtering?");
 505         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 506             SYSCTL_CHILDREN(rack_sysctl_root),
 507             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 508             &rack_delayed_ack_time, 200,
 509             "Delayed ack time (200ms)");
 510         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 511             SYSCTL_CHILDREN(rack_sysctl_root),
 512             OID_AUTO, "tlpminto", CTLFLAG_RW,
 513             &rack_tlp_min, 10,
 514             "TLP minimum timeout per the specification (10ms)");
 515         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 516             SYSCTL_CHILDREN(rack_sysctl_root),
 517             OID_AUTO, "precache", CTLFLAG_RW,
 518             &rack_precache, 0,
 519             "Where should we precache the mcopy (0 is not at all)");
 520         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 521             SYSCTL_CHILDREN(rack_sysctl_root),
 522             OID_AUTO, "sblklimit", CTLFLAG_RW,
 523             &rack_sack_block_limit, 128,
 524             "When do we start paying attention to small sack blocks");
 525         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 526             SYSCTL_CHILDREN(rack_sysctl_root),
 527             OID_AUTO, "send_oldest", CTLFLAG_RW,
 528             &rack_always_send_oldest, 1,
 529             "Should we always send the oldest TLP and RACK-TLP");
 530         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 531             SYSCTL_CHILDREN(rack_sysctl_root),
 532             OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 533             &rack_tlp_in_recovery, 1,
 534             "Can we do a TLP during recovery?");
 535         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 536             SYSCTL_CHILDREN(rack_sysctl_root),
 537             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 538             &rack_limited_retran, 0,
 539             "How many times can a rack timeout drive out sends");
 540         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 541             SYSCTL_CHILDREN(rack_sysctl_root),
 542             OID_AUTO, "minrto", CTLFLAG_RW,
 543             &rack_rto_min, 0,
 544             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 545         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 546             SYSCTL_CHILDREN(rack_sysctl_root),
 547             OID_AUTO, "maxrto", CTLFLAG_RW,
 548             &rack_rto_max, 0,
 549             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 550         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 551             SYSCTL_CHILDREN(rack_sysctl_root),
 552             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 553             &rack_tlp_max_resend, 2,
 554             "How many times does TLP retry a single segment or multiple with no ACK");
 555         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 556             SYSCTL_CHILDREN(rack_sysctl_root),
 557             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 558             &rack_use_proportional_reduce, 0,
 559             "Should we proportionaly reduce cwnd based on the number of losses ");
 560         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 561             SYSCTL_CHILDREN(rack_sysctl_root),
 562             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 563             &rack_proportional_rate, 10,
 564             "What percent reduction per loss");
 565         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 566             SYSCTL_CHILDREN(rack_sysctl_root),
 567             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 568             &rack_lower_cwnd_at_tlp, 0,
 569             "When a TLP completes a retran should we enter recovery?");
 570         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 571             SYSCTL_CHILDREN(rack_sysctl_root),
 572             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 573             &rack_slot_reduction, 4,
 574             "When setting a slot should we reduce by divisor");
 575         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 576             SYSCTL_CHILDREN(rack_sysctl_root),
 577             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 578             &rack_pace_every_seg, 1,
 579             "Should we pace out every segment hptsi");
 580         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 581             SYSCTL_CHILDREN(rack_sysctl_root),
 582             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 583             &rack_hptsi_segments, 6,
 584             "Should we pace out only a limited size of segments");
 585         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 586             SYSCTL_CHILDREN(rack_sysctl_root),
 587             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 588             &rack_send_a_lot_in_prr, 1,
 589             "Send a lot in prr");
 590         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 591             SYSCTL_CHILDREN(rack_sysctl_root),
 592             OID_AUTO, "minto", CTLFLAG_RW,
 593             &rack_min_to, 1,
 594             "Minimum rack timeout in milliseconds");
 595         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 596             SYSCTL_CHILDREN(rack_sysctl_root),
 597             OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 598             &rack_early_recovery_max_seg, 6,
 599             "Max segments in early recovery");
 600         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 601             SYSCTL_CHILDREN(rack_sysctl_root),
 602             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 603             &rack_early_recovery, 1,
 604             "Do we do early recovery with rack");
 605         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 606             SYSCTL_CHILDREN(rack_sysctl_root),
 607             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 608             &rack_reorder_thresh, 2,
 609             "What factor for rack will be added when seeing reordering (shift right)");
 610         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 611             SYSCTL_CHILDREN(rack_sysctl_root),
 612             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 613             &rack_tlp_thresh, 1,
 614             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 615         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 616             SYSCTL_CHILDREN(rack_sysctl_root),
 617             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 618             &rack_reorder_fade, 0,
 619             "Does reorder detection fade, if so how many ms (0 means never)");
 620         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 621             SYSCTL_CHILDREN(rack_sysctl_root),
 622             OID_AUTO, "pktdelay", CTLFLAG_RW,
 623             &rack_pkt_delay, 1,
 624             "Extra RACK time (in ms) besides reordering thresh");
 625         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 626             SYSCTL_CHILDREN(rack_sysctl_root),
 627             OID_AUTO, "inc_var", CTLFLAG_RW,
 628             &rack_inc_var, 0,
 629             "Should rack add to the TLP timer the variance in rtt calculation");
 630         rack_badfr = counter_u64_alloc(M_WAITOK);
 631         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "badfr", CTLFLAG_RD,
 634             &rack_badfr, "Total number of bad FRs");
 635         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 636         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 639             &rack_badfr_bytes, "Total number of bad FRs");
 640         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 641         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 642             SYSCTL_CHILDREN(rack_sysctl_root),
 643             OID_AUTO, "prrsndret", CTLFLAG_RD,
 644             &rack_rtm_prr_retran,
 645             "Total number of prr based retransmits");
 646         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 647         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 648             SYSCTL_CHILDREN(rack_sysctl_root),
 649             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 650             &rack_rtm_prr_newdata,
 651             "Total number of prr based new transmits");
 652         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 653         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 654             SYSCTL_CHILDREN(rack_sysctl_root),
 655             OID_AUTO, "tsnf", CTLFLAG_RD,
 656             &rack_timestamp_mismatch,
 657             "Total number of timestamps that we could not find the reported ts");
 658         rack_find_high = counter_u64_alloc(M_WAITOK);
 659         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 660             SYSCTL_CHILDREN(rack_sysctl_root),
 661             OID_AUTO, "findhigh", CTLFLAG_RD,
 662             &rack_find_high,
 663             "Total number of FIN causing find-high");
 664         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 665         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 666             SYSCTL_CHILDREN(rack_sysctl_root),
 667             OID_AUTO, "reordering", CTLFLAG_RD,
 668             &rack_reorder_seen,
 669             "Total number of times we added delay due to reordering");
 670         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 671         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 672             SYSCTL_CHILDREN(rack_sysctl_root),
 673             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 674             &rack_tlp_tot,
 675             "Total number of tail loss probe expirations");
 676         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 677         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 678             SYSCTL_CHILDREN(rack_sysctl_root),
 679             OID_AUTO, "tlp_new", CTLFLAG_RD,
 680             &rack_tlp_newdata,
 681             "Total number of tail loss probe sending new data");
 682
 683         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 684         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 685             SYSCTL_CHILDREN(rack_sysctl_root),
 686             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 687             &rack_tlp_retran,
 688             "Total number of tail loss probe sending retransmitted data");
 689         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 690         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 691             SYSCTL_CHILDREN(rack_sysctl_root),
 692             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 693             &rack_tlp_retran_bytes,
 694             "Total bytes of tail loss probe sending retransmitted data");
 695         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 696         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 697             SYSCTL_CHILDREN(rack_sysctl_root),
 698             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 699             &rack_tlp_retran_fail,
 700             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 701         rack_to_tot = counter_u64_alloc(M_WAITOK);
 702         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 703             SYSCTL_CHILDREN(rack_sysctl_root),
 704             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 705             &rack_to_tot,
 706             "Total number of times the rack to expired?");
 707         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 708         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 709             SYSCTL_CHILDREN(rack_sysctl_root),
 710             OID_AUTO, "arm_rack", CTLFLAG_RD,
 711             &rack_to_arm_rack,
 712             "Total number of times the rack timer armed?");
 713         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 714         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 715             SYSCTL_CHILDREN(rack_sysctl_root),
 716             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 717             &rack_to_arm_tlp,
 718             "Total number of times the tlp timer armed?");
 719         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 720         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 721             SYSCTL_CHILDREN(rack_sysctl_root),
 722             OID_AUTO, "paced", CTLFLAG_RD,
 723             &rack_paced_segments,
 724             "Total number of times a segment send caused hptsi");
 725         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 726         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 727             SYSCTL_CHILDREN(rack_sysctl_root),
 728             OID_AUTO, "unpaced", CTLFLAG_RD,
 729             &rack_unpaced_segments,
 730             "Total number of times a segment did not cause hptsi");
 731         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 732         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 733             SYSCTL_CHILDREN(rack_sysctl_root),
 734             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 735             &rack_saw_enobuf,
 736             "Total number of times a segment did not cause hptsi");
 737         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 738         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 739             SYSCTL_CHILDREN(rack_sysctl_root),
 740             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 741             &rack_saw_enetunreach,
 742             "Total number of times a segment did not cause hptsi");
 743         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 744         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 745             SYSCTL_CHILDREN(rack_sysctl_root),
 746             OID_AUTO, "allocs", CTLFLAG_RD,
 747             &rack_to_alloc,
 748             "Total allocations of tracking structures");
 749         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 750         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 751             SYSCTL_CHILDREN(rack_sysctl_root),
 752             OID_AUTO, "allochard", CTLFLAG_RD,
 753             &rack_to_alloc_hard,
 754             "Total allocations done with sleeping the hard way");
 755         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 756         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 757             SYSCTL_CHILDREN(rack_sysctl_root),
 758             OID_AUTO, "allocemerg", CTLFLAG_RD,
 759             &rack_to_alloc_emerg,
 760             "Total alocations done from emergency cache");
 761         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 762         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 763             SYSCTL_CHILDREN(rack_sysctl_root),
 764             OID_AUTO, "sack_long", CTLFLAG_RD,
 765             &rack_sack_proc_all,
 766             "Total times we had to walk whole list for sack processing");
 767
 768         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 769         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 770             SYSCTL_CHILDREN(rack_sysctl_root),
 771             OID_AUTO, "sack_restart", CTLFLAG_RD,
 772             &rack_sack_proc_restart,
 773             "Total times we had to walk whole list due to a restart");
 774         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 775         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 776             SYSCTL_CHILDREN(rack_sysctl_root),
 777             OID_AUTO, "sack_short", CTLFLAG_RD,
 778             &rack_sack_proc_short,
 779             "Total times we took shortcut for sack processing");
 780         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 781         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 782             SYSCTL_CHILDREN(rack_sysctl_root),
 783             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 784             &rack_enter_tlp_calc,
 785             "Total times we called calc-tlp");
 786         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 787         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 788             SYSCTL_CHILDREN(rack_sysctl_root),
 789             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 790             &rack_used_tlpmethod,
 791             "Total number of runt sacks");
 792         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 793         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 794             SYSCTL_CHILDREN(rack_sysctl_root),
 795             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 796             &rack_used_tlpmethod2,
 797             "Total number of runt sacks 2");
 798         rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 799         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 800             SYSCTL_CHILDREN(rack_sysctl_root),
 801             OID_AUTO, "runtsacks", CTLFLAG_RD,
 802             &rack_runt_sacks,
 803             "Total number of runt sacks");
 804         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 805         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 806             SYSCTL_CHILDREN(rack_sysctl_root),
 807             OID_AUTO, "prog_drops", CTLFLAG_RD,
 808             &rack_progress_drops,
 809             "Total number of progress drops");
 810         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 811         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 812             SYSCTL_CHILDREN(rack_sysctl_root),
 813             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 814             &rack_input_idle_reduces,
 815             "Total number of idle reductions on input");
 816         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 817         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 818             SYSCTL_CHILDREN(rack_sysctl_root),
 819             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 820             &rack_tlp_does_nada,
 821             "Total number of nada tlp calls");
 822         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 823         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 824             OID_AUTO, "outsize", CTLFLAG_RD,
 825             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 826         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 827         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 828             OID_AUTO, "opts", CTLFLAG_RD,
 829             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 830         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 831             SYSCTL_CHILDREN(rack_sysctl_root),
 832             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 833             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 834 }
 835
 836 static inline int32_t
 837 rack_progress_timeout_check(struct tcpcb *tp)
 838 {
 839         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 840                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 841                         /*
 842                          * There is an assumption that the caller
 843                          * will drop the connection so we will
 844                          * increment the counters here.
 845                          */
 846                         struct tcp_rack *rack;
 847                         rack = (struct tcp_rack *)tp->t_fb_ptr;
 848                         counter_u64_add(rack_progress_drops, 1);
 849 #ifdef NETFLIX_STATS
 850                         TCPSTAT_INC(tcps_progdrops);
 851 #endif
 852                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 853                         return (1);
 854                 }
 855         }
 856         return (0);
 857 }
 858
 859
 860 static void
 861 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 862 {
 863         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 864                 union tcp_log_stackspecific log;
 865
 866                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 867                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 868                 log.u_bbr.flex2 = to;
 869                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 870                 log.u_bbr.flex4 = slot;
 871                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 872                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 873                 log.u_bbr.flex8 = which;
 874                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 875                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 876                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 877                     &rack->rc_inp->inp_socket->so_rcv,
 878                     &rack->rc_inp->inp_socket->so_snd,
 879                     BBR_LOG_TIMERSTAR, 0,
 880                     0, &log, false);
 881         }
 882 }
 883
 884 static void
 885 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 886 {
 887         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 888                 union tcp_log_stackspecific log;
 889
 890                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 891                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 892                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 893                 log.u_bbr.flex8 = to_num;
 894                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 895                 log.u_bbr.flex2 = rack->rc_rack_rtt;
 896                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 897                     &rack->rc_inp->inp_socket->so_rcv,
 898                     &rack->rc_inp->inp_socket->so_snd,
 899                     BBR_LOG_RTO, 0,
 900                     0, &log, false);
 901         }
 902 }
 903
 904 static void
 905 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
 906     uint32_t o_srtt, uint32_t o_var)
 907 {
 908         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 909                 union tcp_log_stackspecific log;
 910
 911                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 912                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 913                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 914                 log.u_bbr.flex1 = t;
 915                 log.u_bbr.flex2 = o_srtt;
 916                 log.u_bbr.flex3 = o_var;
 917                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 918                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 919                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 920                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 921                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 922                 TCP_LOG_EVENT(tp, NULL,
 923                     &rack->rc_inp->inp_socket->so_rcv,
 924                     &rack->rc_inp->inp_socket->so_snd,
 925                     BBR_LOG_BBRRTT, 0,
 926                     0, &log, false);
 927         }
 928 }
 929
 930 static void
 931 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 932 {
 933         /*
 934          * Log the rtt sample we are
 935          * applying to the srtt algorithm in
 936          * useconds.
 937          */
 938         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 939                 union tcp_log_stackspecific log;
 940                 struct timeval tv;
 941
 942                 /* Convert our ms to a microsecond */
 943                 log.u_bbr.flex1 = rtt * 1000;
 944                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 945                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
 946                     &rack->rc_inp->inp_socket->so_rcv,
 947                     &rack->rc_inp->inp_socket->so_snd,
 948                     TCP_LOG_RTT, 0,
 949                     0, &log, false, &tv);
 950         }
 951 }
 952
 953
 954 static inline void
 955 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 956 {
 957         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 958                 union tcp_log_stackspecific log;
 959
 960                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 961                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 962                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 963                 log.u_bbr.flex1 = line;
 964                 log.u_bbr.flex2 = tick;
 965                 log.u_bbr.flex3 = tp->t_maxunacktime;
 966                 log.u_bbr.flex4 = tp->t_acktime;
 967                 log.u_bbr.flex8 = event;
 968                 TCP_LOG_EVENT(tp, NULL,
 969                     &rack->rc_inp->inp_socket->so_rcv,
 970                     &rack->rc_inp->inp_socket->so_snd,
 971                     BBR_LOG_PROGRESS, 0,
 972                     0, &log, false);
 973         }
 974 }
 975
 976 static void
 977 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
 978 {
 979         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 980                 union tcp_log_stackspecific log;
 981
 982                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 983                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 984                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 985                 log.u_bbr.flex1 = slot;
 986                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
 987                 log.u_bbr.flex8 = rack->rc_in_persist;
 988                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 989                     &rack->rc_inp->inp_socket->so_rcv,
 990                     &rack->rc_inp->inp_socket->so_snd,
 991                     BBR_LOG_BBRSND, 0,
 992                     0, &log, false);
 993         }
 994 }
 995
 996 static void
 997 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
 998 {
 999         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1000                 union tcp_log_stackspecific log;
1001                 log.u_bbr.flex1 = did_out;
1002                 log.u_bbr.flex2 = nxt_pkt;
1003                 log.u_bbr.flex3 = way_out;
1004                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1005                 log.u_bbr.flex7 = rack->r_wanted_output;
1006                 log.u_bbr.flex8 = rack->rc_in_persist;
1007                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1008                     &rack->rc_inp->inp_socket->so_rcv,
1009                     &rack->rc_inp->inp_socket->so_snd,
1010                     BBR_LOG_DOSEG_DONE, 0,
1011                     0, &log, false);
1012         }
1013 }
1014
1015
1016 static void
1017 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1018 {
1019         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1020                 union tcp_log_stackspecific log;
1021
1022                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1023                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1024                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1025                 log.u_bbr.flex1 = slot;
1026                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1027                 log.u_bbr.flex7 = hpts_calling;
1028                 log.u_bbr.flex8 = rack->rc_in_persist;
1029                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1030                     &rack->rc_inp->inp_socket->so_rcv,
1031                     &rack->rc_inp->inp_socket->so_snd,
1032                     BBR_LOG_JUSTRET, 0,
1033                     tlen, &log, false);
1034         }
1035 }
1036
1037 static void
1038 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1039 {
1040         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1041                 union tcp_log_stackspecific log;
1042
1043                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1044                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1045                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1046                 log.u_bbr.flex1 = line;
1047                 log.u_bbr.flex2 = 0;
1048                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1049                 log.u_bbr.flex4 = 0;
1050                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1051                 log.u_bbr.flex8 = hpts_removed;
1052                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1053                     &rack->rc_inp->inp_socket->so_rcv,
1054                     &rack->rc_inp->inp_socket->so_snd,
1055                     BBR_LOG_TIMERCANC, 0,
1056                     0, &log, false);
1057         }
1058 }
1059
1060 static void
1061 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1062 {
1063         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1064                 union tcp_log_stackspecific log;
1065
1066                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1067                 log.u_bbr.flex1 = timers;
1068                 log.u_bbr.flex2 = ret;
1069                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1070                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1071                 log.u_bbr.flex5 = cts;
1072                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1073                     &rack->rc_inp->inp_socket->so_rcv,
1074                     &rack->rc_inp->inp_socket->so_snd,
1075                     BBR_LOG_TO_PROCESS, 0,
1076                     0, &log, false);
1077         }
1078 }
1079
1080 static void
1081 rack_counter_destroy()
1082 {
1083         counter_u64_free(rack_badfr);
1084         counter_u64_free(rack_badfr_bytes);
1085         counter_u64_free(rack_rtm_prr_retran);
1086         counter_u64_free(rack_rtm_prr_newdata);
1087         counter_u64_free(rack_timestamp_mismatch);
1088         counter_u64_free(rack_reorder_seen);
1089         counter_u64_free(rack_tlp_tot);
1090         counter_u64_free(rack_tlp_newdata);
1091         counter_u64_free(rack_tlp_retran);
1092         counter_u64_free(rack_tlp_retran_bytes);
1093         counter_u64_free(rack_tlp_retran_fail);
1094         counter_u64_free(rack_to_tot);
1095         counter_u64_free(rack_to_arm_rack);
1096         counter_u64_free(rack_to_arm_tlp);
1097         counter_u64_free(rack_paced_segments);
1098         counter_u64_free(rack_unpaced_segments);
1099         counter_u64_free(rack_saw_enobuf);
1100         counter_u64_free(rack_saw_enetunreach);
1101         counter_u64_free(rack_to_alloc_hard);
1102         counter_u64_free(rack_to_alloc_emerg);
1103         counter_u64_free(rack_sack_proc_all);
1104         counter_u64_free(rack_sack_proc_short);
1105         counter_u64_free(rack_sack_proc_restart);
1106         counter_u64_free(rack_to_alloc);
1107         counter_u64_free(rack_find_high);
1108         counter_u64_free(rack_runt_sacks);
1109         counter_u64_free(rack_enter_tlp_calc);
1110         counter_u64_free(rack_used_tlpmethod);
1111         counter_u64_free(rack_used_tlpmethod2);
1112         counter_u64_free(rack_progress_drops);
1113         counter_u64_free(rack_input_idle_reduces);
1114         counter_u64_free(rack_tlp_does_nada);
1115         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1116         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1117 }
1118
1119 static struct rack_sendmap *
1120 rack_alloc(struct tcp_rack *rack)
1121 {
1122         struct rack_sendmap *rsm;
1123
1124         counter_u64_add(rack_to_alloc, 1);
1125         rack->r_ctl.rc_num_maps_alloced++;
1126         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1127         if (rsm) {
1128                 return (rsm);
1129         }
1130         if (rack->rc_free_cnt) {
1131                 counter_u64_add(rack_to_alloc_emerg, 1);
1132                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1133                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
1134                 rack->rc_free_cnt--;
1135                 return (rsm);
1136         }
1137         return (NULL);
1138 }
1139
1140 static void
1141 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1142 {
1143         rack->r_ctl.rc_num_maps_alloced--;
1144         if (rack->r_ctl.rc_tlpsend == rsm)
1145                 rack->r_ctl.rc_tlpsend = NULL;
1146         if (rack->r_ctl.rc_next == rsm)
1147                 rack->r_ctl.rc_next = NULL;
1148         if (rack->r_ctl.rc_sacklast == rsm)
1149                 rack->r_ctl.rc_sacklast = NULL;
1150         if (rack->rc_free_cnt < rack_free_cache) {
1151                 memset(rsm, 0, sizeof(struct rack_sendmap));
1152                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
1153                 rack->rc_free_cnt++;
1154                 return;
1155         }
1156         uma_zfree(rack_zone, rsm);
1157 }
1158
1159 /*
1160  * CC wrapper hook functions
1161  */
1162 static void
1163 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1164     uint16_t type, int32_t recovery)
1165 {
1166 #ifdef NETFLIX_STATS
1167         int32_t gput;
1168 #endif
1169 #ifdef NETFLIX_CWV
1170         u_long old_cwnd = tp->snd_cwnd;
1171 #endif
1172
1173         INP_WLOCK_ASSERT(tp->t_inpcb);
1174         tp->ccv->nsegs = nsegs;
1175         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1176         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1177                 uint32_t max;
1178
1179                 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
1180                 if (tp->ccv->bytes_this_ack > max) {
1181                         tp->ccv->bytes_this_ack = max;
1182                 }
1183         }
1184         if (tp->snd_cwnd <= tp->snd_wnd)
1185                 tp->ccv->flags |= CCF_CWND_LIMITED;
1186         else
1187                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1188
1189         if (type == CC_ACK) {
1190 #ifdef NETFLIX_STATS
1191                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1192                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1193                 if ((tp->t_flags & TF_GPUTINPROG) &&
1194                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1195                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1196                             max(1, tcp_ts_getticks() - tp->gput_ts);
1197                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1198                             gput);
1199                         /*
1200                          * XXXLAS: This is a temporary hack, and should be
1201                          * chained off VOI_TCP_GPUT when stats(9) grows an
1202                          * API to deal with chained VOIs.
1203                          */
1204                         if (tp->t_stats_gput_prev > 0)
1205                                 stats_voi_update_abs_s32(tp->t_stats,
1206                                     VOI_TCP_GPUT_ND,
1207                                     ((gput - tp->t_stats_gput_prev) * 100) /
1208                                     tp->t_stats_gput_prev);
1209                         tp->t_flags &= ~TF_GPUTINPROG;
1210                         tp->t_stats_gput_prev = gput;
1211 #ifdef NETFLIX_CWV
1212                         if (tp->t_maxpeakrate) {
1213                                 /*
1214                                  * We update t_peakrate_thr. This gives us roughly
1215                                  * one update per round trip time.
1216                                  */
1217                                 tcp_update_peakrate_thr(tp);
1218                         }
1219 #endif
1220                 }
1221 #endif
1222                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1223                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1224                             nsegs * V_tcp_abc_l_var * tp->t_maxseg);
1225                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1226                                 tp->t_bytes_acked -= tp->snd_cwnd;
1227                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1228                         }
1229                 } else {
1230                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1231                         tp->t_bytes_acked = 0;
1232                 }
1233         }
1234         if (CC_ALGO(tp)->ack_received != NULL) {
1235                 /* XXXLAS: Find a way to live without this */
1236                 tp->ccv->curack = th->th_ack;
1237                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1238         }
1239 #ifdef NETFLIX_STATS
1240         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1241 #endif
1242         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1243                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1244         }
1245 #ifdef NETFLIX_CWV
1246         if (tp->cwv_enabled) {
1247                 /*
1248                  * Per RFC 7661: The behaviour in the non-validated phase is
1249                  * specified as: o  A sender determines whether to increase
1250                  * the cwnd based upon whether it is cwnd-limited (see
1251                  * Section 4.5.3): * A sender that is cwnd-limited MAY use
1252                  * the standard TCP method to increase cwnd (i.e., the
1253                  * standard method permits a TCP sender that fully utilises
1254                  * the cwnd to increase the cwnd each time it receives an
1255                  * ACK). * A sender that is not cwnd-limited MUST NOT
1256                  * increase the cwnd when ACK packets are received in this
1257                  * phase (i.e., needs to avoid growing the cwnd when it has
1258                  * not recently sent using the current size of cwnd).
1259                  */
1260                 if ((tp->snd_cwnd > old_cwnd) &&
1261                     (tp->cwv_cwnd_valid == 0) &&
1262                     (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
1263                         tp->snd_cwnd = old_cwnd;
1264                 }
1265                 /* Try to update pipeAck and NCWV state */
1266                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1267                     !IN_RECOVERY(tp->t_flags)) {
1268                         uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
1269
1270                         tcp_newcwv_update_pipeack(tp, data);
1271                 }
1272         }
1273         /* we enforce max peak rate if it is set. */
1274         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1275                 tp->snd_cwnd = tp->t_peakrate_thr;
1276         }
1277 #endif
1278 }
1279
1280 static void
1281 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1282 {
1283         struct tcp_rack *rack;
1284
1285         rack = (struct tcp_rack *)tp->t_fb_ptr;
1286         INP_WLOCK_ASSERT(tp->t_inpcb);
1287         if (rack->r_ctl.rc_prr_sndcnt > 0)
1288                 rack->r_wanted_output++;
1289 }
1290
1291 static void
1292 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1293 {
1294         struct tcp_rack *rack;
1295
1296         INP_WLOCK_ASSERT(tp->t_inpcb);
1297         rack = (struct tcp_rack *)tp->t_fb_ptr;
1298         if (CC_ALGO(tp)->post_recovery != NULL) {
1299                 tp->ccv->curack = th->th_ack;
1300                 CC_ALGO(tp)->post_recovery(tp->ccv);
1301         }
1302         /*
1303          * Here we can in theory adjust cwnd to be based on the number of
1304          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1305          * based on the rack_use_proportional flag.
1306          */
1307         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1308                 int32_t reduce;
1309
1310                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1311                 if (reduce > 50) {
1312                         reduce = 50;
1313                 }
1314                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1315         } else {
1316                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1317                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1318                         tp->snd_cwnd = tp->snd_ssthresh;
1319                 }
1320         }
1321         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1322                 /* Suck the next prr cnt back into cwnd */
1323                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1324                 rack->r_ctl.rc_prr_sndcnt = 0;
1325         }
1326         EXIT_RECOVERY(tp->t_flags);
1327
1328
1329 #ifdef NETFLIX_CWV
1330         if (tp->cwv_enabled) {
1331                 if ((tp->cwv_cwnd_valid == 0) &&
1332                     (tp->snd_cwv.in_recovery))
1333                         tcp_newcwv_end_recovery(tp);
1334         }
1335 #endif
1336 }
1337
1338 static void
1339 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1340 {
1341         struct tcp_rack *rack;
1342
1343         INP_WLOCK_ASSERT(tp->t_inpcb);
1344
1345         rack = (struct tcp_rack *)tp->t_fb_ptr;
1346         switch (type) {
1347         case CC_NDUPACK:
1348 /*              rack->r_ctl.rc_ssthresh_set = 1;*/
1349                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1350                         rack->r_ctl.rc_tlp_rtx_out = 0;
1351                         rack->r_ctl.rc_prr_delivered = 0;
1352                         rack->r_ctl.rc_prr_out = 0;
1353                         rack->r_ctl.rc_loss_count = 0;
1354                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
1355                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1356                         tp->snd_recover = tp->snd_max;
1357                         if (tp->t_flags & TF_ECN_PERMIT)
1358                                 tp->t_flags |= TF_ECN_SND_CWR;
1359                 }
1360                 break;
1361         case CC_ECN:
1362                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1363                         TCPSTAT_INC(tcps_ecn_rcwnd);
1364                         tp->snd_recover = tp->snd_max;
1365                         if (tp->t_flags & TF_ECN_PERMIT)
1366                                 tp->t_flags |= TF_ECN_SND_CWR;
1367                 }
1368                 break;
1369         case CC_RTO:
1370                 tp->t_dupacks = 0;
1371                 tp->t_bytes_acked = 0;
1372                 EXIT_RECOVERY(tp->t_flags);
1373                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1374                     tp->t_maxseg) * tp->t_maxseg;
1375                 tp->snd_cwnd = tp->t_maxseg;
1376                 break;
1377         case CC_RTO_ERR:
1378                 TCPSTAT_INC(tcps_sndrexmitbad);
1379                 /* RTO was unnecessary, so reset everything. */
1380                 tp->snd_cwnd = tp->snd_cwnd_prev;
1381                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1382                 tp->snd_recover = tp->snd_recover_prev;
1383                 if (tp->t_flags & TF_WASFRECOVERY)
1384                         ENTER_FASTRECOVERY(tp->t_flags);
1385                 if (tp->t_flags & TF_WASCRECOVERY)
1386                         ENTER_CONGRECOVERY(tp->t_flags);
1387                 tp->snd_nxt = tp->snd_max;
1388                 tp->t_badrxtwin = 0;
1389                 break;
1390         }
1391
1392         if (CC_ALGO(tp)->cong_signal != NULL) {
1393                 if (th != NULL)
1394                         tp->ccv->curack = th->th_ack;
1395                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1396         }
1397 #ifdef NETFLIX_CWV
1398         if (tp->cwv_enabled) {
1399                 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
1400                         tcp_newcwv_enter_recovery(tp);
1401                 }
1402                 if (type == CC_RTO) {
1403                         tcp_newcwv_reset(tp);
1404                 }
1405         }
1406 #endif
1407 }
1408
1409
1410
1411 static inline void
1412 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
1413 {
1414         uint32_t i_cwnd;
1415
1416         INP_WLOCK_ASSERT(tp->t_inpcb);
1417
1418 #ifdef NETFLIX_STATS
1419         TCPSTAT_INC(tcps_idle_restarts);
1420         if (tp->t_state == TCPS_ESTABLISHED)
1421                 TCPSTAT_INC(tcps_idle_estrestarts);
1422 #endif
1423         if (CC_ALGO(tp)->after_idle != NULL)
1424                 CC_ALGO(tp)->after_idle(tp->ccv);
1425
1426         if (tp->snd_cwnd == 1)
1427                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1428         else if (V_tcp_initcwnd_segments)
1429                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
1430                     max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
1431         else if (V_tcp_do_rfc3390)
1432                 i_cwnd = min(4 * tp->t_maxseg,
1433                     max(2 * tp->t_maxseg, 4380));
1434         else {
1435                 /* Per RFC5681 Section 3.1 */
1436                 if (tp->t_maxseg > 2190)
1437                         i_cwnd = 2 * tp->t_maxseg;
1438                 else if (tp->t_maxseg > 1095)
1439                         i_cwnd = 3 * tp->t_maxseg;
1440                 else
1441                         i_cwnd = 4 * tp->t_maxseg;
1442         }
1443         if (reduce_largest) {
1444                 /*
1445                  * Do we reduce the largest cwnd to make
1446                  * rack play nice on restart hptsi wise?
1447                  */
1448                 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
1449                         ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
1450         }
1451         /*
1452          * Being idle is no differnt than the initial window. If the cc
1453          * clamps it down below the initial window raise it to the initial
1454          * window.
1455          */
1456         if (tp->snd_cwnd < i_cwnd) {
1457                 tp->snd_cwnd = i_cwnd;
1458         }
1459 }
1460
1461
1462 /*
1463  * Indicate whether this ack should be delayed.  We can delay the ack if
1464  * following conditions are met:
1465  *      - There is no delayed ack timer in progress.
1466  *      - Our last ack wasn't a 0-sized window. We never want to delay
1467  *        the ack that opens up a 0-sized window.
1468  *      - LRO wasn't used for this segment. We make sure by checking that the
1469  *        segment size is not larger than the MSS.
1470  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1471  *        connection.
1472  */
1473 #define DELAY_ACK(tp, tlen)                      \
1474         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1475         ((tp->t_flags & TF_DELACK) == 0) &&      \
1476         (tlen <= tp->t_maxseg) &&                \
1477         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1478
1479 static inline void
1480 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
1481 {
1482         int32_t win;
1483
1484         /*
1485          * Calculate amount of space in receive window, and then do TCP
1486          * input processing. Receive window is amount of space in rcv queue,
1487          * but not less than advertised window.
1488          */
1489         win = sbspace(&so->so_rcv);
1490         if (win < 0)
1491                 win = 0;
1492         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1493 }
1494
1495 static void
1496 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
1497 {
1498         /*
1499          * Drop space held by incoming segment and return.
1500          */
1501         if (tp != NULL)
1502                 INP_WUNLOCK(tp->t_inpcb);
1503         if (m)
1504                 m_freem(m);
1505 }
1506
1507 static void
1508 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
1509     int32_t rstreason, int32_t tlen)
1510 {
1511         if (tp != NULL) {
1512                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1513                 INP_WUNLOCK(tp->t_inpcb);
1514         } else
1515                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1516 }
1517
1518 /*
1519  * The value in ret_val informs the caller
1520  * if we dropped the tcb (and lock) or not.
1521  * 1 = we dropped it, 0 = the TCB is still locked
1522  * and valid.
1523  */
1524 static void
1525 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
1526 {
1527         /*
1528          * Generate an ACK dropping incoming segment if it occupies sequence
1529          * space, where the ACK reflects our state.
1530          *
1531          * We can now skip the test for the RST flag since all paths to this
1532          * code happen after packets containing RST have been dropped.
1533          *
1534          * In the SYN-RECEIVED state, don't send an ACK unless the segment
1535          * we received passes the SYN-RECEIVED ACK test. If it fails send a
1536          * RST.  This breaks the loop in the "LAND" DoS attack, and also
1537          * prevents an ACK storm between two listening ports that have been
1538          * sent forged SYN segments, each with the source address of the
1539          * other.
1540          */
1541         struct tcp_rack *rack;
1542
1543         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1544             (SEQ_GT(tp->snd_una, th->th_ack) ||
1545             SEQ_GT(th->th_ack, tp->snd_max))) {
1546                 *ret_val = 1;
1547                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
1548                 return;
1549         } else
1550                 *ret_val = 0;
1551         rack = (struct tcp_rack *)tp->t_fb_ptr;
1552         rack->r_wanted_output++;
1553         tp->t_flags |= TF_ACKNOW;
1554         if (m)
1555                 m_freem(m);
1556 }
1557
1558
1559 static int
1560 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
1561 {
1562         /*
1563          * RFC5961 Section 3.2
1564          *
1565          * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
1566          * window, we send challenge ACK.
1567          *
1568          * Note: to take into account delayed ACKs, we should test against
1569          * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
1570          * of closed window, not covered by the RFC.
1571          */
1572         int dropped = 0;
1573
1574         if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
1575             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1576             (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1577
1578                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1579                 KASSERT(tp->t_state != TCPS_SYN_SENT,
1580                     ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
1581                     __func__, th, tp));
1582
1583                 if (V_tcp_insecure_rst ||
1584                     (tp->last_ack_sent == th->th_seq) ||
1585                     (tp->rcv_nxt == th->th_seq) ||
1586                     ((tp->last_ack_sent - 1) == th->th_seq)) {
1587                         TCPSTAT_INC(tcps_drops);
1588                         /* Drop the connection. */
1589                         switch (tp->t_state) {
1590                         case TCPS_SYN_RECEIVED:
1591                                 so->so_error = ECONNREFUSED;
1592                                 goto close;
1593                         case TCPS_ESTABLISHED:
1594                         case TCPS_FIN_WAIT_1:
1595                         case TCPS_FIN_WAIT_2:
1596                         case TCPS_CLOSE_WAIT:
1597                         case TCPS_CLOSING:
1598                         case TCPS_LAST_ACK:
1599                                 so->so_error = ECONNRESET;
1600                 close:
1601                                 tcp_state_change(tp, TCPS_CLOSED);
1602                                 /* FALLTHROUGH */
1603                         default:
1604                                 tp = tcp_close(tp);
1605                         }
1606                         dropped = 1;
1607                         rack_do_drop(m, tp);
1608                 } else {
1609                         TCPSTAT_INC(tcps_badrst);
1610                         /* Send challenge ACK. */
1611                         tcp_respond(tp, mtod(m, void *), th, m,
1612                             tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1613                         tp->last_ack_sent = tp->rcv_nxt;
1614                 }
1615         } else {
1616                 m_freem(m);
1617         }
1618         return (dropped);
1619 }
1620
1621 /*
1622  * The value in ret_val informs the caller
1623  * if we dropped the tcb (and lock) or not.
1624  * 1 = we dropped it, 0 = the TCB is still locked
1625  * and valid.
1626  */
1627 static void
1628 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
1629 {
1630         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1631
1632         TCPSTAT_INC(tcps_badsyn);
1633         if (V_tcp_insecure_syn &&
1634             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1635             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1636                 tp = tcp_drop(tp, ECONNRESET);
1637                 *ret_val = 1;
1638                 rack_do_drop(m, tp);
1639         } else {
1640                 /* Send challenge ACK. */
1641                 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
1642                     tp->snd_nxt, TH_ACK);
1643                 tp->last_ack_sent = tp->rcv_nxt;
1644                 m = NULL;
1645                 *ret_val = 0;
1646                 rack_do_drop(m, NULL);
1647         }
1648 }
1649
1650 /*
1651  * rack_ts_check returns 1 for you should not proceed. It places
1652  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1653  * that the TCB is unlocked and probably dropped. The 0 indicates the
1654  * TCB is still valid and locked.
1655  */
1656 static int
1657 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
1658 {
1659
1660         /* Check to see if ts_recent is over 24 days old.  */
1661         if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1662                 /*
1663                  * Invalidate ts_recent.  If this segment updates ts_recent,
1664                  * the age will be reset later and ts_recent will get a
1665                  * valid value.  If it does not, setting ts_recent to zero
1666                  * will at least satisfy the requirement that zero be placed
1667                  * in the timestamp echo reply when ts_recent isn't valid.
1668                  * The age isn't reset until we get a valid ts_recent
1669                  * because we don't want out-of-order segments to be dropped
1670                  * when ts_recent is old.
1671                  */
1672                 tp->ts_recent = 0;
1673         } else {
1674                 TCPSTAT_INC(tcps_rcvduppack);
1675                 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1676                 TCPSTAT_INC(tcps_pawsdrop);
1677                 *ret_val = 0;
1678                 if (tlen) {
1679                         rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1680                 } else {
1681                         rack_do_drop(m, NULL);
1682                 }
1683                 return (1);
1684         }
1685         return (0);
1686 }
1687
1688 /*
1689  * rack_drop_checks returns 1 for you should not proceed. It places
1690  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1691  * that the TCB is unlocked and probably dropped. The 0 indicates the
1692  * TCB is still valid and locked.
1693  */
1694 static int
1695 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
1696 {
1697         int32_t todrop;
1698         int32_t thflags;
1699         int32_t tlen;
1700
1701         thflags = *thf;
1702         tlen = *tlenp;
1703         todrop = tp->rcv_nxt - th->th_seq;
1704         if (todrop > 0) {
1705                 if (thflags & TH_SYN) {
1706                         thflags &= ~TH_SYN;
1707                         th->th_seq++;
1708                         if (th->th_urp > 1)
1709                                 th->th_urp--;
1710                         else
1711                                 thflags &= ~TH_URG;
1712                         todrop--;
1713                 }
1714                 /*
1715                  * Following if statement from Stevens, vol. 2, p. 960.
1716                  */
1717                 if (todrop > tlen
1718                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1719                         /*
1720                          * Any valid FIN must be to the left of the window.
1721                          * At this point the FIN must be a duplicate or out
1722                          * of sequence; drop it.
1723                          */
1724                         thflags &= ~TH_FIN;
1725                         /*
1726                          * Send an ACK to resynchronize and drop any data.
1727                          * But keep on processing for RST or ACK.
1728                          */
1729                         tp->t_flags |= TF_ACKNOW;
1730                         todrop = tlen;
1731                         TCPSTAT_INC(tcps_rcvduppack);
1732                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1733                 } else {
1734                         TCPSTAT_INC(tcps_rcvpartduppack);
1735                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1736                 }
1737                 *drop_hdrlen += todrop; /* drop from the top afterwards */
1738                 th->th_seq += todrop;
1739                 tlen -= todrop;
1740                 if (th->th_urp > todrop)
1741                         th->th_urp -= todrop;
1742                 else {
1743                         thflags &= ~TH_URG;
1744                         th->th_urp = 0;
1745                 }
1746         }
1747         /*
1748          * If segment ends after window, drop trailing data (and PUSH and
1749          * FIN); if nothing left, just ACK.
1750          */
1751         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1752         if (todrop > 0) {
1753                 TCPSTAT_INC(tcps_rcvpackafterwin);
1754                 if (todrop >= tlen) {
1755                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1756                         /*
1757                          * If window is closed can only take segments at
1758                          * window edge, and have to drop data and PUSH from
1759                          * incoming segments.  Continue processing, but
1760                          * remember to ack.  Otherwise, drop segment and
1761                          * ack.
1762                          */
1763                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1764                                 tp->t_flags |= TF_ACKNOW;
1765                                 TCPSTAT_INC(tcps_rcvwinprobe);
1766                         } else {
1767                                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1768                                 return (1);
1769                         }
1770                 } else
1771                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1772                 m_adj(m, -todrop);
1773                 tlen -= todrop;
1774                 thflags &= ~(TH_PUSH | TH_FIN);
1775         }
1776         *thf = thflags;
1777         *tlenp = tlen;
1778         return (0);
1779 }
1780
1781 static struct rack_sendmap *
1782 rack_find_lowest_rsm(struct tcp_rack *rack)
1783 {
1784         struct rack_sendmap *rsm;
1785
1786         /*
1787          * Walk the time-order transmitted list looking for an rsm that is
1788          * not acked. This will be the one that was sent the longest time
1789          * ago that is still outstanding.
1790          */
1791         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1792                 if (rsm->r_flags & RACK_ACKED) {
1793                         continue;
1794                 }
1795                 goto finish;
1796         }
1797 finish:
1798         return (rsm);
1799 }
1800
1801 static struct rack_sendmap *
1802 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1803 {
1804         struct rack_sendmap *prsm;
1805
1806         /*
1807          * Walk the sequence order list backward until we hit and arrive at
1808          * the highest seq not acked. In theory when this is called it
1809          * should be the last segment (which it was not).
1810          */
1811         counter_u64_add(rack_find_high, 1);
1812         prsm = rsm;
1813         TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
1814                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1815                         continue;
1816                 }
1817                 return (prsm);
1818         }
1819         return (NULL);
1820 }
1821
1822
1823 static uint32_t
1824 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1825 {
1826         int32_t lro;
1827         uint32_t thresh;
1828
1829         /*
1830          * lro is the flag we use to determine if we have seen reordering.
1831          * If it gets set we have seen reordering. The reorder logic either
1832          * works in one of two ways:
1833          *
1834          * If reorder-fade is configured, then we track the last time we saw
1835          * re-ordering occur. If we reach the point where enough time as
1836          * passed we no longer consider reordering has occuring.
1837          *
1838          * Or if reorder-face is 0, then once we see reordering we consider
1839          * the connection to alway be subject to reordering and just set lro
1840          * to 1.
1841          *
1842          * In the end if lro is non-zero we add the extra time for
1843          * reordering in.
1844          */
1845         if (srtt == 0)
1846                 srtt = 1;
1847         if (rack->r_ctl.rc_reorder_ts) {
1848                 if (rack->r_ctl.rc_reorder_fade) {
1849                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1850                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1851                                 if (lro == 0) {
1852                                         /*
1853                                          * No time as passed since the last
1854                                          * reorder, mark it as reordering.
1855                                          */
1856                                         lro = 1;
1857                                 }
1858                         } else {
1859                                 /* Negative time? */
1860                                 lro = 0;
1861                         }
1862                         if (lro > rack->r_ctl.rc_reorder_fade) {
1863                                 /* Turn off reordering seen too */
1864                                 rack->r_ctl.rc_reorder_ts = 0;
1865                                 lro = 0;
1866                         }
1867                 } else {
1868                         /* Reodering does not fade */
1869                         lro = 1;
1870                 }
1871         } else {
1872                 lro = 0;
1873         }
1874         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1875         if (lro) {
1876                 /* It must be set, if not you get 1/4 rtt */
1877                 if (rack->r_ctl.rc_reorder_shift)
1878                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1879                 else
1880                         thresh += (srtt >> 2);
1881         } else {
1882                 thresh += 1;
1883         }
1884         /* We don't let the rack timeout be above a RTO */
1885
1886         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1887                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1888         }
1889         /* And we don't want it above the RTO max either */
1890         if (thresh > rack_rto_max) {
1891                 thresh = rack_rto_max;
1892         }
1893         return (thresh);
1894 }
1895
1896 static uint32_t
1897 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
1898                      struct rack_sendmap *rsm, uint32_t srtt)
1899 {
1900         struct rack_sendmap *prsm;
1901         uint32_t thresh, len;
1902         int maxseg;
1903
1904         if (srtt == 0)
1905                 srtt = 1;
1906         if (rack->r_ctl.rc_tlp_threshold)
1907                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
1908         else
1909                 thresh = (srtt * 2);
1910
1911         /* Get the previous sent packet, if any  */
1912         maxseg = tcp_maxseg(tp);
1913         counter_u64_add(rack_enter_tlp_calc, 1);
1914         len = rsm->r_end - rsm->r_start;
1915         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
1916                 /* Exactly like the ID */
1917                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
1918                         uint32_t alt_thresh;
1919                         /*
1920                          * Compensate for delayed-ack with the d-ack time.
1921                          */
1922                         counter_u64_add(rack_used_tlpmethod, 1);
1923                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1924                         if (alt_thresh > thresh)
1925                                 thresh = alt_thresh;
1926                 }
1927         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
1928                 /* 2.1 behavior */
1929                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
1930                 if (prsm && (len <= maxseg)) {
1931                         /*
1932                          * Two packets outstanding, thresh should be (2*srtt) +
1933                          * possible inter-packet delay (if any).
1934                          */
1935                         uint32_t inter_gap = 0;
1936                         int idx, nidx;
1937
1938                         counter_u64_add(rack_used_tlpmethod, 1);
1939                         idx = rsm->r_rtr_cnt - 1;
1940                         nidx = prsm->r_rtr_cnt - 1;
1941                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
1942                                 /* Yes it was sent later (or at the same time) */
1943                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
1944                         }
1945                         thresh += inter_gap;
1946                 } else  if (len <= maxseg) {
1947                         /*
1948                          * Possibly compensate for delayed-ack.
1949                          */
1950                         uint32_t alt_thresh;
1951
1952                         counter_u64_add(rack_used_tlpmethod2, 1);
1953                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1954                         if (alt_thresh > thresh)
1955                                 thresh = alt_thresh;
1956                 }
1957         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
1958                 /* 2.2 behavior */
1959                 if (len <= maxseg) {
1960                         uint32_t alt_thresh;
1961                         /*
1962                          * Compensate for delayed-ack with the d-ack time.
1963                          */
1964                         counter_u64_add(rack_used_tlpmethod, 1);
1965                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1966                         if (alt_thresh > thresh)
1967                                 thresh = alt_thresh;
1968                 }
1969         }
1970         /* Not above an RTO */
1971         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
1972                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
1973         }
1974         /* Not above a RTO max */
1975         if (thresh > rack_rto_max) {
1976                 thresh = rack_rto_max;
1977         }
1978         /* Apply user supplied min TLP */
1979         if (thresh < rack_tlp_min) {
1980                 thresh = rack_tlp_min;
1981         }
1982         return (thresh);
1983 }
1984
1985 static struct rack_sendmap *
1986 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
1987 {
1988         /*
1989          * Check to see that we don't need to fall into recovery. We will
1990          * need to do so if our oldest transmit is past the time we should
1991          * have had an ack.
1992          */
1993         struct tcp_rack *rack;
1994         struct rack_sendmap *rsm;
1995         int32_t idx;
1996         uint32_t srtt_cur, srtt, thresh;
1997
1998         rack = (struct tcp_rack *)tp->t_fb_ptr;
1999         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
2000                 return (NULL);
2001         }
2002         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
2003         srtt = TICKS_2_MSEC(srtt_cur);
2004         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
2005                 srtt = rack->rc_rack_rtt;
2006
2007         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2008         if (rsm == NULL)
2009                 return (NULL);
2010
2011         if (rsm->r_flags & RACK_ACKED) {
2012                 rsm = rack_find_lowest_rsm(rack);
2013                 if (rsm == NULL)
2014                         return (NULL);
2015         }
2016         idx = rsm->r_rtr_cnt - 1;
2017         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2018         if (tsused < rsm->r_tim_lastsent[idx]) {
2019                 return (NULL);
2020         }
2021         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2022                 return (NULL);
2023         }
2024         /* Ok if we reach here we are over-due */
2025         rack->r_ctl.rc_rsm_start = rsm->r_start;
2026         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2027         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2028         rack_cong_signal(tp, NULL, CC_NDUPACK);
2029         return (rsm);
2030 }
2031
2032 static uint32_t
2033 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2034 {
2035         int32_t t;
2036         int32_t tt;
2037         uint32_t ret_val;
2038
2039         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2040         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2041             tcp_persmin, tcp_persmax);
2042         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2043                 tp->t_rxtshift++;
2044         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2045         ret_val = (uint32_t)tt;
2046         return (ret_val);
2047 }
2048
2049 static uint32_t
2050 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2051 {
2052         /*
2053          * Start the FR timer, we do this based on getting the first one in
2054          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2055          * events we need to stop the running timer (if its running) before
2056          * starting the new one.
2057          */
2058         uint32_t thresh, exp, to, srtt, time_since_sent;
2059         uint32_t srtt_cur;
2060         int32_t idx;
2061         int32_t is_tlp_timer = 0;
2062         struct rack_sendmap *rsm;
2063
2064         if (rack->t_timers_stopped) {
2065                 /* All timers have been stopped none are to run */
2066                 return (0);
2067         }
2068         if (rack->rc_in_persist) {
2069                 /* We can't start any timer in persists */
2070                 return (rack_get_persists_timer_val(tp, rack));
2071         }
2072         if (tp->t_state < TCPS_ESTABLISHED)
2073                 goto activate_rxt;
2074         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2075         if (rsm == NULL) {
2076                 /* Nothing on the send map */
2077 activate_rxt:
2078                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2079                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2080                         to = TICKS_2_MSEC(tp->t_rxtcur);
2081                         if (to == 0)
2082                                 to = 1;
2083                         return (to);
2084                 }
2085                 return (0);
2086         }
2087         if (rsm->r_flags & RACK_ACKED) {
2088                 rsm = rack_find_lowest_rsm(rack);
2089                 if (rsm == NULL) {
2090                         /* No lowest? */
2091                         goto activate_rxt;
2092                 }
2093         }
2094         /* Convert from ms to usecs */
2095         if (rsm->r_flags & RACK_SACK_PASSED) {
2096                 if ((tp->t_flags & TF_SENTFIN) &&
2097                     ((tp->snd_max - tp->snd_una) == 1) &&
2098                     (rsm->r_flags & RACK_HAS_FIN)) {
2099                         /*
2100                          * We don't start a rack timer if all we have is a
2101                          * FIN outstanding.
2102                          */
2103                         goto activate_rxt;
2104                 }
2105                 if (tp->t_srtt) {
2106                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2107                         srtt = TICKS_2_MSEC(srtt_cur);
2108                 } else
2109                         srtt = RACK_INITIAL_RTO;
2110
2111                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2112                 idx = rsm->r_rtr_cnt - 1;
2113                 exp = rsm->r_tim_lastsent[idx] + thresh;
2114                 if (SEQ_GEQ(exp, cts)) {
2115                         to = exp - cts;
2116                         if (to < rack->r_ctl.rc_min_to) {
2117                                 to = rack->r_ctl.rc_min_to;
2118                         }
2119                 } else {
2120                         to = rack->r_ctl.rc_min_to;
2121                 }
2122         } else {
2123                 /* Ok we need to do a TLP not RACK */
2124                 if ((rack->rc_tlp_in_progress != 0) ||
2125                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2126                         /*
2127                          * The previous send was a TLP or a tlp_rtx is in
2128                          * process.
2129                          */
2130                         goto activate_rxt;
2131                 }
2132                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2133                 if (rsm == NULL) {
2134                         /* We found no rsm to TLP with. */
2135                         goto activate_rxt;
2136                 }
2137                 if (rsm->r_flags & RACK_HAS_FIN) {
2138                         /* If its a FIN we dont do TLP */
2139                         rsm = NULL;
2140                         goto activate_rxt;
2141                 }
2142                 idx = rsm->r_rtr_cnt - 1;
2143                 if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
2144                         time_since_sent = cts - rsm->r_tim_lastsent[idx];
2145                 else
2146                         time_since_sent = 0;
2147                 is_tlp_timer = 1;
2148                 if (tp->t_srtt) {
2149                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2150                         srtt = TICKS_2_MSEC(srtt_cur);
2151                 } else
2152                         srtt = RACK_INITIAL_RTO;
2153                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2154                 if (thresh > time_since_sent)
2155                         to = thresh - time_since_sent;
2156                 else
2157                         to = rack->r_ctl.rc_min_to;
2158                 if (to > TCPTV_REXMTMAX) {
2159                         /*
2160                          * If the TLP time works out to larger than the max
2161                          * RTO lets not do TLP.. just RTO.
2162                          */
2163                         goto activate_rxt;
2164                 }
2165                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2166                         /*
2167                          * The tail is no longer the last one I did a probe
2168                          * on
2169                          */
2170                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2171                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2172                 }
2173         }
2174         if (is_tlp_timer == 0) {
2175                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2176         } else {
2177                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2178                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2179                         /*
2180                          * We have exceeded how many times we can retran the
2181                          * current TLP timer, switch to the RTO timer.
2182                          */
2183                         goto activate_rxt;
2184                 } else {
2185                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2186                 }
2187         }
2188         if (to == 0)
2189                 to = 1;
2190         return (to);
2191 }
2192
2193 static void
2194 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2195 {
2196         if (rack->rc_in_persist == 0) {
2197                 if (((tp->t_flags & TF_SENTFIN) == 0) &&
2198                     (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
2199                         /* Must need to send more data to enter persist */
2200                         return;
2201                 rack->r_ctl.rc_went_idle_time = cts;
2202                 rack_timer_cancel(tp, rack, cts, __LINE__);
2203                 tp->t_rxtshift = 0;
2204                 rack->rc_in_persist = 1;
2205         }
2206 }
2207
2208 static void
2209 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2210 {
2211         if (rack->rc_inp->inp_in_hpts)  {
2212                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2213                 rack->r_ctl.rc_hpts_flags  = 0;
2214         }
2215         rack->rc_in_persist = 0;
2216         rack->r_ctl.rc_went_idle_time = 0;
2217         tp->t_flags &= ~TF_FORCEDATA;
2218         tp->t_rxtshift = 0;
2219 }
2220
2221 static void
2222 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
2223     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
2224 {
2225         struct inpcb *inp;
2226         uint32_t delayed_ack = 0;
2227         uint32_t hpts_timeout;
2228         uint8_t stopped;
2229         uint32_t left = 0;
2230
2231         inp = tp->t_inpcb;
2232         if (inp->inp_in_hpts) {
2233                 /* A previous call is already set up */
2234                 return;
2235         }
2236         if (tp->t_state == TCPS_CLOSED) {
2237                 return;
2238         }
2239         stopped = rack->rc_tmr_stopped;
2240         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2241                 left = rack->r_ctl.rc_timer_exp - cts;
2242         }
2243         rack->r_ctl.rc_timer_exp = 0;
2244         if (rack->rc_inp->inp_in_hpts == 0) {
2245                 rack->r_ctl.rc_hpts_flags = 0;
2246         }
2247         if (slot) {
2248                 /* We are hptsi too */
2249                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2250         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2251                 /*
2252                  * We are still left on the hpts when the to goes
2253                  * it will be for output.
2254                  */
2255                 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
2256                         slot = cts - rack->r_ctl.rc_last_output_to;
2257                 else
2258                         slot = 1;
2259         }
2260         if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2261                 /* No send window.. we must enter persist */
2262                 rack_enter_persist(tp, rack, cts);
2263         } else if ((frm_out_sbavail &&
2264                     (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
2265                     (tp->snd_wnd < tp->t_maxseg)) &&
2266             TCPS_HAVEESTABLISHED(tp->t_state)) {
2267                 /*
2268                  * If we have no window or we can't send a segment (and have
2269                  * data to send.. we cheat here and frm_out_sbavail is
2270                  * passed in with the sbavail(sb) only from bbr_output) and
2271                  * we are established, then we must enter persits (if not
2272                  * already in persits).
2273                  */
2274                 rack_enter_persist(tp, rack, cts);
2275         }
2276         hpts_timeout = rack_timer_start(tp, rack, cts);
2277         if (tp->t_flags & TF_DELACK) {
2278                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2279                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2280         }
2281         if (delayed_ack && ((hpts_timeout == 0) ||
2282                             (delayed_ack < hpts_timeout)))
2283                 hpts_timeout = delayed_ack;
2284         else
2285                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2286         /*
2287          * If no timers are going to run and we will fall off the hptsi
2288          * wheel, we resort to a keep-alive timer if its configured.
2289          */
2290         if ((hpts_timeout == 0) &&
2291             (slot == 0)) {
2292                 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2293                     (tp->t_state <= TCPS_CLOSING)) {
2294                         /*
2295                          * Ok we have no timer (persists, rack, tlp, rxt  or
2296                          * del-ack), we don't have segments being paced. So
2297                          * all that is left is the keepalive timer.
2298                          */
2299                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2300                                 /* Get the established keep-alive time */
2301                                 hpts_timeout = TP_KEEPIDLE(tp);
2302                         } else {
2303                                 /* Get the initial setup keep-alive time */
2304                                 hpts_timeout = TP_KEEPINIT(tp);
2305                         }
2306                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2307                 }
2308         }
2309         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2310             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2311                 /*
2312                  * RACK, TLP, persists and RXT timers all are restartable
2313                  * based on actions input .. i.e we received a packet (ack
2314                  * or sack) and that changes things (rw, or snd_una etc).
2315                  * Thus we can restart them with a new value. For
2316                  * keep-alive, delayed_ack we keep track of what was left
2317                  * and restart the timer with a smaller value.
2318                  */
2319                 if (left < hpts_timeout)
2320                         hpts_timeout = left;
2321         }
2322         if (hpts_timeout) {
2323                 /*
2324                  * Hack alert for now we can't time-out over 2,147,483
2325                  * seconds (a bit more than 596 hours), which is probably ok
2326                  * :).
2327                  */
2328                 if (hpts_timeout > 0x7ffffffe)
2329                         hpts_timeout = 0x7ffffffe;
2330                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2331         }
2332         if (slot) {
2333                 rack->r_ctl.rc_last_output_to = cts + slot;
2334                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2335                         if (rack->rc_inp->inp_in_hpts == 0)
2336                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2337                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2338                 } else {
2339                         /*
2340                          * Arrange for the hpts to kick back in after the
2341                          * t-o if the t-o does not cause a send.
2342                          */
2343                         if (rack->rc_inp->inp_in_hpts == 0)
2344                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2345                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2346                 }
2347         } else if (hpts_timeout) {
2348                 if (rack->rc_inp->inp_in_hpts == 0)
2349                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2350                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2351         } else {
2352                 /* No timer starting */
2353 #ifdef INVARIANTS
2354                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2355                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2356                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2357                 }
2358 #endif
2359         }
2360         rack->rc_tmr_stopped = 0;
2361         if (slot)
2362                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2363 }
2364
2365 /*
2366  * RACK Timer, here we simply do logging and house keeping.
2367  * the normal rack_output() function will call the
2368  * appropriate thing to check if we need to do a RACK retransmit.
2369  * We return 1, saying don't proceed with rack_output only
2370  * when all timers have been stopped (destroyed PCB?).
2371  */
2372 static int
2373 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2374 {
2375         /*
2376          * This timer simply provides an internal trigger to send out data.
2377          * The check_recovery_mode call will see if there are needed
2378          * retransmissions, if so we will enter fast-recovery. The output
2379          * call may or may not do the same thing depending on sysctl
2380          * settings.
2381          */
2382         struct rack_sendmap *rsm;
2383         int32_t recovery;
2384
2385         if (tp->t_timers->tt_flags & TT_STOPPED) {
2386                 return (1);
2387         }
2388         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2389                 /* Its not time yet */
2390                 return (0);
2391         }
2392         rack_log_to_event(rack, RACK_TO_FRM_RACK);
2393         recovery = IN_RECOVERY(tp->t_flags);
2394         counter_u64_add(rack_to_tot, 1);
2395         if (rack->r_state && (rack->r_state != tp->t_state))
2396                 rack_set_state(tp, rack);
2397         rsm = rack_check_recovery_mode(tp, cts);
2398         if (rsm) {
2399                 uint32_t rtt;
2400
2401                 rtt = rack->rc_rack_rtt;
2402                 if (rtt == 0)
2403                         rtt = 1;
2404                 if ((recovery == 0) &&
2405                     (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
2406                         /*
2407                          * The rack-timeout that enter's us into recovery
2408                          * will force out one MSS and set us up so that we
2409                          * can do one more send in 2*rtt (transitioning the
2410                          * rack timeout into a rack-tlp).
2411                          */
2412                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2413                 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
2414                     ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
2415                         /*
2416                          * When a rack timer goes, we have to send at
2417                          * least one segment. They will be paced a min of 1ms
2418                          * apart via the next rack timer (or further
2419                          * if the rack timer dictates it).
2420                          */
2421                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2422                 }
2423         } else {
2424                 /* This is a case that should happen rarely if ever */
2425                 counter_u64_add(rack_tlp_does_nada, 1);
2426 #ifdef TCP_BLACKBOX
2427                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2428 #endif
2429                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2430         }
2431         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2432         return (0);
2433 }
2434
2435 /*
2436  * TLP Timer, here we simply setup what segment we want to
2437  * have the TLP expire on, the normal rack_output() will then
2438  * send it out.
2439  *
2440  * We return 1, saying don't proceed with rack_output only
2441  * when all timers have been stopped (destroyed PCB?).
2442  */
2443 static int
2444 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2445 {
2446         /*
2447          * Tail Loss Probe.
2448          */
2449         struct rack_sendmap *rsm = NULL;
2450         struct socket *so;
2451         uint32_t amm, old_prr_snd = 0;
2452         uint32_t out, avail;
2453
2454         if (tp->t_timers->tt_flags & TT_STOPPED) {
2455                 return (1);
2456         }
2457         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2458                 /* Its not time yet */
2459                 return (0);
2460         }
2461         if (rack_progress_timeout_check(tp)) {
2462                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2463                 return (1);
2464         }
2465         /*
2466          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2467          * need to figure out how to force a full MSS segment out.
2468          */
2469         rack_log_to_event(rack, RACK_TO_FRM_TLP);
2470         counter_u64_add(rack_tlp_tot, 1);
2471         if (rack->r_state && (rack->r_state != tp->t_state))
2472                 rack_set_state(tp, rack);
2473         so = tp->t_inpcb->inp_socket;
2474         avail = sbavail(&so->so_snd);
2475         out = tp->snd_max - tp->snd_una;
2476         rack->rc_timer_up = 1;
2477         /*
2478          * If we are in recovery we can jazz out a segment if new data is
2479          * present simply by setting rc_prr_sndcnt to a segment.
2480          */
2481         if ((avail > out) &&
2482             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2483                 /* New data is available */
2484                 amm = avail - out;
2485                 if (amm > tp->t_maxseg) {
2486                         amm = tp->t_maxseg;
2487                 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
2488                         /* not enough to fill a MTU and no-delay is off */
2489                         goto need_retran;
2490                 }
2491                 if (IN_RECOVERY(tp->t_flags)) {
2492                         /* Unlikely */
2493                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2494                         if (out + amm <= tp->snd_wnd)
2495                                 rack->r_ctl.rc_prr_sndcnt = amm;
2496                         else
2497                                 goto need_retran;
2498                 } else {
2499                         /* Set the send-new override */
2500                         if (out + amm <= tp->snd_wnd)
2501                                 rack->r_ctl.rc_tlp_new_data = amm;
2502                         else
2503                                 goto need_retran;
2504                 }
2505                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2506                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2507                 rack->r_ctl.rc_tlpsend = NULL;
2508                 counter_u64_add(rack_tlp_newdata, 1);
2509                 goto send;
2510         }
2511 need_retran:
2512         /*
2513          * Ok we need to arrange the last un-acked segment to be re-sent, or
2514          * optionally the first un-acked segment.
2515          */
2516         if (rack_always_send_oldest)
2517                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2518         else {
2519                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
2520                 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2521                         rsm = rack_find_high_nonack(rack, rsm);
2522                 }
2523         }
2524         if (rsm == NULL) {
2525                 counter_u64_add(rack_tlp_does_nada, 1);
2526 #ifdef TCP_BLACKBOX
2527                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2528 #endif
2529                 goto out;
2530         }
2531         if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
2532                 /*
2533                  * We need to split this the last segment in two.
2534                  */
2535                 int32_t idx;
2536                 struct rack_sendmap *nrsm;
2537
2538                 nrsm = rack_alloc(rack);
2539                 if (nrsm == NULL) {
2540                         /*
2541                          * No memory to split, we will just exit and punt
2542                          * off to the RXT timer.
2543                          */
2544                         counter_u64_add(rack_tlp_does_nada, 1);
2545                         goto out;
2546                 }
2547                 nrsm->r_start = (rsm->r_end - tp->t_maxseg);
2548                 nrsm->r_end = rsm->r_end;
2549                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2550                 nrsm->r_flags = rsm->r_flags;
2551                 nrsm->r_sndcnt = rsm->r_sndcnt;
2552                 nrsm->r_rtr_bytes = 0;
2553                 rsm->r_end = nrsm->r_start;
2554                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2555                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2556                 }
2557                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
2558                 if (rsm->r_in_tmap) {
2559                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2560                         nrsm->r_in_tmap = 1;
2561                 }
2562                 rsm->r_flags &= (~RACK_HAS_FIN);
2563                 rsm = nrsm;
2564         }
2565         rack->r_ctl.rc_tlpsend = rsm;
2566         rack->r_ctl.rc_tlp_rtx_out = 1;
2567         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2568                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2569                 tp->t_rxtshift++;
2570         } else {
2571                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2572                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2573         }
2574 send:
2575         rack->r_ctl.rc_tlp_send_cnt++;
2576         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2577                 /*
2578                  * Can't [re]/transmit a segment we have not heard from the
2579                  * peer in max times. We need the retransmit timer to take
2580                  * over.
2581                  */
2582 restore:
2583                 rack->r_ctl.rc_tlpsend = NULL;
2584                 if (rsm)
2585                         rsm->r_flags &= ~RACK_TLP;
2586                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2587                 counter_u64_add(rack_tlp_retran_fail, 1);
2588                 goto out;
2589         } else if (rsm) {
2590                 rsm->r_flags |= RACK_TLP;
2591         }
2592         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2593             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2594                 /*
2595                  * We don't want to send a single segment more than the max
2596                  * either.
2597                  */
2598                 goto restore;
2599         }
2600         rack->r_timer_override = 1;
2601         rack->r_tlp_running = 1;
2602         rack->rc_tlp_in_progress = 1;
2603         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2604         return (0);
2605 out:
2606         rack->rc_timer_up = 0;
2607         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2608         return (0);
2609 }
2610
2611 /*
2612  * Delayed ack Timer, here we simply need to setup the
2613  * ACK_NOW flag and remove the DELACK flag. From there
2614  * the output routine will send the ack out.
2615  *
2616  * We only return 1, saying don't proceed, if all timers
2617  * are stopped (destroyed PCB?).
2618  */
2619 static int
2620 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2621 {
2622         if (tp->t_timers->tt_flags & TT_STOPPED) {
2623                 return (1);
2624         }
2625         rack_log_to_event(rack, RACK_TO_FRM_DELACK);
2626         tp->t_flags &= ~TF_DELACK;
2627         tp->t_flags |= TF_ACKNOW;
2628         TCPSTAT_INC(tcps_delack);
2629         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2630         return (0);
2631 }
2632
2633 /*
2634  * Persists timer, here we simply need to setup the
2635  * FORCE-DATA flag the output routine will send
2636  * the one byte send.
2637  *
2638  * We only return 1, saying don't proceed, if all timers
2639  * are stopped (destroyed PCB?).
2640  */
2641 static int
2642 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2643 {
2644         struct inpcb *inp;
2645         int32_t retval = 0;
2646
2647         inp = tp->t_inpcb;
2648
2649         if (tp->t_timers->tt_flags & TT_STOPPED) {
2650                 return (1);
2651         }
2652         if (rack->rc_in_persist == 0)
2653                 return (0);
2654         if (rack_progress_timeout_check(tp)) {
2655                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2656                 return (1);
2657         }
2658         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2659         /*
2660          * Persistence timer into zero window. Force a byte to be output, if
2661          * possible.
2662          */
2663         TCPSTAT_INC(tcps_persisttimeo);
2664         /*
2665          * Hack: if the peer is dead/unreachable, we do not time out if the
2666          * window is closed.  After a full backoff, drop the connection if
2667          * the idle time (no responses to probes) reaches the maximum
2668          * backoff that we would use if retransmitting.
2669          */
2670         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2671             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2672             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2673                 TCPSTAT_INC(tcps_persistdrop);
2674                 retval = 1;
2675                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2676                 goto out;
2677         }
2678         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2679             tp->snd_una == tp->snd_max)
2680                 rack_exit_persist(tp, rack);
2681         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2682         /*
2683          * If the user has closed the socket then drop a persisting
2684          * connection after a much reduced timeout.
2685          */
2686         if (tp->t_state > TCPS_CLOSE_WAIT &&
2687             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2688                 retval = 1;
2689                 TCPSTAT_INC(tcps_persistdrop);
2690                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2691                 goto out;
2692         }
2693         tp->t_flags |= TF_FORCEDATA;
2694 out:
2695         rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
2696         return (retval);
2697 }
2698
2699 /*
2700  * If a keepalive goes off, we had no other timers
2701  * happening. We always return 1 here since this
2702  * routine either drops the connection or sends
2703  * out a segment with respond.
2704  */
2705 static int
2706 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2707 {
2708         struct tcptemp *t_template;
2709         struct inpcb *inp;
2710
2711         if (tp->t_timers->tt_flags & TT_STOPPED) {
2712                 return (1);
2713         }
2714         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
2715         inp = tp->t_inpcb;
2716         rack_log_to_event(rack, RACK_TO_FRM_KEEP);
2717         /*
2718          * Keep-alive timer went off; send something or drop connection if
2719          * idle for too long.
2720          */
2721         TCPSTAT_INC(tcps_keeptimeo);
2722         if (tp->t_state < TCPS_ESTABLISHED)
2723                 goto dropit;
2724         if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2725             tp->t_state <= TCPS_CLOSING) {
2726                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
2727                         goto dropit;
2728                 /*
2729                  * Send a packet designed to force a response if the peer is
2730                  * up and reachable: either an ACK if the connection is
2731                  * still alive, or an RST if the peer has closed the
2732                  * connection due to timeout or reboot. Using sequence
2733                  * number tp->snd_una-1 causes the transmitted zero-length
2734                  * segment to lie outside the receive window; by the
2735                  * protocol spec, this requires the correspondent TCP to
2736                  * respond.
2737                  */
2738                 TCPSTAT_INC(tcps_keepprobe);
2739                 t_template = tcpip_maketemplate(inp);
2740                 if (t_template) {
2741                         tcp_respond(tp, t_template->tt_ipgen,
2742                             &t_template->tt_t, (struct mbuf *)NULL,
2743                             tp->rcv_nxt, tp->snd_una - 1, 0);
2744                         free(t_template, M_TEMP);
2745                 }
2746         }
2747         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
2748         return (1);
2749 dropit:
2750         TCPSTAT_INC(tcps_keepdrops);
2751         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2752         return (1);
2753 }
2754
2755 /*
2756  * Retransmit helper function, clear up all the ack
2757  * flags and take care of important book keeping.
2758  */
2759 static void
2760 rack_remxt_tmr(struct tcpcb *tp)
2761 {
2762         /*
2763          * The retransmit timer went off, all sack'd blocks must be
2764          * un-acked.
2765          */
2766         struct rack_sendmap *rsm, *trsm = NULL;
2767         struct tcp_rack *rack;
2768         int32_t cnt = 0;
2769
2770         rack = (struct tcp_rack *)tp->t_fb_ptr;
2771         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
2772         rack_log_to_event(rack, RACK_TO_FRM_TMR);
2773         if (rack->r_state && (rack->r_state != tp->t_state))
2774                 rack_set_state(tp, rack);
2775         /*
2776          * Ideally we would like to be able to
2777          * mark SACK-PASS on anything not acked here.
2778          * However, if we do that we would burst out
2779          * all that data 1ms apart. This would be unwise,
2780          * so for now we will just let the normal rxt timer
2781          * and tlp timer take care of it.
2782          */
2783         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
2784                 if (rsm->r_flags & RACK_ACKED) {
2785                         cnt++;
2786                         rsm->r_sndcnt = 0;
2787                         if (rsm->r_in_tmap == 0) {
2788                                 /* We must re-add it back to the tlist */
2789                                 if (trsm == NULL) {
2790                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
2791                                 } else {
2792                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
2793                                 }
2794                                 rsm->r_in_tmap = 1;
2795                                 trsm = rsm;
2796                         }
2797                 }
2798                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
2799         }
2800         /* Clear the count (we just un-acked them) */
2801         rack->r_ctl.rc_sacked = 0;
2802         /* Clear the tlp rtx mark */
2803         rack->r_ctl.rc_tlp_rtx_out = 0;
2804         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2805         rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
2806         /* Setup so we send one segment */
2807         if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
2808                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2809         rack->r_timer_override = 1;
2810 }
2811
2812 /*
2813  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
2814  * we will setup to retransmit the lowest seq number outstanding.
2815  */
2816 static int
2817 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2818 {
2819         int32_t rexmt;
2820         struct inpcb *inp;
2821         int32_t retval = 0;
2822
2823         inp = tp->t_inpcb;
2824         if (tp->t_timers->tt_flags & TT_STOPPED) {
2825                 return (1);
2826         }
2827         if (rack_progress_timeout_check(tp)) {
2828                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2829                 return (1);
2830         }
2831         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
2832         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2833             (tp->snd_una == tp->snd_max)) {
2834                 /* Nothing outstanding .. nothing to do */
2835                 return (0);
2836         }
2837         /*
2838          * Retransmission timer went off.  Message has not been acked within
2839          * retransmit interval.  Back off to a longer retransmit interval
2840          * and retransmit one segment.
2841          */
2842         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
2843                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
2844                 TCPSTAT_INC(tcps_timeoutdrop);
2845                 retval = 1;
2846                 tcp_set_inp_to_drop(rack->rc_inp,
2847                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
2848                 goto out;
2849         }
2850         rack_remxt_tmr(tp);
2851         if (tp->t_state == TCPS_SYN_SENT) {
2852                 /*
2853                  * If the SYN was retransmitted, indicate CWND to be limited
2854                  * to 1 segment in cc_conn_init().
2855                  */
2856                 tp->snd_cwnd = 1;
2857         } else if (tp->t_rxtshift == 1) {
2858                 /*
2859                  * first retransmit; record ssthresh and cwnd so they can be
2860                  * recovered if this turns out to be a "bad" retransmit. A
2861                  * retransmit is considered "bad" if an ACK for this segment
2862                  * is received within RTT/2 interval; the assumption here is
2863                  * that the ACK was already in flight.  See "On Estimating
2864                  * End-to-End Network Path Properties" by Allman and Paxson
2865                  * for more details.
2866                  */
2867                 tp->snd_cwnd_prev = tp->snd_cwnd;
2868                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
2869                 tp->snd_recover_prev = tp->snd_recover;
2870                 if (IN_FASTRECOVERY(tp->t_flags))
2871                         tp->t_flags |= TF_WASFRECOVERY;
2872                 else
2873                         tp->t_flags &= ~TF_WASFRECOVERY;
2874                 if (IN_CONGRECOVERY(tp->t_flags))
2875                         tp->t_flags |= TF_WASCRECOVERY;
2876                 else
2877                         tp->t_flags &= ~TF_WASCRECOVERY;
2878                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
2879                 tp->t_flags |= TF_PREVVALID;
2880         } else
2881                 tp->t_flags &= ~TF_PREVVALID;
2882         TCPSTAT_INC(tcps_rexmttimeo);
2883         if ((tp->t_state == TCPS_SYN_SENT) ||
2884             (tp->t_state == TCPS_SYN_RECEIVED))
2885                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
2886         else
2887                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
2888         TCPT_RANGESET(tp->t_rxtcur, rexmt,
2889            max(MSEC_2_TICKS(rack_rto_min), rexmt),
2890            MSEC_2_TICKS(rack_rto_max));
2891         /*
2892          * We enter the path for PLMTUD if connection is established or, if
2893          * connection is FIN_WAIT_1 status, reason for the last is that if
2894          * amount of data we send is very small, we could send it in couple
2895          * of packets and process straight to FIN. In that case we won't
2896          * catch ESTABLISHED state.
2897          */
2898         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
2899             || (tp->t_state == TCPS_FIN_WAIT_1))) {
2900 #ifdef INET6
2901                 int32_t isipv6;
2902 #endif
2903
2904                 /*
2905                  * Idea here is that at each stage of mtu probe (usually,
2906                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
2907                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
2908                  * should take care of that.
2909                  */
2910                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
2911                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
2912                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
2913                     tp->t_rxtshift % 2 == 0)) {
2914                         /*
2915                          * Enter Path MTU Black-hole Detection mechanism: -
2916                          * Disable Path MTU Discovery (IP "DF" bit). -
2917                          * Reduce MTU to lower value than what we negotiated
2918                          * with peer.
2919                          */
2920                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
2921                                 /* Record that we may have found a black hole. */
2922                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
2923                                 /* Keep track of previous MSS. */
2924                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
2925                         }
2926
2927                         /*
2928                          * Reduce the MSS to blackhole value or to the
2929                          * default in an attempt to retransmit.
2930                          */
2931 #ifdef INET6
2932                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
2933                         if (isipv6 &&
2934                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
2935                                 /* Use the sysctl tuneable blackhole MSS. */
2936                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
2937                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
2938                         } else if (isipv6) {
2939                                 /* Use the default MSS. */
2940                                 tp->t_maxseg = V_tcp_v6mssdflt;
2941                                 /*
2942                                  * Disable Path MTU Discovery when we switch
2943                                  * to minmss.
2944                                  */
2945                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
2946                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
2947                         }
2948 #endif
2949 #if defined(INET6) && defined(INET)
2950                         else
2951 #endif
2952 #ifdef INET
2953                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
2954                                 /* Use the sysctl tuneable blackhole MSS. */
2955                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
2956                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
2957                         } else {
2958                                 /* Use the default MSS. */
2959                                 tp->t_maxseg = V_tcp_mssdflt;
2960                                 /*
2961                                  * Disable Path MTU Discovery when we switch
2962                                  * to minmss.
2963                                  */
2964                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
2965                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
2966                         }
2967 #endif
2968                 } else {
2969                         /*
2970                          * If further retransmissions are still unsuccessful
2971                          * with a lowered MTU, maybe this isn't a blackhole
2972                          * and we restore the previous MSS and blackhole
2973                          * detection flags. The limit '6' is determined by
2974                          * giving each probe stage (1448, 1188, 524) 2
2975                          * chances to recover.
2976                          */
2977                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
2978                             (tp->t_rxtshift >= 6)) {
2979                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
2980                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
2981                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
2982                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
2983                         }
2984                 }
2985         }
2986         /*
2987          * Disable RFC1323 and SACK if we haven't got any response to our
2988          * third SYN to work-around some broken terminal servers (most of
2989          * which have hopefully been retired) that have bad VJ header
2990          * compression code which trashes TCP segments containing
2991          * unknown-to-them TCP options.
2992          */
2993         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
2994             (tp->t_rxtshift == 3))
2995                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
2996         /*
2997          * If we backed off this far, our srtt estimate is probably bogus.
2998          * Clobber it so we'll take the next rtt measurement as our srtt;
2999          * move the current srtt into rttvar to keep the current retransmit
3000          * times until then.
3001          */
3002         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3003 #ifdef INET6
3004                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3005                         in6_losing(tp->t_inpcb);
3006                 else
3007 #endif
3008                         in_losing(tp->t_inpcb);
3009                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3010                 tp->t_srtt = 0;
3011         }
3012         if (rack_use_sack_filter)
3013                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3014         tp->snd_recover = tp->snd_max;
3015         tp->t_flags |= TF_ACKNOW;
3016         tp->t_rtttime = 0;
3017         rack_cong_signal(tp, NULL, CC_RTO);
3018 out:
3019         return (retval);
3020 }
3021
3022 static int
3023 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3024 {
3025         int32_t ret = 0;
3026         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3027
3028         if (timers == 0) {
3029                 return (0);
3030         }
3031         if (tp->t_state == TCPS_LISTEN) {
3032                 /* no timers on listen sockets */
3033                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3034                         return (0);
3035                 return (1);
3036         }
3037         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3038                 uint32_t left;
3039
3040                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3041                         ret = -1;
3042                         rack_log_to_processing(rack, cts, ret, 0);
3043                         return (0);
3044                 }
3045                 if (hpts_calling == 0) {
3046                         ret = -2;
3047                         rack_log_to_processing(rack, cts, ret, 0);
3048                         return (0);
3049                 }
3050                 /*
3051                  * Ok our timer went off early and we are not paced false
3052                  * alarm, go back to sleep.
3053                  */
3054                 ret = -3;
3055                 left = rack->r_ctl.rc_timer_exp - cts;
3056                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3057                 rack_log_to_processing(rack, cts, ret, left);
3058                 rack->rc_last_pto_set = 0;
3059                 return (1);
3060         }
3061         rack->rc_tmr_stopped = 0;
3062         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3063         if (timers & PACE_TMR_DELACK) {
3064                 ret = rack_timeout_delack(tp, rack, cts);
3065         } else if (timers & PACE_TMR_RACK) {
3066                 ret = rack_timeout_rack(tp, rack, cts);
3067         } else if (timers & PACE_TMR_TLP) {
3068                 ret = rack_timeout_tlp(tp, rack, cts);
3069         } else if (timers & PACE_TMR_RXT) {
3070                 ret = rack_timeout_rxt(tp, rack, cts);
3071         } else if (timers & PACE_TMR_PERSIT) {
3072                 ret = rack_timeout_persist(tp, rack, cts);
3073         } else if (timers & PACE_TMR_KEEP) {
3074                 ret = rack_timeout_keepalive(tp, rack, cts);
3075         }
3076         rack_log_to_processing(rack, cts, ret, timers);
3077         return (ret);
3078 }
3079
3080 static void
3081 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3082 {
3083         uint8_t hpts_removed = 0;
3084
3085         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3086             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3087                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3088                 hpts_removed = 1;
3089         }
3090         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3091                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3092                 if (rack->rc_inp->inp_in_hpts &&
3093                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3094                         /*
3095                          * Canceling timer's when we have no output being
3096                          * paced. We also must remove ourselves from the
3097                          * hpts.
3098                          */
3099                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3100                         hpts_removed = 1;
3101                 }
3102                 rack_log_to_cancel(rack, hpts_removed, line);
3103                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3104         }
3105 }
3106
3107 static void
3108 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3109 {
3110         return;
3111 }
3112
3113 static int
3114 rack_stopall(struct tcpcb *tp)
3115 {
3116         struct tcp_rack *rack;
3117         rack = (struct tcp_rack *)tp->t_fb_ptr;
3118         rack->t_timers_stopped = 1;
3119         return (0);
3120 }
3121
3122 static void
3123 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3124 {
3125         return;
3126 }
3127
3128 static int
3129 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3130 {
3131         return (0);
3132 }
3133
3134 static void
3135 rack_stop_all_timers(struct tcpcb *tp)
3136 {
3137         struct tcp_rack *rack;
3138
3139         /*
3140          * Assure no timers are running.
3141          */
3142         if (tcp_timer_active(tp, TT_PERSIST)) {
3143                 /* We enter in persists, set the flag appropriately */
3144                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3145                 rack->rc_in_persist = 1;
3146         }
3147         tcp_timer_suspend(tp, TT_PERSIST);
3148         tcp_timer_suspend(tp, TT_REXMT);
3149         tcp_timer_suspend(tp, TT_KEEP);
3150         tcp_timer_suspend(tp, TT_DELACK);
3151 }
3152
3153 static void
3154 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3155     struct rack_sendmap *rsm, uint32_t ts)
3156 {
3157         int32_t idx;
3158
3159         rsm->r_rtr_cnt++;
3160         rsm->r_sndcnt++;
3161         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3162                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3163                 rsm->r_flags |= RACK_OVERMAX;
3164         }
3165         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3166                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3167                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3168         }
3169         idx = rsm->r_rtr_cnt - 1;
3170         rsm->r_tim_lastsent[idx] = ts;
3171         if (rsm->r_flags & RACK_ACKED) {
3172                 /* Problably MTU discovery messing with us */
3173                 rsm->r_flags &= ~RACK_ACKED;
3174                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3175         }
3176         if (rsm->r_in_tmap) {
3177                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3178         }
3179         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3180         rsm->r_in_tmap = 1;
3181         if (rsm->r_flags & RACK_SACK_PASSED) {
3182                 /* We have retransmitted due to the SACK pass */
3183                 rsm->r_flags &= ~RACK_SACK_PASSED;
3184                 rsm->r_flags |= RACK_WAS_SACKPASS;
3185         }
3186         /* Update memory for next rtr */
3187         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3188 }
3189
3190
3191 static uint32_t
3192 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3193     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
3194 {
3195         /*
3196          * We (re-)transmitted starting at rsm->r_start for some length
3197          * (possibly less than r_end.
3198          */
3199         struct rack_sendmap *nrsm;
3200         uint32_t c_end;
3201         int32_t len;
3202         int32_t idx;
3203
3204         len = *lenp;
3205         c_end = rsm->r_start + len;
3206         if (SEQ_GEQ(c_end, rsm->r_end)) {
3207                 /*
3208                  * We retransmitted the whole piece or more than the whole
3209                  * slopping into the next rsm.
3210                  */
3211                 rack_update_rsm(tp, rack, rsm, ts);
3212                 if (c_end == rsm->r_end) {
3213                         *lenp = 0;
3214                         return (0);
3215                 } else {
3216                         int32_t act_len;
3217
3218                         /* Hangs over the end return whats left */
3219                         act_len = rsm->r_end - rsm->r_start;
3220                         *lenp = (len - act_len);
3221                         return (rsm->r_end);
3222                 }
3223                 /* We don't get out of this block. */
3224         }
3225         /*
3226          * Here we retransmitted less than the whole thing which means we
3227          * have to split this into what was transmitted and what was not.
3228          */
3229         nrsm = rack_alloc(rack);
3230         if (nrsm == NULL) {
3231                 /*
3232                  * We can't get memory, so lets not proceed.
3233                  */
3234                 *lenp = 0;
3235                 return (0);
3236         }
3237         /*
3238          * So here we are going to take the original rsm and make it what we
3239          * retransmitted. nrsm will be the tail portion we did not
3240          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3241          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3242          * 1, 6 and the new piece will be 6, 11.
3243          */
3244         nrsm->r_start = c_end;
3245         nrsm->r_end = rsm->r_end;
3246         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3247         nrsm->r_flags = rsm->r_flags;
3248         nrsm->r_sndcnt = rsm->r_sndcnt;
3249         nrsm->r_rtr_bytes = 0;
3250         rsm->r_end = c_end;
3251         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3252                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3253         }
3254         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3255         if (rsm->r_in_tmap) {
3256                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3257                 nrsm->r_in_tmap = 1;
3258         }
3259         rsm->r_flags &= (~RACK_HAS_FIN);
3260         rack_update_rsm(tp, rack, rsm, ts);
3261         *lenp = 0;
3262         return (0);
3263 }
3264
3265
3266 static void
3267 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3268     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3269     uint8_t pass, struct rack_sendmap *hintrsm)
3270 {
3271         struct tcp_rack *rack;
3272         struct rack_sendmap *rsm, *nrsm;
3273         register uint32_t snd_max, snd_una;
3274         int32_t idx;
3275
3276         /*
3277          * Add to the RACK log of packets in flight or retransmitted. If
3278          * there is a TS option we will use the TS echoed, if not we will
3279          * grab a TS.
3280          *
3281          * Retransmissions will increment the count and move the ts to its
3282          * proper place. Note that if options do not include TS's then we
3283          * won't be able to effectively use the ACK for an RTT on a retran.
3284          *
3285          * Notes about r_start and r_end. Lets consider a send starting at
3286          * sequence 1 for 10 bytes. In such an example the r_start would be
3287          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3288          * This means that r_end is actually the first sequence for the next
3289          * slot (11).
3290          *
3291          */
3292         /*
3293          * If err is set what do we do XXXrrs? should we not add the thing?
3294          * -- i.e. return if err != 0 or should we pretend we sent it? --
3295          * i.e. proceed with add ** do this for now.
3296          */
3297         INP_WLOCK_ASSERT(tp->t_inpcb);
3298         if (err)
3299                 /*
3300                  * We don't log errors -- we could but snd_max does not
3301                  * advance in this case either.
3302                  */
3303                 return;
3304
3305         if (th_flags & TH_RST) {
3306                 /*
3307                  * We don't log resets and we return immediately from
3308                  * sending
3309                  */
3310                 return;
3311         }
3312         rack = (struct tcp_rack *)tp->t_fb_ptr;
3313         snd_una = tp->snd_una;
3314         if (SEQ_LEQ((seq_out + len), snd_una)) {
3315                 /* Are sending an old segment to induce an ack (keep-alive)? */
3316                 return;
3317         }
3318         if (SEQ_LT(seq_out, snd_una)) {
3319                 /* huh? should we panic? */
3320                 uint32_t end;
3321
3322                 end = seq_out + len;
3323                 seq_out = snd_una;
3324                 len = end - seq_out;
3325         }
3326         snd_max = tp->snd_max;
3327         if (th_flags & (TH_SYN | TH_FIN)) {
3328                 /*
3329                  * The call to rack_log_output is made before bumping
3330                  * snd_max. This means we can record one extra byte on a SYN
3331                  * or FIN if seq_out is adding more on and a FIN is present
3332                  * (and we are not resending).
3333                  */
3334                 if (th_flags & TH_SYN)
3335                         len++;
3336                 if (th_flags & TH_FIN)
3337                         len++;
3338                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3339                         /*
3340                          * The add/update as not been done for the FIN/SYN
3341                          * yet.
3342                          */
3343                         snd_max = tp->snd_nxt;
3344                 }
3345         }
3346         if (len == 0) {
3347                 /* We don't log zero window probes */
3348                 return;
3349         }
3350         rack->r_ctl.rc_time_last_sent = ts;
3351         if (IN_RECOVERY(tp->t_flags)) {
3352                 rack->r_ctl.rc_prr_out += len;
3353         }
3354         /* First question is it a retransmission? */
3355         if (seq_out == snd_max) {
3356 again:
3357                 rsm = rack_alloc(rack);
3358                 if (rsm == NULL) {
3359                         /*
3360                          * Hmm out of memory and the tcb got destroyed while
3361                          * we tried to wait.
3362                          */
3363 #ifdef INVARIANTS
3364                         panic("Out of memory when we should not be rack:%p", rack);
3365 #endif
3366                         return;
3367                 }
3368                 if (th_flags & TH_FIN) {
3369                         rsm->r_flags = RACK_HAS_FIN;
3370                 } else {
3371                         rsm->r_flags = 0;
3372                 }
3373                 rsm->r_tim_lastsent[0] = ts;
3374                 rsm->r_rtr_cnt = 1;
3375                 rsm->r_rtr_bytes = 0;
3376                 if (th_flags & TH_SYN) {
3377                         /* The data space is one beyond snd_una */
3378                         rsm->r_start = seq_out + 1;
3379                         rsm->r_end = rsm->r_start + (len - 1);
3380                 } else {
3381                         /* Normal case */
3382                         rsm->r_start = seq_out;
3383                         rsm->r_end = rsm->r_start + len;
3384                 }
3385                 rsm->r_sndcnt = 0;
3386                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
3387                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3388                 rsm->r_in_tmap = 1;
3389                 return;
3390         }
3391         /*
3392          * If we reach here its a retransmission and we need to find it.
3393          */
3394 more:
3395         if (hintrsm && (hintrsm->r_start == seq_out)) {
3396                 rsm = hintrsm;
3397                 hintrsm = NULL;
3398         } else if (rack->r_ctl.rc_next) {
3399                 /* We have a hint from a previous run */
3400                 rsm = rack->r_ctl.rc_next;
3401         } else {
3402                 /* No hints sorry */
3403                 rsm = NULL;
3404         }
3405         if ((rsm) && (rsm->r_start == seq_out)) {
3406                 /*
3407                  * We used rc_next or hintrsm  to retransmit, hopefully the
3408                  * likely case.
3409                  */
3410                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3411                 if (len == 0) {
3412                         return;
3413                 } else {
3414                         goto more;
3415                 }
3416         }
3417         /* Ok it was not the last pointer go through it the hard way. */
3418         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3419                 if (rsm->r_start == seq_out) {
3420                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3421                         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3422                         if (len == 0) {
3423                                 return;
3424                         } else {
3425                                 continue;
3426                         }
3427                 }
3428                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3429                         /* Transmitted within this piece */
3430                         /*
3431                          * Ok we must split off the front and then let the
3432                          * update do the rest
3433                          */
3434                         nrsm = rack_alloc(rack);
3435                         if (nrsm == NULL) {
3436 #ifdef INVARIANTS
3437                                 panic("Ran out of memory that was preallocated? rack:%p", rack);
3438 #endif
3439                                 rack_update_rsm(tp, rack, rsm, ts);
3440                                 return;
3441                         }
3442                         /*
3443                          * copy rsm to nrsm and then trim the front of rsm
3444                          * to not include this part.
3445                          */
3446                         nrsm->r_start = seq_out;
3447                         nrsm->r_end = rsm->r_end;
3448                         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3449                         nrsm->r_flags = rsm->r_flags;
3450                         nrsm->r_sndcnt = rsm->r_sndcnt;
3451                         nrsm->r_rtr_bytes = 0;
3452                         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3453                                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3454                         }
3455                         rsm->r_end = nrsm->r_start;
3456                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3457                         if (rsm->r_in_tmap) {
3458                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3459                                 nrsm->r_in_tmap = 1;
3460                         }
3461                         rsm->r_flags &= (~RACK_HAS_FIN);
3462                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3463                         if (len == 0) {
3464                                 return;
3465                         }
3466                 }
3467         }
3468         /*
3469          * Hmm not found in map did they retransmit both old and on into the
3470          * new?
3471          */
3472         if (seq_out == tp->snd_max) {
3473                 goto again;
3474         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3475 #ifdef INVARIANTS
3476                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3477                     seq_out, len, tp->snd_una, tp->snd_max);
3478                 printf("Starting Dump of all rack entries\n");
3479                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3480                         printf("rsm:%p start:%u end:%u\n",
3481                             rsm, rsm->r_start, rsm->r_end);
3482                 }
3483                 printf("Dump complete\n");
3484                 panic("seq_out not found rack:%p tp:%p",
3485                     rack, tp);
3486 #endif
3487         } else {
3488 #ifdef INVARIANTS
3489                 /*
3490                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3491                  * flag)
3492                  */
3493                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3494                     seq_out, len, tp->snd_max, tp);
3495 #endif
3496         }
3497 }
3498
3499 /*
3500  * Record one of the RTT updates from an ack into
3501  * our sample structure.
3502  */
3503 static void
3504 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3505 {
3506         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3507             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3508                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3509         }
3510         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3511             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3512                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3513         }
3514         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3515         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3516         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3517 }
3518
3519 /*
3520  * Collect new round-trip time estimate
3521  * and update averages and current timeout.
3522  */
3523 static void
3524 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3525 {
3526         int32_t delta;
3527         uint32_t o_srtt, o_var;
3528         int32_t rtt;
3529
3530         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3531                 /* No valid sample */
3532                 return;
3533         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3534                 /* We are to use the lowest RTT seen in a single ack */
3535                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3536         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3537                 /* We are to use the highest RTT seen in a single ack */
3538                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3539         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3540                 /* We are to use the average RTT seen in a single ack */
3541                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3542                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3543         } else {
3544 #ifdef INVARIANTS
3545                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3546 #endif
3547                 return;
3548         }
3549         if (rtt == 0)
3550                 rtt = 1;
3551         rack_log_rtt_sample(rack, rtt);
3552         o_srtt = tp->t_srtt;
3553         o_var = tp->t_rttvar;
3554         rack = (struct tcp_rack *)tp->t_fb_ptr;
3555         if (tp->t_srtt != 0) {
3556                 /*
3557                  * srtt is stored as fixed point with 5 bits after the
3558                  * binary point (i.e., scaled by 8).  The following magic is
3559                  * equivalent to the smoothing algorithm in rfc793 with an
3560                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3561                  * Adjust rtt to origin 0.
3562                  */
3563                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3564                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3565
3566                 tp->t_srtt += delta;
3567                 if (tp->t_srtt <= 0)
3568                         tp->t_srtt = 1;
3569
3570                 /*
3571                  * We accumulate a smoothed rtt variance (actually, a
3572                  * smoothed mean difference), then set the retransmit timer
3573                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3574                  * is stored as fixed point with 4 bits after the binary
3575                  * point (scaled by 16).  The following is equivalent to
3576                  * rfc793 smoothing with an alpha of .75 (rttvar =
3577                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3578                  * wired-in beta.
3579                  */
3580                 if (delta < 0)
3581                         delta = -delta;
3582                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3583                 tp->t_rttvar += delta;
3584                 if (tp->t_rttvar <= 0)
3585                         tp->t_rttvar = 1;
3586                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3587                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3588         } else {
3589                 /*
3590                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3591                  * variance to half the rtt (so our first retransmit happens
3592                  * at 3*rtt).
3593                  */
3594                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3595                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3596                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3597         }
3598         TCPSTAT_INC(tcps_rttupdated);
3599         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3600         tp->t_rttupdated++;
3601 #ifdef NETFLIX_STATS
3602         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3603 #endif
3604         tp->t_rxtshift = 0;
3605
3606         /*
3607          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3608          * way we do the smoothing, srtt and rttvar will each average +1/2
3609          * tick of bias.  When we compute the retransmit timer, we want 1/2
3610          * tick of rounding and 1 extra tick because of +-1/2 tick
3611          * uncertainty in the firing of the timer.  The bias will give us
3612          * exactly the 1.5 tick we need.  But, because the bias is
3613          * statistical, we have to test that we don't drop below the minimum
3614          * feasible timer (which is 2 ticks).
3615          */
3616         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3617            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3618         tp->t_softerror = 0;
3619 }
3620
3621 static void
3622 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3623     uint32_t t, uint32_t cts)
3624 {
3625         /*
3626          * For this RSM, we acknowledged the data from a previous
3627          * transmission, not the last one we made. This means we did a false
3628          * retransmit.
3629          */
3630         struct tcp_rack *rack;
3631
3632         if (rsm->r_flags & RACK_HAS_FIN) {
3633                 /*
3634                  * The sending of the FIN often is multiple sent when we
3635                  * have everything outstanding ack'd. We ignore this case
3636                  * since its over now.
3637                  */
3638                 return;
3639         }
3640         if (rsm->r_flags & RACK_TLP) {
3641                 /*
3642                  * We expect TLP's to have this occur.
3643                  */
3644                 return;
3645         }
3646         rack = (struct tcp_rack *)tp->t_fb_ptr;
3647         /* should we undo cc changes and exit recovery? */
3648         if (IN_RECOVERY(tp->t_flags)) {
3649                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3650                         /*
3651                          * Undo what we ratched down and exit recovery if
3652                          * possible
3653                          */
3654                         EXIT_RECOVERY(tp->t_flags);
3655                         tp->snd_recover = tp->snd_una;
3656                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3657                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3658                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3659                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3660                 }
3661         }
3662         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3663                 /*
3664                  * We retransmitted based on a sack and the earlier
3665                  * retransmission ack'd it - re-ordering is occuring.
3666                  */
3667                 counter_u64_add(rack_reorder_seen, 1);
3668                 rack->r_ctl.rc_reorder_ts = cts;
3669         }
3670         counter_u64_add(rack_badfr, 1);
3671         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3672 }
3673
3674
3675 static int
3676 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3677     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3678 {
3679         int32_t i;
3680         uint32_t t;
3681
3682         if (rsm->r_flags & RACK_ACKED)
3683                 /* Already done */
3684                 return (0);
3685
3686
3687         if ((rsm->r_rtr_cnt == 1) ||
3688             ((ack_type == CUM_ACKED) &&
3689             (to->to_flags & TOF_TS) &&
3690             (to->to_tsecr) &&
3691             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3692             ) {
3693                 /*
3694                  * We will only find a matching timestamp if its cum-acked.
3695                  * But if its only one retransmission its for-sure matching
3696                  * :-)
3697                  */
3698                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3699                 if ((int)t <= 0)
3700                         t = 1;
3701                 if (!tp->t_rttlow || tp->t_rttlow > t)
3702                         tp->t_rttlow = t;
3703                 if (!rack->r_ctl.rc_rack_min_rtt ||
3704                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3705                         rack->r_ctl.rc_rack_min_rtt = t;
3706                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3707                                 rack->r_ctl.rc_rack_min_rtt = 1;
3708                         }
3709                 }
3710                 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
3711                 if ((rsm->r_flags & RACK_TLP) &&
3712                     (!IN_RECOVERY(tp->t_flags))) {
3713                         /* Segment was a TLP and our retrans matched */
3714                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
3715                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
3716                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
3717                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
3718                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
3719                                 /*
3720                                  * When we enter recovery we need to assure
3721                                  * we send one packet.
3722                                  */
3723                                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
3724                         } else
3725                                 rack->r_ctl.rc_tlp_rtx_out = 0;
3726                 }
3727                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3728                         /* New more recent rack_tmit_time */
3729                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3730                         rack->rc_rack_rtt = t;
3731                 }
3732                 return (1);
3733         }
3734         /*
3735          * We clear the soft/rxtshift since we got an ack.
3736          * There is no assurance we will call the commit() function
3737          * so we need to clear these to avoid incorrect handling.
3738          */
3739         tp->t_rxtshift = 0;
3740         tp->t_softerror = 0;
3741         if ((to->to_flags & TOF_TS) &&
3742             (ack_type == CUM_ACKED) &&
3743             (to->to_tsecr) &&
3744             ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
3745                 /*
3746                  * Now which timestamp does it match? In this block the ACK
3747                  * must be coming from a previous transmission.
3748                  */
3749                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
3750                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
3751                                 t = cts - rsm->r_tim_lastsent[i];
3752                                 if ((int)t <= 0)
3753                                         t = 1;
3754                                 if ((i + 1) < rsm->r_rtr_cnt) {
3755                                         /* Likely */
3756                                         rack_earlier_retran(tp, rsm, t, cts);
3757                                 }
3758                                 if (!tp->t_rttlow || tp->t_rttlow > t)
3759                                         tp->t_rttlow = t;
3760                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3761                                         rack->r_ctl.rc_rack_min_rtt = t;
3762                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3763                                                 rack->r_ctl.rc_rack_min_rtt = 1;
3764                                         }
3765                                 }
3766                                 /*
3767                                  * Note the following calls to
3768                                  * tcp_rack_xmit_timer() are being commented
3769                                  * out for now. They give us no more accuracy
3770                                  * and often lead to a wrong choice. We have
3771                                  * enough samples that have not been
3772                                  * retransmitted. I leave the commented out
3773                                  * code in here in case in the future we
3774                                  * decide to add it back (though I can't forsee
3775                                  * doing that). That way we will easily see
3776                                  * where they need to be placed.
3777                                  */
3778                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
3779                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3780                                         /* New more recent rack_tmit_time */
3781                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3782                                         rack->rc_rack_rtt = t;
3783                                 }
3784                                 return (1);
3785                         }
3786                 }
3787                 goto ts_not_found;
3788         } else {
3789                 /*
3790                  * Ok its a SACK block that we retransmitted. or a windows
3791                  * machine without timestamps. We can tell nothing from the
3792                  * time-stamp since its not there or the time the peer last
3793                  * recieved a segment that moved forward its cum-ack point.
3794                  */
3795 ts_not_found:
3796                 i = rsm->r_rtr_cnt - 1;
3797                 t = cts - rsm->r_tim_lastsent[i];
3798                 if ((int)t <= 0)
3799                         t = 1;
3800                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3801                         /*
3802                          * We retransmitted and the ack came back in less
3803                          * than the smallest rtt we have observed. We most
3804                          * likey did an improper retransmit as outlined in
3805                          * 4.2 Step 3 point 2 in the rack-draft.
3806                          */
3807                         i = rsm->r_rtr_cnt - 2;
3808                         t = cts - rsm->r_tim_lastsent[i];
3809                         rack_earlier_retran(tp, rsm, t, cts);
3810                 } else if (rack->r_ctl.rc_rack_min_rtt) {
3811                         /*
3812                          * We retransmitted it and the retransmit did the
3813                          * job.
3814                          */
3815                         if (!rack->r_ctl.rc_rack_min_rtt ||
3816                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3817                                 rack->r_ctl.rc_rack_min_rtt = t;
3818                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
3819                                         rack->r_ctl.rc_rack_min_rtt = 1;
3820                                 }
3821                         }
3822                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
3823                                 /* New more recent rack_tmit_time */
3824                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
3825                                 rack->rc_rack_rtt = t;
3826                         }
3827                         return (1);
3828                 }
3829         }
3830         return (0);
3831 }
3832
3833 /*
3834  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
3835  */
3836 static void
3837 rack_log_sack_passed(struct tcpcb *tp,
3838     struct tcp_rack *rack, struct rack_sendmap *rsm)
3839 {
3840         struct rack_sendmap *nrsm;
3841         uint32_t ts;
3842         int32_t idx;
3843
3844         idx = rsm->r_rtr_cnt - 1;
3845         ts = rsm->r_tim_lastsent[idx];
3846         nrsm = rsm;
3847         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
3848             rack_head, r_tnext) {
3849                 if (nrsm == rsm) {
3850                         /* Skip orginal segment he is acked */
3851                         continue;
3852                 }
3853                 if (nrsm->r_flags & RACK_ACKED) {
3854                         /* Skip ack'd segments */
3855                         continue;
3856                 }
3857                 idx = nrsm->r_rtr_cnt - 1;
3858                 if (ts == nrsm->r_tim_lastsent[idx]) {
3859                         /*
3860                          * For this case lets use seq no, if we sent in a
3861                          * big block (TSO) we would have a bunch of segments
3862                          * sent at the same time.
3863                          *
3864                          * We would only get a report if its SEQ is earlier.
3865                          * If we have done multiple retransmits the times
3866                          * would not be equal.
3867                          */
3868                         if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
3869                                 nrsm->r_flags |= RACK_SACK_PASSED;
3870                                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3871                         }
3872                 } else {
3873                         /*
3874                          * Here they were sent at different times, not a big
3875                          * block. Since we transmitted this one later and
3876                          * see it sack'd then this must also be missing (or
3877                          * we would have gotten a sack block for it)
3878                          */
3879                         nrsm->r_flags |= RACK_SACK_PASSED;
3880                         nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3881                 }
3882         }
3883 }
3884
3885 static uint32_t
3886 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
3887     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
3888 {
3889         int32_t idx;
3890         int32_t times = 0;
3891         uint32_t start, end, changed = 0;
3892         struct rack_sendmap *rsm, *nrsm;
3893         int32_t used_ref = 1;
3894
3895         start = sack->start;
3896         end = sack->end;
3897         rsm = *prsm;
3898         if (rsm && SEQ_LT(start, rsm->r_start)) {
3899                 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
3900                         if (SEQ_GEQ(start, rsm->r_start) &&
3901                             SEQ_LT(start, rsm->r_end)) {
3902                                 goto do_rest_ofb;
3903                         }
3904                 }
3905         }
3906         if (rsm == NULL) {
3907 start_at_beginning:
3908                 rsm = NULL;
3909                 used_ref = 0;
3910         }
3911         /* First lets locate the block where this guy is */
3912         TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
3913                 if (SEQ_GEQ(start, rsm->r_start) &&
3914                     SEQ_LT(start, rsm->r_end)) {
3915                         break;
3916                 }
3917         }
3918 do_rest_ofb:
3919         if (rsm == NULL) {
3920                 /*
3921                  * This happens when we get duplicate sack blocks with the
3922                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
3923                  * will not change there location so we would just start at
3924                  * the end of the first one and get lost.
3925                  */
3926                 if (tp->t_flags & TF_SENTFIN) {
3927                         /*
3928                          * Check to see if we have not logged the FIN that
3929                          * went out.
3930                          */
3931                         nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
3932                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
3933                                 /*
3934                                  * Ok we did not get the FIN logged.
3935                                  */
3936                                 nrsm->r_end++;
3937                                 rsm = nrsm;
3938                                 goto do_rest_ofb;
3939                         }
3940                 }
3941                 if (times == 1) {
3942 #ifdef INVARIANTS
3943                         panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
3944                             tp, rack, sack, to, prsm);
3945 #else
3946                         goto out;
3947 #endif
3948                 }
3949                 times++;
3950                 counter_u64_add(rack_sack_proc_restart, 1);
3951                 goto start_at_beginning;
3952         }
3953         /* Ok we have an ACK for some piece of rsm */
3954         if (rsm->r_start != start) {
3955                 /*
3956                  * Need to split this in two pieces the before and after.
3957                  */
3958                 nrsm = rack_alloc(rack);
3959                 if (nrsm == NULL) {
3960                         /*
3961                          * failed XXXrrs what can we do but loose the sack
3962                          * info?
3963                          */
3964                         goto out;
3965                 }
3966                 nrsm->r_start = start;
3967                 nrsm->r_rtr_bytes = 0;
3968                 nrsm->r_end = rsm->r_end;
3969                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3970                 nrsm->r_flags = rsm->r_flags;
3971                 nrsm->r_sndcnt = rsm->r_sndcnt;
3972                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3973                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3974                 }
3975                 rsm->r_end = nrsm->r_start;
3976                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3977                 if (rsm->r_in_tmap) {
3978                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3979                         nrsm->r_in_tmap = 1;
3980                 }
3981                 rsm->r_flags &= (~RACK_HAS_FIN);
3982                 rsm = nrsm;
3983         }
3984         if (SEQ_GEQ(end, rsm->r_end)) {
3985                 /*
3986                  * The end of this block is either beyond this guy or right
3987                  * at this guy.
3988                  */
3989
3990                 if ((rsm->r_flags & RACK_ACKED) == 0) {
3991                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
3992                         changed += (rsm->r_end - rsm->r_start);
3993                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
3994                         rack_log_sack_passed(tp, rack, rsm);
3995                         /* Is Reordering occuring? */
3996                         if (rsm->r_flags & RACK_SACK_PASSED) {
3997                                 counter_u64_add(rack_reorder_seen, 1);
3998                                 rack->r_ctl.rc_reorder_ts = cts;
3999                         }
4000                         rsm->r_flags |= RACK_ACKED;
4001                         rsm->r_flags &= ~RACK_TLP;
4002                         if (rsm->r_in_tmap) {
4003                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4004                                 rsm->r_in_tmap = 0;
4005                         }
4006                 }
4007                 if (end == rsm->r_end) {
4008                         /* This block only - done */
4009                         goto out;
4010                 }
4011                 /* There is more not coverend by this rsm move on */
4012                 start = rsm->r_end;
4013                 nrsm = TAILQ_NEXT(rsm, r_next);
4014                 rsm = nrsm;
4015                 times = 0;
4016                 goto do_rest_ofb;
4017         }
4018         /* Ok we need to split off this one at the tail */
4019         nrsm = rack_alloc(rack);
4020         if (nrsm == NULL) {
4021                 /* failed rrs what can we do but loose the sack info? */
4022                 goto out;
4023         }
4024         /* Clone it */
4025         nrsm->r_start = end;
4026         nrsm->r_end = rsm->r_end;
4027         nrsm->r_rtr_bytes = 0;
4028         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4029         nrsm->r_flags = rsm->r_flags;
4030         nrsm->r_sndcnt = rsm->r_sndcnt;
4031         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4032                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4033         }
4034         /* The sack block does not cover this guy fully */
4035         rsm->r_flags &= (~RACK_HAS_FIN);
4036         rsm->r_end = end;
4037         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4038         if (rsm->r_in_tmap) {
4039                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4040                 nrsm->r_in_tmap = 1;
4041         }
4042         if (rsm->r_flags & RACK_ACKED) {
4043                 /* Been here done that */
4044                 goto out;
4045         }
4046         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4047         changed += (rsm->r_end - rsm->r_start);
4048         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4049         rack_log_sack_passed(tp, rack, rsm);
4050         /* Is Reordering occuring? */
4051         if (rsm->r_flags & RACK_SACK_PASSED) {
4052                 counter_u64_add(rack_reorder_seen, 1);
4053                 rack->r_ctl.rc_reorder_ts = cts;
4054         }
4055         rsm->r_flags |= RACK_ACKED;
4056         rsm->r_flags &= ~RACK_TLP;
4057         if (rsm->r_in_tmap) {
4058                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4059                 rsm->r_in_tmap = 0;
4060         }
4061 out:
4062         if (used_ref == 0) {
4063                 counter_u64_add(rack_sack_proc_all, 1);
4064         } else {
4065                 counter_u64_add(rack_sack_proc_short, 1);
4066         }
4067         /* Save off where we last were */
4068         if (rsm)
4069                 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
4070         else
4071                 rack->r_ctl.rc_sacklast = NULL;
4072         *prsm = rsm;
4073         return (changed);
4074 }
4075
4076 static void inline
4077 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4078 {
4079         struct rack_sendmap *tmap;
4080
4081         tmap = NULL;
4082         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4083                 /* Its no longer sacked, mark it so */
4084                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4085 #ifdef INVARIANTS
4086                 if (rsm->r_in_tmap) {
4087                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4088                               rack, rsm, rsm->r_flags);
4089                 }
4090 #endif
4091                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4092                 /* Rebuild it into our tmap */
4093                 if (tmap == NULL) {
4094                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4095                         tmap = rsm;
4096                 } else {
4097                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4098                         tmap = rsm;
4099                 }
4100                 tmap->r_in_tmap = 1;
4101                 rsm = TAILQ_NEXT(rsm, r_next);
4102         }
4103         /*
4104          * Now lets possibly clear the sack filter so we start
4105          * recognizing sacks that cover this area.
4106          */
4107         if (rack_use_sack_filter)
4108                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4109
4110 }
4111
4112 static void
4113 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4114 {
4115         uint32_t changed, last_seq, entered_recovery = 0;
4116         struct tcp_rack *rack;
4117         struct rack_sendmap *rsm;
4118         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4119         register uint32_t th_ack;
4120         int32_t i, j, k, num_sack_blks = 0;
4121         uint32_t cts, acked, ack_point, sack_changed = 0;
4122
4123         INP_WLOCK_ASSERT(tp->t_inpcb);
4124         if (th->th_flags & TH_RST) {
4125                 /* We don't log resets */
4126                 return;
4127         }
4128         rack = (struct tcp_rack *)tp->t_fb_ptr;
4129         cts = tcp_ts_getticks();
4130         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4131         changed = 0;
4132         th_ack = th->th_ack;
4133
4134         if (SEQ_GT(th_ack, tp->snd_una)) {
4135                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4136                 tp->t_acktime = ticks;
4137         }
4138         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4139                 changed = th_ack - rsm->r_start;
4140         if (changed) {
4141                 /*
4142                  * The ACK point is advancing to th_ack, we must drop off
4143                  * the packets in the rack log and calculate any eligble
4144                  * RTT's.
4145                  */
4146                 rack->r_wanted_output++;
4147 more:
4148                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4149                 if (rsm == NULL) {
4150                         if ((th_ack - 1) == tp->iss) {
4151                                 /*
4152                                  * For the SYN incoming case we will not
4153                                  * have called tcp_output for the sending of
4154                                  * the SYN, so there will be no map. All
4155                                  * other cases should probably be a panic.
4156                                  */
4157                                 goto proc_sack;
4158                         }
4159                         if (tp->t_flags & TF_SENTFIN) {
4160                                 /* if we send a FIN we will not hav a map */
4161                                 goto proc_sack;
4162                         }
4163 #ifdef INVARIANTS
4164                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4165                             tp,
4166                             th, tp->t_state, rack,
4167                             tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4168 #endif
4169                         goto proc_sack;
4170                 }
4171                 if (SEQ_LT(th_ack, rsm->r_start)) {
4172                         /* Huh map is missing this */
4173 #ifdef INVARIANTS
4174                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4175                             rsm->r_start,
4176                             th_ack, tp->t_state, rack->r_state);
4177 #endif
4178                         goto proc_sack;
4179                 }
4180                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4181                 /* Now do we consume the whole thing? */
4182                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4183                         /* Its all consumed. */
4184                         uint32_t left;
4185
4186                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4187                         rsm->r_rtr_bytes = 0;
4188                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
4189                         if (rsm->r_in_tmap) {
4190                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4191                                 rsm->r_in_tmap = 0;
4192                         }
4193                         if (rack->r_ctl.rc_next == rsm) {
4194                                 /* scoot along the marker */
4195                                 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
4196                         }
4197                         if (rsm->r_flags & RACK_ACKED) {
4198                                 /*
4199                                  * It was acked on the scoreboard -- remove
4200                                  * it from total
4201                                  */
4202                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4203                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4204                                 /*
4205                                  * There are acked segments ACKED on the
4206                                  * scoreboard further up. We are seeing
4207                                  * reordering.
4208                                  */
4209                                 counter_u64_add(rack_reorder_seen, 1);
4210                                 rsm->r_flags |= RACK_ACKED;
4211                                 rack->r_ctl.rc_reorder_ts = cts;
4212                         }
4213                         left = th_ack - rsm->r_end;
4214                         if (rsm->r_rtr_cnt > 1) {
4215                                 /*
4216                                  * Technically we should make r_rtr_cnt be
4217                                  * monotonicly increasing and just mod it to
4218                                  * the timestamp it is replacing.. that way
4219                                  * we would have the last 3 retransmits. Now
4220                                  * rc_loss_count will be wrong if we
4221                                  * retransmit something more than 2 times in
4222                                  * recovery :(
4223                                  */
4224                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4225                         }
4226                         /* Free back to zone */
4227                         rack_free(rack, rsm);
4228                         if (left) {
4229                                 goto more;
4230                         }
4231                         goto proc_sack;
4232                 }
4233                 if (rsm->r_flags & RACK_ACKED) {
4234                         /*
4235                          * It was acked on the scoreboard -- remove it from
4236                          * total for the part being cum-acked.
4237                          */
4238                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4239                 }
4240                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4241                 rsm->r_rtr_bytes = 0;
4242                 rsm->r_start = th_ack;
4243         }
4244 proc_sack:
4245         /* Check for reneging */
4246         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4247         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4248                 /*
4249                  * The peer has moved snd_una up to
4250                  * the edge of this send, i.e. one
4251                  * that it had previously acked. The only
4252                  * way that can be true if the peer threw
4253                  * away data (space issues) that it had
4254                  * previously sacked (else it would have
4255                  * given us snd_una up to (rsm->r_end).
4256                  * We need to undo the acked markings here.
4257                  *
4258                  * Note we have to look to make sure th_ack is
4259                  * our rsm->r_start in case we get an old ack
4260                  * where th_ack is behind snd_una.
4261                  */
4262                 rack_peer_reneges(rack, rsm, th->th_ack);
4263         }
4264         if ((to->to_flags & TOF_SACK) == 0) {
4265                 /* We are done nothing left to log */
4266                 goto out;
4267         }
4268         rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4269         if (rsm) {
4270                 last_seq = rsm->r_end;
4271         } else {
4272                 last_seq = tp->snd_max;
4273         }
4274         /* Sack block processing */
4275         if (SEQ_GT(th_ack, tp->snd_una))
4276                 ack_point = th_ack;
4277         else
4278                 ack_point = tp->snd_una;
4279         for (i = 0; i < to->to_nsacks; i++) {
4280                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4281                     &sack, sizeof(sack));
4282                 sack.start = ntohl(sack.start);
4283                 sack.end = ntohl(sack.end);
4284                 if (SEQ_GT(sack.end, sack.start) &&
4285                     SEQ_GT(sack.start, ack_point) &&
4286                     SEQ_LT(sack.start, tp->snd_max) &&
4287                     SEQ_GT(sack.end, ack_point) &&
4288                     SEQ_LEQ(sack.end, tp->snd_max)) {
4289                         if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
4290                             (SEQ_LT(sack.end, last_seq)) &&
4291                             ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
4292                                 /*
4293                                  * Not the last piece and its smaller than
4294                                  * 1/8th of a MSS. We ignore this.
4295                                  */
4296                                 counter_u64_add(rack_runt_sacks, 1);
4297                                 continue;
4298                         }
4299                         sack_blocks[num_sack_blks] = sack;
4300                         num_sack_blks++;
4301 #ifdef NETFLIX_STATS
4302                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4303                            SEQ_LEQ(sack.end, th_ack)) {
4304                         /*
4305                          * Its a D-SACK block.
4306                          */
4307                         tcp_record_dsack(sack.start, sack.end);
4308 #endif
4309                 }
4310
4311         }
4312         if (num_sack_blks == 0)
4313                 goto out;
4314         /*
4315          * Sort the SACK blocks so we can update the rack scoreboard with
4316          * just one pass.
4317          */
4318         if (rack_use_sack_filter) {
4319                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
4320         }
4321         if (num_sack_blks < 2) {
4322                 goto do_sack_work;
4323         }
4324         /* Sort the sacks */
4325         for (i = 0; i < num_sack_blks; i++) {
4326                 for (j = i + 1; j < num_sack_blks; j++) {
4327                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4328                                 sack = sack_blocks[i];
4329                                 sack_blocks[i] = sack_blocks[j];
4330                                 sack_blocks[j] = sack;
4331                         }
4332                 }
4333         }
4334         /*
4335          * Now are any of the sack block ends the same (yes some
4336          * implememtations send these)?
4337          */
4338 again:
4339         if (num_sack_blks > 1) {
4340                 for (i = 0; i < num_sack_blks; i++) {
4341                         for (j = i + 1; j < num_sack_blks; j++) {
4342                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4343                                         /*
4344                                          * Ok these two have the same end we
4345                                          * want the smallest end and then
4346                                          * throw away the larger and start
4347                                          * again.
4348                                          */
4349                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4350                                                 /*
4351                                                  * The second block covers
4352                                                  * more area use that
4353                                                  */
4354                                                 sack_blocks[i].start = sack_blocks[j].start;
4355                                         }
4356                                         /*
4357                                          * Now collapse out the dup-sack and
4358                                          * lower the count
4359                                          */
4360                                         for (k = (j + 1); k < num_sack_blks; k++) {
4361                                                 sack_blocks[j].start = sack_blocks[k].start;
4362                                                 sack_blocks[j].end = sack_blocks[k].end;
4363                                                 j++;
4364                                         }
4365                                         num_sack_blks--;
4366                                         goto again;
4367                                 }
4368                         }
4369                 }
4370         }
4371 do_sack_work:
4372         rsm = rack->r_ctl.rc_sacklast;
4373         for (i = 0; i < num_sack_blks; i++) {
4374                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
4375                 if (acked) {
4376                         rack->r_wanted_output++;
4377                         changed += acked;
4378                         sack_changed += acked;
4379                 }
4380         }
4381 out:
4382         if (changed) {
4383                 /* Something changed cancel the rack timer */
4384                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4385         }
4386         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
4387                 /*
4388                  * Ok we have a high probability that we need to go in to
4389                  * recovery since we have data sack'd
4390                  */
4391                 struct rack_sendmap *rsm;
4392                 uint32_t tsused;
4393
4394                 tsused = tcp_ts_getticks();
4395                 rsm = tcp_rack_output(tp, rack, tsused);
4396                 if (rsm) {
4397                         /* Enter recovery */
4398                         rack->r_ctl.rc_rsm_start = rsm->r_start;
4399                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4400                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4401                         entered_recovery = 1;
4402                         rack_cong_signal(tp, NULL, CC_NDUPACK);
4403                         /*
4404                          * When we enter recovery we need to assure we send
4405                          * one packet.
4406                          */
4407                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
4408                         rack->r_timer_override = 1;
4409                 }
4410         }
4411         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
4412                 /* Deal with changed an PRR here (in recovery only) */
4413                 uint32_t pipe, snd_una;
4414
4415                 rack->r_ctl.rc_prr_delivered += changed;
4416                 /* Compute prr_sndcnt */
4417                 if (SEQ_GT(tp->snd_una, th_ack)) {
4418                         snd_una = tp->snd_una;
4419                 } else {
4420                         snd_una = th_ack;
4421                 }
4422                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
4423                 if (pipe > tp->snd_ssthresh) {
4424                         long sndcnt;
4425
4426                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
4427                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
4428                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
4429                         else {
4430                                 rack->r_ctl.rc_prr_sndcnt = 0;
4431                                 sndcnt = 0;
4432                         }
4433                         sndcnt++;
4434                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
4435                                 sndcnt -= rack->r_ctl.rc_prr_out;
4436                         else
4437                                 sndcnt = 0;
4438                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
4439                 } else {
4440                         uint32_t limit;
4441
4442                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
4443                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
4444                         else
4445                                 limit = 0;
4446                         if (changed > limit)
4447                                 limit = changed;
4448                         limit += tp->t_maxseg;
4449                         if (tp->snd_ssthresh > pipe) {
4450                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
4451                         } else {
4452                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
4453                         }
4454                 }
4455                 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
4456                         rack->r_timer_override = 1;
4457                 }
4458         }
4459 }
4460
4461 /*
4462  * Return value of 1, we do not need to call rack_process_data().
4463  * return value of 0, rack_process_data can be called.
4464  * For ret_val if its 0 the TCP is locked, if its non-zero
4465  * its unlocked and probably unsafe to touch the TCB.
4466  */
4467 static int
4468 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
4469     struct tcpcb *tp, struct tcpopt *to,
4470     uint32_t tiwin, int32_t tlen,
4471     int32_t * ofia, int32_t thflags, int32_t * ret_val)
4472 {
4473         int32_t ourfinisacked = 0;
4474         int32_t nsegs, acked_amount;
4475         int32_t acked;
4476         struct mbuf *mfree;
4477         struct tcp_rack *rack;
4478         int32_t recovery = 0;
4479
4480         rack = (struct tcp_rack *)tp->t_fb_ptr;
4481         if (SEQ_GT(th->th_ack, tp->snd_max)) {
4482                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
4483                 return (1);
4484         }
4485         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
4486                 rack_log_ack(tp, to, th);
4487         }
4488         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
4489                 /*
4490                  * Old ack, behind (or duplicate to) the last one rcv'd
4491                  * Note: Should mark reordering is occuring! We should also
4492                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
4493                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
4494                  * retran and> ack 3
4495                  */
4496                 return (0);
4497         }
4498         /*
4499          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
4500          * something we sent.
4501          */
4502         if (tp->t_flags & TF_NEEDSYN) {
4503                 /*
4504                  * T/TCP: Connection was half-synchronized, and our SYN has
4505                  * been ACK'd (so connection is now fully synchronized).  Go
4506                  * to non-starred state, increment snd_una for ACK of SYN,
4507                  * and check if we can do window scaling.
4508                  */
4509                 tp->t_flags &= ~TF_NEEDSYN;
4510                 tp->snd_una++;
4511                 /* Do window scaling? */
4512                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
4513                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
4514                         tp->rcv_scale = tp->request_r_scale;
4515                         /* Send window already scaled. */
4516                 }
4517         }
4518         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4519         INP_WLOCK_ASSERT(tp->t_inpcb);
4520
4521         acked = BYTES_THIS_ACK(tp, th);
4522         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
4523         TCPSTAT_ADD(tcps_rcvackbyte, acked);
4524
4525         /*
4526          * If we just performed our first retransmit, and the ACK arrives
4527          * within our recovery window, then it was a mistake to do the
4528          * retransmit in the first place.  Recover our original cwnd and
4529          * ssthresh, and proceed to transmit where we left off.
4530          */
4531         if (tp->t_flags & TF_PREVVALID) {
4532                 tp->t_flags &= ~TF_PREVVALID;
4533                 if (tp->t_rxtshift == 1 &&
4534                     (int)(ticks - tp->t_badrxtwin) < 0)
4535                         rack_cong_signal(tp, th, CC_RTO_ERR);
4536         }
4537         /*
4538          * If we have a timestamp reply, update smoothed round trip time. If
4539          * no timestamp is present but transmit timer is running and timed
4540          * sequence number was acked, update smoothed round trip time. Since
4541          * we now have an rtt measurement, cancel the timer backoff (cf.,
4542          * Phil Karn's retransmit alg.). Recompute the initial retransmit
4543          * timer.
4544          *
4545          * Some boxes send broken timestamp replies during the SYN+ACK
4546          * phase, ignore timestamps of 0 or we could calculate a huge RTT
4547          * and blow up the retransmit timer.
4548          */
4549         /*
4550          * If all outstanding data is acked, stop retransmit timer and
4551          * remember to restart (more output or persist). If there is more
4552          * data to be acked, restart retransmit timer, using current
4553          * (possibly backed-off) value.
4554          */
4555         if (th->th_ack == tp->snd_max) {
4556                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4557                 rack->r_wanted_output++;
4558         }
4559         /*
4560          * If no data (only SYN) was ACK'd, skip rest of ACK processing.
4561          */
4562         if (acked == 0) {
4563                 if (ofia)
4564                         *ofia = ourfinisacked;
4565                 return (0);
4566         }
4567         if (rack->r_ctl.rc_early_recovery) {
4568                 if (IN_FASTRECOVERY(tp->t_flags)) {
4569                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4570                                 tcp_rack_partialack(tp, th);
4571                         } else {
4572                                 rack_post_recovery(tp, th);
4573                                 recovery = 1;
4574                         }
4575                 }
4576         }
4577         /*
4578          * Let the congestion control algorithm update congestion control
4579          * related information. This typically means increasing the
4580          * congestion window.
4581          */
4582         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
4583         SOCKBUF_LOCK(&so->so_snd);
4584         acked_amount = min(acked, (int)sbavail(&so->so_snd));
4585         tp->snd_wnd -= acked_amount;
4586         mfree = sbcut_locked(&so->so_snd, acked_amount);
4587         if ((sbused(&so->so_snd) == 0) &&
4588             (acked > acked_amount) &&
4589             (tp->t_state >= TCPS_FIN_WAIT_1)) {
4590                 ourfinisacked = 1;
4591         }
4592         /* NB: sowwakeup_locked() does an implicit unlock. */
4593         sowwakeup_locked(so);
4594         m_freem(mfree);
4595         if (rack->r_ctl.rc_early_recovery == 0) {
4596                 if (IN_FASTRECOVERY(tp->t_flags)) {
4597                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4598                                 tcp_rack_partialack(tp, th);
4599                         } else {
4600                                 rack_post_recovery(tp, th);
4601                         }
4602                 }
4603         }
4604         tp->snd_una = th->th_ack;
4605         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4606                 tp->snd_recover = tp->snd_una;
4607
4608         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4609                 tp->snd_nxt = tp->snd_una;
4610         }
4611         if (tp->snd_una == tp->snd_max) {
4612                 /* Nothing left outstanding */
4613                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
4614                 tp->t_acktime = 0;
4615                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4616                 /* Set need output so persist might get set */
4617                 rack->r_wanted_output++;
4618                 if (rack_use_sack_filter)
4619                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
4620                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
4621                     (sbavail(&so->so_snd) == 0) &&
4622                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
4623                         /*
4624                          * The socket was gone and the
4625                          * peer sent data, time to
4626                          * reset him.
4627                          */
4628                         *ret_val = 1;
4629                         tp = tcp_close(tp);
4630                         rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
4631                         return (1);
4632                 }
4633         }
4634         if (ofia)
4635                 *ofia = ourfinisacked;
4636         return (0);
4637 }
4638
4639
4640 /*
4641  * Return value of 1, the TCB is unlocked and most
4642  * likely gone, return value of 0, the TCP is still
4643  * locked.
4644  */
4645 static int
4646 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
4647     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
4648     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
4649 {
4650         /*
4651          * Update window information. Don't look at window if no ACK: TAC's
4652          * send garbage on first SYN.
4653          */
4654         int32_t nsegs;
4655         int32_t tfo_syn;
4656         struct tcp_rack *rack;
4657
4658         rack = (struct tcp_rack *)tp->t_fb_ptr;
4659         INP_WLOCK_ASSERT(tp->t_inpcb);
4660         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4661         if ((thflags & TH_ACK) &&
4662             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4663             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4664             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4665                 /* keep track of pure window updates */
4666                 if (tlen == 0 &&
4667                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4668                         TCPSTAT_INC(tcps_rcvwinupd);
4669                 tp->snd_wnd = tiwin;
4670                 tp->snd_wl1 = th->th_seq;
4671                 tp->snd_wl2 = th->th_ack;
4672                 if (tp->snd_wnd > tp->max_sndwnd)
4673                         tp->max_sndwnd = tp->snd_wnd;
4674                 rack->r_wanted_output++;
4675         } else if (thflags & TH_ACK) {
4676                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
4677                         tp->snd_wnd = tiwin;
4678                         tp->snd_wl1 = th->th_seq;
4679                         tp->snd_wl2 = th->th_ack;
4680                 }
4681         }
4682         /* Was persist timer active and now we have window space? */
4683         if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
4684                 rack_exit_persist(tp, rack);
4685                 tp->snd_nxt = tp->snd_max;
4686                 /* Make sure we output to start the timer */
4687                 rack->r_wanted_output++;
4688         }
4689         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
4690                 m_freem(m);
4691                 return (0);
4692         }
4693         /*
4694          * Process segments with URG.
4695          */
4696         if ((thflags & TH_URG) && th->th_urp &&
4697             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4698                 /*
4699                  * This is a kludge, but if we receive and accept random
4700                  * urgent pointers, we'll crash in soreceive.  It's hard to
4701                  * imagine someone actually wanting to send this much urgent
4702                  * data.
4703                  */
4704                 SOCKBUF_LOCK(&so->so_rcv);
4705                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
4706                         th->th_urp = 0; /* XXX */
4707                         thflags &= ~TH_URG;     /* XXX */
4708                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
4709                         goto dodata;    /* XXX */
4710                 }
4711                 /*
4712                  * If this segment advances the known urgent pointer, then
4713                  * mark the data stream.  This should not happen in
4714                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
4715                  * FIN has been received from the remote side. In these
4716                  * states we ignore the URG.
4717                  *
4718                  * According to RFC961 (Assigned Protocols), the urgent
4719                  * pointer points to the last octet of urgent data.  We
4720                  * continue, however, to consider it to indicate the first
4721                  * octet of data past the urgent section as the original
4722                  * spec states (in one of two places).
4723                  */
4724                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
4725                         tp->rcv_up = th->th_seq + th->th_urp;
4726                         so->so_oobmark = sbavail(&so->so_rcv) +
4727                             (tp->rcv_up - tp->rcv_nxt) - 1;
4728                         if (so->so_oobmark == 0)
4729                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
4730                         sohasoutofband(so);
4731                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4732                 }
4733                 SOCKBUF_UNLOCK(&so->so_rcv);
4734                 /*
4735                  * Remove out of band data so doesn't get presented to user.
4736                  * This can happen independent of advancing the URG pointer,
4737                  * but if two URG's are pending at once, some out-of-band
4738                  * data may creep in... ick.
4739                  */
4740                 if (th->th_urp <= (uint32_t) tlen &&
4741                     !(so->so_options & SO_OOBINLINE)) {
4742                         /* hdr drop is delayed */
4743                         tcp_pulloutofband(so, th, m, drop_hdrlen);
4744                 }
4745         } else {
4746                 /*
4747                  * If no out of band data is expected, pull receive urgent
4748                  * pointer along with the receive window.
4749                  */
4750                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4751                         tp->rcv_up = tp->rcv_nxt;
4752         }
4753 dodata:                         /* XXX */
4754         INP_WLOCK_ASSERT(tp->t_inpcb);
4755
4756         /*
4757          * Process the segment text, merging it into the TCP sequencing
4758          * queue, and arranging for acknowledgment of receipt if necessary.
4759          * This process logically involves adjusting tp->rcv_wnd as data is
4760          * presented to the user (this happens in tcp_usrreq.c, case
4761          * PRU_RCVD).  If a FIN has already been received on this connection
4762          * then we just ignore the text.
4763          */
4764         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
4765                    IS_FASTOPEN(tp->t_flags));
4766         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
4767             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4768                 tcp_seq save_start = th->th_seq;
4769
4770                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4771                 /*
4772                  * Insert segment which includes th into TCP reassembly
4773                  * queue with control block tp.  Set thflags to whether
4774                  * reassembly now includes a segment with FIN.  This handles
4775                  * the common case inline (segment is the next to be
4776                  * received on an established connection, and the queue is
4777                  * empty), avoiding linkage into and removal from the queue
4778                  * and repetition of various conversions. Set DELACK for
4779                  * segments received in order, but ack immediately when
4780                  * segments are out of order (so fast retransmit can work).
4781                  */
4782                 if (th->th_seq == tp->rcv_nxt &&
4783                     SEGQ_EMPTY(tp) &&
4784                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
4785                     tfo_syn)) {
4786                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
4787                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4788                                 tp->t_flags |= TF_DELACK;
4789                         } else {
4790                                 rack->r_wanted_output++;
4791                                 tp->t_flags |= TF_ACKNOW;
4792                         }
4793                         tp->rcv_nxt += tlen;
4794                         thflags = th->th_flags & TH_FIN;
4795                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
4796                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
4797                         SOCKBUF_LOCK(&so->so_rcv);
4798                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4799                                 m_freem(m);
4800                         else
4801                                 sbappendstream_locked(&so->so_rcv, m, 0);
4802                         /* NB: sorwakeup_locked() does an implicit unlock. */
4803                         sorwakeup_locked(so);
4804                 } else {
4805                         /*
4806                          * XXX: Due to the header drop above "th" is
4807                          * theoretically invalid by now.  Fortunately
4808                          * m_adj() doesn't actually frees any mbufs when
4809                          * trimming from the head.
4810                          */
4811                         thflags = tcp_reass(tp, th, &save_start, &tlen, m);
4812                         tp->t_flags |= TF_ACKNOW;
4813                 }
4814                 if (tlen > 0)
4815                         tcp_update_sack_list(tp, save_start, save_start + tlen);
4816         } else {
4817                 m_freem(m);
4818                 thflags &= ~TH_FIN;
4819         }
4820
4821         /*
4822          * If FIN is received ACK the FIN and let the user know that the
4823          * connection is closing.
4824          */
4825         if (thflags & TH_FIN) {
4826                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4827                         socantrcvmore(so);
4828                         /*
4829                          * If connection is half-synchronized (ie NEEDSYN
4830                          * flag on) then delay ACK, so it may be piggybacked
4831                          * when SYN is sent. Otherwise, since we received a
4832                          * FIN then no more input can be expected, send ACK
4833                          * now.
4834                          */
4835                         if (tp->t_flags & TF_NEEDSYN) {
4836                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4837                                 tp->t_flags |= TF_DELACK;
4838                         } else {
4839                                 tp->t_flags |= TF_ACKNOW;
4840                         }
4841                         tp->rcv_nxt++;
4842                 }
4843                 switch (tp->t_state) {
4844
4845                         /*
4846                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
4847                          * CLOSE_WAIT state.
4848                          */
4849                 case TCPS_SYN_RECEIVED:
4850                         tp->t_starttime = ticks;
4851                         /* FALLTHROUGH */
4852                 case TCPS_ESTABLISHED:
4853                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4854                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
4855                         break;
4856
4857                         /*
4858                          * If still in FIN_WAIT_1 STATE FIN has not been
4859                          * acked so enter the CLOSING state.
4860                          */
4861                 case TCPS_FIN_WAIT_1:
4862                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4863                         tcp_state_change(tp, TCPS_CLOSING);
4864                         break;
4865
4866                         /*
4867                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
4868                          * starting the time-wait timer, turning off the
4869                          * other standard timers.
4870                          */
4871                 case TCPS_FIN_WAIT_2:
4872                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4873                         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
4874                         tcp_twstart(tp);
4875                         return (1);
4876                 }
4877         }
4878         /*
4879          * Return any desired output.
4880          */
4881         if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
4882                 rack->r_wanted_output++;
4883         }
4884         INP_WLOCK_ASSERT(tp->t_inpcb);
4885         return (0);
4886 }
4887
4888 /*
4889  * Here nothing is really faster, its just that we
4890  * have broken out the fast-data path also just like
4891  * the fast-ack.
4892  */
4893 static int
4894 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
4895     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
4896     uint32_t tiwin, int32_t nxt_pkt)
4897 {
4898         int32_t nsegs;
4899         int32_t newsize = 0;    /* automatic sockbuf scaling */
4900         struct tcp_rack *rack;
4901 #ifdef TCPDEBUG
4902         /*
4903          * The size of tcp_saveipgen must be the size of the max ip header,
4904          * now IPv6.
4905          */
4906         u_char tcp_saveipgen[IP6_HDR_LEN];
4907         struct tcphdr tcp_savetcp;
4908         short ostate = 0;
4909
4910 #endif
4911         /*
4912          * If last ACK falls within this segment's sequence numbers, record
4913          * the timestamp. NOTE that the test is modified according to the
4914          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
4915          */
4916         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
4917                 return (0);
4918         }
4919         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
4920                 return (0);
4921         }
4922         if (tiwin && tiwin != tp->snd_wnd) {
4923                 return (0);
4924         }
4925         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
4926                 return (0);
4927         }
4928         if (__predict_false((to->to_flags & TOF_TS) &&
4929             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
4930                 return (0);
4931         }
4932         if (__predict_false((th->th_ack != tp->snd_una))) {
4933                 return (0);
4934         }
4935         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
4936                 return (0);
4937         }
4938         if ((to->to_flags & TOF_TS) != 0 &&
4939             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
4940                 tp->ts_recent_age = tcp_ts_getticks();
4941                 tp->ts_recent = to->to_tsval;
4942         }
4943         rack = (struct tcp_rack *)tp->t_fb_ptr;
4944         /*
4945          * This is a pure, in-sequence data packet with nothing on the
4946          * reassembly queue and we have enough buffer space to take it.
4947          */
4948         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4949
4950
4951         /* Clean receiver SACK report if present */
4952         if (tp->rcv_numsacks)
4953                 tcp_clean_sackreport(tp);
4954         TCPSTAT_INC(tcps_preddat);
4955         tp->rcv_nxt += tlen;
4956         /*
4957          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
4958          */
4959         tp->snd_wl1 = th->th_seq;
4960         /*
4961          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
4962          */
4963         tp->rcv_up = tp->rcv_nxt;
4964         TCPSTAT_ADD(tcps_rcvpack, nsegs);
4965         TCPSTAT_ADD(tcps_rcvbyte, tlen);
4966 #ifdef TCPDEBUG
4967         if (so->so_options & SO_DEBUG)
4968                 tcp_trace(TA_INPUT, ostate, tp,
4969                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
4970 #endif
4971         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
4972
4973         /* Add data to socket buffer. */
4974         SOCKBUF_LOCK(&so->so_rcv);
4975         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
4976                 m_freem(m);
4977         } else {
4978                 /*
4979                  * Set new socket buffer size. Give up when limit is
4980                  * reached.
4981                  */
4982                 if (newsize)
4983                         if (!sbreserve_locked(&so->so_rcv,
4984                             newsize, so, NULL))
4985                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
4986                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4987                 sbappendstream_locked(&so->so_rcv, m, 0);
4988                 rack_calc_rwin(so, tp);
4989         }
4990         /* NB: sorwakeup_locked() does an implicit unlock. */
4991         sorwakeup_locked(so);
4992         if (DELAY_ACK(tp, tlen)) {
4993                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4994                 tp->t_flags |= TF_DELACK;
4995         } else {
4996                 tp->t_flags |= TF_ACKNOW;
4997                 rack->r_wanted_output++;
4998         }
4999         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
5000                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5001         return (1);
5002 }
5003
5004 /*
5005  * This subfunction is used to try to highly optimize the
5006  * fast path. We again allow window updates that are
5007  * in sequence to remain in the fast-path. We also add
5008  * in the __predict's to attempt to help the compiler.
5009  * Note that if we return a 0, then we can *not* process
5010  * it and the caller should push the packet into the
5011  * slow-path.
5012  */
5013 static int
5014 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5015     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5016     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
5017 {
5018         int32_t acked;
5019         int32_t nsegs;
5020
5021 #ifdef TCPDEBUG
5022         /*
5023          * The size of tcp_saveipgen must be the size of the max ip header,
5024          * now IPv6.
5025          */
5026         u_char tcp_saveipgen[IP6_HDR_LEN];
5027         struct tcphdr tcp_savetcp;
5028         short ostate = 0;
5029
5030 #endif
5031         struct tcp_rack *rack;
5032
5033         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5034                 /* Old ack, behind (or duplicate to) the last one rcv'd */
5035                 return (0);
5036         }
5037         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
5038                 /* Above what we have sent? */
5039                 return (0);
5040         }
5041         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5042                 /* We are retransmitting */
5043                 return (0);
5044         }
5045         if (__predict_false(tiwin == 0)) {
5046                 /* zero window */
5047                 return (0);
5048         }
5049         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
5050                 /* We need a SYN or a FIN, unlikely.. */
5051                 return (0);
5052         }
5053         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
5054                 /* Timestamp is behind .. old ack with seq wrap? */
5055                 return (0);
5056         }
5057         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
5058                 /* Still recovering */
5059                 return (0);
5060         }
5061         rack = (struct tcp_rack *)tp->t_fb_ptr;
5062         if (rack->r_ctl.rc_sacked) {
5063                 /* We have sack holes on our scoreboard */
5064                 return (0);
5065         }
5066         /* Ok if we reach here, we can process a fast-ack */
5067         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5068         rack_log_ack(tp, to, th);
5069         /* Did the window get updated? */
5070         if (tiwin != tp->snd_wnd) {
5071                 tp->snd_wnd = tiwin;
5072                 tp->snd_wl1 = th->th_seq;
5073                 if (tp->snd_wnd > tp->max_sndwnd)
5074                         tp->max_sndwnd = tp->snd_wnd;
5075         }
5076         if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
5077                 rack_exit_persist(tp, rack);
5078         }
5079         /*
5080          * If last ACK falls within this segment's sequence numbers, record
5081          * the timestamp. NOTE that the test is modified according to the
5082          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5083          */
5084         if ((to->to_flags & TOF_TS) != 0 &&
5085             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5086                 tp->ts_recent_age = tcp_ts_getticks();
5087                 tp->ts_recent = to->to_tsval;
5088         }
5089         /*
5090          * This is a pure ack for outstanding data.
5091          */
5092         TCPSTAT_INC(tcps_predack);
5093
5094         /*
5095          * "bad retransmit" recovery.
5096          */
5097         if (tp->t_flags & TF_PREVVALID) {
5098                 tp->t_flags &= ~TF_PREVVALID;
5099                 if (tp->t_rxtshift == 1 &&
5100                     (int)(ticks - tp->t_badrxtwin) < 0)
5101                         rack_cong_signal(tp, th, CC_RTO_ERR);
5102         }
5103         /*
5104          * Recalculate the transmit timer / rtt.
5105          *
5106          * Some boxes send broken timestamp replies during the SYN+ACK
5107          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5108          * and blow up the retransmit timer.
5109          */
5110         acked = BYTES_THIS_ACK(tp, th);
5111
5112 #ifdef TCP_HHOOK
5113         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
5114         hhook_run_tcp_est_in(tp, th, to);
5115 #endif
5116
5117         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5118         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5119         sbdrop(&so->so_snd, acked);
5120         /*
5121          * Let the congestion control algorithm update congestion control
5122          * related information. This typically means increasing the
5123          * congestion window.
5124          */
5125         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
5126
5127         tp->snd_una = th->th_ack;
5128         /*
5129          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
5130          */
5131         tp->snd_wl2 = th->th_ack;
5132         tp->t_dupacks = 0;
5133         m_freem(m);
5134         /* ND6_HINT(tp);         *//* Some progress has been made. */
5135
5136         /*
5137          * If all outstanding data are acked, stop retransmit timer,
5138          * otherwise restart timer using current (possibly backed-off)
5139          * value. If process is waiting for space, wakeup/selwakeup/signal.
5140          * If data are ready to send, let tcp_output decide between more
5141          * output or persist.
5142          */
5143 #ifdef TCPDEBUG
5144         if (so->so_options & SO_DEBUG)
5145                 tcp_trace(TA_INPUT, ostate, tp,
5146                     (void *)tcp_saveipgen,
5147                     &tcp_savetcp, 0);
5148 #endif
5149         if (tp->snd_una == tp->snd_max) {
5150                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5151                 tp->t_acktime = 0;
5152                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5153         }
5154         /* Wake up the socket if we have room to write more */
5155         sowwakeup(so);
5156         if (sbavail(&so->so_snd)) {
5157                 rack->r_wanted_output++;
5158         }
5159         return (1);
5160 }
5161
5162 /*
5163  * Return value of 1, the TCB is unlocked and most
5164  * likely gone, return value of 0, the TCP is still
5165  * locked.
5166  */
5167 static int
5168 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
5169     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5170     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5171 {
5172         int32_t ret_val = 0;
5173         int32_t todrop;
5174         int32_t ourfinisacked = 0;
5175
5176         rack_calc_rwin(so, tp);
5177         /*
5178          * If the state is SYN_SENT: if seg contains an ACK, but not for our
5179          * SYN, drop the input. if seg contains a RST, then drop the
5180          * connection. if seg does not contain SYN, then drop it. Otherwise
5181          * this is an acceptable SYN segment initialize tp->rcv_nxt and
5182          * tp->irs if seg contains ack then advance tp->snd_una if seg
5183          * contains an ECE and ECN support is enabled, the stream is ECN
5184          * capable. if SYN has been acked change to ESTABLISHED else
5185          * SYN_RCVD state arrange for segment to be acked (eventually)
5186          * continue processing rest of data/controls, beginning with URG
5187          */
5188         if ((thflags & TH_ACK) &&
5189             (SEQ_LEQ(th->th_ack, tp->iss) ||
5190             SEQ_GT(th->th_ack, tp->snd_max))) {
5191                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5192                 return (1);
5193         }
5194         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
5195                 TCP_PROBE5(connect__refused, NULL, tp,
5196                     mtod(m, const char *), tp, th);
5197                 tp = tcp_drop(tp, ECONNREFUSED);
5198                 rack_do_drop(m, tp);
5199                 return (1);
5200         }
5201         if (thflags & TH_RST) {
5202                 rack_do_drop(m, tp);
5203                 return (1);
5204         }
5205         if (!(thflags & TH_SYN)) {
5206                 rack_do_drop(m, tp);
5207                 return (1);
5208         }
5209         tp->irs = th->th_seq;
5210         tcp_rcvseqinit(tp);
5211         if (thflags & TH_ACK) {
5212                 int tfo_partial = 0;
5213
5214                 TCPSTAT_INC(tcps_connects);
5215                 soisconnected(so);
5216 #ifdef MAC
5217                 mac_socketpeer_set_from_mbuf(m, so);
5218 #endif
5219                 /* Do window scaling on this connection? */
5220                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5221                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5222                         tp->rcv_scale = tp->request_r_scale;
5223                 }
5224                 tp->rcv_adv += min(tp->rcv_wnd,
5225                     TCP_MAXWIN << tp->rcv_scale);
5226                 /*
5227                  * If not all the data that was sent in the TFO SYN
5228                  * has been acked, resend the remainder right away.
5229                  */
5230                 if (IS_FASTOPEN(tp->t_flags) &&
5231                     (tp->snd_una != tp->snd_max)) {
5232                         tp->snd_nxt = th->th_ack;
5233                         tfo_partial = 1;
5234                 }
5235                 /*
5236                  * If there's data, delay ACK; if there's also a FIN ACKNOW
5237                  * will be turned on later.
5238                  */
5239                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
5240                         rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
5241                                           ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
5242                         tp->t_flags |= TF_DELACK;
5243                 } else {
5244                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
5245                         tp->t_flags |= TF_ACKNOW;
5246                 }
5247
5248                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
5249                     V_tcp_do_ecn) {
5250                         tp->t_flags |= TF_ECN_PERMIT;
5251                         TCPSTAT_INC(tcps_ecn_shs);
5252                 }
5253                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
5254                         /*
5255                          * We advance snd_una for the
5256                          * fast open case. If th_ack is
5257                          * acknowledging data beyond
5258                          * snd_una we can't just call
5259                          * ack-processing since the
5260                          * data stream in our send-map
5261                          * will start at snd_una + 1 (one
5262                          * beyond the SYN). If its just
5263                          * equal we don't need to do that
5264                          * and there is no send_map.
5265                          */
5266                         tp->snd_una++;
5267                 }
5268                 /*
5269                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
5270                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
5271                  */
5272                 tp->t_starttime = ticks;
5273                 if (tp->t_flags & TF_NEEDFIN) {
5274                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
5275                         tp->t_flags &= ~TF_NEEDFIN;
5276                         thflags &= ~TH_SYN;
5277                 } else {
5278                         tcp_state_change(tp, TCPS_ESTABLISHED);
5279                         TCP_PROBE5(connect__established, NULL, tp,
5280                             mtod(m, const char *), tp, th);
5281                         cc_conn_init(tp);
5282                 }
5283         } else {
5284                 /*
5285                  * Received initial SYN in SYN-SENT[*] state => simultaneous
5286                  * open.  If segment contains CC option and there is a
5287                  * cached CC, apply TAO test. If it succeeds, connection is *
5288                  * half-synchronized. Otherwise, do 3-way handshake:
5289                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
5290                  * there was no CC option, clear cached CC value.
5291                  */
5292                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
5293                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
5294         }
5295         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5296         INP_WLOCK_ASSERT(tp->t_inpcb);
5297         /*
5298          * Advance th->th_seq to correspond to first data byte. If data,
5299          * trim to stay within window, dropping FIN if necessary.
5300          */
5301         th->th_seq++;
5302         if (tlen > tp->rcv_wnd) {
5303                 todrop = tlen - tp->rcv_wnd;
5304                 m_adj(m, -todrop);
5305                 tlen = tp->rcv_wnd;
5306                 thflags &= ~TH_FIN;
5307                 TCPSTAT_INC(tcps_rcvpackafterwin);
5308                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
5309         }
5310         tp->snd_wl1 = th->th_seq - 1;
5311         tp->rcv_up = th->th_seq;
5312         /*
5313          * Client side of transaction: already sent SYN and data. If the
5314          * remote host used T/TCP to validate the SYN, our data will be
5315          * ACK'd; if so, enter normal data segment processing in the middle
5316          * of step 5, ack processing. Otherwise, goto step 6.
5317          */
5318         if (thflags & TH_ACK) {
5319                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
5320                         return (ret_val);
5321                 /* We may have changed to FIN_WAIT_1 above */
5322                 if (tp->t_state == TCPS_FIN_WAIT_1) {
5323                         /*
5324                          * In FIN_WAIT_1 STATE in addition to the processing
5325                          * for the ESTABLISHED state if our FIN is now
5326                          * acknowledged then enter FIN_WAIT_2.
5327                          */
5328                         if (ourfinisacked) {
5329                                 /*
5330                                  * If we can't receive any more data, then
5331                                  * closing user can proceed. Starting the
5332                                  * timer is contrary to the specification,
5333                                  * but if we don't get a FIN we'll hang
5334                                  * forever.
5335                                  *
5336                                  * XXXjl: we should release the tp also, and
5337                                  * use a compressed state.
5338                                  */
5339                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5340                                         soisdisconnected(so);
5341                                         tcp_timer_activate(tp, TT_2MSL,
5342                                             (tcp_fast_finwait2_recycle ?
5343                                             tcp_finwait2_timeout :
5344                                             TP_MAXIDLE(tp)));
5345                                 }
5346                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5347                         }
5348                 }
5349         }
5350         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5351            tiwin, thflags, nxt_pkt));
5352 }
5353
5354 /*
5355  * Return value of 1, the TCB is unlocked and most
5356  * likely gone, return value of 0, the TCP is still
5357  * locked.
5358  */
5359 static int
5360 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
5361     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5362     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5363 {
5364         int32_t ret_val = 0;
5365         int32_t ourfinisacked = 0;
5366
5367         rack_calc_rwin(so, tp);
5368
5369         if ((thflags & TH_ACK) &&
5370             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
5371             SEQ_GT(th->th_ack, tp->snd_max))) {
5372                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5373                 return (1);
5374         }
5375         if (IS_FASTOPEN(tp->t_flags)) {
5376                 /*
5377                  * When a TFO connection is in SYN_RECEIVED, the
5378                  * only valid packets are the initial SYN, a
5379                  * retransmit/copy of the initial SYN (possibly with
5380                  * a subset of the original data), a valid ACK, a
5381                  * FIN, or a RST.
5382                  */
5383                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
5384                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5385                         return (1);
5386                 } else if (thflags & TH_SYN) {
5387                         /* non-initial SYN is ignored */
5388                         struct tcp_rack *rack;
5389
5390                         rack = (struct tcp_rack *)tp->t_fb_ptr;
5391                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
5392                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
5393                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
5394                                 rack_do_drop(m, NULL);
5395                                 return (0);
5396                         }
5397                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
5398                         rack_do_drop(m, NULL);
5399                         return (0);
5400                 }
5401         }
5402         if (thflags & TH_RST)
5403                 return (rack_process_rst(m, th, so, tp));
5404         /*
5405          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5406          * it's less than ts_recent, drop it.
5407          */
5408         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5409             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5410                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5411                         return (ret_val);
5412         }
5413         /*
5414          * In the SYN-RECEIVED state, validate that the packet belongs to
5415          * this connection before trimming the data to fit the receive
5416          * window.  Check the sequence number versus IRS since we know the
5417          * sequence numbers haven't wrapped.  This is a partial fix for the
5418          * "LAND" DoS attack.
5419          */
5420         if (SEQ_LT(th->th_seq, tp->irs)) {
5421                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5422                 return (1);
5423         }
5424         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5425                 return (ret_val);
5426         }
5427         /*
5428          * If last ACK falls within this segment's sequence numbers, record
5429          * its timestamp. NOTE: 1) That the test incorporates suggestions
5430          * from the latest proposal of the tcplw@cray.com list (Braden
5431          * 1993/04/26). 2) That updating only on newer timestamps interferes
5432          * with our earlier PAWS tests, so this check should be solely
5433          * predicated on the sequence space of this segment. 3) That we
5434          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5435          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5436          * SEG.Len, This modified check allows us to overcome RFC1323's
5437          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5438          * p.869. In such cases, we can still calculate the RTT correctly
5439          * when RCV.NXT == Last.ACK.Sent.
5440          */
5441         if ((to->to_flags & TOF_TS) != 0 &&
5442             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5443             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5444             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5445                 tp->ts_recent_age = tcp_ts_getticks();
5446                 tp->ts_recent = to->to_tsval;
5447         }
5448         /*
5449          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5450          * is on (half-synchronized state), then queue data for later
5451          * processing; else drop segment and return.
5452          */
5453         if ((thflags & TH_ACK) == 0) {
5454                 if (IS_FASTOPEN(tp->t_flags)) {
5455                         tp->snd_wnd = tiwin;
5456                         cc_conn_init(tp);
5457                 }
5458                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5459                     tiwin, thflags, nxt_pkt));
5460         }
5461         TCPSTAT_INC(tcps_connects);
5462         soisconnected(so);
5463         /* Do window scaling? */
5464         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5465             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5466                 tp->rcv_scale = tp->request_r_scale;
5467                 tp->snd_wnd = tiwin;
5468         }
5469         /*
5470          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
5471          * FIN-WAIT-1
5472          */
5473         tp->t_starttime = ticks;
5474         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
5475                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
5476                 tp->t_tfo_pending = NULL;
5477
5478                 /*
5479                  * Account for the ACK of our SYN prior to
5480                  * regular ACK processing below.
5481                  */
5482                 tp->snd_una++;
5483         }
5484         if (tp->t_flags & TF_NEEDFIN) {
5485                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
5486                 tp->t_flags &= ~TF_NEEDFIN;
5487         } else {
5488                 tcp_state_change(tp, TCPS_ESTABLISHED);
5489                 TCP_PROBE5(accept__established, NULL, tp,
5490                     mtod(m, const char *), tp, th);
5491                 /*
5492                  * TFO connections call cc_conn_init() during SYN
5493                  * processing.  Calling it again here for such connections
5494                  * is not harmless as it would undo the snd_cwnd reduction
5495                  * that occurs when a TFO SYN|ACK is retransmitted.
5496                  */
5497                 if (!IS_FASTOPEN(tp->t_flags))
5498                         cc_conn_init(tp);
5499         }
5500         /*
5501          * If segment contains data or ACK, will call tcp_reass() later; if
5502          * not, do so now to pass queued data to user.
5503          */
5504         if (tlen == 0 && (thflags & TH_FIN) == 0)
5505                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
5506                     (struct mbuf *)0);
5507         tp->snd_wl1 = th->th_seq - 1;
5508         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5509                 return (ret_val);
5510         }
5511         if (tp->t_state == TCPS_FIN_WAIT_1) {
5512                 /* We could have went to FIN_WAIT_1 (or EST) above */
5513                 /*
5514                  * In FIN_WAIT_1 STATE in addition to the processing for the
5515                  * ESTABLISHED state if our FIN is now acknowledged then
5516                  * enter FIN_WAIT_2.
5517                  */
5518                 if (ourfinisacked) {
5519                         /*
5520                          * If we can't receive any more data, then closing
5521                          * user can proceed. Starting the timer is contrary
5522                          * to the specification, but if we don't get a FIN
5523                          * we'll hang forever.
5524                          *
5525                          * XXXjl: we should release the tp also, and use a
5526                          * compressed state.
5527                          */
5528                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5529                                 soisdisconnected(so);
5530                                 tcp_timer_activate(tp, TT_2MSL,
5531                                     (tcp_fast_finwait2_recycle ?
5532                                     tcp_finwait2_timeout :
5533                                     TP_MAXIDLE(tp)));
5534                         }
5535                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
5536                 }
5537         }
5538         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5539             tiwin, thflags, nxt_pkt));
5540 }
5541
5542 /*
5543  * Return value of 1, the TCB is unlocked and most
5544  * likely gone, return value of 0, the TCP is still
5545  * locked.
5546  */
5547 static int
5548 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
5549     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5550     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5551 {
5552         int32_t ret_val = 0;
5553
5554         /*
5555          * Header prediction: check for the two common cases of a
5556          * uni-directional data xfer.  If the packet has no control flags,
5557          * is in-sequence, the window didn't change and we're not
5558          * retransmitting, it's a candidate.  If the length is zero and the
5559          * ack moved forward, we're the sender side of the xfer.  Just free
5560          * the data acked & wake any higher level process that was blocked
5561          * waiting for space.  If the length is non-zero and the ack didn't
5562          * move, we're the receiver side.  If we're getting packets in-order
5563          * (the reassembly queue is empty), add the data toc The socket
5564          * buffer and note that we need a delayed ack. Make sure that the
5565          * hidden state-flags are also off. Since we check for
5566          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
5567          */
5568         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
5569             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
5570             __predict_true(SEGQ_EMPTY(tp)) &&
5571             __predict_true(th->th_seq == tp->rcv_nxt)) {
5572                 struct tcp_rack *rack;
5573
5574                 rack = (struct tcp_rack *)tp->t_fb_ptr;
5575                 if (tlen == 0) {
5576                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
5577                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
5578                                 return (0);
5579                         }
5580                 } else {
5581                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
5582                             tiwin, nxt_pkt)) {
5583                                 return (0);
5584                         }
5585                 }
5586         }
5587         rack_calc_rwin(so, tp);
5588
5589         if (thflags & TH_RST)
5590                 return (rack_process_rst(m, th, so, tp));
5591
5592         /*
5593          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5594          * synchronized state.
5595          */
5596         if (thflags & TH_SYN) {
5597                 rack_challenge_ack(m, th, tp, &ret_val);
5598                 return (ret_val);
5599         }
5600         /*
5601          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5602          * it's less than ts_recent, drop it.
5603          */
5604         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5605             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5606                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5607                         return (ret_val);
5608         }
5609         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5610                 return (ret_val);
5611         }
5612         /*
5613          * If last ACK falls within this segment's sequence numbers, record
5614          * its timestamp. NOTE: 1) That the test incorporates suggestions
5615          * from the latest proposal of the tcplw@cray.com list (Braden
5616          * 1993/04/26). 2) That updating only on newer timestamps interferes
5617          * with our earlier PAWS tests, so this check should be solely
5618          * predicated on the sequence space of this segment. 3) That we
5619          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5620          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5621          * SEG.Len, This modified check allows us to overcome RFC1323's
5622          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5623          * p.869. In such cases, we can still calculate the RTT correctly
5624          * when RCV.NXT == Last.ACK.Sent.
5625          */
5626         if ((to->to_flags & TOF_TS) != 0 &&
5627             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5628             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5629             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5630                 tp->ts_recent_age = tcp_ts_getticks();
5631                 tp->ts_recent = to->to_tsval;
5632         }
5633         /*
5634          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5635          * is on (half-synchronized state), then queue data for later
5636          * processing; else drop segment and return.
5637          */
5638         if ((thflags & TH_ACK) == 0) {
5639                 if (tp->t_flags & TF_NEEDSYN) {
5640
5641                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5642                             tiwin, thflags, nxt_pkt));
5643
5644                 } else if (tp->t_flags & TF_ACKNOW) {
5645                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5646                         return (ret_val);
5647                 } else {
5648                         rack_do_drop(m, NULL);
5649                         return (0);
5650                 }
5651         }
5652         /*
5653          * Ack processing.
5654          */
5655         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5656                 return (ret_val);
5657         }
5658         if (sbavail(&so->so_snd)) {
5659                 if (rack_progress_timeout_check(tp)) {
5660                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5661                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5662                         return (1);
5663                 }
5664         }
5665         /* State changes only happen in rack_process_data() */
5666         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5667             tiwin, thflags, nxt_pkt));
5668 }
5669
5670 /*
5671  * Return value of 1, the TCB is unlocked and most
5672  * likely gone, return value of 0, the TCP is still
5673  * locked.
5674  */
5675 static int
5676 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
5677     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5678     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5679 {
5680         int32_t ret_val = 0;
5681
5682         rack_calc_rwin(so, tp);
5683         if (thflags & TH_RST)
5684                 return (rack_process_rst(m, th, so, tp));
5685         /*
5686          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5687          * synchronized state.
5688          */
5689         if (thflags & TH_SYN) {
5690                 rack_challenge_ack(m, th, tp, &ret_val);
5691                 return (ret_val);
5692         }
5693         /*
5694          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5695          * it's less than ts_recent, drop it.
5696          */
5697         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5698             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5699                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5700                         return (ret_val);
5701         }
5702         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5703                 return (ret_val);
5704         }
5705         /*
5706          * If last ACK falls within this segment's sequence numbers, record
5707          * its timestamp. NOTE: 1) That the test incorporates suggestions
5708          * from the latest proposal of the tcplw@cray.com list (Braden
5709          * 1993/04/26). 2) That updating only on newer timestamps interferes
5710          * with our earlier PAWS tests, so this check should be solely
5711          * predicated on the sequence space of this segment. 3) That we
5712          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5713          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5714          * SEG.Len, This modified check allows us to overcome RFC1323's
5715          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5716          * p.869. In such cases, we can still calculate the RTT correctly
5717          * when RCV.NXT == Last.ACK.Sent.
5718          */
5719         if ((to->to_flags & TOF_TS) != 0 &&
5720             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5721             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5722             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5723                 tp->ts_recent_age = tcp_ts_getticks();
5724                 tp->ts_recent = to->to_tsval;
5725         }
5726         /*
5727          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5728          * is on (half-synchronized state), then queue data for later
5729          * processing; else drop segment and return.
5730          */
5731         if ((thflags & TH_ACK) == 0) {
5732                 if (tp->t_flags & TF_NEEDSYN) {
5733                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5734                             tiwin, thflags, nxt_pkt));
5735
5736                 } else if (tp->t_flags & TF_ACKNOW) {
5737                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5738                         return (ret_val);
5739                 } else {
5740                         rack_do_drop(m, NULL);
5741                         return (0);
5742                 }
5743         }
5744         /*
5745          * Ack processing.
5746          */
5747         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5748                 return (ret_val);
5749         }
5750         if (sbavail(&so->so_snd)) {
5751                 if (rack_progress_timeout_check(tp)) {
5752                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5753                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5754                         return (1);
5755                 }
5756         }
5757         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5758             tiwin, thflags, nxt_pkt));
5759 }
5760
5761 static int
5762 rack_check_data_after_close(struct mbuf *m,
5763     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
5764 {
5765         struct tcp_rack *rack;
5766
5767         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5768         rack = (struct tcp_rack *)tp->t_fb_ptr;
5769         if (rack->rc_allow_data_af_clo == 0) {
5770         close_now:
5771                 tp = tcp_close(tp);
5772                 TCPSTAT_INC(tcps_rcvafterclose);
5773                 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
5774                 return (1);
5775         }
5776         if (sbavail(&so->so_snd) == 0)
5777                 goto close_now;
5778         /* Ok we allow data that is ignored and a followup reset */
5779         tp->rcv_nxt = th->th_seq + *tlen;
5780         tp->t_flags2 |= TF2_DROP_AF_DATA;
5781         rack->r_wanted_output = 1;
5782         *tlen = 0;
5783         return (0);
5784 }
5785
5786 /*
5787  * Return value of 1, the TCB is unlocked and most
5788  * likely gone, return value of 0, the TCP is still
5789  * locked.
5790  */
5791 static int
5792 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
5793     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5794     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5795 {
5796         int32_t ret_val = 0;
5797         int32_t ourfinisacked = 0;
5798
5799         rack_calc_rwin(so, tp);
5800
5801         if (thflags & TH_RST)
5802                 return (rack_process_rst(m, th, so, tp));
5803         /*
5804          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5805          * synchronized state.
5806          */
5807         if (thflags & TH_SYN) {
5808                 rack_challenge_ack(m, th, tp, &ret_val);
5809                 return (ret_val);
5810         }
5811         /*
5812          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5813          * it's less than ts_recent, drop it.
5814          */
5815         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5816             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5817                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5818                         return (ret_val);
5819         }
5820         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5821                 return (ret_val);
5822         }
5823         /*
5824          * If new data are received on a connection after the user processes
5825          * are gone, then RST the other end.
5826          */
5827         if ((so->so_state & SS_NOFDREF) && tlen) {
5828                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
5829                         return (1);
5830         }
5831         /*
5832          * If last ACK falls within this segment's sequence numbers, record
5833          * its timestamp. NOTE: 1) That the test incorporates suggestions
5834          * from the latest proposal of the tcplw@cray.com list (Braden
5835          * 1993/04/26). 2) That updating only on newer timestamps interferes
5836          * with our earlier PAWS tests, so this check should be solely
5837          * predicated on the sequence space of this segment. 3) That we
5838          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5839          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5840          * SEG.Len, This modified check allows us to overcome RFC1323's
5841          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5842          * p.869. In such cases, we can still calculate the RTT correctly
5843          * when RCV.NXT == Last.ACK.Sent.
5844          */
5845         if ((to->to_flags & TOF_TS) != 0 &&
5846             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5847             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5848             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5849                 tp->ts_recent_age = tcp_ts_getticks();
5850                 tp->ts_recent = to->to_tsval;
5851         }
5852         /*
5853          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5854          * is on (half-synchronized state), then queue data for later
5855          * processing; else drop segment and return.
5856          */
5857         if ((thflags & TH_ACK) == 0) {
5858                 if (tp->t_flags & TF_NEEDSYN) {
5859                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5860                             tiwin, thflags, nxt_pkt));
5861                 } else if (tp->t_flags & TF_ACKNOW) {
5862                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5863                         return (ret_val);
5864                 } else {
5865                         rack_do_drop(m, NULL);
5866                         return (0);
5867                 }
5868         }
5869         /*
5870          * Ack processing.
5871          */
5872         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5873                 return (ret_val);
5874         }
5875         if (ourfinisacked) {
5876                 /*
5877                  * If we can't receive any more data, then closing user can
5878                  * proceed. Starting the timer is contrary to the
5879                  * specification, but if we don't get a FIN we'll hang
5880                  * forever.
5881                  *
5882                  * XXXjl: we should release the tp also, and use a
5883                  * compressed state.
5884                  */
5885                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5886                         soisdisconnected(so);
5887                         tcp_timer_activate(tp, TT_2MSL,
5888                             (tcp_fast_finwait2_recycle ?
5889                             tcp_finwait2_timeout :
5890                             TP_MAXIDLE(tp)));
5891                 }
5892                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5893         }
5894         if (sbavail(&so->so_snd)) {
5895                 if (rack_progress_timeout_check(tp)) {
5896                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5897                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5898                         return (1);
5899                 }
5900         }
5901         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5902             tiwin, thflags, nxt_pkt));
5903 }
5904
5905 /*
5906  * Return value of 1, the TCB is unlocked and most
5907  * likely gone, return value of 0, the TCP is still
5908  * locked.
5909  */
5910 static int
5911 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
5912     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5913     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5914 {
5915         int32_t ret_val = 0;
5916         int32_t ourfinisacked = 0;
5917
5918         rack_calc_rwin(so, tp);
5919
5920         if (thflags & TH_RST)
5921                 return (rack_process_rst(m, th, so, tp));
5922         /*
5923          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5924          * synchronized state.
5925          */
5926         if (thflags & TH_SYN) {
5927                 rack_challenge_ack(m, th, tp, &ret_val);
5928                 return (ret_val);
5929         }
5930         /*
5931          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5932          * it's less than ts_recent, drop it.
5933          */
5934         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5935             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5936                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5937                         return (ret_val);
5938         }
5939         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5940                 return (ret_val);
5941         }
5942         /*
5943          * If new data are received on a connection after the user processes
5944          * are gone, then RST the other end.
5945          */
5946         if ((so->so_state & SS_NOFDREF) && tlen) {
5947                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
5948                         return (1);
5949         }
5950         /*
5951          * If last ACK falls within this segment's sequence numbers, record
5952          * its timestamp. NOTE: 1) That the test incorporates suggestions
5953          * from the latest proposal of the tcplw@cray.com list (Braden
5954          * 1993/04/26). 2) That updating only on newer timestamps interferes
5955          * with our earlier PAWS tests, so this check should be solely
5956          * predicated on the sequence space of this segment. 3) That we
5957          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5958          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5959          * SEG.Len, This modified check allows us to overcome RFC1323's
5960          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5961          * p.869. In such cases, we can still calculate the RTT correctly
5962          * when RCV.NXT == Last.ACK.Sent.
5963          */
5964         if ((to->to_flags & TOF_TS) != 0 &&
5965             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5966             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5967             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5968                 tp->ts_recent_age = tcp_ts_getticks();
5969                 tp->ts_recent = to->to_tsval;
5970         }
5971         /*
5972          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5973          * is on (half-synchronized state), then queue data for later
5974          * processing; else drop segment and return.
5975          */
5976         if ((thflags & TH_ACK) == 0) {
5977                 if (tp->t_flags & TF_NEEDSYN) {
5978                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5979                             tiwin, thflags, nxt_pkt));
5980                 } else if (tp->t_flags & TF_ACKNOW) {
5981                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5982                         return (ret_val);
5983                 } else {
5984                         rack_do_drop(m, NULL);
5985                         return (0);
5986                 }
5987         }
5988         /*
5989          * Ack processing.
5990          */
5991         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5992                 return (ret_val);
5993         }
5994         if (ourfinisacked) {
5995                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5996                 tcp_twstart(tp);
5997                 m_freem(m);
5998                 return (1);
5999         }
6000         if (sbavail(&so->so_snd)) {
6001                 if (rack_progress_timeout_check(tp)) {
6002                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6003                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6004                         return (1);
6005                 }
6006         }
6007         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6008             tiwin, thflags, nxt_pkt));
6009 }
6010
6011 /*
6012  * Return value of 1, the TCB is unlocked and most
6013  * likely gone, return value of 0, the TCP is still
6014  * locked.
6015  */
6016 static int
6017 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6018     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6019     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6020 {
6021         int32_t ret_val = 0;
6022         int32_t ourfinisacked = 0;
6023
6024         rack_calc_rwin(so, tp);
6025
6026         if (thflags & TH_RST)
6027                 return (rack_process_rst(m, th, so, tp));
6028         /*
6029          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6030          * synchronized state.
6031          */
6032         if (thflags & TH_SYN) {
6033                 rack_challenge_ack(m, th, tp, &ret_val);
6034                 return (ret_val);
6035         }
6036         /*
6037          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6038          * it's less than ts_recent, drop it.
6039          */
6040         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6041             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6042                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6043                         return (ret_val);
6044         }
6045         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6046                 return (ret_val);
6047         }
6048         /*
6049          * If new data are received on a connection after the user processes
6050          * are gone, then RST the other end.
6051          */
6052         if ((so->so_state & SS_NOFDREF) && tlen) {
6053                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6054                         return (1);
6055         }
6056         /*
6057          * If last ACK falls within this segment's sequence numbers, record
6058          * its timestamp. NOTE: 1) That the test incorporates suggestions
6059          * from the latest proposal of the tcplw@cray.com list (Braden
6060          * 1993/04/26). 2) That updating only on newer timestamps interferes
6061          * with our earlier PAWS tests, so this check should be solely
6062          * predicated on the sequence space of this segment. 3) That we
6063          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6064          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6065          * SEG.Len, This modified check allows us to overcome RFC1323's
6066          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6067          * p.869. In such cases, we can still calculate the RTT correctly
6068          * when RCV.NXT == Last.ACK.Sent.
6069          */
6070         if ((to->to_flags & TOF_TS) != 0 &&
6071             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6072             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6073             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6074                 tp->ts_recent_age = tcp_ts_getticks();
6075                 tp->ts_recent = to->to_tsval;
6076         }
6077         /*
6078          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6079          * is on (half-synchronized state), then queue data for later
6080          * processing; else drop segment and return.
6081          */
6082         if ((thflags & TH_ACK) == 0) {
6083                 if (tp->t_flags & TF_NEEDSYN) {
6084                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6085                             tiwin, thflags, nxt_pkt));
6086                 } else if (tp->t_flags & TF_ACKNOW) {
6087                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6088                         return (ret_val);
6089                 } else {
6090                         rack_do_drop(m, NULL);
6091                         return (0);
6092                 }
6093         }
6094         /*
6095          * case TCPS_LAST_ACK: Ack processing.
6096          */
6097         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6098                 return (ret_val);
6099         }
6100         if (ourfinisacked) {
6101                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6102                 tp = tcp_close(tp);
6103                 rack_do_drop(m, tp);
6104                 return (1);
6105         }
6106         if (sbavail(&so->so_snd)) {
6107                 if (rack_progress_timeout_check(tp)) {
6108                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6109                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6110                         return (1);
6111                 }
6112         }
6113         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6114             tiwin, thflags, nxt_pkt));
6115 }
6116
6117
6118 /*
6119  * Return value of 1, the TCB is unlocked and most
6120  * likely gone, return value of 0, the TCP is still
6121  * locked.
6122  */
6123 static int
6124 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
6125     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6126     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6127 {
6128         int32_t ret_val = 0;
6129         int32_t ourfinisacked = 0;
6130
6131         rack_calc_rwin(so, tp);
6132
6133         /* Reset receive buffer auto scaling when not in bulk receive mode. */
6134         if (thflags & TH_RST)
6135                 return (rack_process_rst(m, th, so, tp));
6136         /*
6137          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6138          * synchronized state.
6139          */
6140         if (thflags & TH_SYN) {
6141                 rack_challenge_ack(m, th, tp, &ret_val);
6142                 return (ret_val);
6143         }
6144         /*
6145          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6146          * it's less than ts_recent, drop it.
6147          */
6148         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6149             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6150                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6151                         return (ret_val);
6152         }
6153         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6154                 return (ret_val);
6155         }
6156         /*
6157          * If new data are received on a connection after the user processes
6158          * are gone, then RST the other end.
6159          */
6160         if ((so->so_state & SS_NOFDREF) &&
6161             tlen) {
6162                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6163                         return (1);
6164         }
6165         /*
6166          * If last ACK falls within this segment's sequence numbers, record
6167          * its timestamp. NOTE: 1) That the test incorporates suggestions
6168          * from the latest proposal of the tcplw@cray.com list (Braden
6169          * 1993/04/26). 2) That updating only on newer timestamps interferes
6170          * with our earlier PAWS tests, so this check should be solely
6171          * predicated on the sequence space of this segment. 3) That we
6172          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6173          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6174          * SEG.Len, This modified check allows us to overcome RFC1323's
6175          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6176          * p.869. In such cases, we can still calculate the RTT correctly
6177          * when RCV.NXT == Last.ACK.Sent.
6178          */
6179         if ((to->to_flags & TOF_TS) != 0 &&
6180             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6181             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6182             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6183                 tp->ts_recent_age = tcp_ts_getticks();
6184                 tp->ts_recent = to->to_tsval;
6185         }
6186         /*
6187          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6188          * is on (half-synchronized state), then queue data for later
6189          * processing; else drop segment and return.
6190          */
6191         if ((thflags & TH_ACK) == 0) {
6192                 if (tp->t_flags & TF_NEEDSYN) {
6193                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6194                             tiwin, thflags, nxt_pkt));
6195                 } else if (tp->t_flags & TF_ACKNOW) {
6196                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6197                         return (ret_val);
6198                 } else {
6199                         rack_do_drop(m, NULL);
6200                         return (0);
6201                 }
6202         }
6203         /*
6204          * Ack processing.
6205          */
6206         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6207                 return (ret_val);
6208         }
6209         if (sbavail(&so->so_snd)) {
6210                 if (rack_progress_timeout_check(tp)) {
6211                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6212                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6213                         return (1);
6214                 }
6215         }
6216         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6217             tiwin, thflags, nxt_pkt));
6218 }
6219
6220
6221 static void inline
6222 rack_clear_rate_sample(struct tcp_rack *rack)
6223 {
6224         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
6225         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
6226         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
6227 }
6228
6229 static int
6230 rack_init(struct tcpcb *tp)
6231 {
6232         struct tcp_rack *rack = NULL;
6233
6234         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
6235         if (tp->t_fb_ptr == NULL) {
6236                 /*
6237                  * We need to allocate memory but cant. The INP and INP_INFO
6238                  * locks and they are recusive (happens during setup. So a
6239                  * scheme to drop the locks fails :(
6240                  *
6241                  */
6242                 return (ENOMEM);
6243         }
6244         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
6245
6246         rack = (struct tcp_rack *)tp->t_fb_ptr;
6247         TAILQ_INIT(&rack->r_ctl.rc_map);
6248         TAILQ_INIT(&rack->r_ctl.rc_free);
6249         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6250         rack->rc_tp = tp;
6251         if (tp->t_inpcb) {
6252                 rack->rc_inp = tp->t_inpcb;
6253         }
6254         /* Probably not needed but lets be sure */
6255         rack_clear_rate_sample(rack);
6256         rack->r_cpu = 0;
6257         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
6258         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
6259         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
6260         rack->rc_pace_reduce = rack_slot_reduction;
6261         if (V_tcp_delack_enabled)
6262                 tp->t_delayed_ack = 1;
6263         else
6264                 tp->t_delayed_ack = 0;
6265         rack->rc_pace_max_segs = rack_hptsi_segments;
6266         rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
6267         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
6268         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
6269         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
6270         rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
6271         rack->r_enforce_min_pace = rack_min_pace_time;
6272         rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
6273         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
6274         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
6275         rack->r_ctl.rc_early_recovery = rack_early_recovery;
6276         rack->rc_always_pace = rack_pace_every_seg;
6277         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
6278         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
6279         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
6280         rack->r_ctl.rc_min_to = rack_min_to;
6281         rack->r_ctl.rc_prr_inc_var = rack_inc_var;
6282         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6283         if (tp->snd_una != tp->snd_max) {
6284                 /* Create a send map for the current outstanding data */
6285                 struct rack_sendmap *rsm;
6286
6287                 rsm = rack_alloc(rack);
6288                 if (rsm == NULL) {
6289                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6290                         tp->t_fb_ptr = NULL;
6291                         return (ENOMEM);
6292                 }
6293                 rsm->r_flags = RACK_OVERMAX;
6294                 rsm->r_tim_lastsent[0] = tcp_ts_getticks();
6295                 rsm->r_rtr_cnt = 1;
6296                 rsm->r_rtr_bytes = 0;
6297                 rsm->r_start = tp->snd_una;
6298                 rsm->r_end = tp->snd_max;
6299                 rsm->r_sndcnt = 0;
6300                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
6301                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6302                 rsm->r_in_tmap = 1;
6303         }
6304         return (0);
6305 }
6306
6307 static int
6308 rack_handoff_ok(struct tcpcb *tp)
6309 {
6310         if ((tp->t_state == TCPS_CLOSED) ||
6311             (tp->t_state == TCPS_LISTEN)) {
6312                 /* Sure no problem though it may not stick */
6313                 return (0);
6314         }
6315         if ((tp->t_state == TCPS_SYN_SENT) ||
6316             (tp->t_state == TCPS_SYN_RECEIVED)) {
6317                 /*
6318                  * We really don't know you have to get to ESTAB or beyond
6319                  * to tell.
6320                  */
6321                 return (EAGAIN);
6322         }
6323         if (tp->t_flags & TF_SACK_PERMIT) {
6324                 return (0);
6325         }
6326         /*
6327          * If we reach here we don't do SACK on this connection so we can
6328          * never do rack.
6329          */
6330         return (EINVAL);
6331 }
6332
6333 static void
6334 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
6335 {
6336         if (tp->t_fb_ptr) {
6337                 struct tcp_rack *rack;
6338                 struct rack_sendmap *rsm;
6339
6340                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6341 #ifdef TCP_BLACKBOX
6342                 tcp_log_flowend(tp);
6343 #endif
6344                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6345                 while (rsm) {
6346                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
6347                         uma_zfree(rack_zone, rsm);
6348                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6349                 }
6350                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6351                 while (rsm) {
6352                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
6353                         uma_zfree(rack_zone, rsm);
6354                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6355                 }
6356                 rack->rc_free_cnt = 0;
6357                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6358                 tp->t_fb_ptr = NULL;
6359         }
6360 }
6361
6362 static void
6363 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
6364 {
6365         switch (tp->t_state) {
6366         case TCPS_SYN_SENT:
6367                 rack->r_state = TCPS_SYN_SENT;
6368                 rack->r_substate = rack_do_syn_sent;
6369                 break;
6370         case TCPS_SYN_RECEIVED:
6371                 rack->r_state = TCPS_SYN_RECEIVED;
6372                 rack->r_substate = rack_do_syn_recv;
6373                 break;
6374         case TCPS_ESTABLISHED:
6375                 rack->r_state = TCPS_ESTABLISHED;
6376                 rack->r_substate = rack_do_established;
6377                 break;
6378         case TCPS_CLOSE_WAIT:
6379                 rack->r_state = TCPS_CLOSE_WAIT;
6380                 rack->r_substate = rack_do_close_wait;
6381                 break;
6382         case TCPS_FIN_WAIT_1:
6383                 rack->r_state = TCPS_FIN_WAIT_1;
6384                 rack->r_substate = rack_do_fin_wait_1;
6385                 break;
6386         case TCPS_CLOSING:
6387                 rack->r_state = TCPS_CLOSING;
6388                 rack->r_substate = rack_do_closing;
6389                 break;
6390         case TCPS_LAST_ACK:
6391                 rack->r_state = TCPS_LAST_ACK;
6392                 rack->r_substate = rack_do_lastack;
6393                 break;
6394         case TCPS_FIN_WAIT_2:
6395                 rack->r_state = TCPS_FIN_WAIT_2;
6396                 rack->r_substate = rack_do_fin_wait_2;
6397                 break;
6398         case TCPS_LISTEN:
6399         case TCPS_CLOSED:
6400         case TCPS_TIME_WAIT:
6401         default:
6402 #ifdef INVARIANTS
6403                 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
6404 #endif
6405                 break;
6406         };
6407 }
6408
6409
6410 static void
6411 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
6412 {
6413         /*
6414          * We received an ack, and then did not
6415          * call send or were bounced out due to the
6416          * hpts was running. Now a timer is up as well, is
6417          * it the right timer?
6418          */
6419         struct rack_sendmap *rsm;
6420         int tmr_up;
6421
6422         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
6423         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
6424                 return;
6425         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6426         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
6427             (tmr_up == PACE_TMR_RXT)) {
6428                 /* Should be an RXT */
6429                 return;
6430         }
6431         if (rsm == NULL) {
6432                 /* Nothing outstanding? */
6433                 if (tp->t_flags & TF_DELACK) {
6434                         if (tmr_up == PACE_TMR_DELACK)
6435                                 /* We are supposed to have delayed ack up and we do */
6436                                 return;
6437                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
6438                         /*
6439                          * if we hit enobufs then we would expect the possiblity
6440                          * of nothing outstanding and the RXT up (and the hptsi timer).
6441                          */
6442                         return;
6443                 } else if (((tcp_always_keepalive ||
6444                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6445                             (tp->t_state <= TCPS_CLOSING)) &&
6446                            (tmr_up == PACE_TMR_KEEP) &&
6447                            (tp->snd_max == tp->snd_una)) {
6448                         /* We should have keep alive up and we do */
6449                         return;
6450                 }
6451         }
6452         if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
6453                 if ((tp->t_flags & TF_SENTFIN) &&
6454                     ((tp->snd_max - tp->snd_una) == 1) &&
6455                     (rsm->r_flags & RACK_HAS_FIN)) {
6456                         /* needs to be a RXT */
6457                         if (tmr_up == PACE_TMR_RXT)
6458                                 return;
6459                 } else if (tmr_up == PACE_TMR_RACK)
6460                         return;
6461         } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
6462                    ((tmr_up == PACE_TMR_TLP) ||
6463                     (tmr_up == PACE_TMR_RXT))) {
6464                 /*
6465                  * Either a TLP or RXT is fine if no sack-passed
6466                  * is in place and data is outstanding.
6467                  */
6468                 return;
6469         } else if (tmr_up == PACE_TMR_DELACK) {
6470                 /*
6471                  * If the delayed ack was going to go off
6472                  * before the rtx/tlp/rack timer were going to
6473                  * expire, then that would be the timer in control.
6474                  * Note we don't check the time here trusting the
6475                  * code is correct.
6476                  */
6477                 return;
6478         }
6479         /*
6480          * Ok the timer originally started is not what we want now.
6481          * We will force the hpts to be stopped if any, and restart
6482          * with the slot set to what was in the saved slot.
6483          */
6484         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6485         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6486 }
6487
6488 static void
6489 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6490     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
6491     int32_t nxt_pkt, struct timeval *tv)
6492 {
6493         int32_t thflags, retval, did_out = 0;
6494         int32_t way_out = 0;
6495         uint32_t cts;
6496         uint32_t tiwin;
6497         struct tcpopt to;
6498         struct tcp_rack *rack;
6499         struct rack_sendmap *rsm;
6500         int32_t prev_state = 0;
6501
6502         cts = tcp_tv_to_mssectick(tv);
6503         rack = (struct tcp_rack *)tp->t_fb_ptr;
6504
6505         kern_prefetch(rack, &prev_state);
6506         prev_state = 0;
6507         thflags = th->th_flags;
6508         /*
6509          * If this is either a state-changing packet or current state isn't
6510          * established, we require a read lock on tcbinfo.  Otherwise, we
6511          * allow the tcbinfo to be in either locked or unlocked, as the
6512          * caller may have unnecessarily acquired a lock due to a race.
6513          */
6514         if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
6515             tp->t_state != TCPS_ESTABLISHED) {
6516                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6517         }
6518         INP_WLOCK_ASSERT(tp->t_inpcb);
6519         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
6520             __func__));
6521         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
6522             __func__));
6523         {
6524                 union tcp_log_stackspecific log;
6525
6526                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6527                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
6528                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
6529                 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
6530                     tlen, &log, true);
6531         }
6532         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
6533                 way_out = 4;
6534                 goto done_with_input;
6535         }
6536         /*
6537          * If a segment with the ACK-bit set arrives in the SYN-SENT state
6538          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
6539          */
6540         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
6541             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
6542                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6543                 return;
6544         }
6545         /*
6546          * Segment received on connection. Reset idle time and keep-alive
6547          * timer. XXX: This should be done after segment validation to
6548          * ignore broken/spoofed segs.
6549          */
6550         if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
6551 #ifdef NETFLIX_CWV
6552                 if ((tp->cwv_enabled) &&
6553                     ((tp->cwv_cwnd_valid == 0) &&
6554                      TCPS_HAVEESTABLISHED(tp->t_state) &&
6555                      (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
6556                         tcp_newcwv_nvp_closedown(tp);
6557                 } else
6558 #endif
6559                        if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
6560                         counter_u64_add(rack_input_idle_reduces, 1);
6561                         rack_cc_after_idle(tp,
6562                             (rack->r_idle_reduce_largest ? 1 :0));
6563                 }
6564         }
6565         rack->r_ctl.rc_rcvtime = cts;
6566         tp->t_rcvtime = ticks;
6567
6568 #ifdef NETFLIX_CWV
6569         if (tp->cwv_enabled) {
6570                 if ((tp->cwv_cwnd_valid == 0) &&
6571                     TCPS_HAVEESTABLISHED(tp->t_state) &&
6572                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
6573                         tcp_newcwv_nvp_closedown(tp);
6574         }
6575 #endif
6576         /*
6577          * Unscale the window into a 32-bit value. For the SYN_SENT state
6578          * the scale is zero.
6579          */
6580         tiwin = th->th_win << tp->snd_scale;
6581 #ifdef NETFLIX_STATS
6582         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
6583 #endif
6584         /*
6585          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
6586          * this to occur after we've validated the segment.
6587          */
6588         if (tp->t_flags & TF_ECN_PERMIT) {
6589                 if (thflags & TH_CWR)
6590                         tp->t_flags &= ~TF_ECN_SND_ECE;
6591                 switch (iptos & IPTOS_ECN_MASK) {
6592                 case IPTOS_ECN_CE:
6593                         tp->t_flags |= TF_ECN_SND_ECE;
6594                         TCPSTAT_INC(tcps_ecn_ce);
6595                         break;
6596                 case IPTOS_ECN_ECT0:
6597                         TCPSTAT_INC(tcps_ecn_ect0);
6598                         break;
6599                 case IPTOS_ECN_ECT1:
6600                         TCPSTAT_INC(tcps_ecn_ect1);
6601                         break;
6602                 }
6603                 /* Congestion experienced. */
6604                 if (thflags & TH_ECE) {
6605                         rack_cong_signal(tp, th, CC_ECN);
6606                 }
6607         }
6608         /*
6609          * Parse options on any incoming segment.
6610          */
6611         tcp_dooptions(&to, (u_char *)(th + 1),
6612             (th->th_off << 2) - sizeof(struct tcphdr),
6613             (thflags & TH_SYN) ? TO_SYN : 0);
6614
6615         /*
6616          * If echoed timestamp is later than the current time, fall back to
6617          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
6618          * were used when this connection was established.
6619          */
6620         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
6621                 to.to_tsecr -= tp->ts_offset;
6622                 if (TSTMP_GT(to.to_tsecr, cts))
6623                         to.to_tsecr = 0;
6624         }
6625         /*
6626          * If its the first time in we need to take care of options and
6627          * verify we can do SACK for rack!
6628          */
6629         if (rack->r_state == 0) {
6630                 /* Should be init'd by rack_init() */
6631                 KASSERT(rack->rc_inp != NULL,
6632                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
6633                 if (rack->rc_inp == NULL) {
6634                         rack->rc_inp = tp->t_inpcb;
6635                 }
6636
6637                 /*
6638                  * Process options only when we get SYN/ACK back. The SYN
6639                  * case for incoming connections is handled in tcp_syncache.
6640                  * According to RFC1323 the window field in a SYN (i.e., a
6641                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
6642                  * this is traditional behavior, may need to be cleaned up.
6643                  */
6644                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
6645                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
6646                         if ((to.to_flags & TOF_SCALE) &&
6647                             (tp->t_flags & TF_REQ_SCALE)) {
6648                                 tp->t_flags |= TF_RCVD_SCALE;
6649                                 tp->snd_scale = to.to_wscale;
6650                         }
6651                         /*
6652                          * Initial send window.  It will be updated with the
6653                          * next incoming segment to the scaled value.
6654                          */
6655                         tp->snd_wnd = th->th_win;
6656                         if (to.to_flags & TOF_TS) {
6657                                 tp->t_flags |= TF_RCVD_TSTMP;
6658                                 tp->ts_recent = to.to_tsval;
6659                                 tp->ts_recent_age = cts;
6660                         }
6661                         if (to.to_flags & TOF_MSS)
6662                                 tcp_mss(tp, to.to_mss);
6663                         if ((tp->t_flags & TF_SACK_PERMIT) &&
6664                             (to.to_flags & TOF_SACKPERM) == 0)
6665                                 tp->t_flags &= ~TF_SACK_PERMIT;
6666                         if (IS_FASTOPEN(tp->t_flags)) {
6667                                 if (to.to_flags & TOF_FASTOPEN) {
6668                                         uint16_t mss;
6669
6670                                         if (to.to_flags & TOF_MSS)
6671                                                 mss = to.to_mss;
6672                                         else
6673                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
6674                                                         mss = TCP6_MSS;
6675                                                 else
6676                                                         mss = TCP_MSS;
6677                                         tcp_fastopen_update_cache(tp, mss,
6678                                             to.to_tfo_len, to.to_tfo_cookie);
6679                                 } else
6680                                         tcp_fastopen_disable_path(tp);
6681                         }
6682                 }
6683                 /*
6684                  * At this point we are at the initial call. Here we decide
6685                  * if we are doing RACK or not. We do this by seeing if
6686                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
6687                  * we switch to the default code.
6688                  */
6689                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
6690                         tcp_switch_back_to_default(tp);
6691                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
6692                             tlen, iptos);
6693                         return;
6694                 }
6695                 /* Set the flag */
6696                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
6697                 tcp_set_hpts(tp->t_inpcb);
6698                 rack_stop_all_timers(tp);
6699                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
6700         }
6701         /*
6702          * This is the one exception case where we set the rack state
6703          * always. All other times (timers etc) we must have a rack-state
6704          * set (so we assure we have done the checks above for SACK).
6705          */
6706         if (rack->r_state != tp->t_state)
6707                 rack_set_state(tp, rack);
6708         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
6709                 kern_prefetch(rsm, &prev_state);
6710         prev_state = rack->r_state;
6711         rack->r_ctl.rc_tlp_send_cnt = 0;
6712         rack_clear_rate_sample(rack);
6713         retval = (*rack->r_substate) (m, th, so,
6714             tp, &to, drop_hdrlen,
6715             tlen, tiwin, thflags, nxt_pkt);
6716 #ifdef INVARIANTS
6717         if ((retval == 0) &&
6718             (tp->t_inpcb == NULL)) {
6719                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
6720                     retval, tp, prev_state);
6721         }
6722 #endif
6723         if (retval == 0) {
6724                 /*
6725                  * If retval is 1 the tcb is unlocked and most likely the tp
6726                  * is gone.
6727                  */
6728                 INP_WLOCK_ASSERT(tp->t_inpcb);
6729                 tcp_rack_xmit_timer_commit(rack, tp);
6730                 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
6731                     (rack->rc_in_persist == 0)){
6732                         /*
6733                          * The peer shrunk its window on us to the point
6734                          * where we have sent too much. The only thing
6735                          * we can do here is stop any timers and
6736                          * enter persist. We most likely lost the last
6737                          * bytes we sent but oh well, we will have to
6738                          * retransmit them after the peer is caught up.
6739                          */
6740                         if (rack->rc_inp->inp_in_hpts)
6741                                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6742                         rack_timer_cancel(tp, rack, cts, __LINE__);
6743                         rack_enter_persist(tp, rack, cts);
6744                         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6745                         way_out = 3;
6746                         goto done_with_input;
6747                 }
6748                 if (nxt_pkt == 0) {
6749                         if (rack->r_wanted_output != 0) {
6750                                 did_out = 1;
6751                                 (void)tp->t_fb->tfb_tcp_output(tp);
6752                         }
6753                         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
6754                 }
6755                 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
6756                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
6757                      (tp->t_flags & TF_DELACK) ||
6758                      ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6759                       (tp->t_state <= TCPS_CLOSING)))) {
6760                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
6761                         if ((tp->snd_max == tp->snd_una) &&
6762                             ((tp->t_flags & TF_DELACK) == 0) &&
6763                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
6764                                 /* keep alive not needed if we are hptsi output yet */
6765                                 ;
6766                         } else {
6767                                 if (rack->rc_inp->inp_in_hpts)
6768                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6769                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6770                         }
6771                         way_out = 1;
6772                 } else {
6773                         /* Do we have the correct timer running? */
6774                         rack_timer_audit(tp, rack, &so->so_snd);
6775                         way_out = 2;
6776                 }
6777         done_with_input:
6778                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
6779                 if (did_out)
6780                         rack->r_wanted_output = 0;
6781 #ifdef INVARIANTS
6782                 if (tp->t_inpcb == NULL) {
6783                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
6784                               did_out,
6785                               retval, tp, prev_state);
6786                 }
6787 #endif
6788                 INP_WUNLOCK(tp->t_inpcb);
6789         }
6790 }
6791
6792 void
6793 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6794     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
6795 {
6796         struct timeval tv;
6797 #ifdef RSS
6798         struct tcp_function_block *tfb;
6799         struct tcp_rack *rack;
6800         struct epoch_tracker et;
6801
6802         rack = (struct tcp_rack *)tp->t_fb_ptr;
6803         if (rack->r_state == 0) {
6804                 /*
6805                  * Initial input (ACK to SYN-ACK etc)lets go ahead and get
6806                  * it processed
6807                  */
6808                 INP_INFO_RLOCK_ET(&V_tcbinfo, et);
6809                 tcp_get_usecs(&tv);
6810                 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6811                     tlen, iptos, 0, &tv);
6812                 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
6813                 return;
6814         }
6815         tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
6816         INP_WUNLOCK(tp->t_inpcb);
6817 #else
6818         tcp_get_usecs(&tv);
6819         rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6820             tlen, iptos, 0, &tv);
6821 #endif
6822 }
6823
6824 struct rack_sendmap *
6825 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
6826 {
6827         struct rack_sendmap *rsm = NULL;
6828         int32_t idx;
6829         uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
6830
6831         /* Return the next guy to be re-transmitted */
6832         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
6833                 return (NULL);
6834         }
6835         if (tp->t_flags & TF_SENTFIN) {
6836                 /* retran the end FIN? */
6837                 return (NULL);
6838         }
6839         /* ok lets look at this one */
6840         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6841         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
6842                 goto check_it;
6843         }
6844         rsm = rack_find_lowest_rsm(rack);
6845         if (rsm == NULL) {
6846                 return (NULL);
6847         }
6848 check_it:
6849         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
6850         srtt = TICKS_2_MSEC(srtt_cur);
6851         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
6852                 srtt = rack->rc_rack_rtt;
6853         if (rsm->r_flags & RACK_ACKED) {
6854                 return (NULL);
6855         }
6856         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
6857                 /* Its not yet ready */
6858                 return (NULL);
6859         }
6860         idx = rsm->r_rtr_cnt - 1;
6861         ts_low = rsm->r_tim_lastsent[idx];
6862         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6863         if (tsused <= ts_low) {
6864                 return (NULL);
6865         }
6866         if ((tsused - ts_low) >= thresh) {
6867                 return (rsm);
6868         }
6869         return (NULL);
6870 }
6871
6872 static int
6873 rack_output(struct tcpcb *tp)
6874 {
6875         struct socket *so;
6876         uint32_t recwin, sendwin;
6877         uint32_t sb_offset;
6878         int32_t len, flags, error = 0;
6879         struct mbuf *m;
6880         struct mbuf *mb;
6881         uint32_t if_hw_tsomaxsegcount = 0;
6882         uint32_t if_hw_tsomaxsegsize;
6883         long tot_len_this_send = 0;
6884         struct ip *ip = NULL;
6885 #ifdef TCPDEBUG
6886         struct ipovly *ipov = NULL;
6887 #endif
6888         struct udphdr *udp = NULL;
6889         struct tcp_rack *rack;
6890         struct tcphdr *th;
6891         uint8_t pass = 0;
6892         uint8_t wanted_cookie = 0;
6893         u_char opt[TCP_MAXOLEN];
6894         unsigned ipoptlen, optlen, hdrlen, ulen=0;
6895         uint32_t rack_seq;
6896
6897 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
6898         unsigned ipsec_optlen = 0;
6899
6900 #endif
6901         int32_t idle, sendalot;
6902         int32_t sub_from_prr = 0;
6903         volatile int32_t sack_rxmit;
6904         struct rack_sendmap *rsm = NULL;
6905         int32_t tso, mtu, would_have_fin = 0;
6906         struct tcpopt to;
6907         int32_t slot = 0;
6908         uint32_t cts;
6909         uint8_t hpts_calling, doing_tlp = 0;
6910         int32_t do_a_prefetch;
6911         int32_t prefetch_rsm = 0;
6912         int32_t prefetch_so_done = 0;
6913         struct tcp_log_buffer *lgb = NULL;
6914         struct inpcb *inp;
6915         struct sockbuf *sb;
6916 #ifdef INET6
6917         struct ip6_hdr *ip6 = NULL;
6918         int32_t isipv6;
6919 #endif
6920         /* setup and take the cache hits here */
6921         rack = (struct tcp_rack *)tp->t_fb_ptr;
6922         inp = rack->rc_inp;
6923         so = inp->inp_socket;
6924         sb = &so->so_snd;
6925         kern_prefetch(sb, &do_a_prefetch);
6926         do_a_prefetch = 1;
6927
6928         INP_WLOCK_ASSERT(inp);
6929 #ifdef TCP_OFFLOAD
6930         if (tp->t_flags & TF_TOE)
6931                 return (tcp_offload_output(tp));
6932 #endif
6933 #ifdef INET6
6934         if (rack->r_state) {
6935                 /* Use the cache line loaded if possible */
6936                 isipv6 = rack->r_is_v6;
6937         } else {
6938                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
6939         }
6940 #endif
6941         cts = tcp_ts_getticks();
6942         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
6943             inp->inp_in_hpts) {
6944                 /*
6945                  * We are on the hpts for some timer but not hptsi output.
6946                  * Remove from the hpts unconditionally.
6947                  */
6948                 rack_timer_cancel(tp, rack, cts, __LINE__);
6949         }
6950         /* Mark that we have called rack_output(). */
6951         if ((rack->r_timer_override) ||
6952             (tp->t_flags & TF_FORCEDATA) ||
6953             (tp->t_state < TCPS_ESTABLISHED)) {
6954                 if (tp->t_inpcb->inp_in_hpts)
6955                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
6956         } else if (tp->t_inpcb->inp_in_hpts) {
6957                 /*
6958                  * On the hpts you can't pass even if ACKNOW is on, we will
6959                  * when the hpts fires.
6960                  */
6961                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
6962                 return (0);
6963         }
6964         hpts_calling = inp->inp_hpts_calls;
6965         inp->inp_hpts_calls = 0;
6966         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
6967                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
6968                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
6969                         return (0);
6970                 }
6971         }
6972         rack->r_wanted_output = 0;
6973         rack->r_timer_override = 0;
6974         /*
6975          * For TFO connections in SYN_SENT or SYN_RECEIVED,
6976          * only allow the initial SYN or SYN|ACK and those sent
6977          * by the retransmit timer.
6978          */
6979         if (IS_FASTOPEN(tp->t_flags) &&
6980             ((tp->t_state == TCPS_SYN_RECEIVED) ||
6981              (tp->t_state == TCPS_SYN_SENT)) &&
6982             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
6983             (tp->t_rxtshift == 0))              /* not a retransmit */
6984                 return (0);
6985         /*
6986          * Determine length of data that should be transmitted, and flags
6987          * that will be used. If there is some data or critical controls
6988          * (SYN, RST) to send, then transmit; otherwise, investigate
6989          * further.
6990          */
6991         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
6992 #ifdef NETFLIX_CWV
6993         if (tp->cwv_enabled) {
6994                 if ((tp->cwv_cwnd_valid == 0) &&
6995                     TCPS_HAVEESTABLISHED(tp->t_state) &&
6996                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
6997                         tcp_newcwv_nvp_closedown(tp);
6998         } else
6999 #endif
7000         if (tp->t_idle_reduce) {
7001                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
7002                         rack_cc_after_idle(tp,
7003                             (rack->r_idle_reduce_largest ? 1 :0));
7004         }
7005         tp->t_flags &= ~TF_LASTIDLE;
7006         if (idle) {
7007                 if (tp->t_flags & TF_MORETOCOME) {
7008                         tp->t_flags |= TF_LASTIDLE;
7009                         idle = 0;
7010                 }
7011         }
7012 again:
7013         /*
7014          * If we've recently taken a timeout, snd_max will be greater than
7015          * snd_nxt.  There may be SACK information that allows us to avoid
7016          * resending already delivered data.  Adjust snd_nxt accordingly.
7017          */
7018         sendalot = 0;
7019         cts = tcp_ts_getticks();
7020         tso = 0;
7021         mtu = 0;
7022         sb_offset = tp->snd_max - tp->snd_una;
7023         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
7024
7025         flags = tcp_outflags[tp->t_state];
7026         /*
7027          * Send any SACK-generated retransmissions.  If we're explicitly
7028          * trying to send out new data (when sendalot is 1), bypass this
7029          * function. If we retransmit in fast recovery mode, decrement
7030          * snd_cwnd, since we're replacing a (future) new transmission with
7031          * a retransmission now, and we previously incremented snd_cwnd in
7032          * tcp_input().
7033          */
7034         /*
7035          * Still in sack recovery , reset rxmit flag to zero.
7036          */
7037         while (rack->rc_free_cnt < rack_free_cache) {
7038                 rsm = rack_alloc(rack);
7039                 if (rsm == NULL) {
7040                         if (inp->inp_hpts_calls)
7041                                 /* Retry in a ms */
7042                                 slot = 1;
7043                         goto just_return_nolock;
7044                 }
7045                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
7046                 rack->rc_free_cnt++;
7047                 rsm = NULL;
7048         }
7049         if (inp->inp_hpts_calls)
7050                 inp->inp_hpts_calls = 0;
7051         sack_rxmit = 0;
7052         len = 0;
7053         rsm = NULL;
7054         if (flags & TH_RST) {
7055                 SOCKBUF_LOCK(sb);
7056                 goto send;
7057         }
7058         if (rack->r_ctl.rc_tlpsend) {
7059                 /* Tail loss probe */
7060                 long cwin;
7061                 long tlen;
7062
7063                 doing_tlp = 1;
7064                 rsm = rack->r_ctl.rc_tlpsend;
7065                 rack->r_ctl.rc_tlpsend = NULL;
7066                 sack_rxmit = 1;
7067                 tlen = rsm->r_end - rsm->r_start;
7068                 if (tlen > tp->t_maxseg)
7069                         tlen = tp->t_maxseg;
7070                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7071                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7072                     __func__, __LINE__,
7073                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7074                 sb_offset = rsm->r_start - tp->snd_una;
7075                 cwin = min(tp->snd_wnd, tlen);
7076                 len = cwin;
7077         } else if (rack->r_ctl.rc_resend) {
7078                 /* Retransmit timer */
7079                 rsm = rack->r_ctl.rc_resend;
7080                 rack->r_ctl.rc_resend = NULL;
7081                 len = rsm->r_end - rsm->r_start;
7082                 sack_rxmit = 1;
7083                 sendalot = 0;
7084                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7085                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7086                     __func__, __LINE__,
7087                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7088                 sb_offset = rsm->r_start - tp->snd_una;
7089                 if (len >= tp->t_maxseg) {
7090                         len = tp->t_maxseg;
7091                 }
7092         } else if ((rack->rc_in_persist == 0) &&
7093             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
7094                 long tlen;
7095
7096                 if ((!IN_RECOVERY(tp->t_flags)) &&
7097                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
7098                         /* Enter recovery if not induced by a time-out */
7099                         rack->r_ctl.rc_rsm_start = rsm->r_start;
7100                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
7101                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
7102                         rack_cong_signal(tp, NULL, CC_NDUPACK);
7103                         /*
7104                          * When we enter recovery we need to assure we send
7105                          * one packet.
7106                          */
7107                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
7108                 }
7109 #ifdef INVARIANTS
7110                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
7111                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
7112                             tp, rack, rsm, rsm->r_start, tp->snd_una);
7113                 }
7114 #endif
7115                 tlen = rsm->r_end - rsm->r_start;
7116                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7117                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7118                     __func__, __LINE__,
7119                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7120                 sb_offset = rsm->r_start - tp->snd_una;
7121                 if (tlen > rack->r_ctl.rc_prr_sndcnt) {
7122                         len = rack->r_ctl.rc_prr_sndcnt;
7123                 } else {
7124                         len = tlen;
7125                 }
7126                 if (len >= tp->t_maxseg) {
7127                         sendalot = 1;
7128                         len = tp->t_maxseg;
7129                 } else {
7130                         sendalot = 0;
7131                         if ((rack->rc_timer_up == 0) &&
7132                             (len < tlen)) {
7133                                 /*
7134                                  * If its not a timer don't send a partial
7135                                  * segment.
7136                                  */
7137                                 len = 0;
7138                                 goto just_return_nolock;
7139                         }
7140                 }
7141                 if (len > 0) {
7142                         sub_from_prr = 1;
7143                         sack_rxmit = 1;
7144                         TCPSTAT_INC(tcps_sack_rexmits);
7145                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
7146                             min(len, tp->t_maxseg));
7147                         counter_u64_add(rack_rtm_prr_retran, 1);
7148                 }
7149         }
7150         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
7151                 /* we are retransmitting the fin */
7152                 len--;
7153                 if (len) {
7154                         /*
7155                          * When retransmitting data do *not* include the
7156                          * FIN. This could happen from a TLP probe.
7157                          */
7158                         flags &= ~TH_FIN;
7159                 }
7160         }
7161 #ifdef INVARIANTS
7162         /* For debugging */
7163         rack->r_ctl.rc_rsm_at_retran = rsm;
7164 #endif
7165         /*
7166          * Get standard flags, and add SYN or FIN if requested by 'hidden'
7167          * state flags.
7168          */
7169         if (tp->t_flags & TF_NEEDFIN)
7170                 flags |= TH_FIN;
7171         if (tp->t_flags & TF_NEEDSYN)
7172                 flags |= TH_SYN;
7173         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
7174                 void *end_rsm;
7175                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
7176                 if (end_rsm)
7177                         kern_prefetch(end_rsm, &prefetch_rsm);
7178                 prefetch_rsm = 1;
7179         }
7180         SOCKBUF_LOCK(sb);
7181         /*
7182          * If in persist timeout with window of 0, send 1 byte. Otherwise,
7183          * if window is small but nonzero and time TF_SENTFIN expired, we
7184          * will send what we can and go to transmit state.
7185          */
7186         if (tp->t_flags & TF_FORCEDATA) {
7187                 if (sendwin == 0) {
7188                         /*
7189                          * If we still have some data to send, then clear
7190                          * the FIN bit.  Usually this would happen below
7191                          * when it realizes that we aren't sending all the
7192                          * data.  However, if we have exactly 1 byte of
7193                          * unsent data, then it won't clear the FIN bit
7194                          * below, and if we are in persist state, we wind up
7195                          * sending the packet without recording that we sent
7196                          * the FIN bit.
7197                          *
7198                          * We can't just blindly clear the FIN bit, because
7199                          * if we don't have any more data to send then the
7200                          * probe will be the FIN itself.
7201                          */
7202                         if (sb_offset < sbused(sb))
7203                                 flags &= ~TH_FIN;
7204                         sendwin = 1;
7205                 } else {
7206                         if (rack->rc_in_persist)
7207                                 rack_exit_persist(tp, rack);
7208                         /*
7209                          * If we are dropping persist mode then we need to
7210                          * correct snd_nxt/snd_max and off.
7211                          */
7212                         tp->snd_nxt = tp->snd_max;
7213                         sb_offset = tp->snd_nxt - tp->snd_una;
7214                 }
7215         }
7216         /*
7217          * If snd_nxt == snd_max and we have transmitted a FIN, the
7218          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
7219          * negative length.  This can also occur when TCP opens up its
7220          * congestion window while receiving additional duplicate acks after
7221          * fast-retransmit because TCP will reset snd_nxt to snd_max after
7222          * the fast-retransmit.
7223          *
7224          * In the normal retransmit-FIN-only case, however, snd_nxt will be
7225          * set to snd_una, the sb_offset will be 0, and the length may wind
7226          * up 0.
7227          *
7228          * If sack_rxmit is true we are retransmitting from the scoreboard
7229          * in which case len is already set.
7230          */
7231         if (sack_rxmit == 0) {
7232                 uint32_t avail;
7233
7234                 avail = sbavail(sb);
7235                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
7236                         sb_offset = tp->snd_nxt - tp->snd_una;
7237                 else
7238                         sb_offset = 0;
7239                 if (IN_RECOVERY(tp->t_flags) == 0) {
7240                         if (rack->r_ctl.rc_tlp_new_data) {
7241                                 /* TLP is forcing out new data */
7242                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
7243                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
7244                                 }
7245                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
7246                                         len = tp->snd_wnd;
7247                                 else
7248                                         len = rack->r_ctl.rc_tlp_new_data;
7249                                 rack->r_ctl.rc_tlp_new_data = 0;
7250                                 doing_tlp = 1;
7251                         } else {
7252                                 if (sendwin > avail) {
7253                                         /* use the available */
7254                                         if (avail > sb_offset) {
7255                                                 len = (int32_t)(avail - sb_offset);
7256                                         } else {
7257                                                 len = 0;
7258                                         }
7259                                 } else {
7260                                         if (sendwin > sb_offset) {
7261                                                 len = (int32_t)(sendwin - sb_offset);
7262                                         } else {
7263                                                 len = 0;
7264                                         }
7265                                 }
7266                         }
7267                 } else {
7268                         uint32_t outstanding;
7269
7270                         /*
7271                          * We are inside of a SACK recovery episode and are
7272                          * sending new data, having retransmitted all the
7273                          * data possible so far in the scoreboard.
7274                          */
7275                         outstanding = tp->snd_max - tp->snd_una;
7276                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
7277                                 len = 0;
7278                         else if (avail > sb_offset)
7279                                 len = avail - sb_offset;
7280                         else
7281                                 len = 0;
7282                         if (len > 0) {
7283                                 if (len > rack->r_ctl.rc_prr_sndcnt)
7284                                         len = rack->r_ctl.rc_prr_sndcnt;
7285
7286                                 if (len > 0) {
7287                                         sub_from_prr = 1;
7288                                         counter_u64_add(rack_rtm_prr_newdata, 1);
7289                                 }
7290                         }
7291                         if (len > tp->t_maxseg) {
7292                                 /*
7293                                  * We should never send more than a MSS when
7294                                  * retransmitting or sending new data in prr
7295                                  * mode unless the override flag is on. Most
7296                                  * likely the PRR algorithm is not going to
7297                                  * let us send a lot as well :-)
7298                                  */
7299                                 if (rack->r_ctl.rc_prr_sendalot == 0)
7300                                         len = tp->t_maxseg;
7301                         } else if (len < tp->t_maxseg) {
7302                                 /*
7303                                  * Do we send any? The idea here is if the
7304                                  * send empty's the socket buffer we want to
7305                                  * do it. However if not then lets just wait
7306                                  * for our prr_sndcnt to get bigger.
7307                                  */
7308                                 long leftinsb;
7309
7310                                 leftinsb = sbavail(sb) - sb_offset;
7311                                 if (leftinsb > len) {
7312                                         /* This send does not empty the sb */
7313                                         len = 0;
7314                                 }
7315                         }
7316                 }
7317         }
7318         if (prefetch_so_done == 0) {
7319                 kern_prefetch(so, &prefetch_so_done);
7320                 prefetch_so_done = 1;
7321         }
7322         /*
7323          * Lop off SYN bit if it has already been sent.  However, if this is
7324          * SYN-SENT state and if segment contains data and if we don't know
7325          * that foreign host supports TAO, suppress sending segment.
7326          */
7327         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
7328             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
7329                 if (tp->t_state != TCPS_SYN_RECEIVED)
7330                         flags &= ~TH_SYN;
7331                 /*
7332                  * When sending additional segments following a TFO SYN|ACK,
7333                  * do not include the SYN bit.
7334                  */
7335                 if (IS_FASTOPEN(tp->t_flags) &&
7336                     (tp->t_state == TCPS_SYN_RECEIVED))
7337                         flags &= ~TH_SYN;
7338                 sb_offset--, len++;
7339         }
7340         /*
7341          * Be careful not to send data and/or FIN on SYN segments. This
7342          * measure is needed to prevent interoperability problems with not
7343          * fully conformant TCP implementations.
7344          */
7345         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
7346                 len = 0;
7347                 flags &= ~TH_FIN;
7348         }
7349         /*
7350          * On TFO sockets, ensure no data is sent in the following cases:
7351          *
7352          *  - When retransmitting SYN|ACK on a passively-created socket
7353          *
7354          *  - When retransmitting SYN on an actively created socket
7355          *
7356          *  - When sending a zero-length cookie (cookie request) on an
7357          *    actively created socket
7358          *
7359          *  - When the socket is in the CLOSED state (RST is being sent)
7360          */
7361         if (IS_FASTOPEN(tp->t_flags) &&
7362             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
7363              ((tp->t_state == TCPS_SYN_SENT) &&
7364               (tp->t_tfo_client_cookie_len == 0)) ||
7365              (flags & TH_RST))) {
7366                 sack_rxmit = 0;
7367                 len = 0;
7368         }
7369         /* Without fast-open there should never be data sent on a SYN */
7370         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
7371                 len = 0;
7372         if (len <= 0) {
7373                 /*
7374                  * If FIN has been sent but not acked, but we haven't been
7375                  * called to retransmit, len will be < 0.  Otherwise, window
7376                  * shrank after we sent into it.  If window shrank to 0,
7377                  * cancel pending retransmit, pull snd_nxt back to (closed)
7378                  * window, and set the persist timer if it isn't already
7379                  * going.  If the window didn't close completely, just wait
7380                  * for an ACK.
7381                  *
7382                  * We also do a general check here to ensure that we will
7383                  * set the persist timer when we have data to send, but a
7384                  * 0-byte window. This makes sure the persist timer is set
7385                  * even if the packet hits one of the "goto send" lines
7386                  * below.
7387                  */
7388                 len = 0;
7389                 if ((tp->snd_wnd == 0) &&
7390                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
7391                     (sb_offset < (int)sbavail(sb))) {
7392                         tp->snd_nxt = tp->snd_una;
7393                         rack_enter_persist(tp, rack, cts);
7394                 }
7395         }
7396         /* len will be >= 0 after this point. */
7397         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7398         tcp_sndbuf_autoscale(tp, so, sendwin);
7399         /*
7400          * Decide if we can use TCP Segmentation Offloading (if supported by
7401          * hardware).
7402          *
7403          * TSO may only be used if we are in a pure bulk sending state.  The
7404          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
7405          * options prevent using TSO.  With TSO the TCP header is the same
7406          * (except for the sequence number) for all generated packets.  This
7407          * makes it impossible to transmit any options which vary per
7408          * generated segment or packet.
7409          *
7410          * IPv4 handling has a clear separation of ip options and ip header
7411          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
7412          * the right thing below to provide length of just ip options and thus
7413          * checking for ipoptlen is enough to decide if ip options are present.
7414          */
7415
7416 #ifdef INET6
7417         if (isipv6)
7418                 ipoptlen = ip6_optlen(tp->t_inpcb);
7419         else
7420 #endif
7421                 if (tp->t_inpcb->inp_options)
7422                         ipoptlen = tp->t_inpcb->inp_options->m_len -
7423                             offsetof(struct ipoption, ipopt_list);
7424                 else
7425                         ipoptlen = 0;
7426 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7427         /*
7428          * Pre-calculate here as we save another lookup into the darknesses
7429          * of IPsec that way and can actually decide if TSO is ok.
7430          */
7431 #ifdef INET6
7432         if (isipv6 && IPSEC_ENABLED(ipv6))
7433                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
7434 #ifdef INET
7435         else
7436 #endif
7437 #endif                          /* INET6 */
7438 #ifdef INET
7439         if (IPSEC_ENABLED(ipv4))
7440                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
7441 #endif                          /* INET */
7442 #endif
7443
7444 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7445         ipoptlen += ipsec_optlen;
7446 #endif
7447         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
7448             (tp->t_port == 0) &&
7449             ((tp->t_flags & TF_SIGNATURE) == 0) &&
7450             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
7451             ipoptlen == 0)
7452                 tso = 1;
7453         {
7454                 uint32_t outstanding;
7455
7456                 outstanding = tp->snd_max - tp->snd_una;
7457                 if (tp->t_flags & TF_SENTFIN) {
7458                         /*
7459                          * If we sent a fin, snd_max is 1 higher than
7460                          * snd_una
7461                          */
7462                         outstanding--;
7463                 }
7464                 if (outstanding > 0) {
7465                         /*
7466                          * This is sub-optimal. We only send a stand alone
7467                          * FIN on its own segment.
7468                          */
7469                         if (flags & TH_FIN) {
7470                                 flags &= ~TH_FIN;
7471                                 would_have_fin = 1;
7472                         }
7473                 } else if (sack_rxmit) {
7474                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
7475                                 flags &= ~TH_FIN;
7476                 } else {
7477                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
7478                             sbused(sb)))
7479                                 flags &= ~TH_FIN;
7480                 }
7481         }
7482         recwin = sbspace(&so->so_rcv);
7483
7484         /*
7485          * Sender silly window avoidance.   We transmit under the following
7486          * conditions when len is non-zero:
7487          *
7488          * - We have a full segment (or more with TSO) - This is the last
7489          * buffer in a write()/send() and we are either idle or running
7490          * NODELAY - we've timed out (e.g. persist timer) - we have more
7491          * then 1/2 the maximum send window's worth of data (receiver may be
7492          * limited the window size) - we need to retransmit
7493          */
7494         if (len) {
7495                 if (len >= tp->t_maxseg) {
7496                         pass = 1;
7497                         goto send;
7498                 }
7499                 /*
7500                  * NOTE! on localhost connections an 'ack' from the remote
7501                  * end may occur synchronously with the output and cause us
7502                  * to flush a buffer queued with moretocome.  XXX
7503                  *
7504                  */
7505                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
7506                     (idle || (tp->t_flags & TF_NODELAY)) &&
7507                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
7508                     (tp->t_flags & TF_NOPUSH) == 0) {
7509                         pass = 2;
7510                         goto send;
7511                 }
7512                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
7513                         pass = 3;
7514                         goto send;
7515                 }
7516                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
7517                         goto send;
7518                 }
7519                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
7520                         pass = 4;
7521                         goto send;
7522                 }
7523                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
7524                         pass = 5;
7525                         goto send;
7526                 }
7527                 if (sack_rxmit) {
7528                         pass = 6;
7529                         goto send;
7530                 }
7531         }
7532         /*
7533          * Sending of standalone window updates.
7534          *
7535          * Window updates are important when we close our window due to a
7536          * full socket buffer and are opening it again after the application
7537          * reads data from it.  Once the window has opened again and the
7538          * remote end starts to send again the ACK clock takes over and
7539          * provides the most current window information.
7540          *
7541          * We must avoid the silly window syndrome whereas every read from
7542          * the receive buffer, no matter how small, causes a window update
7543          * to be sent.  We also should avoid sending a flurry of window
7544          * updates when the socket buffer had queued a lot of data and the
7545          * application is doing small reads.
7546          *
7547          * Prevent a flurry of pointless window updates by only sending an
7548          * update when we can increase the advertized window by more than
7549          * 1/4th of the socket buffer capacity.  When the buffer is getting
7550          * full or is very small be more aggressive and send an update
7551          * whenever we can increase by two mss sized segments. In all other
7552          * situations the ACK's to new incoming data will carry further
7553          * window increases.
7554          *
7555          * Don't send an independent window update if a delayed ACK is
7556          * pending (it will get piggy-backed on it) or the remote side
7557          * already has done a half-close and won't send more data.  Skip
7558          * this if the connection is in T/TCP half-open state.
7559          */
7560         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
7561             !(tp->t_flags & TF_DELACK) &&
7562             !TCPS_HAVERCVDFIN(tp->t_state)) {
7563                 /*
7564                  * "adv" is the amount we could increase the window, taking
7565                  * into account that we are limited by TCP_MAXWIN <<
7566                  * tp->rcv_scale.
7567                  */
7568                 int32_t adv;
7569                 int oldwin;
7570
7571                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
7572                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
7573                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
7574                         adv -= oldwin;
7575                 } else
7576                         oldwin = 0;
7577
7578                 /*
7579                  * If the new window size ends up being the same as the old
7580                  * size when it is scaled, then don't force a window update.
7581                  */
7582                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
7583                         goto dontupdate;
7584
7585                 if (adv >= (int32_t)(2 * tp->t_maxseg) &&
7586                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
7587                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
7588                     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
7589                         pass = 7;
7590                         goto send;
7591                 }
7592                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
7593                         goto send;
7594         }
7595 dontupdate:
7596
7597         /*
7598          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
7599          * is also a catch-all for the retransmit timer timeout case.
7600          */
7601         if (tp->t_flags & TF_ACKNOW) {
7602                 pass = 8;
7603                 goto send;
7604         }
7605         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
7606                 pass = 9;
7607                 goto send;
7608         }
7609         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
7610                 pass = 10;
7611                 goto send;
7612         }
7613         /*
7614          * If our state indicates that FIN should be sent and we have not
7615          * yet done so, then we need to send.
7616          */
7617         if ((flags & TH_FIN) &&
7618             (tp->snd_nxt == tp->snd_una)) {
7619                 pass = 11;
7620                 goto send;
7621         }
7622         /*
7623          * No reason to send a segment, just return.
7624          */
7625 just_return:
7626         SOCKBUF_UNLOCK(sb);
7627 just_return_nolock:
7628         if (tot_len_this_send == 0)
7629                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
7630         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
7631         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
7632         tp->t_flags &= ~TF_FORCEDATA;
7633         return (0);
7634
7635 send:
7636         if (doing_tlp == 0) {
7637                 /*
7638                  * Data not a TLP, and its not the rxt firing. If it is the
7639                  * rxt firing, we want to leave the tlp_in_progress flag on
7640                  * so we don't send another TLP. It has to be a rack timer
7641                  * or normal send (response to acked data) to clear the tlp
7642                  * in progress flag.
7643                  */
7644                 rack->rc_tlp_in_progress = 0;
7645         }
7646         SOCKBUF_LOCK_ASSERT(sb);
7647         if (len > 0) {
7648                 if (len >= tp->t_maxseg)
7649                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
7650                 else
7651                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
7652         }
7653         /*
7654          * Before ESTABLISHED, force sending of initial options unless TCP
7655          * set not to do any options. NOTE: we assume that the IP/TCP header
7656          * plus TCP options always fit in a single mbuf, leaving room for a
7657          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
7658          * + optlen <= MCLBYTES
7659          */
7660         optlen = 0;
7661 #ifdef INET6
7662         if (isipv6)
7663                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
7664         else
7665 #endif
7666                 hdrlen = sizeof(struct tcpiphdr);
7667
7668         /*
7669          * Compute options for segment. We only have to care about SYN and
7670          * established connection segments.  Options for SYN-ACK segments
7671          * are handled in TCP syncache.
7672          */
7673         to.to_flags = 0;
7674         if ((tp->t_flags & TF_NOOPT) == 0) {
7675                 /* Maximum segment size. */
7676                 if (flags & TH_SYN) {
7677                         tp->snd_nxt = tp->iss;
7678                         to.to_mss = tcp_mssopt(&inp->inp_inc);
7679 #ifdef NETFLIX_TCPOUDP
7680                         if (tp->t_port)
7681                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
7682 #endif
7683                         to.to_flags |= TOF_MSS;
7684
7685                         /*
7686                          * On SYN or SYN|ACK transmits on TFO connections,
7687                          * only include the TFO option if it is not a
7688                          * retransmit, as the presence of the TFO option may
7689                          * have caused the original SYN or SYN|ACK to have
7690                          * been dropped by a middlebox.
7691                          */
7692                         if (IS_FASTOPEN(tp->t_flags) &&
7693                             (tp->t_rxtshift == 0)) {
7694                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
7695                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
7696                                         to.to_tfo_cookie =
7697                                             (u_int8_t *)&tp->t_tfo_cookie.server;
7698                                         to.to_flags |= TOF_FASTOPEN;
7699                                         wanted_cookie = 1;
7700                                 } else if (tp->t_state == TCPS_SYN_SENT) {
7701                                         to.to_tfo_len =
7702                                             tp->t_tfo_client_cookie_len;
7703                                         to.to_tfo_cookie =
7704                                             tp->t_tfo_cookie.client;
7705                                         to.to_flags |= TOF_FASTOPEN;
7706                                         wanted_cookie = 1;
7707                                         /*
7708                                          * If we wind up having more data to
7709                                          * send with the SYN than can fit in
7710                                          * one segment, don't send any more
7711                                          * until the SYN|ACK comes back from
7712                                          * the other end.
7713                                          */
7714                                         sendalot = 0;
7715                                 }
7716                         }
7717                 }
7718                 /* Window scaling. */
7719                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
7720                         to.to_wscale = tp->request_r_scale;
7721                         to.to_flags |= TOF_SCALE;
7722                 }
7723                 /* Timestamps. */
7724                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
7725                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
7726                         to.to_tsval = cts + tp->ts_offset;
7727                         to.to_tsecr = tp->ts_recent;
7728                         to.to_flags |= TOF_TS;
7729                 }
7730                 /* Set receive buffer autosizing timestamp. */
7731                 if (tp->rfbuf_ts == 0 &&
7732                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
7733                         tp->rfbuf_ts = tcp_ts_getticks();
7734                 /* Selective ACK's. */
7735                 if (flags & TH_SYN)
7736                         to.to_flags |= TOF_SACKPERM;
7737                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7738                     tp->rcv_numsacks > 0) {
7739                         to.to_flags |= TOF_SACK;
7740                         to.to_nsacks = tp->rcv_numsacks;
7741                         to.to_sacks = (u_char *)tp->sackblks;
7742                 }
7743 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
7744                 /* TCP-MD5 (RFC2385). */
7745                 if (tp->t_flags & TF_SIGNATURE)
7746                         to.to_flags |= TOF_SIGNATURE;
7747 #endif                          /* TCP_SIGNATURE */
7748
7749                 /* Processing the options. */
7750                 hdrlen += optlen = tcp_addoptions(&to, opt);
7751                 /*
7752                  * If we wanted a TFO option to be added, but it was unable
7753                  * to fit, ensure no data is sent.
7754                  */
7755                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
7756                     !(to.to_flags & TOF_FASTOPEN))
7757                         len = 0;
7758         }
7759 #ifdef NETFLIX_TCPOUDP
7760         if (tp->t_port) {
7761                 if (V_tcp_udp_tunneling_port == 0) {
7762                         /* The port was removed?? */
7763                         SOCKBUF_UNLOCK(&so->so_snd);
7764                         return (EHOSTUNREACH);
7765                 }
7766                 hdrlen += sizeof(struct udphdr);
7767         }
7768 #endif
7769         ipoptlen = 0;
7770 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7771         ipoptlen += ipsec_optlen;
7772 #endif
7773
7774         /*
7775          * Adjust data length if insertion of options will bump the packet
7776          * length beyond the t_maxseg length. Clear the FIN bit because we
7777          * cut off the tail of the segment.
7778          */
7779         if (len + optlen + ipoptlen > tp->t_maxseg) {
7780                 if (flags & TH_FIN) {
7781                         would_have_fin = 1;
7782                         flags &= ~TH_FIN;
7783                 }
7784                 if (tso) {
7785                         uint32_t if_hw_tsomax;
7786                         uint32_t moff;
7787                         int32_t max_len;
7788
7789                         /* extract TSO information */
7790                         if_hw_tsomax = tp->t_tsomax;
7791                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
7792                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
7793                         KASSERT(ipoptlen == 0,
7794                             ("%s: TSO can't do IP options", __func__));
7795
7796                         /*
7797                          * Check if we should limit by maximum payload
7798                          * length:
7799                          */
7800                         if (if_hw_tsomax != 0) {
7801                                 /* compute maximum TSO length */
7802                                 max_len = (if_hw_tsomax - hdrlen -
7803                                     max_linkhdr);
7804                                 if (max_len <= 0) {
7805                                         len = 0;
7806                                 } else if (len > max_len) {
7807                                         sendalot = 1;
7808                                         len = max_len;
7809                                 }
7810                         }
7811                         /*
7812                          * Prevent the last segment from being fractional
7813                          * unless the send sockbuf can be emptied:
7814                          */
7815                         max_len = (tp->t_maxseg - optlen);
7816                         if ((sb_offset + len) < sbavail(sb)) {
7817                                 moff = len % (u_int)max_len;
7818                                 if (moff != 0) {
7819                                         len -= moff;
7820                                         sendalot = 1;
7821                                 }
7822                         }
7823                         /*
7824                          * In case there are too many small fragments don't
7825                          * use TSO:
7826                          */
7827                         if (len <= max_len) {
7828                                 len = max_len;
7829                                 sendalot = 1;
7830                                 tso = 0;
7831                         }
7832                         /*
7833                          * Send the FIN in a separate segment after the bulk
7834                          * sending is done. We don't trust the TSO
7835                          * implementations to clear the FIN flag on all but
7836                          * the last segment.
7837                          */
7838                         if (tp->t_flags & TF_NEEDFIN)
7839                                 sendalot = 1;
7840
7841                 } else {
7842                         len = tp->t_maxseg - optlen - ipoptlen;
7843                         sendalot = 1;
7844                 }
7845         } else
7846                 tso = 0;
7847         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
7848             ("%s: len > IP_MAXPACKET", __func__));
7849 #ifdef DIAGNOSTIC
7850 #ifdef INET6
7851         if (max_linkhdr + hdrlen > MCLBYTES)
7852 #else
7853         if (max_linkhdr + hdrlen > MHLEN)
7854 #endif
7855                 panic("tcphdr too big");
7856 #endif
7857
7858         /*
7859          * This KASSERT is here to catch edge cases at a well defined place.
7860          * Before, those had triggered (random) panic conditions further
7861          * down.
7862          */
7863         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7864         if ((len == 0) &&
7865             (flags & TH_FIN) &&
7866             (sbused(sb))) {
7867                 /*
7868                  * We have outstanding data, don't send a fin by itself!.
7869                  */
7870                 goto just_return;
7871         }
7872         /*
7873          * Grab a header mbuf, attaching a copy of data to be transmitted,
7874          * and initialize the header from the template for sends on this
7875          * connection.
7876          */
7877         if (len) {
7878                 uint32_t max_val;
7879                 uint32_t moff;
7880
7881                 if (rack->rc_pace_max_segs)
7882                         max_val = rack->rc_pace_max_segs * tp->t_maxseg;
7883                 else
7884                         max_val = len;
7885                 /*
7886                  * We allow a limit on sending with hptsi.
7887                  */
7888                 if (len > max_val) {
7889                         len = max_val;
7890                 }
7891 #ifdef INET6
7892                 if (MHLEN < hdrlen + max_linkhdr)
7893                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
7894                 else
7895 #endif
7896                         m = m_gethdr(M_NOWAIT, MT_DATA);
7897
7898                 if (m == NULL) {
7899                         SOCKBUF_UNLOCK(sb);
7900                         error = ENOBUFS;
7901                         sack_rxmit = 0;
7902                         goto out;
7903                 }
7904                 m->m_data += max_linkhdr;
7905                 m->m_len = hdrlen;
7906
7907                 /*
7908                  * Start the m_copy functions from the closest mbuf to the
7909                  * sb_offset in the socket buffer chain.
7910                  */
7911                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
7912                 if (len <= MHLEN - hdrlen - max_linkhdr) {
7913                         m_copydata(mb, moff, (int)len,
7914                             mtod(m, caddr_t)+hdrlen);
7915                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7916                                 sbsndptr_adv(sb, mb, len);
7917                         m->m_len += len;
7918                 } else {
7919                         struct sockbuf *msb;
7920
7921                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7922                                 msb = NULL;
7923                         else
7924                                 msb = sb;
7925                         m->m_next = tcp_m_copym(mb, moff, &len,
7926                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
7927                         if (len <= (tp->t_maxseg - optlen)) {
7928                                 /*
7929                                  * Must have ran out of mbufs for the copy
7930                                  * shorten it to no longer need tso. Lets
7931                                  * not put on sendalot since we are low on
7932                                  * mbufs.
7933                                  */
7934                                 tso = 0;
7935                         }
7936                         if (m->m_next == NULL) {
7937                                 SOCKBUF_UNLOCK(sb);
7938                                 (void)m_free(m);
7939                                 error = ENOBUFS;
7940                                 sack_rxmit = 0;
7941                                 goto out;
7942                         }
7943                 }
7944                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
7945                         TCPSTAT_INC(tcps_sndprobe);
7946 #ifdef NETFLIX_STATS
7947                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7948                                 stats_voi_update_abs_u32(tp->t_stats,
7949                                     VOI_TCP_RETXPB, len);
7950                         else
7951                                 stats_voi_update_abs_u64(tp->t_stats,
7952                                     VOI_TCP_TXPB, len);
7953 #endif
7954                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
7955                         if (rsm && (rsm->r_flags & RACK_TLP)) {
7956                                 /*
7957                                  * TLP should not count in retran count, but
7958                                  * in its own bin
7959                                  */
7960                                 counter_u64_add(rack_tlp_retran, 1);
7961                                 counter_u64_add(rack_tlp_retran_bytes, len);
7962                         } else {
7963                                 tp->t_sndrexmitpack++;
7964                                 TCPSTAT_INC(tcps_sndrexmitpack);
7965                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
7966                         }
7967 #ifdef NETFLIX_STATS
7968                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
7969                             len);
7970 #endif
7971                 } else {
7972                         TCPSTAT_INC(tcps_sndpack);
7973                         TCPSTAT_ADD(tcps_sndbyte, len);
7974 #ifdef NETFLIX_STATS
7975                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
7976                             len);
7977 #endif
7978                 }
7979                 /*
7980                  * If we're sending everything we've got, set PUSH. (This
7981                  * will keep happy those implementations which only give
7982                  * data to the user when a buffer fills or a PUSH comes in.)
7983                  */
7984                 if (sb_offset + len == sbused(sb) &&
7985                     sbused(sb) &&
7986                     !(flags & TH_SYN))
7987                         flags |= TH_PUSH;
7988
7989                 /*
7990                  * Are we doing hptsi, if so we must calculate the slot. We
7991                  * only do hptsi in ESTABLISHED and with no RESET being
7992                  * sent where we have data to send.
7993                  */
7994                 if (((tp->t_state == TCPS_ESTABLISHED) ||
7995                     (tp->t_state == TCPS_CLOSE_WAIT) ||
7996                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
7997                     ((tp->t_flags & TF_SENTFIN) == 0) &&
7998                     ((flags & TH_FIN) == 0))) &&
7999                     ((flags & TH_RST) == 0) &&
8000                     (rack->rc_always_pace)) {
8001                         /*
8002                          * We use the most optimistic possible cwnd/srtt for
8003                          * sending calculations. This will make our
8004                          * calculation anticipate getting more through
8005                          * quicker then possible. But thats ok we don't want
8006                          * the peer to have a gap in data sending.
8007                          */
8008                         uint32_t srtt, cwnd, tr_perms = 0;
8009
8010                         if (rack->r_ctl.rc_rack_min_rtt)
8011                                 srtt = rack->r_ctl.rc_rack_min_rtt;
8012                         else
8013                                 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8014                         if (rack->r_ctl.rc_rack_largest_cwnd)
8015                                 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8016                         else
8017                                 cwnd = tp->snd_cwnd;
8018                         tr_perms = cwnd / srtt;
8019                         if (tr_perms == 0) {
8020                                 tr_perms = tp->t_maxseg;
8021                         }
8022                         tot_len_this_send += len;
8023                         /*
8024                          * Calculate how long this will take to drain, if
8025                          * the calculation comes out to zero, thats ok we
8026                          * will use send_a_lot to possibly spin around for
8027                          * more increasing tot_len_this_send to the point
8028                          * that its going to require a pace, or we hit the
8029                          * cwnd. Which in that case we are just waiting for
8030                          * a ACK.
8031                          */
8032                         slot = tot_len_this_send / tr_perms;
8033                         /* Now do we reduce the time so we don't run dry? */
8034                         if (slot && rack->rc_pace_reduce) {
8035                                 int32_t reduce;
8036
8037                                 reduce = (slot / rack->rc_pace_reduce);
8038                                 if (reduce < slot) {
8039                                         slot -= reduce;
8040                                 } else
8041                                         slot = 0;
8042                         }
8043                         if (rack->r_enforce_min_pace &&
8044                             (slot == 0) &&
8045                             (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
8046                                 /* We are enforcing a minimum pace time of 1ms */
8047                                 slot = rack->r_enforce_min_pace;
8048                         }
8049                 }
8050                 SOCKBUF_UNLOCK(sb);
8051         } else {
8052                 SOCKBUF_UNLOCK(sb);
8053                 if (tp->t_flags & TF_ACKNOW)
8054                         TCPSTAT_INC(tcps_sndacks);
8055                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
8056                         TCPSTAT_INC(tcps_sndctrl);
8057                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
8058                         TCPSTAT_INC(tcps_sndurg);
8059                 else
8060                         TCPSTAT_INC(tcps_sndwinup);
8061
8062                 m = m_gethdr(M_NOWAIT, MT_DATA);
8063                 if (m == NULL) {
8064                         error = ENOBUFS;
8065                         sack_rxmit = 0;
8066                         goto out;
8067                 }
8068 #ifdef INET6
8069                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
8070                     MHLEN >= hdrlen) {
8071                         M_ALIGN(m, hdrlen);
8072                 } else
8073 #endif
8074                         m->m_data += max_linkhdr;
8075                 m->m_len = hdrlen;
8076         }
8077         SOCKBUF_UNLOCK_ASSERT(sb);
8078         m->m_pkthdr.rcvif = (struct ifnet *)0;
8079 #ifdef MAC
8080         mac_inpcb_create_mbuf(inp, m);
8081 #endif
8082 #ifdef INET6
8083         if (isipv6) {
8084                 ip6 = mtod(m, struct ip6_hdr *);
8085 #ifdef NETFLIX_TCPOUDP
8086                 if (tp->t_port) {
8087                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
8088                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8089                         udp->uh_dport = tp->t_port;
8090                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
8091                         udp->uh_ulen = htons(ulen);
8092                         th = (struct tcphdr *)(udp + 1);
8093                 } else
8094 #endif
8095                         th = (struct tcphdr *)(ip6 + 1);
8096                 tcpip_fillheaders(inp, ip6, th);
8097         } else
8098 #endif                          /* INET6 */
8099         {
8100                 ip = mtod(m, struct ip *);
8101 #ifdef TCPDEBUG
8102                 ipov = (struct ipovly *)ip;
8103 #endif
8104 #ifdef NETFLIX_TCPOUDP
8105                 if (tp->t_port) {
8106                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
8107                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8108                         udp->uh_dport = tp->t_port;
8109                         ulen = hdrlen + len - sizeof(struct ip);
8110                         udp->uh_ulen = htons(ulen);
8111                         th = (struct tcphdr *)(udp + 1);
8112                 } else
8113 #endif
8114                         th = (struct tcphdr *)(ip + 1);
8115                 tcpip_fillheaders(inp, ip, th);
8116         }
8117         /*
8118          * Fill in fields, remembering maximum advertised window for use in
8119          * delaying messages about window sizes. If resending a FIN, be sure
8120          * not to use a new sequence number.
8121          */
8122         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
8123             tp->snd_nxt == tp->snd_max)
8124                 tp->snd_nxt--;
8125         /*
8126          * If we are starting a connection, send ECN setup SYN packet. If we
8127          * are on a retransmit, we may resend those bits a number of times
8128          * as per RFC 3168.
8129          */
8130         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
8131                 if (tp->t_rxtshift >= 1) {
8132                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
8133                                 flags |= TH_ECE | TH_CWR;
8134                 } else
8135                         flags |= TH_ECE | TH_CWR;
8136         }
8137         if (tp->t_state == TCPS_ESTABLISHED &&
8138             (tp->t_flags & TF_ECN_PERMIT)) {
8139                 /*
8140                  * If the peer has ECN, mark data packets with ECN capable
8141                  * transmission (ECT). Ignore pure ack packets,
8142                  * retransmissions and window probes.
8143                  */
8144                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
8145                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
8146 #ifdef INET6
8147                         if (isipv6)
8148                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
8149                         else
8150 #endif
8151                                 ip->ip_tos |= IPTOS_ECN_ECT0;
8152                         TCPSTAT_INC(tcps_ecn_ect0);
8153                 }
8154                 /*
8155                  * Reply with proper ECN notifications.
8156                  */
8157                 if (tp->t_flags & TF_ECN_SND_CWR) {
8158                         flags |= TH_CWR;
8159                         tp->t_flags &= ~TF_ECN_SND_CWR;
8160                 }
8161                 if (tp->t_flags & TF_ECN_SND_ECE)
8162                         flags |= TH_ECE;
8163         }
8164         /*
8165          * If we are doing retransmissions, then snd_nxt will not reflect
8166          * the first unsent octet.  For ACK only packets, we do not want the
8167          * sequence number of the retransmitted packet, we want the sequence
8168          * number of the next unsent octet.  So, if there is no data (and no
8169          * SYN or FIN), use snd_max instead of snd_nxt when filling in
8170          * ti_seq.  But if we are in persist state, snd_max might reflect
8171          * one byte beyond the right edge of the window, so use snd_nxt in
8172          * that case, since we know we aren't doing a retransmission.
8173          * (retransmit and persist are mutually exclusive...)
8174          */
8175         if (sack_rxmit == 0) {
8176                 if (len || (flags & (TH_SYN | TH_FIN)) ||
8177                     rack->rc_in_persist) {
8178                         th->th_seq = htonl(tp->snd_nxt);
8179                         rack_seq = tp->snd_nxt;
8180                 } else if (flags & TH_RST) {
8181                         /*
8182                          * For a Reset send the last cum ack in sequence
8183                          * (this like any other choice may still generate a
8184                          * challenge ack, if a ack-update packet is in
8185                          * flight).
8186                          */
8187                         th->th_seq = htonl(tp->snd_una);
8188                         rack_seq = tp->snd_una;
8189                 } else {
8190                         th->th_seq = htonl(tp->snd_max);
8191                         rack_seq = tp->snd_max;
8192                 }
8193         } else {
8194                 th->th_seq = htonl(rsm->r_start);
8195                 rack_seq = rsm->r_start;
8196         }
8197         th->th_ack = htonl(tp->rcv_nxt);
8198         if (optlen) {
8199                 bcopy(opt, th + 1, optlen);
8200                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
8201         }
8202         th->th_flags = flags;
8203         /*
8204          * Calculate receive window.  Don't shrink window, but avoid silly
8205          * window syndrome.
8206          * If a RST segment is sent, advertise a window of zero.
8207          */
8208         if (flags & TH_RST) {
8209                 recwin = 0;
8210         } else {
8211                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
8212                     recwin < (long)tp->t_maxseg)
8213                         recwin = 0;
8214                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
8215                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
8216                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
8217                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
8218                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
8219         }
8220
8221         /*
8222          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
8223          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
8224          * handled in syncache.
8225          */
8226         if (flags & TH_SYN)
8227                 th->th_win = htons((u_short)
8228                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
8229         else
8230                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
8231         /*
8232          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
8233          * window.  This may cause the remote transmitter to stall.  This
8234          * flag tells soreceive() to disable delayed acknowledgements when
8235          * draining the buffer.  This can occur if the receiver is
8236          * attempting to read more data than can be buffered prior to
8237          * transmitting on the connection.
8238          */
8239         if (th->th_win == 0) {
8240                 tp->t_sndzerowin++;
8241                 tp->t_flags |= TF_RXWIN0SENT;
8242         } else
8243                 tp->t_flags &= ~TF_RXWIN0SENT;
8244         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8245                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
8246                 th->th_flags |= TH_URG;
8247         } else
8248                 /*
8249                  * If no urgent pointer to send, then we pull the urgent
8250                  * pointer to the left edge of the send window so that it
8251                  * doesn't drift into the send window on sequence number
8252                  * wraparound.
8253                  */
8254                 tp->snd_up = tp->snd_una;       /* drag it along */
8255
8256 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
8257         if (to.to_flags & TOF_SIGNATURE) {
8258                 /*
8259                  * Calculate MD5 signature and put it into the place
8260                  * determined before.
8261                  * NOTE: since TCP options buffer doesn't point into
8262                  * mbuf's data, calculate offset and use it.
8263                  */
8264                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
8265                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
8266                         /*
8267                          * Do not send segment if the calculation of MD5
8268                          * digest has failed.
8269                          */
8270                         goto out;
8271                 }
8272         }
8273 #endif
8274
8275         /*
8276          * Put TCP length in extended header, and then checksum extended
8277          * header and data.
8278          */
8279         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
8280 #ifdef INET6
8281         if (isipv6) {
8282                 /*
8283                  * ip6_plen is not need to be filled now, and will be filled
8284                  * in ip6_output.
8285                  */
8286                 if (tp->t_port) {
8287                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
8288                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8289                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
8290                         th->th_sum = htons(0);
8291                 } else {
8292                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
8293                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8294                         th->th_sum = in6_cksum_pseudo(ip6,
8295                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
8296                             0);
8297                 }
8298         }
8299 #endif
8300 #if defined(INET6) && defined(INET)
8301         else
8302 #endif
8303 #ifdef INET
8304         {
8305                 if (tp->t_port) {
8306                         m->m_pkthdr.csum_flags = CSUM_UDP;
8307                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8308                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
8309                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
8310                         th->th_sum = htons(0);
8311                 } else {
8312                         m->m_pkthdr.csum_flags = CSUM_TCP;
8313                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8314                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
8315                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
8316                             IPPROTO_TCP + len + optlen));
8317                 }
8318                 /* IP version must be set here for ipv4/ipv6 checking later */
8319                 KASSERT(ip->ip_v == IPVERSION,
8320                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
8321         }
8322 #endif
8323
8324         /*
8325          * Enable TSO and specify the size of the segments. The TCP pseudo
8326          * header checksum is always provided. XXX: Fixme: This is currently
8327          * not the case for IPv6.
8328          */
8329         if (tso) {
8330                 KASSERT(len > tp->t_maxseg - optlen,
8331                     ("%s: len <= tso_segsz", __func__));
8332                 m->m_pkthdr.csum_flags |= CSUM_TSO;
8333                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
8334         }
8335 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8336         KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
8337             ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
8338             __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
8339 #else
8340         KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
8341             ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
8342             __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
8343 #endif
8344
8345 #ifdef TCP_HHOOK
8346         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
8347         hhook_run_tcp_est_out(tp, th, &to, len, tso);
8348 #endif
8349
8350 #ifdef TCPDEBUG
8351         /*
8352          * Trace.
8353          */
8354         if (so->so_options & SO_DEBUG) {
8355                 u_short save = 0;
8356
8357 #ifdef INET6
8358                 if (!isipv6)
8359 #endif
8360                 {
8361                         save = ipov->ih_len;
8362                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
8363                               * (th->th_off << 2) */ );
8364                 }
8365                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
8366 #ifdef INET6
8367                 if (!isipv6)
8368 #endif
8369                         ipov->ih_len = save;
8370         }
8371 #endif                          /* TCPDEBUG */
8372
8373         /* We're getting ready to send; log now. */
8374         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
8375                 union tcp_log_stackspecific log;
8376
8377                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
8378                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
8379                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
8380                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
8381                 if (rsm || sack_rxmit) {
8382                         log.u_bbr.flex8 = 1;
8383                 } else {
8384                         log.u_bbr.flex8 = 0;
8385                 }
8386                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
8387                     len, &log, false, NULL, NULL, 0, NULL);
8388         } else
8389                 lgb = NULL;
8390
8391         /*
8392          * Fill in IP length and desired time to live and send to IP level.
8393          * There should be a better way to handle ttl and tos; we could keep
8394          * them in the template, but need a way to checksum without them.
8395          */
8396         /*
8397          * m->m_pkthdr.len should have been set before cksum calcuration,
8398          * because in6_cksum() need it.
8399          */
8400 #ifdef INET6
8401         if (isipv6) {
8402                 /*
8403                  * we separately set hoplimit for every segment, since the
8404                  * user might want to change the value via setsockopt. Also,
8405                  * desired default hop limit might be changed via Neighbor
8406                  * Discovery.
8407                  */
8408                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
8409
8410                 /*
8411                  * Set the packet size here for the benefit of DTrace
8412                  * probes. ip6_output() will set it properly; it's supposed
8413                  * to include the option header lengths as well.
8414                  */
8415                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
8416
8417                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
8418                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8419                 else
8420                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8421
8422                 if (tp->t_state == TCPS_SYN_SENT)
8423                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
8424
8425                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
8426                 /* TODO: IPv6 IP6TOS_ECT bit on */
8427                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
8428                     &inp->inp_route6,
8429                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
8430                     NULL, NULL, inp);
8431
8432                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
8433                         mtu = inp->inp_route6.ro_rt->rt_mtu;
8434         }
8435 #endif                          /* INET6 */
8436 #if defined(INET) && defined(INET6)
8437         else
8438 #endif
8439 #ifdef INET
8440         {
8441                 ip->ip_len = htons(m->m_pkthdr.len);
8442 #ifdef INET6
8443                 if (inp->inp_vflag & INP_IPV6PROTO)
8444                         ip->ip_ttl = in6_selecthlim(inp, NULL);
8445 #endif                          /* INET6 */
8446                 /*
8447                  * If we do path MTU discovery, then we set DF on every
8448                  * packet. This might not be the best thing to do according
8449                  * to RFC3390 Section 2. However the tcp hostcache migitates
8450                  * the problem so it affects only the first tcp connection
8451                  * with a host.
8452                  *
8453                  * NB: Don't set DF on small MTU/MSS to have a safe
8454                  * fallback.
8455                  */
8456                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
8457                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8458                         if (tp->t_port == 0 || len < V_tcp_minmss) {
8459                                 ip->ip_off |= htons(IP_DF);
8460                         }
8461                 } else {
8462                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8463                 }
8464
8465                 if (tp->t_state == TCPS_SYN_SENT)
8466                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
8467
8468                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
8469
8470                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
8471                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
8472                     inp);
8473                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
8474                         mtu = inp->inp_route.ro_rt->rt_mtu;
8475         }
8476 #endif                          /* INET */
8477
8478 out:
8479         if (lgb) {
8480                 lgb->tlb_errno = error;
8481                 lgb = NULL;
8482         }
8483         /*
8484          * In transmit state, time the transmission and arrange for the
8485          * retransmit.  In persist state, just set snd_max.
8486          */
8487         if (error == 0) {
8488                 if (len == 0)
8489                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
8490                 else if (len == 1) {
8491                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
8492                 } else if (len > 1) {
8493                         int idx;
8494
8495                         idx = (len / tp->t_maxseg) + 3;
8496                         if (idx >= TCP_MSS_ACCT_ATIMER)
8497                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
8498                         else
8499                                 counter_u64_add(rack_out_size[idx], 1);
8500                 }
8501         }
8502         if (sub_from_prr && (error == 0)) {
8503                 rack->r_ctl.rc_prr_sndcnt -= len;
8504         }
8505         sub_from_prr = 0;
8506         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
8507             pass, rsm);
8508         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
8509             (rack->rc_in_persist == 0)) {
8510                 tcp_seq startseq = tp->snd_nxt;
8511
8512                 /*
8513                  * Advance snd_nxt over sequence space of this segment.
8514                  */
8515                 if (error)
8516                         /* We don't log or do anything with errors */
8517                         goto timer;
8518
8519                 if (flags & (TH_SYN | TH_FIN)) {
8520                         if (flags & TH_SYN)
8521                                 tp->snd_nxt++;
8522                         if (flags & TH_FIN) {
8523                                 tp->snd_nxt++;
8524                                 tp->t_flags |= TF_SENTFIN;
8525                         }
8526                 }
8527                 /* In the ENOBUFS case we do *not* update snd_max */
8528                 if (sack_rxmit)
8529                         goto timer;
8530
8531                 tp->snd_nxt += len;
8532                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
8533                         if (tp->snd_una == tp->snd_max) {
8534                                 /*
8535                                  * Update the time we just added data since
8536                                  * none was outstanding.
8537                                  */
8538                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8539                                 tp->t_acktime = ticks;
8540                         }
8541                         tp->snd_max = tp->snd_nxt;
8542                         /*
8543                          * Time this transmission if not a retransmission and
8544                          * not currently timing anything.
8545                          * This is only relevant in case of switching back to
8546                          * the base stack.
8547                          */
8548                         if (tp->t_rtttime == 0) {
8549                                 tp->t_rtttime = ticks;
8550                                 tp->t_rtseq = startseq;
8551                                 TCPSTAT_INC(tcps_segstimed);
8552                         }
8553 #ifdef NETFLIX_STATS
8554                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
8555                                 tp->t_flags |= TF_GPUTINPROG;
8556                                 tp->gput_seq = startseq;
8557                                 tp->gput_ack = startseq +
8558                                     ulmin(sbavail(sb) - sb_offset, sendwin);
8559                                 tp->gput_ts = tcp_ts_getticks();
8560                         }
8561 #endif
8562                 }
8563                 /*
8564                  * Set retransmit timer if not currently set, and not doing
8565                  * a pure ack or a keep-alive probe. Initial value for
8566                  * retransmit timer is smoothed round-trip time + 2 *
8567                  * round-trip time variance. Initialize shift counter which
8568                  * is used for backoff of retransmit time.
8569                  */
8570 timer:
8571                 if ((tp->snd_wnd == 0) &&
8572                     TCPS_HAVEESTABLISHED(tp->t_state)) {
8573                         /*
8574                          * If the persists timer was set above (right before
8575                          * the goto send), and still needs to be on. Lets
8576                          * make sure all is canceled. If the persist timer
8577                          * is not running, we want to get it up.
8578                          */
8579                         if (rack->rc_in_persist == 0) {
8580                                 rack_enter_persist(tp, rack, cts);
8581                         }
8582                 }
8583         } else {
8584                 /*
8585                  * Persist case, update snd_max but since we are in persist
8586                  * mode (no window) we do not update snd_nxt.
8587                  */
8588                 int32_t xlen = len;
8589
8590                 if (error)
8591                         goto nomore;
8592
8593                 if (flags & TH_SYN)
8594                         ++xlen;
8595                 if (flags & TH_FIN) {
8596                         ++xlen;
8597                         tp->t_flags |= TF_SENTFIN;
8598                 }
8599                 /* In the ENOBUFS case we do *not* update snd_max */
8600                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
8601                         if (tp->snd_una == tp->snd_max) {
8602                                 /*
8603                                  * Update the time we just added data since
8604                                  * none was outstanding.
8605                                  */
8606                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8607                                 tp->t_acktime = ticks;
8608                         }
8609                         tp->snd_max = tp->snd_nxt + len;
8610                 }
8611         }
8612 nomore:
8613         if (error) {
8614                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
8615                 /*
8616                  * Failures do not advance the seq counter above. For the
8617                  * case of ENOBUFS we will fall out and retry in 1ms with
8618                  * the hpts. Everything else will just have to retransmit
8619                  * with the timer.
8620                  *
8621                  * In any case, we do not want to loop around for another
8622                  * send without a good reason.
8623                  */
8624                 sendalot = 0;
8625                 switch (error) {
8626                 case EPERM:
8627                         tp->t_flags &= ~TF_FORCEDATA;
8628                         tp->t_softerror = error;
8629                         return (error);
8630                 case ENOBUFS:
8631                         if (slot == 0) {
8632                                 /*
8633                                  * Pace us right away to retry in a some
8634                                  * time
8635                                  */
8636                                 slot = 1 + rack->rc_enobuf;
8637                                 if (rack->rc_enobuf < 255)
8638                                         rack->rc_enobuf++;
8639                                 if (slot > (rack->rc_rack_rtt / 2)) {
8640                                         slot = rack->rc_rack_rtt / 2;
8641                                 }
8642                                 if (slot < 10)
8643                                         slot = 10;
8644                         }
8645                         counter_u64_add(rack_saw_enobuf, 1);
8646                         error = 0;
8647                         goto enobufs;
8648                 case EMSGSIZE:
8649                         /*
8650                          * For some reason the interface we used initially
8651                          * to send segments changed to another or lowered
8652                          * its MTU. If TSO was active we either got an
8653                          * interface without TSO capabilits or TSO was
8654                          * turned off. If we obtained mtu from ip_output()
8655                          * then update it and try again.
8656                          */
8657                         if (tso)
8658                                 tp->t_flags &= ~TF_TSO;
8659                         if (mtu != 0) {
8660                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
8661                                 goto again;
8662                         }
8663                         slot = 10;
8664                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8665                         tp->t_flags &= ~TF_FORCEDATA;
8666                         return (error);
8667                 case ENETUNREACH:
8668                         counter_u64_add(rack_saw_enetunreach, 1);
8669                 case EHOSTDOWN:
8670                 case EHOSTUNREACH:
8671                 case ENETDOWN:
8672                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
8673                                 tp->t_softerror = error;
8674                         }
8675                         /* FALLTHROUGH */
8676                 default:
8677                         slot = 10;
8678                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8679                         tp->t_flags &= ~TF_FORCEDATA;
8680                         return (error);
8681                 }
8682         } else {
8683                 rack->rc_enobuf = 0;
8684         }
8685         TCPSTAT_INC(tcps_sndtotal);
8686
8687         /*
8688          * Data sent (as far as we can tell). If this advertises a larger
8689          * window than any other segment, then remember the size of the
8690          * advertised window. Any pending ACK has now been sent.
8691          */
8692         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8693                 tp->rcv_adv = tp->rcv_nxt + recwin;
8694         tp->last_ack_sent = tp->rcv_nxt;
8695         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
8696 enobufs:
8697         rack->r_tlp_running = 0;
8698         if ((flags & TH_RST) || (would_have_fin == 1)) {
8699                 /*
8700                  * We don't send again after a RST. We also do *not* send
8701                  * again if we would have had a find, but now have
8702                  * outstanding data.
8703                  */
8704                 slot = 0;
8705                 sendalot = 0;
8706         }
8707         if (slot) {
8708                 /* set the rack tcb into the slot N */
8709                 counter_u64_add(rack_paced_segments, 1);
8710         } else if (sendalot) {
8711                 if (len)
8712                         counter_u64_add(rack_unpaced_segments, 1);
8713                 sack_rxmit = 0;
8714                 tp->t_flags &= ~TF_FORCEDATA;
8715                 goto again;
8716         } else if (len) {
8717                 counter_u64_add(rack_unpaced_segments, 1);
8718         }
8719         tp->t_flags &= ~TF_FORCEDATA;
8720         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
8721         return (error);
8722 }
8723
8724 /*
8725  * rack_ctloutput() must drop the inpcb lock before performing copyin on
8726  * socket option arguments.  When it re-acquires the lock after the copy, it
8727  * has to revalidate that the connection is still valid for the socket
8728  * option.
8729  */
8730 static int
8731 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
8732     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8733 {
8734         int32_t error = 0, optval;
8735
8736         switch (sopt->sopt_name) {
8737         case TCP_RACK_PROP_RATE:
8738         case TCP_RACK_PROP:
8739         case TCP_RACK_TLP_REDUCE:
8740         case TCP_RACK_EARLY_RECOV:
8741         case TCP_RACK_PACE_ALWAYS:
8742         case TCP_DELACK:
8743         case TCP_RACK_PACE_REDUCE:
8744         case TCP_RACK_PACE_MAX_SEG:
8745         case TCP_RACK_PRR_SENDALOT:
8746         case TCP_RACK_MIN_TO:
8747         case TCP_RACK_EARLY_SEG:
8748         case TCP_RACK_REORD_THRESH:
8749         case TCP_RACK_REORD_FADE:
8750         case TCP_RACK_TLP_THRESH:
8751         case TCP_RACK_PKT_DELAY:
8752         case TCP_RACK_TLP_USE:
8753         case TCP_RACK_TLP_INC_VAR:
8754         case TCP_RACK_IDLE_REDUCE_HIGH:
8755         case TCP_RACK_MIN_PACE:
8756         case TCP_RACK_MIN_PACE_SEG:
8757         case TCP_BBR_RACK_RTT_USE:
8758         case TCP_DATA_AFTER_CLOSE:
8759                 break;
8760         default:
8761                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8762                 break;
8763         }
8764         INP_WUNLOCK(inp);
8765         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
8766         if (error)
8767                 return (error);
8768         INP_WLOCK(inp);
8769         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
8770                 INP_WUNLOCK(inp);
8771                 return (ECONNRESET);
8772         }
8773         tp = intotcpcb(inp);
8774         rack = (struct tcp_rack *)tp->t_fb_ptr;
8775         switch (sopt->sopt_name) {
8776         case TCP_RACK_PROP_RATE:
8777                 if ((optval <= 0) || (optval >= 100)) {
8778                         error = EINVAL;
8779                         break;
8780                 }
8781                 RACK_OPTS_INC(tcp_rack_prop_rate);
8782                 rack->r_ctl.rc_prop_rate = optval;
8783                 break;
8784         case TCP_RACK_TLP_USE:
8785                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
8786                         error = EINVAL;
8787                         break;
8788                 }
8789                 RACK_OPTS_INC(tcp_tlp_use);
8790                 rack->rack_tlp_threshold_use = optval;
8791                 break;
8792         case TCP_RACK_PROP:
8793                 /* RACK proportional rate reduction (bool) */
8794                 RACK_OPTS_INC(tcp_rack_prop);
8795                 rack->r_ctl.rc_prop_reduce = optval;
8796                 break;
8797         case TCP_RACK_TLP_REDUCE:
8798                 /* RACK TLP cwnd reduction (bool) */
8799                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
8800                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
8801                 break;
8802         case TCP_RACK_EARLY_RECOV:
8803                 /* Should recovery happen early (bool) */
8804                 RACK_OPTS_INC(tcp_rack_early_recov);
8805                 rack->r_ctl.rc_early_recovery = optval;
8806                 break;
8807         case TCP_RACK_PACE_ALWAYS:
8808                 /* Use the always pace method (bool)  */
8809                 RACK_OPTS_INC(tcp_rack_pace_always);
8810                 if (optval > 0)
8811                         rack->rc_always_pace = 1;
8812                 else
8813                         rack->rc_always_pace = 0;
8814                 break;
8815         case TCP_RACK_PACE_REDUCE:
8816                 /* RACK Hptsi reduction factor (divisor) */
8817                 RACK_OPTS_INC(tcp_rack_pace_reduce);
8818                 if (optval)
8819                         /* Must be non-zero */
8820                         rack->rc_pace_reduce = optval;
8821                 else
8822                         error = EINVAL;
8823                 break;
8824         case TCP_RACK_PACE_MAX_SEG:
8825                 /* Max segments in a pace */
8826                 RACK_OPTS_INC(tcp_rack_max_seg);
8827                 rack->rc_pace_max_segs = optval;
8828                 break;
8829         case TCP_RACK_PRR_SENDALOT:
8830                 /* Allow PRR to send more than one seg */
8831                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
8832                 rack->r_ctl.rc_prr_sendalot = optval;
8833                 break;
8834         case TCP_RACK_MIN_TO:
8835                 /* Minimum time between rack t-o's in ms */
8836                 RACK_OPTS_INC(tcp_rack_min_to);
8837                 rack->r_ctl.rc_min_to = optval;
8838                 break;
8839         case TCP_RACK_EARLY_SEG:
8840                 /* If early recovery max segments */
8841                 RACK_OPTS_INC(tcp_rack_early_seg);
8842                 rack->r_ctl.rc_early_recovery_segs = optval;
8843                 break;
8844         case TCP_RACK_REORD_THRESH:
8845                 /* RACK reorder threshold (shift amount) */
8846                 RACK_OPTS_INC(tcp_rack_reord_thresh);
8847                 if ((optval > 0) && (optval < 31))
8848                         rack->r_ctl.rc_reorder_shift = optval;
8849                 else
8850                         error = EINVAL;
8851                 break;
8852         case TCP_RACK_REORD_FADE:
8853                 /* Does reordering fade after ms time */
8854                 RACK_OPTS_INC(tcp_rack_reord_fade);
8855                 rack->r_ctl.rc_reorder_fade = optval;
8856                 break;
8857         case TCP_RACK_TLP_THRESH:
8858                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
8859                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
8860                 if (optval)
8861                         rack->r_ctl.rc_tlp_threshold = optval;
8862                 else
8863                         error = EINVAL;
8864                 break;
8865         case TCP_RACK_PKT_DELAY:
8866                 /* RACK added ms i.e. rack-rtt + reord + N */
8867                 RACK_OPTS_INC(tcp_rack_pkt_delay);
8868                 rack->r_ctl.rc_pkt_delay = optval;
8869                 break;
8870         case TCP_RACK_TLP_INC_VAR:
8871                 /* Does TLP include rtt variance in t-o */
8872                 RACK_OPTS_INC(tcp_rack_tlp_inc_var);
8873                 rack->r_ctl.rc_prr_inc_var = optval;
8874                 break;
8875         case TCP_RACK_IDLE_REDUCE_HIGH:
8876                 RACK_OPTS_INC(tcp_rack_idle_reduce_high);
8877                 if (optval)
8878                         rack->r_idle_reduce_largest = 1;
8879                 else
8880                         rack->r_idle_reduce_largest = 0;
8881                 break;
8882         case TCP_DELACK:
8883                 if (optval == 0)
8884                         tp->t_delayed_ack = 0;
8885                 else
8886                         tp->t_delayed_ack = 1;
8887                 if (tp->t_flags & TF_DELACK) {
8888                         tp->t_flags &= ~TF_DELACK;
8889                         tp->t_flags |= TF_ACKNOW;
8890                         rack_output(tp);
8891                 }
8892                 break;
8893         case TCP_RACK_MIN_PACE:
8894                 RACK_OPTS_INC(tcp_rack_min_pace);
8895                 if (optval > 3)
8896                         rack->r_enforce_min_pace = 3;
8897                 else
8898                         rack->r_enforce_min_pace = optval;
8899                 break;
8900         case TCP_RACK_MIN_PACE_SEG:
8901                 RACK_OPTS_INC(tcp_rack_min_pace_seg);
8902                 if (optval >= 16)
8903                         rack->r_min_pace_seg_thresh = 15;
8904                 else
8905                         rack->r_min_pace_seg_thresh = optval;
8906                 break;
8907         case TCP_BBR_RACK_RTT_USE:
8908                 if ((optval != USE_RTT_HIGH) &&
8909                     (optval != USE_RTT_LOW) &&
8910                     (optval != USE_RTT_AVG))
8911                         error = EINVAL;
8912                 else
8913                         rack->r_ctl.rc_rate_sample_method = optval;
8914                 break;
8915         case TCP_DATA_AFTER_CLOSE:
8916                 if (optval)
8917                         rack->rc_allow_data_af_clo = 1;
8918                 else
8919                         rack->rc_allow_data_af_clo = 0;
8920                 break;
8921         default:
8922                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8923                 break;
8924         }
8925 #ifdef NETFLIX_STATS
8926         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
8927 #endif
8928         INP_WUNLOCK(inp);
8929         return (error);
8930 }
8931
8932 static int
8933 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
8934     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8935 {
8936         int32_t error, optval;
8937
8938         /*
8939          * Because all our options are either boolean or an int, we can just
8940          * pull everything into optval and then unlock and copy. If we ever
8941          * add a option that is not a int, then this will have quite an
8942          * impact to this routine.
8943          */
8944         switch (sopt->sopt_name) {
8945         case TCP_RACK_PROP_RATE:
8946                 optval = rack->r_ctl.rc_prop_rate;
8947                 break;
8948         case TCP_RACK_PROP:
8949                 /* RACK proportional rate reduction (bool) */
8950                 optval = rack->r_ctl.rc_prop_reduce;
8951                 break;
8952         case TCP_RACK_TLP_REDUCE:
8953                 /* RACK TLP cwnd reduction (bool) */
8954                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
8955                 break;
8956         case TCP_RACK_EARLY_RECOV:
8957                 /* Should recovery happen early (bool) */
8958                 optval = rack->r_ctl.rc_early_recovery;
8959                 break;
8960         case TCP_RACK_PACE_REDUCE:
8961                 /* RACK Hptsi reduction factor (divisor) */
8962                 optval = rack->rc_pace_reduce;
8963                 break;
8964         case TCP_RACK_PACE_MAX_SEG:
8965                 /* Max segments in a pace */
8966                 optval = rack->rc_pace_max_segs;
8967                 break;
8968         case TCP_RACK_PACE_ALWAYS:
8969                 /* Use the always pace method */
8970                 optval = rack->rc_always_pace;
8971                 break;
8972         case TCP_RACK_PRR_SENDALOT:
8973                 /* Allow PRR to send more than one seg */
8974                 optval = rack->r_ctl.rc_prr_sendalot;
8975                 break;
8976         case TCP_RACK_MIN_TO:
8977                 /* Minimum time between rack t-o's in ms */
8978                 optval = rack->r_ctl.rc_min_to;
8979                 break;
8980         case TCP_RACK_EARLY_SEG:
8981                 /* If early recovery max segments */
8982                 optval = rack->r_ctl.rc_early_recovery_segs;
8983                 break;
8984         case TCP_RACK_REORD_THRESH:
8985                 /* RACK reorder threshold (shift amount) */
8986                 optval = rack->r_ctl.rc_reorder_shift;
8987                 break;
8988         case TCP_RACK_REORD_FADE:
8989                 /* Does reordering fade after ms time */
8990                 optval = rack->r_ctl.rc_reorder_fade;
8991                 break;
8992         case TCP_RACK_TLP_THRESH:
8993                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
8994                 optval = rack->r_ctl.rc_tlp_threshold;
8995                 break;
8996         case TCP_RACK_PKT_DELAY:
8997                 /* RACK added ms i.e. rack-rtt + reord + N */
8998                 optval = rack->r_ctl.rc_pkt_delay;
8999                 break;
9000         case TCP_RACK_TLP_USE:
9001                 optval = rack->rack_tlp_threshold_use;
9002                 break;
9003         case TCP_RACK_TLP_INC_VAR:
9004                 /* Does TLP include rtt variance in t-o */
9005                 optval = rack->r_ctl.rc_prr_inc_var;
9006                 break;
9007         case TCP_RACK_IDLE_REDUCE_HIGH:
9008                 optval = rack->r_idle_reduce_largest;
9009                 break;
9010         case TCP_RACK_MIN_PACE:
9011                 optval = rack->r_enforce_min_pace;
9012                 break;
9013         case TCP_RACK_MIN_PACE_SEG:
9014                 optval = rack->r_min_pace_seg_thresh;
9015                 break;
9016         case TCP_BBR_RACK_RTT_USE:
9017                 optval = rack->r_ctl.rc_rate_sample_method;
9018                 break;
9019         case TCP_DELACK:
9020                 optval = tp->t_delayed_ack;
9021                 break;
9022         case TCP_DATA_AFTER_CLOSE:
9023                 optval = rack->rc_allow_data_af_clo;
9024                 break;
9025         default:
9026                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9027                 break;
9028         }
9029         INP_WUNLOCK(inp);
9030         error = sooptcopyout(sopt, &optval, sizeof optval);
9031         return (error);
9032 }
9033
9034 static int
9035 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
9036 {
9037         int32_t error = EINVAL;
9038         struct tcp_rack *rack;
9039
9040         rack = (struct tcp_rack *)tp->t_fb_ptr;
9041         if (rack == NULL) {
9042                 /* Huh? */
9043                 goto out;
9044         }
9045         if (sopt->sopt_dir == SOPT_SET) {
9046                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
9047         } else if (sopt->sopt_dir == SOPT_GET) {
9048                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
9049         }
9050 out:
9051         INP_WUNLOCK(inp);
9052         return (error);
9053 }
9054
9055
9056 struct tcp_function_block __tcp_rack = {
9057         .tfb_tcp_block_name = __XSTRING(STACKNAME),
9058         .tfb_tcp_output = rack_output,
9059         .tfb_tcp_do_segment = rack_do_segment,
9060         .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
9061         .tfb_tcp_ctloutput = rack_ctloutput,
9062         .tfb_tcp_fb_init = rack_init,
9063         .tfb_tcp_fb_fini = rack_fini,
9064         .tfb_tcp_timer_stop_all = rack_stopall,
9065         .tfb_tcp_timer_activate = rack_timer_activate,
9066         .tfb_tcp_timer_active = rack_timer_active,
9067         .tfb_tcp_timer_stop = rack_timer_stop,
9068         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
9069         .tfb_tcp_handoff_ok = rack_handoff_ok
9070 };
9071
9072 static const char *rack_stack_names[] = {
9073         __XSTRING(STACKNAME),
9074 #ifdef STACKALIAS
9075         __XSTRING(STACKALIAS),
9076 #endif
9077 };
9078
9079 static int
9080 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
9081 {
9082         memset(mem, 0, size);
9083         return (0);
9084 }
9085
9086 static void
9087 rack_dtor(void *mem, int32_t size, void *arg)
9088 {
9089
9090 }
9091
9092 static bool rack_mod_inited = false;
9093
9094 static int
9095 tcp_addrack(module_t mod, int32_t type, void *data)
9096 {
9097         int32_t err = 0;
9098         int num_stacks;
9099
9100         switch (type) {
9101         case MOD_LOAD:
9102                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
9103                     sizeof(struct rack_sendmap),
9104                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
9105
9106                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
9107                     sizeof(struct tcp_rack),
9108                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
9109
9110                 sysctl_ctx_init(&rack_sysctl_ctx);
9111                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
9112                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
9113                     OID_AUTO,
9114                     __XSTRING(STACKNAME),
9115                     CTLFLAG_RW, 0,
9116                     "");
9117                 if (rack_sysctl_root == NULL) {
9118                         printf("Failed to add sysctl node\n");
9119                         err = EFAULT;
9120                         goto free_uma;
9121                 }
9122                 rack_init_sysctls();
9123                 num_stacks = nitems(rack_stack_names);
9124                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
9125                     rack_stack_names, &num_stacks);
9126                 if (err) {
9127                         printf("Failed to register %s stack name for "
9128                             "%s module\n", rack_stack_names[num_stacks],
9129                             __XSTRING(MODNAME));
9130                         sysctl_ctx_free(&rack_sysctl_ctx);
9131 free_uma:
9132                         uma_zdestroy(rack_zone);
9133                         uma_zdestroy(rack_pcb_zone);
9134                         rack_counter_destroy();
9135                         printf("Failed to register rack module -- err:%d\n", err);
9136                         return (err);
9137                 }
9138                 rack_mod_inited = true;
9139                 break;
9140         case MOD_QUIESCE:
9141                 err = deregister_tcp_functions(&__tcp_rack, true, false);
9142                 break;
9143         case MOD_UNLOAD:
9144                 err = deregister_tcp_functions(&__tcp_rack, false, true);
9145                 if (err == EBUSY)
9146                         break;
9147                 if (rack_mod_inited) {
9148                         uma_zdestroy(rack_zone);
9149                         uma_zdestroy(rack_pcb_zone);
9150                         sysctl_ctx_free(&rack_sysctl_ctx);
9151                         rack_counter_destroy();
9152                         rack_mod_inited = false;
9153                 }
9154                 err = 0;
9155                 break;
9156         default:
9157                 return (EOPNOTSUPP);
9158         }
9159         return (err);
9160 }
9161
9162 static moduledata_t tcp_rack = {
9163         .name = __XSTRING(MODNAME),
9164         .evhand = tcp_addrack,
9165         .priv = 0
9166 };
9167
9168 MODULE_VERSION(MODNAME, 1);
9169 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
9170 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);