sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2018 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34
  35 #include <sys/param.h>
  36 #include <sys/module.h>
  37 #include <sys/kernel.h>
  38 #ifdef TCP_HHOOK
  39 #include <sys/hhook.h>
  40 #endif
  41 #include <sys/lock.h>
  42 #include <sys/malloc.h>
  43 #include <sys/lock.h>
  44 #include <sys/mutex.h>
  45 #include <sys/mbuf.h>
  46 #include <sys/proc.h>           /* for proc0 declaration */
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/systm.h>
  51 #ifdef NETFLIX_STATS
  52 #include <sys/stats.h>
  53 #endif
  54 #include <sys/refcount.h>
  55 #include <sys/queue.h>
  56 #include <sys/smp.h>
  57 #include <sys/kthread.h>
  58 #include <sys/kern_prefetch.h>
  59
  60 #include <vm/uma.h>
  61
  62 #include <net/route.h>
  63 #include <net/vnet.h>
  64
  65 #define TCPSTATES               /* for logging */
  66
  67 #include <netinet/in.h>
  68 #include <netinet/in_kdtrace.h>
  69 #include <netinet/in_pcb.h>
  70 #include <netinet/ip.h>
  71 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  72 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  73 #include <netinet/ip_var.h>
  74 #include <netinet/ip6.h>
  75 #include <netinet6/in6_pcb.h>
  76 #include <netinet6/ip6_var.h>
  77 #include <netinet/tcp.h>
  78 #define TCPOUTFLAGS
  79 #include <netinet/tcp_fsm.h>
  80 #include <netinet/tcp_log_buf.h>
  81 #include <netinet/tcp_seq.h>
  82 #include <netinet/tcp_timer.h>
  83 #include <netinet/tcp_var.h>
  84 #include <netinet/tcp_hpts.h>
  85 #include <netinet/tcpip.h>
  86 #include <netinet/cc/cc.h>
  87 #ifdef NETFLIX_CWV
  88 #include <netinet/tcp_newcwv.h>
  89 #endif
  90 #include <netinet/tcp_fastopen.h>
  91 #ifdef TCPDEBUG
  92 #include <netinet/tcp_debug.h>
  93 #endif                          /* TCPDEBUG */
  94 #ifdef TCP_OFFLOAD
  95 #include <netinet/tcp_offload.h>
  96 #endif
  97 #ifdef INET6
  98 #include <netinet6/tcp6_var.h>
  99 #endif
 100
 101 #include <netipsec/ipsec_support.h>
 102
 103 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 104 #include <netipsec/ipsec.h>
 105 #include <netipsec/ipsec6.h>
 106 #endif                          /* IPSEC */
 107
 108 #include <netinet/udp.h>
 109 #include <netinet/udp_var.h>
 110 #include <machine/in_cksum.h>
 111
 112 #ifdef MAC
 113 #include <security/mac/mac_framework.h>
 114 #endif
 115 #include "sack_filter.h"
 116 #include "tcp_rack.h"
 117 #include "rack_bbr_common.h"
 118
 119 uma_zone_t rack_zone;
 120 uma_zone_t rack_pcb_zone;
 121
 122 #ifndef TICKS2SBT
 123 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 124 #endif
 125
 126 struct sysctl_ctx_list rack_sysctl_ctx;
 127 struct sysctl_oid *rack_sysctl_root;
 128
 129 #define CUM_ACKED 1
 130 #define SACKED 2
 131
 132 /*
 133  * The RACK module incorporates a number of
 134  * TCP ideas that have been put out into the IETF
 135  * over the last few years:
 136  * - Matt Mathis's Rate Halving which slowly drops
 137  *    the congestion window so that the ack clock can
 138  *    be maintained during a recovery.
 139  * - Yuchung Cheng's RACK TCP (for which its named) that
 140  *    will stop us using the number of dup acks and instead
 141  *    use time as the gage of when we retransmit.
 142  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 143  *    of Dukkipati et.al.
 144  * RACK depends on SACK, so if an endpoint arrives that
 145  * cannot do SACK the state machine below will shuttle the
 146  * connection back to using the "default" TCP stack that is
 147  * in FreeBSD.
 148  *
 149  * To implement RACK the original TCP stack was first decomposed
 150  * into a functional state machine with individual states
 151  * for each of the possible TCP connection states. The do_segement
 152  * functions role in life is to mandate the connection supports SACK
 153  * initially and then assure that the RACK state matches the conenction
 154  * state before calling the states do_segment function. Each
 155  * state is simplified due to the fact that the original do_segment
 156  * has been decomposed and we *know* what state we are in (no
 157  * switches on the state) and all tests for SACK are gone. This
 158  * greatly simplifies what each state does.
 159  *
 160  * TCP output is also over-written with a new version since it
 161  * must maintain the new rack scoreboard.
 162  *
 163  */
 164 static int32_t rack_precache = 1;
 165 static int32_t rack_tlp_thresh = 1;
 166 static int32_t rack_reorder_thresh = 2;
 167 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 168                                                  * - 60 seconds */
 169 static int32_t rack_pkt_delay = 1;
 170 static int32_t rack_inc_var = 0;/* For TLP */
 171 static int32_t rack_reduce_largest_on_idle = 0;
 172 static int32_t rack_min_pace_time = 0;
 173 static int32_t rack_min_pace_time_seg_req=6;
 174 static int32_t rack_early_recovery = 1;
 175 static int32_t rack_early_recovery_max_seg = 6;
 176 static int32_t rack_send_a_lot_in_prr = 1;
 177 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 178 static int32_t rack_tlp_in_recovery = 1;        /* Can we do TLP in recovery? */
 179 static int32_t rack_verbose_logging = 0;
 180 static int32_t rack_ignore_data_after_close = 1;
 181 /*
 182  * Currently regular tcp has a rto_min of 30ms
 183  * the backoff goes 12 times so that ends up
 184  * being a total of 122.850 seconds before a
 185  * connection is killed.
 186  */
 187 static int32_t rack_tlp_min = 10;
 188 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 189 static int32_t rack_rto_max = 30000;    /* 30 seconds */
 190 static const int32_t rack_free_cache = 2;
 191 static int32_t rack_hptsi_segments = 40;
 192 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 193 static int32_t rack_pace_every_seg = 1;
 194 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 195 static int32_t rack_slot_reduction = 4;
 196 static int32_t rack_lower_cwnd_at_tlp = 0;
 197 static int32_t rack_use_proportional_reduce = 0;
 198 static int32_t rack_proportional_rate = 10;
 199 static int32_t rack_tlp_max_resend = 2;
 200 static int32_t rack_limited_retran = 0;
 201 static int32_t rack_always_send_oldest = 0;
 202 static int32_t rack_sack_block_limit = 128;
 203 static int32_t rack_use_sack_filter = 1;
 204 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 205 static uint32_t rack_map_split_limit = 0;       /* unlimited by default */
 206
 207 /* Rack specific counters */
 208 counter_u64_t rack_badfr;
 209 counter_u64_t rack_badfr_bytes;
 210 counter_u64_t rack_rtm_prr_retran;
 211 counter_u64_t rack_rtm_prr_newdata;
 212 counter_u64_t rack_timestamp_mismatch;
 213 counter_u64_t rack_reorder_seen;
 214 counter_u64_t rack_paced_segments;
 215 counter_u64_t rack_unpaced_segments;
 216 counter_u64_t rack_saw_enobuf;
 217 counter_u64_t rack_saw_enetunreach;
 218
 219 /* Tail loss probe counters */
 220 counter_u64_t rack_tlp_tot;
 221 counter_u64_t rack_tlp_newdata;
 222 counter_u64_t rack_tlp_retran;
 223 counter_u64_t rack_tlp_retran_bytes;
 224 counter_u64_t rack_tlp_retran_fail;
 225 counter_u64_t rack_to_tot;
 226 counter_u64_t rack_to_arm_rack;
 227 counter_u64_t rack_to_arm_tlp;
 228 counter_u64_t rack_to_alloc;
 229 counter_u64_t rack_to_alloc_hard;
 230 counter_u64_t rack_to_alloc_emerg;
 231 counter_u64_t rack_alloc_limited_conns;
 232 counter_u64_t rack_split_limited;
 233
 234 counter_u64_t rack_sack_proc_all;
 235 counter_u64_t rack_sack_proc_short;
 236 counter_u64_t rack_sack_proc_restart;
 237 counter_u64_t rack_runt_sacks;
 238 counter_u64_t rack_used_tlpmethod;
 239 counter_u64_t rack_used_tlpmethod2;
 240 counter_u64_t rack_enter_tlp_calc;
 241 counter_u64_t rack_input_idle_reduces;
 242 counter_u64_t rack_tlp_does_nada;
 243
 244 /* Temp CPU counters */
 245 counter_u64_t rack_find_high;
 246
 247 counter_u64_t rack_progress_drops;
 248 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 249 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 250
 251 static void
 252 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 253
 254 static int
 255 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 256     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 257     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 258 static int
 259 rack_process_data(struct mbuf *m, struct tcphdr *th,
 260     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 261     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 262 static void
 263 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 264     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 265 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 266 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 267     uint8_t limit_type);
 268 static struct rack_sendmap *
 269 rack_check_recovery_mode(struct tcpcb *tp,
 270     uint32_t tsused);
 271 static void
 272 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 273     uint32_t type);
 274 static void rack_counter_destroy(void);
 275 static int
 276 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 277     struct inpcb *inp, struct tcpcb *tp);
 278 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 279 static void
 280 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 281     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 282     uint8_t iptos);
 283 static void rack_dtor(void *mem, int32_t size, void *arg);
 284 static void
 285 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 286     uint32_t t, uint32_t cts);
 287 static struct rack_sendmap *
 288 rack_find_high_nonack(struct tcp_rack *rack,
 289     struct rack_sendmap *rsm);
 290 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 291 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 292 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 293 static int
 294 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 295     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 296 static int32_t rack_handoff_ok(struct tcpcb *tp);
 297 static int32_t rack_init(struct tcpcb *tp);
 298 static void rack_init_sysctls(void);
 299 static void
 300 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 301     struct tcphdr *th);
 302 static void
 303 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 304     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 305     uint8_t pass, struct rack_sendmap *hintrsm);
 306 static void
 307 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 308     struct rack_sendmap *rsm);
 309 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 310 static int32_t rack_output(struct tcpcb *tp);
 311 static void
 312 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
 313     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 314     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
 315
 316 static uint32_t
 317 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 318     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 319     uint32_t cts);
 320 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 321 static void rack_remxt_tmr(struct tcpcb *tp);
 322 static int
 323 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 324     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 325 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 326 static int32_t rack_stopall(struct tcpcb *tp);
 327 static void
 328 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 329     uint32_t delta);
 330 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 331 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 332 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 333 static uint32_t
 334 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 335     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 336 static void
 337 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 338     struct rack_sendmap *rsm, uint32_t ts);
 339 static int
 340 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 341     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 342 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 343 static void
 344 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
 345     struct tcpcb *tp, int32_t * ret_val);
 346 static int
 347 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 348     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 349     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 350 static int
 351 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 352     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 353     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 354 static void
 355 rack_do_drop(struct mbuf *m, struct tcpcb *tp);
 356 static void
 357 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
 358     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
 359 static void
 360 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
 361         struct tcphdr *th, int32_t rstreason, int32_t tlen);
 362 static int
 363 rack_do_established(struct mbuf *m, struct tcphdr *th,
 364     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 365     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 366 static int
 367 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 368     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 369     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
 370 static int
 371 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 372     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 373     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 374 static int
 375 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 376     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 377     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 378 static int
 379 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 380     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 381     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 382 static int
 383 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 384     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 385     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 386 static int
 387 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 388     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 389     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 390 static int
 391 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
 392     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
 393     int32_t * drop_hdrlen, int32_t * ret_val);
 394 static int
 395 rack_process_rst(struct mbuf *m, struct tcphdr *th,
 396     struct socket *so, struct tcpcb *tp);
 397 struct rack_sendmap *
 398 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 399     uint32_t tsused);
 400 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 401 static void
 402      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 403
 404 static int
 405 rack_ts_check(struct mbuf *m, struct tcphdr *th,
 406     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
 407
 408 int32_t rack_clear_counter=0;
 409
 410
 411 static int
 412 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 413 {
 414         uint32_t stat;
 415         int32_t error;
 416
 417         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 418         if (error || req->newptr == NULL)
 419                 return error;
 420
 421         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 422         if (error)
 423                 return (error);
 424         if (stat == 1) {
 425 #ifdef INVARIANTS
 426                 printf("Clearing RACK counters\n");
 427 #endif
 428                 counter_u64_zero(rack_badfr);
 429                 counter_u64_zero(rack_badfr_bytes);
 430                 counter_u64_zero(rack_rtm_prr_retran);
 431                 counter_u64_zero(rack_rtm_prr_newdata);
 432                 counter_u64_zero(rack_timestamp_mismatch);
 433                 counter_u64_zero(rack_reorder_seen);
 434                 counter_u64_zero(rack_tlp_tot);
 435                 counter_u64_zero(rack_tlp_newdata);
 436                 counter_u64_zero(rack_tlp_retran);
 437                 counter_u64_zero(rack_tlp_retran_bytes);
 438                 counter_u64_zero(rack_tlp_retran_fail);
 439                 counter_u64_zero(rack_to_tot);
 440                 counter_u64_zero(rack_to_arm_rack);
 441                 counter_u64_zero(rack_to_arm_tlp);
 442                 counter_u64_zero(rack_paced_segments);
 443                 counter_u64_zero(rack_unpaced_segments);
 444                 counter_u64_zero(rack_saw_enobuf);
 445                 counter_u64_zero(rack_saw_enetunreach);
 446                 counter_u64_zero(rack_to_alloc_hard);
 447                 counter_u64_zero(rack_to_alloc_emerg);
 448                 counter_u64_zero(rack_sack_proc_all);
 449                 counter_u64_zero(rack_sack_proc_short);
 450                 counter_u64_zero(rack_sack_proc_restart);
 451                 counter_u64_zero(rack_to_alloc);
 452                 counter_u64_zero(rack_alloc_limited_conns);
 453                 counter_u64_zero(rack_split_limited);
 454                 counter_u64_zero(rack_find_high);
 455                 counter_u64_zero(rack_runt_sacks);
 456                 counter_u64_zero(rack_used_tlpmethod);
 457                 counter_u64_zero(rack_used_tlpmethod2);
 458                 counter_u64_zero(rack_enter_tlp_calc);
 459                 counter_u64_zero(rack_progress_drops);
 460                 counter_u64_zero(rack_tlp_does_nada);
 461         }
 462         rack_clear_counter = 0;
 463         return (0);
 464 }
 465
 466
 467
 468 static void
 469 rack_init_sysctls()
 470 {
 471         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 472             SYSCTL_CHILDREN(rack_sysctl_root),
 473             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 474             &rack_rate_sample_method , USE_RTT_LOW,
 475             "What method should we use for rate sampling 0=high, 1=low ");
 476         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 477             SYSCTL_CHILDREN(rack_sysctl_root),
 478             OID_AUTO, "data_after_close", CTLFLAG_RW,
 479             &rack_ignore_data_after_close, 0,
 480             "Do we hold off sending a RST until all pending data is ack'd");
 481         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 482             SYSCTL_CHILDREN(rack_sysctl_root),
 483             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 484             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 485             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 486         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 487             SYSCTL_CHILDREN(rack_sysctl_root),
 488             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 489             &rack_min_pace_time, 0,
 490             "Should we enforce a minimum pace time of 1ms");
 491         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 492             SYSCTL_CHILDREN(rack_sysctl_root),
 493             OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 494             &rack_min_pace_time_seg_req, 6,
 495             "How many segments have to be in the len to enforce min-pace-time");
 496         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 497             SYSCTL_CHILDREN(rack_sysctl_root),
 498             OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 499             &rack_reduce_largest_on_idle, 0,
 500             "Should we reduce the largest cwnd seen to IW on idle reduction");
 501         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 502             SYSCTL_CHILDREN(rack_sysctl_root),
 503             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 504             &rack_verbose_logging, 0,
 505             "Should RACK black box logging be verbose");
 506         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 507             SYSCTL_CHILDREN(rack_sysctl_root),
 508             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 509             &rack_use_sack_filter, 1,
 510             "Do we use sack filtering?");
 511         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 512             SYSCTL_CHILDREN(rack_sysctl_root),
 513             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 514             &rack_delayed_ack_time, 200,
 515             "Delayed ack time (200ms)");
 516         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 517             SYSCTL_CHILDREN(rack_sysctl_root),
 518             OID_AUTO, "tlpminto", CTLFLAG_RW,
 519             &rack_tlp_min, 10,
 520             "TLP minimum timeout per the specification (10ms)");
 521         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 522             SYSCTL_CHILDREN(rack_sysctl_root),
 523             OID_AUTO, "precache", CTLFLAG_RW,
 524             &rack_precache, 0,
 525             "Where should we precache the mcopy (0 is not at all)");
 526         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 527             SYSCTL_CHILDREN(rack_sysctl_root),
 528             OID_AUTO, "sblklimit", CTLFLAG_RW,
 529             &rack_sack_block_limit, 128,
 530             "When do we start paying attention to small sack blocks");
 531         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 532             SYSCTL_CHILDREN(rack_sysctl_root),
 533             OID_AUTO, "send_oldest", CTLFLAG_RW,
 534             &rack_always_send_oldest, 1,
 535             "Should we always send the oldest TLP and RACK-TLP");
 536         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 537             SYSCTL_CHILDREN(rack_sysctl_root),
 538             OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 539             &rack_tlp_in_recovery, 1,
 540             "Can we do a TLP during recovery?");
 541         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 542             SYSCTL_CHILDREN(rack_sysctl_root),
 543             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 544             &rack_limited_retran, 0,
 545             "How many times can a rack timeout drive out sends");
 546         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 547             SYSCTL_CHILDREN(rack_sysctl_root),
 548             OID_AUTO, "minrto", CTLFLAG_RW,
 549             &rack_rto_min, 0,
 550             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 551         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 552             SYSCTL_CHILDREN(rack_sysctl_root),
 553             OID_AUTO, "maxrto", CTLFLAG_RW,
 554             &rack_rto_max, 0,
 555             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 556         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 557             SYSCTL_CHILDREN(rack_sysctl_root),
 558             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 559             &rack_tlp_max_resend, 2,
 560             "How many times does TLP retry a single segment or multiple with no ACK");
 561         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 562             SYSCTL_CHILDREN(rack_sysctl_root),
 563             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 564             &rack_use_proportional_reduce, 0,
 565             "Should we proportionaly reduce cwnd based on the number of losses ");
 566         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 567             SYSCTL_CHILDREN(rack_sysctl_root),
 568             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 569             &rack_proportional_rate, 10,
 570             "What percent reduction per loss");
 571         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 572             SYSCTL_CHILDREN(rack_sysctl_root),
 573             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 574             &rack_lower_cwnd_at_tlp, 0,
 575             "When a TLP completes a retran should we enter recovery?");
 576         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 577             SYSCTL_CHILDREN(rack_sysctl_root),
 578             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 579             &rack_slot_reduction, 4,
 580             "When setting a slot should we reduce by divisor");
 581         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 582             SYSCTL_CHILDREN(rack_sysctl_root),
 583             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 584             &rack_pace_every_seg, 1,
 585             "Should we pace out every segment hptsi");
 586         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 587             SYSCTL_CHILDREN(rack_sysctl_root),
 588             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 589             &rack_hptsi_segments, 6,
 590             "Should we pace out only a limited size of segments");
 591         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 592             SYSCTL_CHILDREN(rack_sysctl_root),
 593             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 594             &rack_send_a_lot_in_prr, 1,
 595             "Send a lot in prr");
 596         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 597             SYSCTL_CHILDREN(rack_sysctl_root),
 598             OID_AUTO, "minto", CTLFLAG_RW,
 599             &rack_min_to, 1,
 600             "Minimum rack timeout in milliseconds");
 601         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 602             SYSCTL_CHILDREN(rack_sysctl_root),
 603             OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 604             &rack_early_recovery_max_seg, 6,
 605             "Max segments in early recovery");
 606         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 607             SYSCTL_CHILDREN(rack_sysctl_root),
 608             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 609             &rack_early_recovery, 1,
 610             "Do we do early recovery with rack");
 611         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 612             SYSCTL_CHILDREN(rack_sysctl_root),
 613             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 614             &rack_reorder_thresh, 2,
 615             "What factor for rack will be added when seeing reordering (shift right)");
 616         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 617             SYSCTL_CHILDREN(rack_sysctl_root),
 618             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 619             &rack_tlp_thresh, 1,
 620             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 621         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 622             SYSCTL_CHILDREN(rack_sysctl_root),
 623             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 624             &rack_reorder_fade, 0,
 625             "Does reorder detection fade, if so how many ms (0 means never)");
 626         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 627             SYSCTL_CHILDREN(rack_sysctl_root),
 628             OID_AUTO, "pktdelay", CTLFLAG_RW,
 629             &rack_pkt_delay, 1,
 630             "Extra RACK time (in ms) besides reordering thresh");
 631         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "split_limit", CTLFLAG_RW,
 634             &rack_map_split_limit, 0,
 635             "Is there a limit on the number of map split entries (0=unlimited)");
 636         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "inc_var", CTLFLAG_RW,
 639             &rack_inc_var, 0,
 640             "Should rack add to the TLP timer the variance in rtt calculation");
 641         rack_badfr = counter_u64_alloc(M_WAITOK);
 642         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 643             SYSCTL_CHILDREN(rack_sysctl_root),
 644             OID_AUTO, "badfr", CTLFLAG_RD,
 645             &rack_badfr, "Total number of bad FRs");
 646         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 647         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 648             SYSCTL_CHILDREN(rack_sysctl_root),
 649             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 650             &rack_badfr_bytes, "Total number of bad FRs");
 651         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 652         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 653             SYSCTL_CHILDREN(rack_sysctl_root),
 654             OID_AUTO, "prrsndret", CTLFLAG_RD,
 655             &rack_rtm_prr_retran,
 656             "Total number of prr based retransmits");
 657         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 658         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 659             SYSCTL_CHILDREN(rack_sysctl_root),
 660             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 661             &rack_rtm_prr_newdata,
 662             "Total number of prr based new transmits");
 663         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 664         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 665             SYSCTL_CHILDREN(rack_sysctl_root),
 666             OID_AUTO, "tsnf", CTLFLAG_RD,
 667             &rack_timestamp_mismatch,
 668             "Total number of timestamps that we could not find the reported ts");
 669         rack_find_high = counter_u64_alloc(M_WAITOK);
 670         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 671             SYSCTL_CHILDREN(rack_sysctl_root),
 672             OID_AUTO, "findhigh", CTLFLAG_RD,
 673             &rack_find_high,
 674             "Total number of FIN causing find-high");
 675         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 676         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 677             SYSCTL_CHILDREN(rack_sysctl_root),
 678             OID_AUTO, "reordering", CTLFLAG_RD,
 679             &rack_reorder_seen,
 680             "Total number of times we added delay due to reordering");
 681         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 682         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 683             SYSCTL_CHILDREN(rack_sysctl_root),
 684             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 685             &rack_tlp_tot,
 686             "Total number of tail loss probe expirations");
 687         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 688         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 689             SYSCTL_CHILDREN(rack_sysctl_root),
 690             OID_AUTO, "tlp_new", CTLFLAG_RD,
 691             &rack_tlp_newdata,
 692             "Total number of tail loss probe sending new data");
 693
 694         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 695         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 696             SYSCTL_CHILDREN(rack_sysctl_root),
 697             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 698             &rack_tlp_retran,
 699             "Total number of tail loss probe sending retransmitted data");
 700         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 701         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 702             SYSCTL_CHILDREN(rack_sysctl_root),
 703             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 704             &rack_tlp_retran_bytes,
 705             "Total bytes of tail loss probe sending retransmitted data");
 706         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 707         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 708             SYSCTL_CHILDREN(rack_sysctl_root),
 709             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 710             &rack_tlp_retran_fail,
 711             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 712         rack_to_tot = counter_u64_alloc(M_WAITOK);
 713         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 714             SYSCTL_CHILDREN(rack_sysctl_root),
 715             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 716             &rack_to_tot,
 717             "Total number of times the rack to expired?");
 718         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 719         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 720             SYSCTL_CHILDREN(rack_sysctl_root),
 721             OID_AUTO, "arm_rack", CTLFLAG_RD,
 722             &rack_to_arm_rack,
 723             "Total number of times the rack timer armed?");
 724         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 725         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 726             SYSCTL_CHILDREN(rack_sysctl_root),
 727             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 728             &rack_to_arm_tlp,
 729             "Total number of times the tlp timer armed?");
 730         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 731         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 732             SYSCTL_CHILDREN(rack_sysctl_root),
 733             OID_AUTO, "paced", CTLFLAG_RD,
 734             &rack_paced_segments,
 735             "Total number of times a segment send caused hptsi");
 736         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 737         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 738             SYSCTL_CHILDREN(rack_sysctl_root),
 739             OID_AUTO, "unpaced", CTLFLAG_RD,
 740             &rack_unpaced_segments,
 741             "Total number of times a segment did not cause hptsi");
 742         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 743         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 744             SYSCTL_CHILDREN(rack_sysctl_root),
 745             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 746             &rack_saw_enobuf,
 747             "Total number of times a segment did not cause hptsi");
 748         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 749         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 750             SYSCTL_CHILDREN(rack_sysctl_root),
 751             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 752             &rack_saw_enetunreach,
 753             "Total number of times a segment did not cause hptsi");
 754         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 755         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 756             SYSCTL_CHILDREN(rack_sysctl_root),
 757             OID_AUTO, "allocs", CTLFLAG_RD,
 758             &rack_to_alloc,
 759             "Total allocations of tracking structures");
 760         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 761         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 762             SYSCTL_CHILDREN(rack_sysctl_root),
 763             OID_AUTO, "allochard", CTLFLAG_RD,
 764             &rack_to_alloc_hard,
 765             "Total allocations done with sleeping the hard way");
 766         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 767         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 768             SYSCTL_CHILDREN(rack_sysctl_root),
 769             OID_AUTO, "allocemerg", CTLFLAG_RD,
 770             &rack_to_alloc_emerg,
 771             "Total allocations done from emergency cache");
 772         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 773         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 774             SYSCTL_CHILDREN(rack_sysctl_root),
 775             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 776             &rack_alloc_limited_conns,
 777             "Connections with allocations dropped due to limit");
 778         rack_split_limited = counter_u64_alloc(M_WAITOK);
 779         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 780             SYSCTL_CHILDREN(rack_sysctl_root),
 781             OID_AUTO, "split_limited", CTLFLAG_RD,
 782             &rack_split_limited,
 783             "Split allocations dropped due to limit");
 784         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 785         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 786             SYSCTL_CHILDREN(rack_sysctl_root),
 787             OID_AUTO, "sack_long", CTLFLAG_RD,
 788             &rack_sack_proc_all,
 789             "Total times we had to walk whole list for sack processing");
 790
 791         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 792         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 793             SYSCTL_CHILDREN(rack_sysctl_root),
 794             OID_AUTO, "sack_restart", CTLFLAG_RD,
 795             &rack_sack_proc_restart,
 796             "Total times we had to walk whole list due to a restart");
 797         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 798         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 799             SYSCTL_CHILDREN(rack_sysctl_root),
 800             OID_AUTO, "sack_short", CTLFLAG_RD,
 801             &rack_sack_proc_short,
 802             "Total times we took shortcut for sack processing");
 803         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 804         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 805             SYSCTL_CHILDREN(rack_sysctl_root),
 806             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 807             &rack_enter_tlp_calc,
 808             "Total times we called calc-tlp");
 809         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 810         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 811             SYSCTL_CHILDREN(rack_sysctl_root),
 812             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 813             &rack_used_tlpmethod,
 814             "Total number of runt sacks");
 815         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 816         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 817             SYSCTL_CHILDREN(rack_sysctl_root),
 818             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 819             &rack_used_tlpmethod2,
 820             "Total number of runt sacks 2");
 821         rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 822         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 823             SYSCTL_CHILDREN(rack_sysctl_root),
 824             OID_AUTO, "runtsacks", CTLFLAG_RD,
 825             &rack_runt_sacks,
 826             "Total number of runt sacks");
 827         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 828         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 829             SYSCTL_CHILDREN(rack_sysctl_root),
 830             OID_AUTO, "prog_drops", CTLFLAG_RD,
 831             &rack_progress_drops,
 832             "Total number of progress drops");
 833         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 834         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 835             SYSCTL_CHILDREN(rack_sysctl_root),
 836             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 837             &rack_input_idle_reduces,
 838             "Total number of idle reductions on input");
 839         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 840         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 841             SYSCTL_CHILDREN(rack_sysctl_root),
 842             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 843             &rack_tlp_does_nada,
 844             "Total number of nada tlp calls");
 845         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 846         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 847             OID_AUTO, "outsize", CTLFLAG_RD,
 848             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 849         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 850         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 851             OID_AUTO, "opts", CTLFLAG_RD,
 852             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 853         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 854             SYSCTL_CHILDREN(rack_sysctl_root),
 855             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 856             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 857 }
 858
 859 static inline int32_t
 860 rack_progress_timeout_check(struct tcpcb *tp)
 861 {
 862         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 863                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 864                         /*
 865                          * There is an assumption that the caller
 866                          * will drop the connection so we will
 867                          * increment the counters here.
 868                          */
 869                         struct tcp_rack *rack;
 870                         rack = (struct tcp_rack *)tp->t_fb_ptr;
 871                         counter_u64_add(rack_progress_drops, 1);
 872 #ifdef NETFLIX_STATS
 873                         TCPSTAT_INC(tcps_progdrops);
 874 #endif
 875                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 876                         return (1);
 877                 }
 878         }
 879         return (0);
 880 }
 881
 882
 883 static void
 884 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 885 {
 886         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 887                 union tcp_log_stackspecific log;
 888
 889                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 890                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 891                 log.u_bbr.flex2 = to;
 892                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 893                 log.u_bbr.flex4 = slot;
 894                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 895                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 896                 log.u_bbr.flex8 = which;
 897                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 898                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 899                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 900                     &rack->rc_inp->inp_socket->so_rcv,
 901                     &rack->rc_inp->inp_socket->so_snd,
 902                     BBR_LOG_TIMERSTAR, 0,
 903                     0, &log, false);
 904         }
 905 }
 906
 907 static void
 908 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 909 {
 910         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 911                 union tcp_log_stackspecific log;
 912
 913                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 914                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 915                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 916                 log.u_bbr.flex8 = to_num;
 917                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 918                 log.u_bbr.flex2 = rack->rc_rack_rtt;
 919                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 920                     &rack->rc_inp->inp_socket->so_rcv,
 921                     &rack->rc_inp->inp_socket->so_snd,
 922                     BBR_LOG_RTO, 0,
 923                     0, &log, false);
 924         }
 925 }
 926
 927 static void
 928 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
 929     uint32_t o_srtt, uint32_t o_var)
 930 {
 931         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 932                 union tcp_log_stackspecific log;
 933
 934                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 935                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 936                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 937                 log.u_bbr.flex1 = t;
 938                 log.u_bbr.flex2 = o_srtt;
 939                 log.u_bbr.flex3 = o_var;
 940                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 941                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 942                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 943                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 944                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 945                 TCP_LOG_EVENT(tp, NULL,
 946                     &rack->rc_inp->inp_socket->so_rcv,
 947                     &rack->rc_inp->inp_socket->so_snd,
 948                     BBR_LOG_BBRRTT, 0,
 949                     0, &log, false);
 950         }
 951 }
 952
 953 static void
 954 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 955 {
 956         /*
 957          * Log the rtt sample we are
 958          * applying to the srtt algorithm in
 959          * useconds.
 960          */
 961         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 962                 union tcp_log_stackspecific log;
 963                 struct timeval tv;
 964
 965                 /* Convert our ms to a microsecond */
 966                 log.u_bbr.flex1 = rtt * 1000;
 967                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 968                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
 969                     &rack->rc_inp->inp_socket->so_rcv,
 970                     &rack->rc_inp->inp_socket->so_snd,
 971                     TCP_LOG_RTT, 0,
 972                     0, &log, false, &tv);
 973         }
 974 }
 975
 976
 977 static inline void
 978 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 979 {
 980         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 981                 union tcp_log_stackspecific log;
 982
 983                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 984                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 985                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 986                 log.u_bbr.flex1 = line;
 987                 log.u_bbr.flex2 = tick;
 988                 log.u_bbr.flex3 = tp->t_maxunacktime;
 989                 log.u_bbr.flex4 = tp->t_acktime;
 990                 log.u_bbr.flex8 = event;
 991                 TCP_LOG_EVENT(tp, NULL,
 992                     &rack->rc_inp->inp_socket->so_rcv,
 993                     &rack->rc_inp->inp_socket->so_snd,
 994                     BBR_LOG_PROGRESS, 0,
 995                     0, &log, false);
 996         }
 997 }
 998
 999 static void
1000 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1001 {
1002         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1003                 union tcp_log_stackspecific log;
1004
1005                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1006                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1007                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1008                 log.u_bbr.flex1 = slot;
1009                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1010                 log.u_bbr.flex8 = rack->rc_in_persist;
1011                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1012                     &rack->rc_inp->inp_socket->so_rcv,
1013                     &rack->rc_inp->inp_socket->so_snd,
1014                     BBR_LOG_BBRSND, 0,
1015                     0, &log, false);
1016         }
1017 }
1018
1019 static void
1020 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1021 {
1022         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1023                 union tcp_log_stackspecific log;
1024                 log.u_bbr.flex1 = did_out;
1025                 log.u_bbr.flex2 = nxt_pkt;
1026                 log.u_bbr.flex3 = way_out;
1027                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1028                 log.u_bbr.flex7 = rack->r_wanted_output;
1029                 log.u_bbr.flex8 = rack->rc_in_persist;
1030                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1031                     &rack->rc_inp->inp_socket->so_rcv,
1032                     &rack->rc_inp->inp_socket->so_snd,
1033                     BBR_LOG_DOSEG_DONE, 0,
1034                     0, &log, false);
1035         }
1036 }
1037
1038
1039 static void
1040 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1041 {
1042         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1043                 union tcp_log_stackspecific log;
1044
1045                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1046                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1047                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1048                 log.u_bbr.flex1 = slot;
1049                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1050                 log.u_bbr.flex7 = hpts_calling;
1051                 log.u_bbr.flex8 = rack->rc_in_persist;
1052                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1053                     &rack->rc_inp->inp_socket->so_rcv,
1054                     &rack->rc_inp->inp_socket->so_snd,
1055                     BBR_LOG_JUSTRET, 0,
1056                     tlen, &log, false);
1057         }
1058 }
1059
1060 static void
1061 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1062 {
1063         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1064                 union tcp_log_stackspecific log;
1065
1066                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1067                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1068                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1069                 log.u_bbr.flex1 = line;
1070                 log.u_bbr.flex2 = 0;
1071                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1072                 log.u_bbr.flex4 = 0;
1073                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1074                 log.u_bbr.flex8 = hpts_removed;
1075                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1076                     &rack->rc_inp->inp_socket->so_rcv,
1077                     &rack->rc_inp->inp_socket->so_snd,
1078                     BBR_LOG_TIMERCANC, 0,
1079                     0, &log, false);
1080         }
1081 }
1082
1083 static void
1084 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1085 {
1086         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1087                 union tcp_log_stackspecific log;
1088
1089                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1090                 log.u_bbr.flex1 = timers;
1091                 log.u_bbr.flex2 = ret;
1092                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1093                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1094                 log.u_bbr.flex5 = cts;
1095                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1096                     &rack->rc_inp->inp_socket->so_rcv,
1097                     &rack->rc_inp->inp_socket->so_snd,
1098                     BBR_LOG_TO_PROCESS, 0,
1099                     0, &log, false);
1100         }
1101 }
1102
1103 static void
1104 rack_counter_destroy()
1105 {
1106         counter_u64_free(rack_badfr);
1107         counter_u64_free(rack_badfr_bytes);
1108         counter_u64_free(rack_rtm_prr_retran);
1109         counter_u64_free(rack_rtm_prr_newdata);
1110         counter_u64_free(rack_timestamp_mismatch);
1111         counter_u64_free(rack_reorder_seen);
1112         counter_u64_free(rack_tlp_tot);
1113         counter_u64_free(rack_tlp_newdata);
1114         counter_u64_free(rack_tlp_retran);
1115         counter_u64_free(rack_tlp_retran_bytes);
1116         counter_u64_free(rack_tlp_retran_fail);
1117         counter_u64_free(rack_to_tot);
1118         counter_u64_free(rack_to_arm_rack);
1119         counter_u64_free(rack_to_arm_tlp);
1120         counter_u64_free(rack_paced_segments);
1121         counter_u64_free(rack_unpaced_segments);
1122         counter_u64_free(rack_saw_enobuf);
1123         counter_u64_free(rack_saw_enetunreach);
1124         counter_u64_free(rack_to_alloc_hard);
1125         counter_u64_free(rack_to_alloc_emerg);
1126         counter_u64_free(rack_sack_proc_all);
1127         counter_u64_free(rack_sack_proc_short);
1128         counter_u64_free(rack_sack_proc_restart);
1129         counter_u64_free(rack_to_alloc);
1130         counter_u64_free(rack_find_high);
1131         counter_u64_free(rack_runt_sacks);
1132         counter_u64_free(rack_enter_tlp_calc);
1133         counter_u64_free(rack_used_tlpmethod);
1134         counter_u64_free(rack_used_tlpmethod2);
1135         counter_u64_free(rack_progress_drops);
1136         counter_u64_free(rack_input_idle_reduces);
1137         counter_u64_free(rack_tlp_does_nada);
1138         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1139         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1140 }
1141
1142 static struct rack_sendmap *
1143 rack_alloc(struct tcp_rack *rack)
1144 {
1145         struct rack_sendmap *rsm;
1146
1147         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1148         if (rsm) {
1149 alloc_done:
1150                 counter_u64_add(rack_to_alloc, 1);
1151                 rack->r_ctl.rc_num_maps_alloced++;
1152                 return (rsm);
1153         }
1154         if (rack->rc_free_cnt) {
1155                 counter_u64_add(rack_to_alloc_emerg, 1);
1156                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1157                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
1158                 rack->rc_free_cnt--;
1159                 goto alloc_done;
1160         }
1161         return (NULL);
1162 }
1163
1164 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1165 static struct rack_sendmap *
1166 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1167 {
1168         struct rack_sendmap *rsm;
1169
1170         if (limit_type) {
1171                 /* currently there is only one limit type */
1172                 if (rack_map_split_limit > 0 &&
1173                     rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
1174                         counter_u64_add(rack_split_limited, 1);
1175                         if (!rack->alloc_limit_reported) {
1176                                 rack->alloc_limit_reported = 1;
1177                                 counter_u64_add(rack_alloc_limited_conns, 1);
1178                         }
1179                         return (NULL);
1180                 }
1181         }
1182
1183         /* allocate and mark in the limit type, if set */
1184         rsm = rack_alloc(rack);
1185         if (rsm != NULL && limit_type) {
1186                 rsm->r_limit_type = limit_type;
1187                 rack->r_ctl.rc_num_split_allocs++;
1188         }
1189         return (rsm);
1190 }
1191
1192 static void
1193 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1194 {
1195         if (rsm->r_limit_type) {
1196                 /* currently there is only one limit type */
1197                 rack->r_ctl.rc_num_split_allocs--;
1198         }
1199         rack->r_ctl.rc_num_maps_alloced--;
1200         if (rack->r_ctl.rc_tlpsend == rsm)
1201                 rack->r_ctl.rc_tlpsend = NULL;
1202         if (rack->r_ctl.rc_next == rsm)
1203                 rack->r_ctl.rc_next = NULL;
1204         if (rack->r_ctl.rc_sacklast == rsm)
1205                 rack->r_ctl.rc_sacklast = NULL;
1206         if (rack->rc_free_cnt < rack_free_cache) {
1207                 memset(rsm, 0, sizeof(struct rack_sendmap));
1208                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
1209                 rack->rc_free_cnt++;
1210                 return;
1211         }
1212         uma_zfree(rack_zone, rsm);
1213 }
1214
1215 /*
1216  * CC wrapper hook functions
1217  */
1218 static void
1219 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1220     uint16_t type, int32_t recovery)
1221 {
1222 #ifdef NETFLIX_STATS
1223         int32_t gput;
1224 #endif
1225 #ifdef NETFLIX_CWV
1226         u_long old_cwnd = tp->snd_cwnd;
1227 #endif
1228
1229         INP_WLOCK_ASSERT(tp->t_inpcb);
1230         tp->ccv->nsegs = nsegs;
1231         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1232         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1233                 uint32_t max;
1234
1235                 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
1236                 if (tp->ccv->bytes_this_ack > max) {
1237                         tp->ccv->bytes_this_ack = max;
1238                 }
1239         }
1240         if (tp->snd_cwnd <= tp->snd_wnd)
1241                 tp->ccv->flags |= CCF_CWND_LIMITED;
1242         else
1243                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1244
1245         if (type == CC_ACK) {
1246 #ifdef NETFLIX_STATS
1247                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1248                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1249                 if ((tp->t_flags & TF_GPUTINPROG) &&
1250                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1251                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1252                             max(1, tcp_ts_getticks() - tp->gput_ts);
1253                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1254                             gput);
1255                         /*
1256                          * XXXLAS: This is a temporary hack, and should be
1257                          * chained off VOI_TCP_GPUT when stats(9) grows an
1258                          * API to deal with chained VOIs.
1259                          */
1260                         if (tp->t_stats_gput_prev > 0)
1261                                 stats_voi_update_abs_s32(tp->t_stats,
1262                                     VOI_TCP_GPUT_ND,
1263                                     ((gput - tp->t_stats_gput_prev) * 100) /
1264                                     tp->t_stats_gput_prev);
1265                         tp->t_flags &= ~TF_GPUTINPROG;
1266                         tp->t_stats_gput_prev = gput;
1267 #ifdef NETFLIX_CWV
1268                         if (tp->t_maxpeakrate) {
1269                                 /*
1270                                  * We update t_peakrate_thr. This gives us roughly
1271                                  * one update per round trip time.
1272                                  */
1273                                 tcp_update_peakrate_thr(tp);
1274                         }
1275 #endif
1276                 }
1277 #endif
1278                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1279                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1280                             nsegs * V_tcp_abc_l_var * tp->t_maxseg);
1281                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1282                                 tp->t_bytes_acked -= tp->snd_cwnd;
1283                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1284                         }
1285                 } else {
1286                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1287                         tp->t_bytes_acked = 0;
1288                 }
1289         }
1290         if (CC_ALGO(tp)->ack_received != NULL) {
1291                 /* XXXLAS: Find a way to live without this */
1292                 tp->ccv->curack = th->th_ack;
1293                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1294         }
1295 #ifdef NETFLIX_STATS
1296         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1297 #endif
1298         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1299                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1300         }
1301 #ifdef NETFLIX_CWV
1302         if (tp->cwv_enabled) {
1303                 /*
1304                  * Per RFC 7661: The behaviour in the non-validated phase is
1305                  * specified as: o  A sender determines whether to increase
1306                  * the cwnd based upon whether it is cwnd-limited (see
1307                  * Section 4.5.3): * A sender that is cwnd-limited MAY use
1308                  * the standard TCP method to increase cwnd (i.e., the
1309                  * standard method permits a TCP sender that fully utilises
1310                  * the cwnd to increase the cwnd each time it receives an
1311                  * ACK). * A sender that is not cwnd-limited MUST NOT
1312                  * increase the cwnd when ACK packets are received in this
1313                  * phase (i.e., needs to avoid growing the cwnd when it has
1314                  * not recently sent using the current size of cwnd).
1315                  */
1316                 if ((tp->snd_cwnd > old_cwnd) &&
1317                     (tp->cwv_cwnd_valid == 0) &&
1318                     (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
1319                         tp->snd_cwnd = old_cwnd;
1320                 }
1321                 /* Try to update pipeAck and NCWV state */
1322                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1323                     !IN_RECOVERY(tp->t_flags)) {
1324                         uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
1325
1326                         tcp_newcwv_update_pipeack(tp, data);
1327                 }
1328         }
1329         /* we enforce max peak rate if it is set. */
1330         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1331                 tp->snd_cwnd = tp->t_peakrate_thr;
1332         }
1333 #endif
1334 }
1335
1336 static void
1337 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1338 {
1339         struct tcp_rack *rack;
1340
1341         rack = (struct tcp_rack *)tp->t_fb_ptr;
1342         INP_WLOCK_ASSERT(tp->t_inpcb);
1343         if (rack->r_ctl.rc_prr_sndcnt > 0)
1344                 rack->r_wanted_output++;
1345 }
1346
1347 static void
1348 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1349 {
1350         struct tcp_rack *rack;
1351
1352         INP_WLOCK_ASSERT(tp->t_inpcb);
1353         rack = (struct tcp_rack *)tp->t_fb_ptr;
1354         if (CC_ALGO(tp)->post_recovery != NULL) {
1355                 tp->ccv->curack = th->th_ack;
1356                 CC_ALGO(tp)->post_recovery(tp->ccv);
1357         }
1358         /*
1359          * Here we can in theory adjust cwnd to be based on the number of
1360          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1361          * based on the rack_use_proportional flag.
1362          */
1363         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1364                 int32_t reduce;
1365
1366                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1367                 if (reduce > 50) {
1368                         reduce = 50;
1369                 }
1370                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1371         } else {
1372                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1373                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1374                         tp->snd_cwnd = tp->snd_ssthresh;
1375                 }
1376         }
1377         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1378                 /* Suck the next prr cnt back into cwnd */
1379                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1380                 rack->r_ctl.rc_prr_sndcnt = 0;
1381         }
1382         EXIT_RECOVERY(tp->t_flags);
1383
1384
1385 #ifdef NETFLIX_CWV
1386         if (tp->cwv_enabled) {
1387                 if ((tp->cwv_cwnd_valid == 0) &&
1388                     (tp->snd_cwv.in_recovery))
1389                         tcp_newcwv_end_recovery(tp);
1390         }
1391 #endif
1392 }
1393
1394 static void
1395 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1396 {
1397         struct tcp_rack *rack;
1398
1399         INP_WLOCK_ASSERT(tp->t_inpcb);
1400
1401         rack = (struct tcp_rack *)tp->t_fb_ptr;
1402         switch (type) {
1403         case CC_NDUPACK:
1404 /*              rack->r_ctl.rc_ssthresh_set = 1;*/
1405                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1406                         rack->r_ctl.rc_tlp_rtx_out = 0;
1407                         rack->r_ctl.rc_prr_delivered = 0;
1408                         rack->r_ctl.rc_prr_out = 0;
1409                         rack->r_ctl.rc_loss_count = 0;
1410                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
1411                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1412                         tp->snd_recover = tp->snd_max;
1413                         if (tp->t_flags & TF_ECN_PERMIT)
1414                                 tp->t_flags |= TF_ECN_SND_CWR;
1415                 }
1416                 break;
1417         case CC_ECN:
1418                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1419                         TCPSTAT_INC(tcps_ecn_rcwnd);
1420                         tp->snd_recover = tp->snd_max;
1421                         if (tp->t_flags & TF_ECN_PERMIT)
1422                                 tp->t_flags |= TF_ECN_SND_CWR;
1423                 }
1424                 break;
1425         case CC_RTO:
1426                 tp->t_dupacks = 0;
1427                 tp->t_bytes_acked = 0;
1428                 EXIT_RECOVERY(tp->t_flags);
1429                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1430                     tp->t_maxseg) * tp->t_maxseg;
1431                 tp->snd_cwnd = tp->t_maxseg;
1432                 break;
1433         case CC_RTO_ERR:
1434                 TCPSTAT_INC(tcps_sndrexmitbad);
1435                 /* RTO was unnecessary, so reset everything. */
1436                 tp->snd_cwnd = tp->snd_cwnd_prev;
1437                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1438                 tp->snd_recover = tp->snd_recover_prev;
1439                 if (tp->t_flags & TF_WASFRECOVERY)
1440                         ENTER_FASTRECOVERY(tp->t_flags);
1441                 if (tp->t_flags & TF_WASCRECOVERY)
1442                         ENTER_CONGRECOVERY(tp->t_flags);
1443                 tp->snd_nxt = tp->snd_max;
1444                 tp->t_badrxtwin = 0;
1445                 break;
1446         }
1447
1448         if (CC_ALGO(tp)->cong_signal != NULL) {
1449                 if (th != NULL)
1450                         tp->ccv->curack = th->th_ack;
1451                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1452         }
1453 #ifdef NETFLIX_CWV
1454         if (tp->cwv_enabled) {
1455                 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
1456                         tcp_newcwv_enter_recovery(tp);
1457                 }
1458                 if (type == CC_RTO) {
1459                         tcp_newcwv_reset(tp);
1460                 }
1461         }
1462 #endif
1463 }
1464
1465
1466
1467 static inline void
1468 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
1469 {
1470         uint32_t i_cwnd;
1471
1472         INP_WLOCK_ASSERT(tp->t_inpcb);
1473
1474 #ifdef NETFLIX_STATS
1475         TCPSTAT_INC(tcps_idle_restarts);
1476         if (tp->t_state == TCPS_ESTABLISHED)
1477                 TCPSTAT_INC(tcps_idle_estrestarts);
1478 #endif
1479         if (CC_ALGO(tp)->after_idle != NULL)
1480                 CC_ALGO(tp)->after_idle(tp->ccv);
1481
1482         if (tp->snd_cwnd == 1)
1483                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1484         else if (V_tcp_initcwnd_segments)
1485                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
1486                     max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
1487         else if (V_tcp_do_rfc3390)
1488                 i_cwnd = min(4 * tp->t_maxseg,
1489                     max(2 * tp->t_maxseg, 4380));
1490         else {
1491                 /* Per RFC5681 Section 3.1 */
1492                 if (tp->t_maxseg > 2190)
1493                         i_cwnd = 2 * tp->t_maxseg;
1494                 else if (tp->t_maxseg > 1095)
1495                         i_cwnd = 3 * tp->t_maxseg;
1496                 else
1497                         i_cwnd = 4 * tp->t_maxseg;
1498         }
1499         if (reduce_largest) {
1500                 /*
1501                  * Do we reduce the largest cwnd to make
1502                  * rack play nice on restart hptsi wise?
1503                  */
1504                 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
1505                         ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
1506         }
1507         /*
1508          * Being idle is no differnt than the initial window. If the cc
1509          * clamps it down below the initial window raise it to the initial
1510          * window.
1511          */
1512         if (tp->snd_cwnd < i_cwnd) {
1513                 tp->snd_cwnd = i_cwnd;
1514         }
1515 }
1516
1517
1518 /*
1519  * Indicate whether this ack should be delayed.  We can delay the ack if
1520  * following conditions are met:
1521  *      - There is no delayed ack timer in progress.
1522  *      - Our last ack wasn't a 0-sized window. We never want to delay
1523  *        the ack that opens up a 0-sized window.
1524  *      - LRO wasn't used for this segment. We make sure by checking that the
1525  *        segment size is not larger than the MSS.
1526  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1527  *        connection.
1528  */
1529 #define DELAY_ACK(tp, tlen)                      \
1530         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1531         ((tp->t_flags & TF_DELACK) == 0) &&      \
1532         (tlen <= tp->t_maxseg) &&                \
1533         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1534
1535 static inline void
1536 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
1537 {
1538         int32_t win;
1539
1540         /*
1541          * Calculate amount of space in receive window, and then do TCP
1542          * input processing. Receive window is amount of space in rcv queue,
1543          * but not less than advertised window.
1544          */
1545         win = sbspace(&so->so_rcv);
1546         if (win < 0)
1547                 win = 0;
1548         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1549 }
1550
1551 static void
1552 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
1553 {
1554         /*
1555          * Drop space held by incoming segment and return.
1556          */
1557         if (tp != NULL)
1558                 INP_WUNLOCK(tp->t_inpcb);
1559         if (m)
1560                 m_freem(m);
1561 }
1562
1563 static void
1564 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
1565     int32_t rstreason, int32_t tlen)
1566 {
1567         if (tp != NULL) {
1568                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1569                 INP_WUNLOCK(tp->t_inpcb);
1570         } else
1571                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1572 }
1573
1574 /*
1575  * The value in ret_val informs the caller
1576  * if we dropped the tcb (and lock) or not.
1577  * 1 = we dropped it, 0 = the TCB is still locked
1578  * and valid.
1579  */
1580 static void
1581 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
1582 {
1583         /*
1584          * Generate an ACK dropping incoming segment if it occupies sequence
1585          * space, where the ACK reflects our state.
1586          *
1587          * We can now skip the test for the RST flag since all paths to this
1588          * code happen after packets containing RST have been dropped.
1589          *
1590          * In the SYN-RECEIVED state, don't send an ACK unless the segment
1591          * we received passes the SYN-RECEIVED ACK test. If it fails send a
1592          * RST.  This breaks the loop in the "LAND" DoS attack, and also
1593          * prevents an ACK storm between two listening ports that have been
1594          * sent forged SYN segments, each with the source address of the
1595          * other.
1596          */
1597         struct tcp_rack *rack;
1598
1599         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1600             (SEQ_GT(tp->snd_una, th->th_ack) ||
1601             SEQ_GT(th->th_ack, tp->snd_max))) {
1602                 *ret_val = 1;
1603                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
1604                 return;
1605         } else
1606                 *ret_val = 0;
1607         rack = (struct tcp_rack *)tp->t_fb_ptr;
1608         rack->r_wanted_output++;
1609         tp->t_flags |= TF_ACKNOW;
1610         if (m)
1611                 m_freem(m);
1612 }
1613
1614
1615 static int
1616 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
1617 {
1618         /*
1619          * RFC5961 Section 3.2
1620          *
1621          * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
1622          * window, we send challenge ACK.
1623          *
1624          * Note: to take into account delayed ACKs, we should test against
1625          * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
1626          * of closed window, not covered by the RFC.
1627          */
1628         int dropped = 0;
1629
1630         if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
1631             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1632             (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1633
1634                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1635                 KASSERT(tp->t_state != TCPS_SYN_SENT,
1636                     ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
1637                     __func__, th, tp));
1638
1639                 if (V_tcp_insecure_rst ||
1640                     (tp->last_ack_sent == th->th_seq) ||
1641                     (tp->rcv_nxt == th->th_seq) ||
1642                     ((tp->last_ack_sent - 1) == th->th_seq)) {
1643                         TCPSTAT_INC(tcps_drops);
1644                         /* Drop the connection. */
1645                         switch (tp->t_state) {
1646                         case TCPS_SYN_RECEIVED:
1647                                 so->so_error = ECONNREFUSED;
1648                                 goto close;
1649                         case TCPS_ESTABLISHED:
1650                         case TCPS_FIN_WAIT_1:
1651                         case TCPS_FIN_WAIT_2:
1652                         case TCPS_CLOSE_WAIT:
1653                         case TCPS_CLOSING:
1654                         case TCPS_LAST_ACK:
1655                                 so->so_error = ECONNRESET;
1656                 close:
1657                                 tcp_state_change(tp, TCPS_CLOSED);
1658                                 /* FALLTHROUGH */
1659                         default:
1660                                 tp = tcp_close(tp);
1661                         }
1662                         dropped = 1;
1663                         rack_do_drop(m, tp);
1664                 } else {
1665                         TCPSTAT_INC(tcps_badrst);
1666                         /* Send challenge ACK. */
1667                         tcp_respond(tp, mtod(m, void *), th, m,
1668                             tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1669                         tp->last_ack_sent = tp->rcv_nxt;
1670                 }
1671         } else {
1672                 m_freem(m);
1673         }
1674         return (dropped);
1675 }
1676
1677 /*
1678  * The value in ret_val informs the caller
1679  * if we dropped the tcb (and lock) or not.
1680  * 1 = we dropped it, 0 = the TCB is still locked
1681  * and valid.
1682  */
1683 static void
1684 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
1685 {
1686         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1687
1688         TCPSTAT_INC(tcps_badsyn);
1689         if (V_tcp_insecure_syn &&
1690             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1691             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1692                 tp = tcp_drop(tp, ECONNRESET);
1693                 *ret_val = 1;
1694                 rack_do_drop(m, tp);
1695         } else {
1696                 /* Send challenge ACK. */
1697                 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
1698                     tp->snd_nxt, TH_ACK);
1699                 tp->last_ack_sent = tp->rcv_nxt;
1700                 m = NULL;
1701                 *ret_val = 0;
1702                 rack_do_drop(m, NULL);
1703         }
1704 }
1705
1706 /*
1707  * rack_ts_check returns 1 for you should not proceed. It places
1708  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1709  * that the TCB is unlocked and probably dropped. The 0 indicates the
1710  * TCB is still valid and locked.
1711  */
1712 static int
1713 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
1714 {
1715
1716         /* Check to see if ts_recent is over 24 days old.  */
1717         if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1718                 /*
1719                  * Invalidate ts_recent.  If this segment updates ts_recent,
1720                  * the age will be reset later and ts_recent will get a
1721                  * valid value.  If it does not, setting ts_recent to zero
1722                  * will at least satisfy the requirement that zero be placed
1723                  * in the timestamp echo reply when ts_recent isn't valid.
1724                  * The age isn't reset until we get a valid ts_recent
1725                  * because we don't want out-of-order segments to be dropped
1726                  * when ts_recent is old.
1727                  */
1728                 tp->ts_recent = 0;
1729         } else {
1730                 TCPSTAT_INC(tcps_rcvduppack);
1731                 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1732                 TCPSTAT_INC(tcps_pawsdrop);
1733                 *ret_val = 0;
1734                 if (tlen) {
1735                         rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1736                 } else {
1737                         rack_do_drop(m, NULL);
1738                 }
1739                 return (1);
1740         }
1741         return (0);
1742 }
1743
1744 /*
1745  * rack_drop_checks returns 1 for you should not proceed. It places
1746  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1747  * that the TCB is unlocked and probably dropped. The 0 indicates the
1748  * TCB is still valid and locked.
1749  */
1750 static int
1751 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
1752 {
1753         int32_t todrop;
1754         int32_t thflags;
1755         int32_t tlen;
1756
1757         thflags = *thf;
1758         tlen = *tlenp;
1759         todrop = tp->rcv_nxt - th->th_seq;
1760         if (todrop > 0) {
1761                 if (thflags & TH_SYN) {
1762                         thflags &= ~TH_SYN;
1763                         th->th_seq++;
1764                         if (th->th_urp > 1)
1765                                 th->th_urp--;
1766                         else
1767                                 thflags &= ~TH_URG;
1768                         todrop--;
1769                 }
1770                 /*
1771                  * Following if statement from Stevens, vol. 2, p. 960.
1772                  */
1773                 if (todrop > tlen
1774                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1775                         /*
1776                          * Any valid FIN must be to the left of the window.
1777                          * At this point the FIN must be a duplicate or out
1778                          * of sequence; drop it.
1779                          */
1780                         thflags &= ~TH_FIN;
1781                         /*
1782                          * Send an ACK to resynchronize and drop any data.
1783                          * But keep on processing for RST or ACK.
1784                          */
1785                         tp->t_flags |= TF_ACKNOW;
1786                         todrop = tlen;
1787                         TCPSTAT_INC(tcps_rcvduppack);
1788                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1789                 } else {
1790                         TCPSTAT_INC(tcps_rcvpartduppack);
1791                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1792                 }
1793                 /*
1794                  * DSACK - add SACK block for dropped range
1795                  */
1796                 if (tp->t_flags & TF_SACK_PERMIT) {
1797                         tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
1798                         /*
1799                          * ACK now, as the next in-sequence segment
1800                          * will clear the DSACK block again
1801                          */
1802                         tp->t_flags |= TF_ACKNOW;
1803                 }
1804                 *drop_hdrlen += todrop; /* drop from the top afterwards */
1805                 th->th_seq += todrop;
1806                 tlen -= todrop;
1807                 if (th->th_urp > todrop)
1808                         th->th_urp -= todrop;
1809                 else {
1810                         thflags &= ~TH_URG;
1811                         th->th_urp = 0;
1812                 }
1813         }
1814         /*
1815          * If segment ends after window, drop trailing data (and PUSH and
1816          * FIN); if nothing left, just ACK.
1817          */
1818         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1819         if (todrop > 0) {
1820                 TCPSTAT_INC(tcps_rcvpackafterwin);
1821                 if (todrop >= tlen) {
1822                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1823                         /*
1824                          * If window is closed can only take segments at
1825                          * window edge, and have to drop data and PUSH from
1826                          * incoming segments.  Continue processing, but
1827                          * remember to ack.  Otherwise, drop segment and
1828                          * ack.
1829                          */
1830                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1831                                 tp->t_flags |= TF_ACKNOW;
1832                                 TCPSTAT_INC(tcps_rcvwinprobe);
1833                         } else {
1834                                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1835                                 return (1);
1836                         }
1837                 } else
1838                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1839                 m_adj(m, -todrop);
1840                 tlen -= todrop;
1841                 thflags &= ~(TH_PUSH | TH_FIN);
1842         }
1843         *thf = thflags;
1844         *tlenp = tlen;
1845         return (0);
1846 }
1847
1848 static struct rack_sendmap *
1849 rack_find_lowest_rsm(struct tcp_rack *rack)
1850 {
1851         struct rack_sendmap *rsm;
1852
1853         /*
1854          * Walk the time-order transmitted list looking for an rsm that is
1855          * not acked. This will be the one that was sent the longest time
1856          * ago that is still outstanding.
1857          */
1858         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1859                 if (rsm->r_flags & RACK_ACKED) {
1860                         continue;
1861                 }
1862                 goto finish;
1863         }
1864 finish:
1865         return (rsm);
1866 }
1867
1868 static struct rack_sendmap *
1869 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1870 {
1871         struct rack_sendmap *prsm;
1872
1873         /*
1874          * Walk the sequence order list backward until we hit and arrive at
1875          * the highest seq not acked. In theory when this is called it
1876          * should be the last segment (which it was not).
1877          */
1878         counter_u64_add(rack_find_high, 1);
1879         prsm = rsm;
1880         TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
1881                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1882                         continue;
1883                 }
1884                 return (prsm);
1885         }
1886         return (NULL);
1887 }
1888
1889
1890 static uint32_t
1891 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1892 {
1893         int32_t lro;
1894         uint32_t thresh;
1895
1896         /*
1897          * lro is the flag we use to determine if we have seen reordering.
1898          * If it gets set we have seen reordering. The reorder logic either
1899          * works in one of two ways:
1900          *
1901          * If reorder-fade is configured, then we track the last time we saw
1902          * re-ordering occur. If we reach the point where enough time as
1903          * passed we no longer consider reordering has occuring.
1904          *
1905          * Or if reorder-face is 0, then once we see reordering we consider
1906          * the connection to alway be subject to reordering and just set lro
1907          * to 1.
1908          *
1909          * In the end if lro is non-zero we add the extra time for
1910          * reordering in.
1911          */
1912         if (srtt == 0)
1913                 srtt = 1;
1914         if (rack->r_ctl.rc_reorder_ts) {
1915                 if (rack->r_ctl.rc_reorder_fade) {
1916                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1917                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1918                                 if (lro == 0) {
1919                                         /*
1920                                          * No time as passed since the last
1921                                          * reorder, mark it as reordering.
1922                                          */
1923                                         lro = 1;
1924                                 }
1925                         } else {
1926                                 /* Negative time? */
1927                                 lro = 0;
1928                         }
1929                         if (lro > rack->r_ctl.rc_reorder_fade) {
1930                                 /* Turn off reordering seen too */
1931                                 rack->r_ctl.rc_reorder_ts = 0;
1932                                 lro = 0;
1933                         }
1934                 } else {
1935                         /* Reodering does not fade */
1936                         lro = 1;
1937                 }
1938         } else {
1939                 lro = 0;
1940         }
1941         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1942         if (lro) {
1943                 /* It must be set, if not you get 1/4 rtt */
1944                 if (rack->r_ctl.rc_reorder_shift)
1945                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1946                 else
1947                         thresh += (srtt >> 2);
1948         } else {
1949                 thresh += 1;
1950         }
1951         /* We don't let the rack timeout be above a RTO */
1952
1953         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1954                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1955         }
1956         /* And we don't want it above the RTO max either */
1957         if (thresh > rack_rto_max) {
1958                 thresh = rack_rto_max;
1959         }
1960         return (thresh);
1961 }
1962
1963 static uint32_t
1964 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
1965                      struct rack_sendmap *rsm, uint32_t srtt)
1966 {
1967         struct rack_sendmap *prsm;
1968         uint32_t thresh, len;
1969         int maxseg;
1970
1971         if (srtt == 0)
1972                 srtt = 1;
1973         if (rack->r_ctl.rc_tlp_threshold)
1974                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
1975         else
1976                 thresh = (srtt * 2);
1977
1978         /* Get the previous sent packet, if any  */
1979         maxseg = tcp_maxseg(tp);
1980         counter_u64_add(rack_enter_tlp_calc, 1);
1981         len = rsm->r_end - rsm->r_start;
1982         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
1983                 /* Exactly like the ID */
1984                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
1985                         uint32_t alt_thresh;
1986                         /*
1987                          * Compensate for delayed-ack with the d-ack time.
1988                          */
1989                         counter_u64_add(rack_used_tlpmethod, 1);
1990                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1991                         if (alt_thresh > thresh)
1992                                 thresh = alt_thresh;
1993                 }
1994         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
1995                 /* 2.1 behavior */
1996                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
1997                 if (prsm && (len <= maxseg)) {
1998                         /*
1999                          * Two packets outstanding, thresh should be (2*srtt) +
2000                          * possible inter-packet delay (if any).
2001                          */
2002                         uint32_t inter_gap = 0;
2003                         int idx, nidx;
2004
2005                         counter_u64_add(rack_used_tlpmethod, 1);
2006                         idx = rsm->r_rtr_cnt - 1;
2007                         nidx = prsm->r_rtr_cnt - 1;
2008                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
2009                                 /* Yes it was sent later (or at the same time) */
2010                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
2011                         }
2012                         thresh += inter_gap;
2013                 } else  if (len <= maxseg) {
2014                         /*
2015                          * Possibly compensate for delayed-ack.
2016                          */
2017                         uint32_t alt_thresh;
2018
2019                         counter_u64_add(rack_used_tlpmethod2, 1);
2020                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2021                         if (alt_thresh > thresh)
2022                                 thresh = alt_thresh;
2023                 }
2024         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2025                 /* 2.2 behavior */
2026                 if (len <= maxseg) {
2027                         uint32_t alt_thresh;
2028                         /*
2029                          * Compensate for delayed-ack with the d-ack time.
2030                          */
2031                         counter_u64_add(rack_used_tlpmethod, 1);
2032                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2033                         if (alt_thresh > thresh)
2034                                 thresh = alt_thresh;
2035                 }
2036         }
2037         /* Not above an RTO */
2038         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2039                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2040         }
2041         /* Not above a RTO max */
2042         if (thresh > rack_rto_max) {
2043                 thresh = rack_rto_max;
2044         }
2045         /* Apply user supplied min TLP */
2046         if (thresh < rack_tlp_min) {
2047                 thresh = rack_tlp_min;
2048         }
2049         return (thresh);
2050 }
2051
2052 static struct rack_sendmap *
2053 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2054 {
2055         /*
2056          * Check to see that we don't need to fall into recovery. We will
2057          * need to do so if our oldest transmit is past the time we should
2058          * have had an ack.
2059          */
2060         struct tcp_rack *rack;
2061         struct rack_sendmap *rsm;
2062         int32_t idx;
2063         uint32_t srtt_cur, srtt, thresh;
2064
2065         rack = (struct tcp_rack *)tp->t_fb_ptr;
2066         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
2067                 return (NULL);
2068         }
2069         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
2070         srtt = TICKS_2_MSEC(srtt_cur);
2071         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
2072                 srtt = rack->rc_rack_rtt;
2073
2074         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2075         if (rsm == NULL)
2076                 return (NULL);
2077
2078         if (rsm->r_flags & RACK_ACKED) {
2079                 rsm = rack_find_lowest_rsm(rack);
2080                 if (rsm == NULL)
2081                         return (NULL);
2082         }
2083         idx = rsm->r_rtr_cnt - 1;
2084         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2085         if (tsused < rsm->r_tim_lastsent[idx]) {
2086                 return (NULL);
2087         }
2088         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2089                 return (NULL);
2090         }
2091         /* Ok if we reach here we are over-due */
2092         rack->r_ctl.rc_rsm_start = rsm->r_start;
2093         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2094         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2095         rack_cong_signal(tp, NULL, CC_NDUPACK);
2096         return (rsm);
2097 }
2098
2099 static uint32_t
2100 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2101 {
2102         int32_t t;
2103         int32_t tt;
2104         uint32_t ret_val;
2105
2106         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2107         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2108             tcp_persmin, tcp_persmax);
2109         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2110                 tp->t_rxtshift++;
2111         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2112         ret_val = (uint32_t)tt;
2113         return (ret_val);
2114 }
2115
2116 static uint32_t
2117 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2118 {
2119         /*
2120          * Start the FR timer, we do this based on getting the first one in
2121          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2122          * events we need to stop the running timer (if its running) before
2123          * starting the new one.
2124          */
2125         uint32_t thresh, exp, to, srtt, time_since_sent;
2126         uint32_t srtt_cur;
2127         int32_t idx;
2128         int32_t is_tlp_timer = 0;
2129         struct rack_sendmap *rsm;
2130
2131         if (rack->t_timers_stopped) {
2132                 /* All timers have been stopped none are to run */
2133                 return (0);
2134         }
2135         if (rack->rc_in_persist) {
2136                 /* We can't start any timer in persists */
2137                 return (rack_get_persists_timer_val(tp, rack));
2138         }
2139         if (tp->t_state < TCPS_ESTABLISHED)
2140                 goto activate_rxt;
2141         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2142         if (rsm == NULL) {
2143                 /* Nothing on the send map */
2144 activate_rxt:
2145                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2146                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2147                         to = TICKS_2_MSEC(tp->t_rxtcur);
2148                         if (to == 0)
2149                                 to = 1;
2150                         return (to);
2151                 }
2152                 return (0);
2153         }
2154         if (rsm->r_flags & RACK_ACKED) {
2155                 rsm = rack_find_lowest_rsm(rack);
2156                 if (rsm == NULL) {
2157                         /* No lowest? */
2158                         goto activate_rxt;
2159                 }
2160         }
2161         /* Convert from ms to usecs */
2162         if (rsm->r_flags & RACK_SACK_PASSED) {
2163                 if ((tp->t_flags & TF_SENTFIN) &&
2164                     ((tp->snd_max - tp->snd_una) == 1) &&
2165                     (rsm->r_flags & RACK_HAS_FIN)) {
2166                         /*
2167                          * We don't start a rack timer if all we have is a
2168                          * FIN outstanding.
2169                          */
2170                         goto activate_rxt;
2171                 }
2172                 if (tp->t_srtt) {
2173                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2174                         srtt = TICKS_2_MSEC(srtt_cur);
2175                 } else
2176                         srtt = RACK_INITIAL_RTO;
2177
2178                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2179                 idx = rsm->r_rtr_cnt - 1;
2180                 exp = rsm->r_tim_lastsent[idx] + thresh;
2181                 if (SEQ_GEQ(exp, cts)) {
2182                         to = exp - cts;
2183                         if (to < rack->r_ctl.rc_min_to) {
2184                                 to = rack->r_ctl.rc_min_to;
2185                         }
2186                 } else {
2187                         to = rack->r_ctl.rc_min_to;
2188                 }
2189         } else {
2190                 /* Ok we need to do a TLP not RACK */
2191                 if ((rack->rc_tlp_in_progress != 0) ||
2192                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2193                         /*
2194                          * The previous send was a TLP or a tlp_rtx is in
2195                          * process.
2196                          */
2197                         goto activate_rxt;
2198                 }
2199                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2200                 if (rsm == NULL) {
2201                         /* We found no rsm to TLP with. */
2202                         goto activate_rxt;
2203                 }
2204                 if (rsm->r_flags & RACK_HAS_FIN) {
2205                         /* If its a FIN we dont do TLP */
2206                         rsm = NULL;
2207                         goto activate_rxt;
2208                 }
2209                 idx = rsm->r_rtr_cnt - 1;
2210                 if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
2211                         time_since_sent = cts - rsm->r_tim_lastsent[idx];
2212                 else
2213                         time_since_sent = 0;
2214                 is_tlp_timer = 1;
2215                 if (tp->t_srtt) {
2216                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2217                         srtt = TICKS_2_MSEC(srtt_cur);
2218                 } else
2219                         srtt = RACK_INITIAL_RTO;
2220                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2221                 if (thresh > time_since_sent)
2222                         to = thresh - time_since_sent;
2223                 else
2224                         to = rack->r_ctl.rc_min_to;
2225                 if (to > TCPTV_REXMTMAX) {
2226                         /*
2227                          * If the TLP time works out to larger than the max
2228                          * RTO lets not do TLP.. just RTO.
2229                          */
2230                         goto activate_rxt;
2231                 }
2232                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2233                         /*
2234                          * The tail is no longer the last one I did a probe
2235                          * on
2236                          */
2237                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2238                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2239                 }
2240         }
2241         if (is_tlp_timer == 0) {
2242                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2243         } else {
2244                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2245                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2246                         /*
2247                          * We have exceeded how many times we can retran the
2248                          * current TLP timer, switch to the RTO timer.
2249                          */
2250                         goto activate_rxt;
2251                 } else {
2252                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2253                 }
2254         }
2255         if (to == 0)
2256                 to = 1;
2257         return (to);
2258 }
2259
2260 static void
2261 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2262 {
2263         if (rack->rc_in_persist == 0) {
2264                 if (((tp->t_flags & TF_SENTFIN) == 0) &&
2265                     (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
2266                         /* Must need to send more data to enter persist */
2267                         return;
2268                 rack->r_ctl.rc_went_idle_time = cts;
2269                 rack_timer_cancel(tp, rack, cts, __LINE__);
2270                 tp->t_rxtshift = 0;
2271                 rack->rc_in_persist = 1;
2272         }
2273 }
2274
2275 static void
2276 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2277 {
2278         if (rack->rc_inp->inp_in_hpts)  {
2279                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2280                 rack->r_ctl.rc_hpts_flags  = 0;
2281         }
2282         rack->rc_in_persist = 0;
2283         rack->r_ctl.rc_went_idle_time = 0;
2284         tp->t_flags &= ~TF_FORCEDATA;
2285         tp->t_rxtshift = 0;
2286 }
2287
2288 static void
2289 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
2290     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
2291 {
2292         struct inpcb *inp;
2293         uint32_t delayed_ack = 0;
2294         uint32_t hpts_timeout;
2295         uint8_t stopped;
2296         uint32_t left = 0;
2297
2298         inp = tp->t_inpcb;
2299         if (inp->inp_in_hpts) {
2300                 /* A previous call is already set up */
2301                 return;
2302         }
2303         if (tp->t_state == TCPS_CLOSED) {
2304                 return;
2305         }
2306         stopped = rack->rc_tmr_stopped;
2307         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2308                 left = rack->r_ctl.rc_timer_exp - cts;
2309         }
2310         rack->r_ctl.rc_timer_exp = 0;
2311         if (rack->rc_inp->inp_in_hpts == 0) {
2312                 rack->r_ctl.rc_hpts_flags = 0;
2313         }
2314         if (slot) {
2315                 /* We are hptsi too */
2316                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2317         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2318                 /*
2319                  * We are still left on the hpts when the to goes
2320                  * it will be for output.
2321                  */
2322                 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
2323                         slot = cts - rack->r_ctl.rc_last_output_to;
2324                 else
2325                         slot = 1;
2326         }
2327         if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2328                 /* No send window.. we must enter persist */
2329                 rack_enter_persist(tp, rack, cts);
2330         } else if ((frm_out_sbavail &&
2331                     (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
2332                     (tp->snd_wnd < tp->t_maxseg)) &&
2333             TCPS_HAVEESTABLISHED(tp->t_state)) {
2334                 /*
2335                  * If we have no window or we can't send a segment (and have
2336                  * data to send.. we cheat here and frm_out_sbavail is
2337                  * passed in with the sbavail(sb) only from bbr_output) and
2338                  * we are established, then we must enter persits (if not
2339                  * already in persits).
2340                  */
2341                 rack_enter_persist(tp, rack, cts);
2342         }
2343         hpts_timeout = rack_timer_start(tp, rack, cts);
2344         if (tp->t_flags & TF_DELACK) {
2345                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2346                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2347         }
2348         if (delayed_ack && ((hpts_timeout == 0) ||
2349                             (delayed_ack < hpts_timeout)))
2350                 hpts_timeout = delayed_ack;
2351         else
2352                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2353         /*
2354          * If no timers are going to run and we will fall off the hptsi
2355          * wheel, we resort to a keep-alive timer if its configured.
2356          */
2357         if ((hpts_timeout == 0) &&
2358             (slot == 0)) {
2359                 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2360                     (tp->t_state <= TCPS_CLOSING)) {
2361                         /*
2362                          * Ok we have no timer (persists, rack, tlp, rxt  or
2363                          * del-ack), we don't have segments being paced. So
2364                          * all that is left is the keepalive timer.
2365                          */
2366                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2367                                 /* Get the established keep-alive time */
2368                                 hpts_timeout = TP_KEEPIDLE(tp);
2369                         } else {
2370                                 /* Get the initial setup keep-alive time */
2371                                 hpts_timeout = TP_KEEPINIT(tp);
2372                         }
2373                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2374                 }
2375         }
2376         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2377             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2378                 /*
2379                  * RACK, TLP, persists and RXT timers all are restartable
2380                  * based on actions input .. i.e we received a packet (ack
2381                  * or sack) and that changes things (rw, or snd_una etc).
2382                  * Thus we can restart them with a new value. For
2383                  * keep-alive, delayed_ack we keep track of what was left
2384                  * and restart the timer with a smaller value.
2385                  */
2386                 if (left < hpts_timeout)
2387                         hpts_timeout = left;
2388         }
2389         if (hpts_timeout) {
2390                 /*
2391                  * Hack alert for now we can't time-out over 2,147,483
2392                  * seconds (a bit more than 596 hours), which is probably ok
2393                  * :).
2394                  */
2395                 if (hpts_timeout > 0x7ffffffe)
2396                         hpts_timeout = 0x7ffffffe;
2397                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2398         }
2399         if (slot) {
2400                 rack->r_ctl.rc_last_output_to = cts + slot;
2401                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2402                         if (rack->rc_inp->inp_in_hpts == 0)
2403                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2404                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2405                 } else {
2406                         /*
2407                          * Arrange for the hpts to kick back in after the
2408                          * t-o if the t-o does not cause a send.
2409                          */
2410                         if (rack->rc_inp->inp_in_hpts == 0)
2411                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2412                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2413                 }
2414         } else if (hpts_timeout) {
2415                 if (rack->rc_inp->inp_in_hpts == 0)
2416                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2417                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2418         } else {
2419                 /* No timer starting */
2420 #ifdef INVARIANTS
2421                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2422                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2423                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2424                 }
2425 #endif
2426         }
2427         rack->rc_tmr_stopped = 0;
2428         if (slot)
2429                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2430 }
2431
2432 /*
2433  * RACK Timer, here we simply do logging and house keeping.
2434  * the normal rack_output() function will call the
2435  * appropriate thing to check if we need to do a RACK retransmit.
2436  * We return 1, saying don't proceed with rack_output only
2437  * when all timers have been stopped (destroyed PCB?).
2438  */
2439 static int
2440 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2441 {
2442         /*
2443          * This timer simply provides an internal trigger to send out data.
2444          * The check_recovery_mode call will see if there are needed
2445          * retransmissions, if so we will enter fast-recovery. The output
2446          * call may or may not do the same thing depending on sysctl
2447          * settings.
2448          */
2449         struct rack_sendmap *rsm;
2450         int32_t recovery;
2451
2452         if (tp->t_timers->tt_flags & TT_STOPPED) {
2453                 return (1);
2454         }
2455         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2456                 /* Its not time yet */
2457                 return (0);
2458         }
2459         rack_log_to_event(rack, RACK_TO_FRM_RACK);
2460         recovery = IN_RECOVERY(tp->t_flags);
2461         counter_u64_add(rack_to_tot, 1);
2462         if (rack->r_state && (rack->r_state != tp->t_state))
2463                 rack_set_state(tp, rack);
2464         rsm = rack_check_recovery_mode(tp, cts);
2465         if (rsm) {
2466                 uint32_t rtt;
2467
2468                 rtt = rack->rc_rack_rtt;
2469                 if (rtt == 0)
2470                         rtt = 1;
2471                 if ((recovery == 0) &&
2472                     (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
2473                         /*
2474                          * The rack-timeout that enter's us into recovery
2475                          * will force out one MSS and set us up so that we
2476                          * can do one more send in 2*rtt (transitioning the
2477                          * rack timeout into a rack-tlp).
2478                          */
2479                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2480                 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
2481                     ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
2482                         /*
2483                          * When a rack timer goes, we have to send at
2484                          * least one segment. They will be paced a min of 1ms
2485                          * apart via the next rack timer (or further
2486                          * if the rack timer dictates it).
2487                          */
2488                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2489                 }
2490         } else {
2491                 /* This is a case that should happen rarely if ever */
2492                 counter_u64_add(rack_tlp_does_nada, 1);
2493 #ifdef TCP_BLACKBOX
2494                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2495 #endif
2496                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2497         }
2498         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2499         return (0);
2500 }
2501
2502 /*
2503  * TLP Timer, here we simply setup what segment we want to
2504  * have the TLP expire on, the normal rack_output() will then
2505  * send it out.
2506  *
2507  * We return 1, saying don't proceed with rack_output only
2508  * when all timers have been stopped (destroyed PCB?).
2509  */
2510 static int
2511 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2512 {
2513         /*
2514          * Tail Loss Probe.
2515          */
2516         struct rack_sendmap *rsm = NULL;
2517         struct socket *so;
2518         uint32_t amm, old_prr_snd = 0;
2519         uint32_t out, avail;
2520
2521         if (tp->t_timers->tt_flags & TT_STOPPED) {
2522                 return (1);
2523         }
2524         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2525                 /* Its not time yet */
2526                 return (0);
2527         }
2528         if (rack_progress_timeout_check(tp)) {
2529                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2530                 return (1);
2531         }
2532         /*
2533          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2534          * need to figure out how to force a full MSS segment out.
2535          */
2536         rack_log_to_event(rack, RACK_TO_FRM_TLP);
2537         counter_u64_add(rack_tlp_tot, 1);
2538         if (rack->r_state && (rack->r_state != tp->t_state))
2539                 rack_set_state(tp, rack);
2540         so = tp->t_inpcb->inp_socket;
2541         avail = sbavail(&so->so_snd);
2542         out = tp->snd_max - tp->snd_una;
2543         rack->rc_timer_up = 1;
2544         /*
2545          * If we are in recovery we can jazz out a segment if new data is
2546          * present simply by setting rc_prr_sndcnt to a segment.
2547          */
2548         if ((avail > out) &&
2549             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2550                 /* New data is available */
2551                 amm = avail - out;
2552                 if (amm > tp->t_maxseg) {
2553                         amm = tp->t_maxseg;
2554                 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
2555                         /* not enough to fill a MTU and no-delay is off */
2556                         goto need_retran;
2557                 }
2558                 if (IN_RECOVERY(tp->t_flags)) {
2559                         /* Unlikely */
2560                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2561                         if (out + amm <= tp->snd_wnd)
2562                                 rack->r_ctl.rc_prr_sndcnt = amm;
2563                         else
2564                                 goto need_retran;
2565                 } else {
2566                         /* Set the send-new override */
2567                         if (out + amm <= tp->snd_wnd)
2568                                 rack->r_ctl.rc_tlp_new_data = amm;
2569                         else
2570                                 goto need_retran;
2571                 }
2572                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2573                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2574                 rack->r_ctl.rc_tlpsend = NULL;
2575                 counter_u64_add(rack_tlp_newdata, 1);
2576                 goto send;
2577         }
2578 need_retran:
2579         /*
2580          * Ok we need to arrange the last un-acked segment to be re-sent, or
2581          * optionally the first un-acked segment.
2582          */
2583         if (rack_always_send_oldest)
2584                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2585         else {
2586                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
2587                 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2588                         rsm = rack_find_high_nonack(rack, rsm);
2589                 }
2590         }
2591         if (rsm == NULL) {
2592                 counter_u64_add(rack_tlp_does_nada, 1);
2593 #ifdef TCP_BLACKBOX
2594                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2595 #endif
2596                 goto out;
2597         }
2598         if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
2599                 /*
2600                  * We need to split this the last segment in two.
2601                  */
2602                 int32_t idx;
2603                 struct rack_sendmap *nrsm;
2604
2605                 nrsm = rack_alloc(rack);
2606                 if (nrsm == NULL) {
2607                         /*
2608                          * No memory to split, we will just exit and punt
2609                          * off to the RXT timer.
2610                          */
2611                         counter_u64_add(rack_tlp_does_nada, 1);
2612                         goto out;
2613                 }
2614                 nrsm->r_start = (rsm->r_end - tp->t_maxseg);
2615                 nrsm->r_end = rsm->r_end;
2616                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2617                 nrsm->r_flags = rsm->r_flags;
2618                 nrsm->r_sndcnt = rsm->r_sndcnt;
2619                 nrsm->r_rtr_bytes = 0;
2620                 rsm->r_end = nrsm->r_start;
2621                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2622                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2623                 }
2624                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
2625                 if (rsm->r_in_tmap) {
2626                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2627                         nrsm->r_in_tmap = 1;
2628                 }
2629                 rsm->r_flags &= (~RACK_HAS_FIN);
2630                 rsm = nrsm;
2631         }
2632         rack->r_ctl.rc_tlpsend = rsm;
2633         rack->r_ctl.rc_tlp_rtx_out = 1;
2634         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2635                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2636                 tp->t_rxtshift++;
2637         } else {
2638                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2639                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2640         }
2641 send:
2642         rack->r_ctl.rc_tlp_send_cnt++;
2643         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2644                 /*
2645                  * Can't [re]/transmit a segment we have not heard from the
2646                  * peer in max times. We need the retransmit timer to take
2647                  * over.
2648                  */
2649 restore:
2650                 rack->r_ctl.rc_tlpsend = NULL;
2651                 if (rsm)
2652                         rsm->r_flags &= ~RACK_TLP;
2653                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2654                 counter_u64_add(rack_tlp_retran_fail, 1);
2655                 goto out;
2656         } else if (rsm) {
2657                 rsm->r_flags |= RACK_TLP;
2658         }
2659         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2660             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2661                 /*
2662                  * We don't want to send a single segment more than the max
2663                  * either.
2664                  */
2665                 goto restore;
2666         }
2667         rack->r_timer_override = 1;
2668         rack->r_tlp_running = 1;
2669         rack->rc_tlp_in_progress = 1;
2670         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2671         return (0);
2672 out:
2673         rack->rc_timer_up = 0;
2674         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2675         return (0);
2676 }
2677
2678 /*
2679  * Delayed ack Timer, here we simply need to setup the
2680  * ACK_NOW flag and remove the DELACK flag. From there
2681  * the output routine will send the ack out.
2682  *
2683  * We only return 1, saying don't proceed, if all timers
2684  * are stopped (destroyed PCB?).
2685  */
2686 static int
2687 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2688 {
2689         if (tp->t_timers->tt_flags & TT_STOPPED) {
2690                 return (1);
2691         }
2692         rack_log_to_event(rack, RACK_TO_FRM_DELACK);
2693         tp->t_flags &= ~TF_DELACK;
2694         tp->t_flags |= TF_ACKNOW;
2695         TCPSTAT_INC(tcps_delack);
2696         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2697         return (0);
2698 }
2699
2700 /*
2701  * Persists timer, here we simply need to setup the
2702  * FORCE-DATA flag the output routine will send
2703  * the one byte send.
2704  *
2705  * We only return 1, saying don't proceed, if all timers
2706  * are stopped (destroyed PCB?).
2707  */
2708 static int
2709 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2710 {
2711         struct inpcb *inp;
2712         int32_t retval = 0;
2713
2714         inp = tp->t_inpcb;
2715
2716         if (tp->t_timers->tt_flags & TT_STOPPED) {
2717                 return (1);
2718         }
2719         if (rack->rc_in_persist == 0)
2720                 return (0);
2721         if (rack_progress_timeout_check(tp)) {
2722                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2723                 return (1);
2724         }
2725         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2726         /*
2727          * Persistence timer into zero window. Force a byte to be output, if
2728          * possible.
2729          */
2730         TCPSTAT_INC(tcps_persisttimeo);
2731         /*
2732          * Hack: if the peer is dead/unreachable, we do not time out if the
2733          * window is closed.  After a full backoff, drop the connection if
2734          * the idle time (no responses to probes) reaches the maximum
2735          * backoff that we would use if retransmitting.
2736          */
2737         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2738             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2739             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2740                 TCPSTAT_INC(tcps_persistdrop);
2741                 retval = 1;
2742                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2743                 goto out;
2744         }
2745         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2746             tp->snd_una == tp->snd_max)
2747                 rack_exit_persist(tp, rack);
2748         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2749         /*
2750          * If the user has closed the socket then drop a persisting
2751          * connection after a much reduced timeout.
2752          */
2753         if (tp->t_state > TCPS_CLOSE_WAIT &&
2754             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2755                 retval = 1;
2756                 TCPSTAT_INC(tcps_persistdrop);
2757                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2758                 goto out;
2759         }
2760         tp->t_flags |= TF_FORCEDATA;
2761 out:
2762         rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
2763         return (retval);
2764 }
2765
2766 /*
2767  * If a keepalive goes off, we had no other timers
2768  * happening. We always return 1 here since this
2769  * routine either drops the connection or sends
2770  * out a segment with respond.
2771  */
2772 static int
2773 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2774 {
2775         struct tcptemp *t_template;
2776         struct inpcb *inp;
2777
2778         if (tp->t_timers->tt_flags & TT_STOPPED) {
2779                 return (1);
2780         }
2781         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
2782         inp = tp->t_inpcb;
2783         rack_log_to_event(rack, RACK_TO_FRM_KEEP);
2784         /*
2785          * Keep-alive timer went off; send something or drop connection if
2786          * idle for too long.
2787          */
2788         TCPSTAT_INC(tcps_keeptimeo);
2789         if (tp->t_state < TCPS_ESTABLISHED)
2790                 goto dropit;
2791         if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2792             tp->t_state <= TCPS_CLOSING) {
2793                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
2794                         goto dropit;
2795                 /*
2796                  * Send a packet designed to force a response if the peer is
2797                  * up and reachable: either an ACK if the connection is
2798                  * still alive, or an RST if the peer has closed the
2799                  * connection due to timeout or reboot. Using sequence
2800                  * number tp->snd_una-1 causes the transmitted zero-length
2801                  * segment to lie outside the receive window; by the
2802                  * protocol spec, this requires the correspondent TCP to
2803                  * respond.
2804                  */
2805                 TCPSTAT_INC(tcps_keepprobe);
2806                 t_template = tcpip_maketemplate(inp);
2807                 if (t_template) {
2808                         tcp_respond(tp, t_template->tt_ipgen,
2809                             &t_template->tt_t, (struct mbuf *)NULL,
2810                             tp->rcv_nxt, tp->snd_una - 1, 0);
2811                         free(t_template, M_TEMP);
2812                 }
2813         }
2814         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
2815         return (1);
2816 dropit:
2817         TCPSTAT_INC(tcps_keepdrops);
2818         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2819         return (1);
2820 }
2821
2822 /*
2823  * Retransmit helper function, clear up all the ack
2824  * flags and take care of important book keeping.
2825  */
2826 static void
2827 rack_remxt_tmr(struct tcpcb *tp)
2828 {
2829         /*
2830          * The retransmit timer went off, all sack'd blocks must be
2831          * un-acked.
2832          */
2833         struct rack_sendmap *rsm, *trsm = NULL;
2834         struct tcp_rack *rack;
2835         int32_t cnt = 0;
2836
2837         rack = (struct tcp_rack *)tp->t_fb_ptr;
2838         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
2839         rack_log_to_event(rack, RACK_TO_FRM_TMR);
2840         if (rack->r_state && (rack->r_state != tp->t_state))
2841                 rack_set_state(tp, rack);
2842         /*
2843          * Ideally we would like to be able to
2844          * mark SACK-PASS on anything not acked here.
2845          * However, if we do that we would burst out
2846          * all that data 1ms apart. This would be unwise,
2847          * so for now we will just let the normal rxt timer
2848          * and tlp timer take care of it.
2849          */
2850         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
2851                 if (rsm->r_flags & RACK_ACKED) {
2852                         cnt++;
2853                         rsm->r_sndcnt = 0;
2854                         if (rsm->r_in_tmap == 0) {
2855                                 /* We must re-add it back to the tlist */
2856                                 if (trsm == NULL) {
2857                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
2858                                 } else {
2859                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
2860                                 }
2861                                 rsm->r_in_tmap = 1;
2862                                 trsm = rsm;
2863                         }
2864                 }
2865                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
2866         }
2867         /* Clear the count (we just un-acked them) */
2868         rack->r_ctl.rc_sacked = 0;
2869         /* Clear the tlp rtx mark */
2870         rack->r_ctl.rc_tlp_rtx_out = 0;
2871         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2872         rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
2873         /* Setup so we send one segment */
2874         if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
2875                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2876         rack->r_timer_override = 1;
2877 }
2878
2879 /*
2880  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
2881  * we will setup to retransmit the lowest seq number outstanding.
2882  */
2883 static int
2884 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2885 {
2886         int32_t rexmt;
2887         struct inpcb *inp;
2888         int32_t retval = 0;
2889
2890         inp = tp->t_inpcb;
2891         if (tp->t_timers->tt_flags & TT_STOPPED) {
2892                 return (1);
2893         }
2894         if (rack_progress_timeout_check(tp)) {
2895                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2896                 return (1);
2897         }
2898         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
2899         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2900             (tp->snd_una == tp->snd_max)) {
2901                 /* Nothing outstanding .. nothing to do */
2902                 return (0);
2903         }
2904         /*
2905          * Retransmission timer went off.  Message has not been acked within
2906          * retransmit interval.  Back off to a longer retransmit interval
2907          * and retransmit one segment.
2908          */
2909         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
2910                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
2911                 TCPSTAT_INC(tcps_timeoutdrop);
2912                 retval = 1;
2913                 tcp_set_inp_to_drop(rack->rc_inp,
2914                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
2915                 goto out;
2916         }
2917         rack_remxt_tmr(tp);
2918         if (tp->t_state == TCPS_SYN_SENT) {
2919                 /*
2920                  * If the SYN was retransmitted, indicate CWND to be limited
2921                  * to 1 segment in cc_conn_init().
2922                  */
2923                 tp->snd_cwnd = 1;
2924         } else if (tp->t_rxtshift == 1) {
2925                 /*
2926                  * first retransmit; record ssthresh and cwnd so they can be
2927                  * recovered if this turns out to be a "bad" retransmit. A
2928                  * retransmit is considered "bad" if an ACK for this segment
2929                  * is received within RTT/2 interval; the assumption here is
2930                  * that the ACK was already in flight.  See "On Estimating
2931                  * End-to-End Network Path Properties" by Allman and Paxson
2932                  * for more details.
2933                  */
2934                 tp->snd_cwnd_prev = tp->snd_cwnd;
2935                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
2936                 tp->snd_recover_prev = tp->snd_recover;
2937                 if (IN_FASTRECOVERY(tp->t_flags))
2938                         tp->t_flags |= TF_WASFRECOVERY;
2939                 else
2940                         tp->t_flags &= ~TF_WASFRECOVERY;
2941                 if (IN_CONGRECOVERY(tp->t_flags))
2942                         tp->t_flags |= TF_WASCRECOVERY;
2943                 else
2944                         tp->t_flags &= ~TF_WASCRECOVERY;
2945                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
2946                 tp->t_flags |= TF_PREVVALID;
2947         } else
2948                 tp->t_flags &= ~TF_PREVVALID;
2949         TCPSTAT_INC(tcps_rexmttimeo);
2950         if ((tp->t_state == TCPS_SYN_SENT) ||
2951             (tp->t_state == TCPS_SYN_RECEIVED))
2952                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
2953         else
2954                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
2955         TCPT_RANGESET(tp->t_rxtcur, rexmt,
2956            max(MSEC_2_TICKS(rack_rto_min), rexmt),
2957            MSEC_2_TICKS(rack_rto_max));
2958         /*
2959          * We enter the path for PLMTUD if connection is established or, if
2960          * connection is FIN_WAIT_1 status, reason for the last is that if
2961          * amount of data we send is very small, we could send it in couple
2962          * of packets and process straight to FIN. In that case we won't
2963          * catch ESTABLISHED state.
2964          */
2965         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
2966             || (tp->t_state == TCPS_FIN_WAIT_1))) {
2967 #ifdef INET6
2968                 int32_t isipv6;
2969 #endif
2970
2971                 /*
2972                  * Idea here is that at each stage of mtu probe (usually,
2973                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
2974                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
2975                  * should take care of that.
2976                  */
2977                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
2978                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
2979                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
2980                     tp->t_rxtshift % 2 == 0)) {
2981                         /*
2982                          * Enter Path MTU Black-hole Detection mechanism: -
2983                          * Disable Path MTU Discovery (IP "DF" bit). -
2984                          * Reduce MTU to lower value than what we negotiated
2985                          * with peer.
2986                          */
2987                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
2988                                 /* Record that we may have found a black hole. */
2989                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
2990                                 /* Keep track of previous MSS. */
2991                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
2992                         }
2993
2994                         /*
2995                          * Reduce the MSS to blackhole value or to the
2996                          * default in an attempt to retransmit.
2997                          */
2998 #ifdef INET6
2999                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
3000                         if (isipv6 &&
3001                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3002                                 /* Use the sysctl tuneable blackhole MSS. */
3003                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3004                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3005                         } else if (isipv6) {
3006                                 /* Use the default MSS. */
3007                                 tp->t_maxseg = V_tcp_v6mssdflt;
3008                                 /*
3009                                  * Disable Path MTU Discovery when we switch
3010                                  * to minmss.
3011                                  */
3012                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3013                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3014                         }
3015 #endif
3016 #if defined(INET6) && defined(INET)
3017                         else
3018 #endif
3019 #ifdef INET
3020                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3021                                 /* Use the sysctl tuneable blackhole MSS. */
3022                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3023                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3024                         } else {
3025                                 /* Use the default MSS. */
3026                                 tp->t_maxseg = V_tcp_mssdflt;
3027                                 /*
3028                                  * Disable Path MTU Discovery when we switch
3029                                  * to minmss.
3030                                  */
3031                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3032                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3033                         }
3034 #endif
3035                 } else {
3036                         /*
3037                          * If further retransmissions are still unsuccessful
3038                          * with a lowered MTU, maybe this isn't a blackhole
3039                          * and we restore the previous MSS and blackhole
3040                          * detection flags. The limit '6' is determined by
3041                          * giving each probe stage (1448, 1188, 524) 2
3042                          * chances to recover.
3043                          */
3044                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3045                             (tp->t_rxtshift >= 6)) {
3046                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3047                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3048                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3049                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3050                         }
3051                 }
3052         }
3053         /*
3054          * Disable RFC1323 and SACK if we haven't got any response to our
3055          * third SYN to work-around some broken terminal servers (most of
3056          * which have hopefully been retired) that have bad VJ header
3057          * compression code which trashes TCP segments containing
3058          * unknown-to-them TCP options.
3059          */
3060         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
3061             (tp->t_rxtshift == 3))
3062                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
3063         /*
3064          * If we backed off this far, our srtt estimate is probably bogus.
3065          * Clobber it so we'll take the next rtt measurement as our srtt;
3066          * move the current srtt into rttvar to keep the current retransmit
3067          * times until then.
3068          */
3069         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3070 #ifdef INET6
3071                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3072                         in6_losing(tp->t_inpcb);
3073                 else
3074 #endif
3075                         in_losing(tp->t_inpcb);
3076                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3077                 tp->t_srtt = 0;
3078         }
3079         if (rack_use_sack_filter)
3080                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3081         tp->snd_recover = tp->snd_max;
3082         tp->t_flags |= TF_ACKNOW;
3083         tp->t_rtttime = 0;
3084         rack_cong_signal(tp, NULL, CC_RTO);
3085 out:
3086         return (retval);
3087 }
3088
3089 static int
3090 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3091 {
3092         int32_t ret = 0;
3093         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3094
3095         if (timers == 0) {
3096                 return (0);
3097         }
3098         if (tp->t_state == TCPS_LISTEN) {
3099                 /* no timers on listen sockets */
3100                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3101                         return (0);
3102                 return (1);
3103         }
3104         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3105                 uint32_t left;
3106
3107                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3108                         ret = -1;
3109                         rack_log_to_processing(rack, cts, ret, 0);
3110                         return (0);
3111                 }
3112                 if (hpts_calling == 0) {
3113                         ret = -2;
3114                         rack_log_to_processing(rack, cts, ret, 0);
3115                         return (0);
3116                 }
3117                 /*
3118                  * Ok our timer went off early and we are not paced false
3119                  * alarm, go back to sleep.
3120                  */
3121                 ret = -3;
3122                 left = rack->r_ctl.rc_timer_exp - cts;
3123                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3124                 rack_log_to_processing(rack, cts, ret, left);
3125                 rack->rc_last_pto_set = 0;
3126                 return (1);
3127         }
3128         rack->rc_tmr_stopped = 0;
3129         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3130         if (timers & PACE_TMR_DELACK) {
3131                 ret = rack_timeout_delack(tp, rack, cts);
3132         } else if (timers & PACE_TMR_RACK) {
3133                 ret = rack_timeout_rack(tp, rack, cts);
3134         } else if (timers & PACE_TMR_TLP) {
3135                 ret = rack_timeout_tlp(tp, rack, cts);
3136         } else if (timers & PACE_TMR_RXT) {
3137                 ret = rack_timeout_rxt(tp, rack, cts);
3138         } else if (timers & PACE_TMR_PERSIT) {
3139                 ret = rack_timeout_persist(tp, rack, cts);
3140         } else if (timers & PACE_TMR_KEEP) {
3141                 ret = rack_timeout_keepalive(tp, rack, cts);
3142         }
3143         rack_log_to_processing(rack, cts, ret, timers);
3144         return (ret);
3145 }
3146
3147 static void
3148 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3149 {
3150         uint8_t hpts_removed = 0;
3151
3152         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3153             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3154                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3155                 hpts_removed = 1;
3156         }
3157         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3158                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3159                 if (rack->rc_inp->inp_in_hpts &&
3160                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3161                         /*
3162                          * Canceling timer's when we have no output being
3163                          * paced. We also must remove ourselves from the
3164                          * hpts.
3165                          */
3166                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3167                         hpts_removed = 1;
3168                 }
3169                 rack_log_to_cancel(rack, hpts_removed, line);
3170                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3171         }
3172 }
3173
3174 static void
3175 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3176 {
3177         return;
3178 }
3179
3180 static int
3181 rack_stopall(struct tcpcb *tp)
3182 {
3183         struct tcp_rack *rack;
3184         rack = (struct tcp_rack *)tp->t_fb_ptr;
3185         rack->t_timers_stopped = 1;
3186         return (0);
3187 }
3188
3189 static void
3190 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3191 {
3192         return;
3193 }
3194
3195 static int
3196 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3197 {
3198         return (0);
3199 }
3200
3201 static void
3202 rack_stop_all_timers(struct tcpcb *tp)
3203 {
3204         struct tcp_rack *rack;
3205
3206         /*
3207          * Assure no timers are running.
3208          */
3209         if (tcp_timer_active(tp, TT_PERSIST)) {
3210                 /* We enter in persists, set the flag appropriately */
3211                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3212                 rack->rc_in_persist = 1;
3213         }
3214         tcp_timer_suspend(tp, TT_PERSIST);
3215         tcp_timer_suspend(tp, TT_REXMT);
3216         tcp_timer_suspend(tp, TT_KEEP);
3217         tcp_timer_suspend(tp, TT_DELACK);
3218 }
3219
3220 static void
3221 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3222     struct rack_sendmap *rsm, uint32_t ts)
3223 {
3224         int32_t idx;
3225
3226         rsm->r_rtr_cnt++;
3227         rsm->r_sndcnt++;
3228         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3229                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3230                 rsm->r_flags |= RACK_OVERMAX;
3231         }
3232         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3233                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3234                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3235         }
3236         idx = rsm->r_rtr_cnt - 1;
3237         rsm->r_tim_lastsent[idx] = ts;
3238         if (rsm->r_flags & RACK_ACKED) {
3239                 /* Problably MTU discovery messing with us */
3240                 rsm->r_flags &= ~RACK_ACKED;
3241                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3242         }
3243         if (rsm->r_in_tmap) {
3244                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3245         }
3246         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3247         rsm->r_in_tmap = 1;
3248         if (rsm->r_flags & RACK_SACK_PASSED) {
3249                 /* We have retransmitted due to the SACK pass */
3250                 rsm->r_flags &= ~RACK_SACK_PASSED;
3251                 rsm->r_flags |= RACK_WAS_SACKPASS;
3252         }
3253         /* Update memory for next rtr */
3254         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3255 }
3256
3257
3258 static uint32_t
3259 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3260     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
3261 {
3262         /*
3263          * We (re-)transmitted starting at rsm->r_start for some length
3264          * (possibly less than r_end.
3265          */
3266         struct rack_sendmap *nrsm;
3267         uint32_t c_end;
3268         int32_t len;
3269         int32_t idx;
3270
3271         len = *lenp;
3272         c_end = rsm->r_start + len;
3273         if (SEQ_GEQ(c_end, rsm->r_end)) {
3274                 /*
3275                  * We retransmitted the whole piece or more than the whole
3276                  * slopping into the next rsm.
3277                  */
3278                 rack_update_rsm(tp, rack, rsm, ts);
3279                 if (c_end == rsm->r_end) {
3280                         *lenp = 0;
3281                         return (0);
3282                 } else {
3283                         int32_t act_len;
3284
3285                         /* Hangs over the end return whats left */
3286                         act_len = rsm->r_end - rsm->r_start;
3287                         *lenp = (len - act_len);
3288                         return (rsm->r_end);
3289                 }
3290                 /* We don't get out of this block. */
3291         }
3292         /*
3293          * Here we retransmitted less than the whole thing which means we
3294          * have to split this into what was transmitted and what was not.
3295          */
3296         nrsm = rack_alloc(rack);
3297         if (nrsm == NULL) {
3298                 /*
3299                  * We can't get memory, so lets not proceed.
3300                  */
3301                 *lenp = 0;
3302                 return (0);
3303         }
3304         /*
3305          * So here we are going to take the original rsm and make it what we
3306          * retransmitted. nrsm will be the tail portion we did not
3307          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3308          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3309          * 1, 6 and the new piece will be 6, 11.
3310          */
3311         nrsm->r_start = c_end;
3312         nrsm->r_end = rsm->r_end;
3313         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3314         nrsm->r_flags = rsm->r_flags;
3315         nrsm->r_sndcnt = rsm->r_sndcnt;
3316         nrsm->r_rtr_bytes = 0;
3317         rsm->r_end = c_end;
3318         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3319                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3320         }
3321         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3322         if (rsm->r_in_tmap) {
3323                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3324                 nrsm->r_in_tmap = 1;
3325         }
3326         rsm->r_flags &= (~RACK_HAS_FIN);
3327         rack_update_rsm(tp, rack, rsm, ts);
3328         *lenp = 0;
3329         return (0);
3330 }
3331
3332
3333 static void
3334 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3335     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3336     uint8_t pass, struct rack_sendmap *hintrsm)
3337 {
3338         struct tcp_rack *rack;
3339         struct rack_sendmap *rsm, *nrsm;
3340         register uint32_t snd_max, snd_una;
3341         int32_t idx;
3342
3343         /*
3344          * Add to the RACK log of packets in flight or retransmitted. If
3345          * there is a TS option we will use the TS echoed, if not we will
3346          * grab a TS.
3347          *
3348          * Retransmissions will increment the count and move the ts to its
3349          * proper place. Note that if options do not include TS's then we
3350          * won't be able to effectively use the ACK for an RTT on a retran.
3351          *
3352          * Notes about r_start and r_end. Lets consider a send starting at
3353          * sequence 1 for 10 bytes. In such an example the r_start would be
3354          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3355          * This means that r_end is actually the first sequence for the next
3356          * slot (11).
3357          *
3358          */
3359         /*
3360          * If err is set what do we do XXXrrs? should we not add the thing?
3361          * -- i.e. return if err != 0 or should we pretend we sent it? --
3362          * i.e. proceed with add ** do this for now.
3363          */
3364         INP_WLOCK_ASSERT(tp->t_inpcb);
3365         if (err)
3366                 /*
3367                  * We don't log errors -- we could but snd_max does not
3368                  * advance in this case either.
3369                  */
3370                 return;
3371
3372         if (th_flags & TH_RST) {
3373                 /*
3374                  * We don't log resets and we return immediately from
3375                  * sending
3376                  */
3377                 return;
3378         }
3379         rack = (struct tcp_rack *)tp->t_fb_ptr;
3380         snd_una = tp->snd_una;
3381         if (SEQ_LEQ((seq_out + len), snd_una)) {
3382                 /* Are sending an old segment to induce an ack (keep-alive)? */
3383                 return;
3384         }
3385         if (SEQ_LT(seq_out, snd_una)) {
3386                 /* huh? should we panic? */
3387                 uint32_t end;
3388
3389                 end = seq_out + len;
3390                 seq_out = snd_una;
3391                 len = end - seq_out;
3392         }
3393         snd_max = tp->snd_max;
3394         if (th_flags & (TH_SYN | TH_FIN)) {
3395                 /*
3396                  * The call to rack_log_output is made before bumping
3397                  * snd_max. This means we can record one extra byte on a SYN
3398                  * or FIN if seq_out is adding more on and a FIN is present
3399                  * (and we are not resending).
3400                  */
3401                 if (th_flags & TH_SYN)
3402                         len++;
3403                 if (th_flags & TH_FIN)
3404                         len++;
3405                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3406                         /*
3407                          * The add/update as not been done for the FIN/SYN
3408                          * yet.
3409                          */
3410                         snd_max = tp->snd_nxt;
3411                 }
3412         }
3413         if (len == 0) {
3414                 /* We don't log zero window probes */
3415                 return;
3416         }
3417         rack->r_ctl.rc_time_last_sent = ts;
3418         if (IN_RECOVERY(tp->t_flags)) {
3419                 rack->r_ctl.rc_prr_out += len;
3420         }
3421         /* First question is it a retransmission? */
3422         if (seq_out == snd_max) {
3423 again:
3424                 rsm = rack_alloc(rack);
3425                 if (rsm == NULL) {
3426                         /*
3427                          * Hmm out of memory and the tcb got destroyed while
3428                          * we tried to wait.
3429                          */
3430 #ifdef INVARIANTS
3431                         panic("Out of memory when we should not be rack:%p", rack);
3432 #endif
3433                         return;
3434                 }
3435                 if (th_flags & TH_FIN) {
3436                         rsm->r_flags = RACK_HAS_FIN;
3437                 } else {
3438                         rsm->r_flags = 0;
3439                 }
3440                 rsm->r_tim_lastsent[0] = ts;
3441                 rsm->r_rtr_cnt = 1;
3442                 rsm->r_rtr_bytes = 0;
3443                 if (th_flags & TH_SYN) {
3444                         /* The data space is one beyond snd_una */
3445                         rsm->r_start = seq_out + 1;
3446                         rsm->r_end = rsm->r_start + (len - 1);
3447                 } else {
3448                         /* Normal case */
3449                         rsm->r_start = seq_out;
3450                         rsm->r_end = rsm->r_start + len;
3451                 }
3452                 rsm->r_sndcnt = 0;
3453                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
3454                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3455                 rsm->r_in_tmap = 1;
3456                 return;
3457         }
3458         /*
3459          * If we reach here its a retransmission and we need to find it.
3460          */
3461 more:
3462         if (hintrsm && (hintrsm->r_start == seq_out)) {
3463                 rsm = hintrsm;
3464                 hintrsm = NULL;
3465         } else if (rack->r_ctl.rc_next) {
3466                 /* We have a hint from a previous run */
3467                 rsm = rack->r_ctl.rc_next;
3468         } else {
3469                 /* No hints sorry */
3470                 rsm = NULL;
3471         }
3472         if ((rsm) && (rsm->r_start == seq_out)) {
3473                 /*
3474                  * We used rc_next or hintrsm  to retransmit, hopefully the
3475                  * likely case.
3476                  */
3477                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3478                 if (len == 0) {
3479                         return;
3480                 } else {
3481                         goto more;
3482                 }
3483         }
3484         /* Ok it was not the last pointer go through it the hard way. */
3485         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3486                 if (rsm->r_start == seq_out) {
3487                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3488                         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3489                         if (len == 0) {
3490                                 return;
3491                         } else {
3492                                 continue;
3493                         }
3494                 }
3495                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3496                         /* Transmitted within this piece */
3497                         /*
3498                          * Ok we must split off the front and then let the
3499                          * update do the rest
3500                          */
3501                         nrsm = rack_alloc(rack);
3502                         if (nrsm == NULL) {
3503 #ifdef INVARIANTS
3504                                 panic("Ran out of memory that was preallocated? rack:%p", rack);
3505 #endif
3506                                 rack_update_rsm(tp, rack, rsm, ts);
3507                                 return;
3508                         }
3509                         /*
3510                          * copy rsm to nrsm and then trim the front of rsm
3511                          * to not include this part.
3512                          */
3513                         nrsm->r_start = seq_out;
3514                         nrsm->r_end = rsm->r_end;
3515                         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3516                         nrsm->r_flags = rsm->r_flags;
3517                         nrsm->r_sndcnt = rsm->r_sndcnt;
3518                         nrsm->r_rtr_bytes = 0;
3519                         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3520                                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3521                         }
3522                         rsm->r_end = nrsm->r_start;
3523                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3524                         if (rsm->r_in_tmap) {
3525                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3526                                 nrsm->r_in_tmap = 1;
3527                         }
3528                         rsm->r_flags &= (~RACK_HAS_FIN);
3529                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3530                         if (len == 0) {
3531                                 return;
3532                         }
3533                 }
3534         }
3535         /*
3536          * Hmm not found in map did they retransmit both old and on into the
3537          * new?
3538          */
3539         if (seq_out == tp->snd_max) {
3540                 goto again;
3541         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3542 #ifdef INVARIANTS
3543                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3544                     seq_out, len, tp->snd_una, tp->snd_max);
3545                 printf("Starting Dump of all rack entries\n");
3546                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3547                         printf("rsm:%p start:%u end:%u\n",
3548                             rsm, rsm->r_start, rsm->r_end);
3549                 }
3550                 printf("Dump complete\n");
3551                 panic("seq_out not found rack:%p tp:%p",
3552                     rack, tp);
3553 #endif
3554         } else {
3555 #ifdef INVARIANTS
3556                 /*
3557                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3558                  * flag)
3559                  */
3560                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3561                     seq_out, len, tp->snd_max, tp);
3562 #endif
3563         }
3564 }
3565
3566 /*
3567  * Record one of the RTT updates from an ack into
3568  * our sample structure.
3569  */
3570 static void
3571 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3572 {
3573         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3574             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3575                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3576         }
3577         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3578             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3579                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3580         }
3581         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3582         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3583         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3584 }
3585
3586 /*
3587  * Collect new round-trip time estimate
3588  * and update averages and current timeout.
3589  */
3590 static void
3591 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3592 {
3593         int32_t delta;
3594         uint32_t o_srtt, o_var;
3595         int32_t rtt;
3596
3597         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3598                 /* No valid sample */
3599                 return;
3600         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3601                 /* We are to use the lowest RTT seen in a single ack */
3602                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3603         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3604                 /* We are to use the highest RTT seen in a single ack */
3605                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3606         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3607                 /* We are to use the average RTT seen in a single ack */
3608                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3609                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3610         } else {
3611 #ifdef INVARIANTS
3612                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3613 #endif
3614                 return;
3615         }
3616         if (rtt == 0)
3617                 rtt = 1;
3618         rack_log_rtt_sample(rack, rtt);
3619         o_srtt = tp->t_srtt;
3620         o_var = tp->t_rttvar;
3621         rack = (struct tcp_rack *)tp->t_fb_ptr;
3622         if (tp->t_srtt != 0) {
3623                 /*
3624                  * srtt is stored as fixed point with 5 bits after the
3625                  * binary point (i.e., scaled by 8).  The following magic is
3626                  * equivalent to the smoothing algorithm in rfc793 with an
3627                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3628                  * Adjust rtt to origin 0.
3629                  */
3630                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3631                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3632
3633                 tp->t_srtt += delta;
3634                 if (tp->t_srtt <= 0)
3635                         tp->t_srtt = 1;
3636
3637                 /*
3638                  * We accumulate a smoothed rtt variance (actually, a
3639                  * smoothed mean difference), then set the retransmit timer
3640                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3641                  * is stored as fixed point with 4 bits after the binary
3642                  * point (scaled by 16).  The following is equivalent to
3643                  * rfc793 smoothing with an alpha of .75 (rttvar =
3644                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3645                  * wired-in beta.
3646                  */
3647                 if (delta < 0)
3648                         delta = -delta;
3649                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3650                 tp->t_rttvar += delta;
3651                 if (tp->t_rttvar <= 0)
3652                         tp->t_rttvar = 1;
3653                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3654                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3655         } else {
3656                 /*
3657                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3658                  * variance to half the rtt (so our first retransmit happens
3659                  * at 3*rtt).
3660                  */
3661                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3662                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3663                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3664         }
3665         TCPSTAT_INC(tcps_rttupdated);
3666         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3667         tp->t_rttupdated++;
3668 #ifdef NETFLIX_STATS
3669         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3670 #endif
3671         tp->t_rxtshift = 0;
3672
3673         /*
3674          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3675          * way we do the smoothing, srtt and rttvar will each average +1/2
3676          * tick of bias.  When we compute the retransmit timer, we want 1/2
3677          * tick of rounding and 1 extra tick because of +-1/2 tick
3678          * uncertainty in the firing of the timer.  The bias will give us
3679          * exactly the 1.5 tick we need.  But, because the bias is
3680          * statistical, we have to test that we don't drop below the minimum
3681          * feasible timer (which is 2 ticks).
3682          */
3683         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3684            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3685         tp->t_softerror = 0;
3686 }
3687
3688 static void
3689 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3690     uint32_t t, uint32_t cts)
3691 {
3692         /*
3693          * For this RSM, we acknowledged the data from a previous
3694          * transmission, not the last one we made. This means we did a false
3695          * retransmit.
3696          */
3697         struct tcp_rack *rack;
3698
3699         if (rsm->r_flags & RACK_HAS_FIN) {
3700                 /*
3701                  * The sending of the FIN often is multiple sent when we
3702                  * have everything outstanding ack'd. We ignore this case
3703                  * since its over now.
3704                  */
3705                 return;
3706         }
3707         if (rsm->r_flags & RACK_TLP) {
3708                 /*
3709                  * We expect TLP's to have this occur.
3710                  */
3711                 return;
3712         }
3713         rack = (struct tcp_rack *)tp->t_fb_ptr;
3714         /* should we undo cc changes and exit recovery? */
3715         if (IN_RECOVERY(tp->t_flags)) {
3716                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3717                         /*
3718                          * Undo what we ratched down and exit recovery if
3719                          * possible
3720                          */
3721                         EXIT_RECOVERY(tp->t_flags);
3722                         tp->snd_recover = tp->snd_una;
3723                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3724                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3725                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3726                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3727                 }
3728         }
3729         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3730                 /*
3731                  * We retransmitted based on a sack and the earlier
3732                  * retransmission ack'd it - re-ordering is occuring.
3733                  */
3734                 counter_u64_add(rack_reorder_seen, 1);
3735                 rack->r_ctl.rc_reorder_ts = cts;
3736         }
3737         counter_u64_add(rack_badfr, 1);
3738         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3739 }
3740
3741
3742 static int
3743 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3744     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3745 {
3746         int32_t i;
3747         uint32_t t;
3748
3749         if (rsm->r_flags & RACK_ACKED)
3750                 /* Already done */
3751                 return (0);
3752
3753
3754         if ((rsm->r_rtr_cnt == 1) ||
3755             ((ack_type == CUM_ACKED) &&
3756             (to->to_flags & TOF_TS) &&
3757             (to->to_tsecr) &&
3758             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3759             ) {
3760                 /*
3761                  * We will only find a matching timestamp if its cum-acked.
3762                  * But if its only one retransmission its for-sure matching
3763                  * :-)
3764                  */
3765                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3766                 if ((int)t <= 0)
3767                         t = 1;
3768                 if (!tp->t_rttlow || tp->t_rttlow > t)
3769                         tp->t_rttlow = t;
3770                 if (!rack->r_ctl.rc_rack_min_rtt ||
3771                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3772                         rack->r_ctl.rc_rack_min_rtt = t;
3773                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3774                                 rack->r_ctl.rc_rack_min_rtt = 1;
3775                         }
3776                 }
3777                 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
3778                 if ((rsm->r_flags & RACK_TLP) &&
3779                     (!IN_RECOVERY(tp->t_flags))) {
3780                         /* Segment was a TLP and our retrans matched */
3781                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
3782                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
3783                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
3784                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
3785                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
3786                                 /*
3787                                  * When we enter recovery we need to assure
3788                                  * we send one packet.
3789                                  */
3790                                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
3791                         } else
3792                                 rack->r_ctl.rc_tlp_rtx_out = 0;
3793                 }
3794                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3795                         /* New more recent rack_tmit_time */
3796                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3797                         rack->rc_rack_rtt = t;
3798                 }
3799                 return (1);
3800         }
3801         /*
3802          * We clear the soft/rxtshift since we got an ack.
3803          * There is no assurance we will call the commit() function
3804          * so we need to clear these to avoid incorrect handling.
3805          */
3806         tp->t_rxtshift = 0;
3807         tp->t_softerror = 0;
3808         if ((to->to_flags & TOF_TS) &&
3809             (ack_type == CUM_ACKED) &&
3810             (to->to_tsecr) &&
3811             ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
3812                 /*
3813                  * Now which timestamp does it match? In this block the ACK
3814                  * must be coming from a previous transmission.
3815                  */
3816                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
3817                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
3818                                 t = cts - rsm->r_tim_lastsent[i];
3819                                 if ((int)t <= 0)
3820                                         t = 1;
3821                                 if ((i + 1) < rsm->r_rtr_cnt) {
3822                                         /* Likely */
3823                                         rack_earlier_retran(tp, rsm, t, cts);
3824                                 }
3825                                 if (!tp->t_rttlow || tp->t_rttlow > t)
3826                                         tp->t_rttlow = t;
3827                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3828                                         rack->r_ctl.rc_rack_min_rtt = t;
3829                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3830                                                 rack->r_ctl.rc_rack_min_rtt = 1;
3831                                         }
3832                                 }
3833                                 /*
3834                                  * Note the following calls to
3835                                  * tcp_rack_xmit_timer() are being commented
3836                                  * out for now. They give us no more accuracy
3837                                  * and often lead to a wrong choice. We have
3838                                  * enough samples that have not been
3839                                  * retransmitted. I leave the commented out
3840                                  * code in here in case in the future we
3841                                  * decide to add it back (though I can't forsee
3842                                  * doing that). That way we will easily see
3843                                  * where they need to be placed.
3844                                  */
3845                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
3846                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3847                                         /* New more recent rack_tmit_time */
3848                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3849                                         rack->rc_rack_rtt = t;
3850                                 }
3851                                 return (1);
3852                         }
3853                 }
3854                 goto ts_not_found;
3855         } else {
3856                 /*
3857                  * Ok its a SACK block that we retransmitted. or a windows
3858                  * machine without timestamps. We can tell nothing from the
3859                  * time-stamp since its not there or the time the peer last
3860                  * recieved a segment that moved forward its cum-ack point.
3861                  */
3862 ts_not_found:
3863                 i = rsm->r_rtr_cnt - 1;
3864                 t = cts - rsm->r_tim_lastsent[i];
3865                 if ((int)t <= 0)
3866                         t = 1;
3867                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3868                         /*
3869                          * We retransmitted and the ack came back in less
3870                          * than the smallest rtt we have observed. We most
3871                          * likey did an improper retransmit as outlined in
3872                          * 4.2 Step 3 point 2 in the rack-draft.
3873                          */
3874                         i = rsm->r_rtr_cnt - 2;
3875                         t = cts - rsm->r_tim_lastsent[i];
3876                         rack_earlier_retran(tp, rsm, t, cts);
3877                 } else if (rack->r_ctl.rc_rack_min_rtt) {
3878                         /*
3879                          * We retransmitted it and the retransmit did the
3880                          * job.
3881                          */
3882                         if (!rack->r_ctl.rc_rack_min_rtt ||
3883                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3884                                 rack->r_ctl.rc_rack_min_rtt = t;
3885                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
3886                                         rack->r_ctl.rc_rack_min_rtt = 1;
3887                                 }
3888                         }
3889                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
3890                                 /* New more recent rack_tmit_time */
3891                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
3892                                 rack->rc_rack_rtt = t;
3893                         }
3894                         return (1);
3895                 }
3896         }
3897         return (0);
3898 }
3899
3900 /*
3901  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
3902  */
3903 static void
3904 rack_log_sack_passed(struct tcpcb *tp,
3905     struct tcp_rack *rack, struct rack_sendmap *rsm)
3906 {
3907         struct rack_sendmap *nrsm;
3908         uint32_t ts;
3909         int32_t idx;
3910
3911         idx = rsm->r_rtr_cnt - 1;
3912         ts = rsm->r_tim_lastsent[idx];
3913         nrsm = rsm;
3914         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
3915             rack_head, r_tnext) {
3916                 if (nrsm == rsm) {
3917                         /* Skip orginal segment he is acked */
3918                         continue;
3919                 }
3920                 if (nrsm->r_flags & RACK_ACKED) {
3921                         /* Skip ack'd segments */
3922                         continue;
3923                 }
3924                 idx = nrsm->r_rtr_cnt - 1;
3925                 if (ts == nrsm->r_tim_lastsent[idx]) {
3926                         /*
3927                          * For this case lets use seq no, if we sent in a
3928                          * big block (TSO) we would have a bunch of segments
3929                          * sent at the same time.
3930                          *
3931                          * We would only get a report if its SEQ is earlier.
3932                          * If we have done multiple retransmits the times
3933                          * would not be equal.
3934                          */
3935                         if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
3936                                 nrsm->r_flags |= RACK_SACK_PASSED;
3937                                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3938                         }
3939                 } else {
3940                         /*
3941                          * Here they were sent at different times, not a big
3942                          * block. Since we transmitted this one later and
3943                          * see it sack'd then this must also be missing (or
3944                          * we would have gotten a sack block for it)
3945                          */
3946                         nrsm->r_flags |= RACK_SACK_PASSED;
3947                         nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3948                 }
3949         }
3950 }
3951
3952 static uint32_t
3953 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
3954     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
3955 {
3956         int32_t idx;
3957         int32_t times = 0;
3958         uint32_t start, end, changed = 0;
3959         struct rack_sendmap *rsm, *nrsm;
3960         int32_t used_ref = 1;
3961
3962         start = sack->start;
3963         end = sack->end;
3964         rsm = *prsm;
3965         if (rsm && SEQ_LT(start, rsm->r_start)) {
3966                 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
3967                         if (SEQ_GEQ(start, rsm->r_start) &&
3968                             SEQ_LT(start, rsm->r_end)) {
3969                                 goto do_rest_ofb;
3970                         }
3971                 }
3972         }
3973         if (rsm == NULL) {
3974 start_at_beginning:
3975                 rsm = NULL;
3976                 used_ref = 0;
3977         }
3978         /* First lets locate the block where this guy is */
3979         TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
3980                 if (SEQ_GEQ(start, rsm->r_start) &&
3981                     SEQ_LT(start, rsm->r_end)) {
3982                         break;
3983                 }
3984         }
3985 do_rest_ofb:
3986         if (rsm == NULL) {
3987                 /*
3988                  * This happens when we get duplicate sack blocks with the
3989                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
3990                  * will not change there location so we would just start at
3991                  * the end of the first one and get lost.
3992                  */
3993                 if (tp->t_flags & TF_SENTFIN) {
3994                         /*
3995                          * Check to see if we have not logged the FIN that
3996                          * went out.
3997                          */
3998                         nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
3999                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
4000                                 /*
4001                                  * Ok we did not get the FIN logged.
4002                                  */
4003                                 nrsm->r_end++;
4004                                 rsm = nrsm;
4005                                 goto do_rest_ofb;
4006                         }
4007                 }
4008                 if (times == 1) {
4009 #ifdef INVARIANTS
4010                         panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
4011                             tp, rack, sack, to, prsm);
4012 #else
4013                         goto out;
4014 #endif
4015                 }
4016                 times++;
4017                 counter_u64_add(rack_sack_proc_restart, 1);
4018                 goto start_at_beginning;
4019         }
4020         /* Ok we have an ACK for some piece of rsm */
4021         if (rsm->r_start != start) {
4022                 /*
4023                  * Need to split this in two pieces the before and after.
4024                  */
4025                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4026                 if (nrsm == NULL) {
4027                         /*
4028                          * failed XXXrrs what can we do but loose the sack
4029                          * info?
4030                          */
4031                         goto out;
4032                 }
4033                 nrsm->r_start = start;
4034                 nrsm->r_rtr_bytes = 0;
4035                 nrsm->r_end = rsm->r_end;
4036                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4037                 nrsm->r_flags = rsm->r_flags;
4038                 nrsm->r_sndcnt = rsm->r_sndcnt;
4039                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4040                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4041                 }
4042                 rsm->r_end = nrsm->r_start;
4043                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4044                 if (rsm->r_in_tmap) {
4045                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4046                         nrsm->r_in_tmap = 1;
4047                 }
4048                 rsm->r_flags &= (~RACK_HAS_FIN);
4049                 rsm = nrsm;
4050         }
4051         if (SEQ_GEQ(end, rsm->r_end)) {
4052                 /*
4053                  * The end of this block is either beyond this guy or right
4054                  * at this guy.
4055                  */
4056
4057                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4058                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4059                         changed += (rsm->r_end - rsm->r_start);
4060                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4061                         rack_log_sack_passed(tp, rack, rsm);
4062                         /* Is Reordering occuring? */
4063                         if (rsm->r_flags & RACK_SACK_PASSED) {
4064                                 counter_u64_add(rack_reorder_seen, 1);
4065                                 rack->r_ctl.rc_reorder_ts = cts;
4066                         }
4067                         rsm->r_flags |= RACK_ACKED;
4068                         rsm->r_flags &= ~RACK_TLP;
4069                         if (rsm->r_in_tmap) {
4070                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4071                                 rsm->r_in_tmap = 0;
4072                         }
4073                 }
4074                 if (end == rsm->r_end) {
4075                         /* This block only - done */
4076                         goto out;
4077                 }
4078                 /* There is more not coverend by this rsm move on */
4079                 start = rsm->r_end;
4080                 nrsm = TAILQ_NEXT(rsm, r_next);
4081                 rsm = nrsm;
4082                 times = 0;
4083                 goto do_rest_ofb;
4084         }
4085         /* Ok we need to split off this one at the tail */
4086         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4087         if (nrsm == NULL) {
4088                 /* failed rrs what can we do but loose the sack info? */
4089                 goto out;
4090         }
4091         /* Clone it */
4092         nrsm->r_start = end;
4093         nrsm->r_end = rsm->r_end;
4094         nrsm->r_rtr_bytes = 0;
4095         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4096         nrsm->r_flags = rsm->r_flags;
4097         nrsm->r_sndcnt = rsm->r_sndcnt;
4098         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4099                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4100         }
4101         /* The sack block does not cover this guy fully */
4102         rsm->r_flags &= (~RACK_HAS_FIN);
4103         rsm->r_end = end;
4104         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4105         if (rsm->r_in_tmap) {
4106                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4107                 nrsm->r_in_tmap = 1;
4108         }
4109         if (rsm->r_flags & RACK_ACKED) {
4110                 /* Been here done that */
4111                 goto out;
4112         }
4113         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4114         changed += (rsm->r_end - rsm->r_start);
4115         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4116         rack_log_sack_passed(tp, rack, rsm);
4117         /* Is Reordering occuring? */
4118         if (rsm->r_flags & RACK_SACK_PASSED) {
4119                 counter_u64_add(rack_reorder_seen, 1);
4120                 rack->r_ctl.rc_reorder_ts = cts;
4121         }
4122         rsm->r_flags |= RACK_ACKED;
4123         rsm->r_flags &= ~RACK_TLP;
4124         if (rsm->r_in_tmap) {
4125                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4126                 rsm->r_in_tmap = 0;
4127         }
4128 out:
4129         if (used_ref == 0) {
4130                 counter_u64_add(rack_sack_proc_all, 1);
4131         } else {
4132                 counter_u64_add(rack_sack_proc_short, 1);
4133         }
4134         /* Save off where we last were */
4135         if (rsm)
4136                 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
4137         else
4138                 rack->r_ctl.rc_sacklast = NULL;
4139         *prsm = rsm;
4140         return (changed);
4141 }
4142
4143 static void inline
4144 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4145 {
4146         struct rack_sendmap *tmap;
4147
4148         tmap = NULL;
4149         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4150                 /* Its no longer sacked, mark it so */
4151                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4152 #ifdef INVARIANTS
4153                 if (rsm->r_in_tmap) {
4154                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4155                               rack, rsm, rsm->r_flags);
4156                 }
4157 #endif
4158                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4159                 /* Rebuild it into our tmap */
4160                 if (tmap == NULL) {
4161                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4162                         tmap = rsm;
4163                 } else {
4164                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4165                         tmap = rsm;
4166                 }
4167                 tmap->r_in_tmap = 1;
4168                 rsm = TAILQ_NEXT(rsm, r_next);
4169         }
4170         /*
4171          * Now lets possibly clear the sack filter so we start
4172          * recognizing sacks that cover this area.
4173          */
4174         if (rack_use_sack_filter)
4175                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4176
4177 }
4178
4179 static void
4180 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4181 {
4182         uint32_t changed, last_seq, entered_recovery = 0;
4183         struct tcp_rack *rack;
4184         struct rack_sendmap *rsm;
4185         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4186         register uint32_t th_ack;
4187         int32_t i, j, k, num_sack_blks = 0;
4188         uint32_t cts, acked, ack_point, sack_changed = 0;
4189
4190         INP_WLOCK_ASSERT(tp->t_inpcb);
4191         if (th->th_flags & TH_RST) {
4192                 /* We don't log resets */
4193                 return;
4194         }
4195         rack = (struct tcp_rack *)tp->t_fb_ptr;
4196         cts = tcp_ts_getticks();
4197         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4198         changed = 0;
4199         th_ack = th->th_ack;
4200
4201         if (SEQ_GT(th_ack, tp->snd_una)) {
4202                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4203                 tp->t_acktime = ticks;
4204         }
4205         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4206                 changed = th_ack - rsm->r_start;
4207         if (changed) {
4208                 /*
4209                  * The ACK point is advancing to th_ack, we must drop off
4210                  * the packets in the rack log and calculate any eligble
4211                  * RTT's.
4212                  */
4213                 rack->r_wanted_output++;
4214 more:
4215                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4216                 if (rsm == NULL) {
4217                         if ((th_ack - 1) == tp->iss) {
4218                                 /*
4219                                  * For the SYN incoming case we will not
4220                                  * have called tcp_output for the sending of
4221                                  * the SYN, so there will be no map. All
4222                                  * other cases should probably be a panic.
4223                                  */
4224                                 goto proc_sack;
4225                         }
4226                         if (tp->t_flags & TF_SENTFIN) {
4227                                 /* if we send a FIN we will not hav a map */
4228                                 goto proc_sack;
4229                         }
4230 #ifdef INVARIANTS
4231                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4232                             tp,
4233                             th, tp->t_state, rack,
4234                             tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4235 #endif
4236                         goto proc_sack;
4237                 }
4238                 if (SEQ_LT(th_ack, rsm->r_start)) {
4239                         /* Huh map is missing this */
4240 #ifdef INVARIANTS
4241                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4242                             rsm->r_start,
4243                             th_ack, tp->t_state, rack->r_state);
4244 #endif
4245                         goto proc_sack;
4246                 }
4247                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4248                 /* Now do we consume the whole thing? */
4249                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4250                         /* Its all consumed. */
4251                         uint32_t left;
4252
4253                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4254                         rsm->r_rtr_bytes = 0;
4255                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
4256                         if (rsm->r_in_tmap) {
4257                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4258                                 rsm->r_in_tmap = 0;
4259                         }
4260                         if (rack->r_ctl.rc_next == rsm) {
4261                                 /* scoot along the marker */
4262                                 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
4263                         }
4264                         if (rsm->r_flags & RACK_ACKED) {
4265                                 /*
4266                                  * It was acked on the scoreboard -- remove
4267                                  * it from total
4268                                  */
4269                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4270                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4271                                 /*
4272                                  * There are acked segments ACKED on the
4273                                  * scoreboard further up. We are seeing
4274                                  * reordering.
4275                                  */
4276                                 counter_u64_add(rack_reorder_seen, 1);
4277                                 rsm->r_flags |= RACK_ACKED;
4278                                 rack->r_ctl.rc_reorder_ts = cts;
4279                         }
4280                         left = th_ack - rsm->r_end;
4281                         if (rsm->r_rtr_cnt > 1) {
4282                                 /*
4283                                  * Technically we should make r_rtr_cnt be
4284                                  * monotonicly increasing and just mod it to
4285                                  * the timestamp it is replacing.. that way
4286                                  * we would have the last 3 retransmits. Now
4287                                  * rc_loss_count will be wrong if we
4288                                  * retransmit something more than 2 times in
4289                                  * recovery :(
4290                                  */
4291                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4292                         }
4293                         /* Free back to zone */
4294                         rack_free(rack, rsm);
4295                         if (left) {
4296                                 goto more;
4297                         }
4298                         goto proc_sack;
4299                 }
4300                 if (rsm->r_flags & RACK_ACKED) {
4301                         /*
4302                          * It was acked on the scoreboard -- remove it from
4303                          * total for the part being cum-acked.
4304                          */
4305                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4306                 }
4307                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4308                 rsm->r_rtr_bytes = 0;
4309                 rsm->r_start = th_ack;
4310         }
4311 proc_sack:
4312         /* Check for reneging */
4313         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4314         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4315                 /*
4316                  * The peer has moved snd_una up to
4317                  * the edge of this send, i.e. one
4318                  * that it had previously acked. The only
4319                  * way that can be true if the peer threw
4320                  * away data (space issues) that it had
4321                  * previously sacked (else it would have
4322                  * given us snd_una up to (rsm->r_end).
4323                  * We need to undo the acked markings here.
4324                  *
4325                  * Note we have to look to make sure th_ack is
4326                  * our rsm->r_start in case we get an old ack
4327                  * where th_ack is behind snd_una.
4328                  */
4329                 rack_peer_reneges(rack, rsm, th->th_ack);
4330         }
4331         if ((to->to_flags & TOF_SACK) == 0) {
4332                 /* We are done nothing left to log */
4333                 goto out;
4334         }
4335         rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4336         if (rsm) {
4337                 last_seq = rsm->r_end;
4338         } else {
4339                 last_seq = tp->snd_max;
4340         }
4341         /* Sack block processing */
4342         if (SEQ_GT(th_ack, tp->snd_una))
4343                 ack_point = th_ack;
4344         else
4345                 ack_point = tp->snd_una;
4346         for (i = 0; i < to->to_nsacks; i++) {
4347                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4348                     &sack, sizeof(sack));
4349                 sack.start = ntohl(sack.start);
4350                 sack.end = ntohl(sack.end);
4351                 if (SEQ_GT(sack.end, sack.start) &&
4352                     SEQ_GT(sack.start, ack_point) &&
4353                     SEQ_LT(sack.start, tp->snd_max) &&
4354                     SEQ_GT(sack.end, ack_point) &&
4355                     SEQ_LEQ(sack.end, tp->snd_max)) {
4356                         if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
4357                             (SEQ_LT(sack.end, last_seq)) &&
4358                             ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
4359                                 /*
4360                                  * Not the last piece and its smaller than
4361                                  * 1/8th of a MSS. We ignore this.
4362                                  */
4363                                 counter_u64_add(rack_runt_sacks, 1);
4364                                 continue;
4365                         }
4366                         sack_blocks[num_sack_blks] = sack;
4367                         num_sack_blks++;
4368 #ifdef NETFLIX_STATS
4369                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4370                            SEQ_LEQ(sack.end, th_ack)) {
4371                         /*
4372                          * Its a D-SACK block.
4373                          */
4374                         tcp_record_dsack(sack.start, sack.end);
4375 #endif
4376                 }
4377
4378         }
4379         if (num_sack_blks == 0)
4380                 goto out;
4381         /*
4382          * Sort the SACK blocks so we can update the rack scoreboard with
4383          * just one pass.
4384          */
4385         if (rack_use_sack_filter) {
4386                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
4387         }
4388         if (num_sack_blks < 2) {
4389                 goto do_sack_work;
4390         }
4391         /* Sort the sacks */
4392         for (i = 0; i < num_sack_blks; i++) {
4393                 for (j = i + 1; j < num_sack_blks; j++) {
4394                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4395                                 sack = sack_blocks[i];
4396                                 sack_blocks[i] = sack_blocks[j];
4397                                 sack_blocks[j] = sack;
4398                         }
4399                 }
4400         }
4401         /*
4402          * Now are any of the sack block ends the same (yes some
4403          * implememtations send these)?
4404          */
4405 again:
4406         if (num_sack_blks > 1) {
4407                 for (i = 0; i < num_sack_blks; i++) {
4408                         for (j = i + 1; j < num_sack_blks; j++) {
4409                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4410                                         /*
4411                                          * Ok these two have the same end we
4412                                          * want the smallest end and then
4413                                          * throw away the larger and start
4414                                          * again.
4415                                          */
4416                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4417                                                 /*
4418                                                  * The second block covers
4419                                                  * more area use that
4420                                                  */
4421                                                 sack_blocks[i].start = sack_blocks[j].start;
4422                                         }
4423                                         /*
4424                                          * Now collapse out the dup-sack and
4425                                          * lower the count
4426                                          */
4427                                         for (k = (j + 1); k < num_sack_blks; k++) {
4428                                                 sack_blocks[j].start = sack_blocks[k].start;
4429                                                 sack_blocks[j].end = sack_blocks[k].end;
4430                                                 j++;
4431                                         }
4432                                         num_sack_blks--;
4433                                         goto again;
4434                                 }
4435                         }
4436                 }
4437         }
4438 do_sack_work:
4439         rsm = rack->r_ctl.rc_sacklast;
4440         for (i = 0; i < num_sack_blks; i++) {
4441                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
4442                 if (acked) {
4443                         rack->r_wanted_output++;
4444                         changed += acked;
4445                         sack_changed += acked;
4446                 }
4447         }
4448 out:
4449         if (changed) {
4450                 /* Something changed cancel the rack timer */
4451                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4452         }
4453         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
4454                 /*
4455                  * Ok we have a high probability that we need to go in to
4456                  * recovery since we have data sack'd
4457                  */
4458                 struct rack_sendmap *rsm;
4459                 uint32_t tsused;
4460
4461                 tsused = tcp_ts_getticks();
4462                 rsm = tcp_rack_output(tp, rack, tsused);
4463                 if (rsm) {
4464                         /* Enter recovery */
4465                         rack->r_ctl.rc_rsm_start = rsm->r_start;
4466                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4467                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4468                         entered_recovery = 1;
4469                         rack_cong_signal(tp, NULL, CC_NDUPACK);
4470                         /*
4471                          * When we enter recovery we need to assure we send
4472                          * one packet.
4473                          */
4474                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
4475                         rack->r_timer_override = 1;
4476                 }
4477         }
4478         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
4479                 /* Deal with changed an PRR here (in recovery only) */
4480                 uint32_t pipe, snd_una;
4481
4482                 rack->r_ctl.rc_prr_delivered += changed;
4483                 /* Compute prr_sndcnt */
4484                 if (SEQ_GT(tp->snd_una, th_ack)) {
4485                         snd_una = tp->snd_una;
4486                 } else {
4487                         snd_una = th_ack;
4488                 }
4489                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
4490                 if (pipe > tp->snd_ssthresh) {
4491                         long sndcnt;
4492
4493                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
4494                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
4495                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
4496                         else {
4497                                 rack->r_ctl.rc_prr_sndcnt = 0;
4498                                 sndcnt = 0;
4499                         }
4500                         sndcnt++;
4501                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
4502                                 sndcnt -= rack->r_ctl.rc_prr_out;
4503                         else
4504                                 sndcnt = 0;
4505                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
4506                 } else {
4507                         uint32_t limit;
4508
4509                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
4510                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
4511                         else
4512                                 limit = 0;
4513                         if (changed > limit)
4514                                 limit = changed;
4515                         limit += tp->t_maxseg;
4516                         if (tp->snd_ssthresh > pipe) {
4517                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
4518                         } else {
4519                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
4520                         }
4521                 }
4522                 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
4523                         rack->r_timer_override = 1;
4524                 }
4525         }
4526 }
4527
4528 /*
4529  * Return value of 1, we do not need to call rack_process_data().
4530  * return value of 0, rack_process_data can be called.
4531  * For ret_val if its 0 the TCP is locked, if its non-zero
4532  * its unlocked and probably unsafe to touch the TCB.
4533  */
4534 static int
4535 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
4536     struct tcpcb *tp, struct tcpopt *to,
4537     uint32_t tiwin, int32_t tlen,
4538     int32_t * ofia, int32_t thflags, int32_t * ret_val)
4539 {
4540         int32_t ourfinisacked = 0;
4541         int32_t nsegs, acked_amount;
4542         int32_t acked;
4543         struct mbuf *mfree;
4544         struct tcp_rack *rack;
4545         int32_t recovery = 0;
4546
4547         rack = (struct tcp_rack *)tp->t_fb_ptr;
4548         if (SEQ_GT(th->th_ack, tp->snd_max)) {
4549                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
4550                 return (1);
4551         }
4552         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
4553                 rack_log_ack(tp, to, th);
4554         }
4555         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
4556                 /*
4557                  * Old ack, behind (or duplicate to) the last one rcv'd
4558                  * Note: Should mark reordering is occuring! We should also
4559                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
4560                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
4561                  * retran and> ack 3
4562                  */
4563                 return (0);
4564         }
4565         /*
4566          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
4567          * something we sent.
4568          */
4569         if (tp->t_flags & TF_NEEDSYN) {
4570                 /*
4571                  * T/TCP: Connection was half-synchronized, and our SYN has
4572                  * been ACK'd (so connection is now fully synchronized).  Go
4573                  * to non-starred state, increment snd_una for ACK of SYN,
4574                  * and check if we can do window scaling.
4575                  */
4576                 tp->t_flags &= ~TF_NEEDSYN;
4577                 tp->snd_una++;
4578                 /* Do window scaling? */
4579                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
4580                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
4581                         tp->rcv_scale = tp->request_r_scale;
4582                         /* Send window already scaled. */
4583                 }
4584         }
4585         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4586         INP_WLOCK_ASSERT(tp->t_inpcb);
4587
4588         acked = BYTES_THIS_ACK(tp, th);
4589         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
4590         TCPSTAT_ADD(tcps_rcvackbyte, acked);
4591
4592         /*
4593          * If we just performed our first retransmit, and the ACK arrives
4594          * within our recovery window, then it was a mistake to do the
4595          * retransmit in the first place.  Recover our original cwnd and
4596          * ssthresh, and proceed to transmit where we left off.
4597          */
4598         if (tp->t_flags & TF_PREVVALID) {
4599                 tp->t_flags &= ~TF_PREVVALID;
4600                 if (tp->t_rxtshift == 1 &&
4601                     (int)(ticks - tp->t_badrxtwin) < 0)
4602                         rack_cong_signal(tp, th, CC_RTO_ERR);
4603         }
4604         /*
4605          * If we have a timestamp reply, update smoothed round trip time. If
4606          * no timestamp is present but transmit timer is running and timed
4607          * sequence number was acked, update smoothed round trip time. Since
4608          * we now have an rtt measurement, cancel the timer backoff (cf.,
4609          * Phil Karn's retransmit alg.). Recompute the initial retransmit
4610          * timer.
4611          *
4612          * Some boxes send broken timestamp replies during the SYN+ACK
4613          * phase, ignore timestamps of 0 or we could calculate a huge RTT
4614          * and blow up the retransmit timer.
4615          */
4616         /*
4617          * If all outstanding data is acked, stop retransmit timer and
4618          * remember to restart (more output or persist). If there is more
4619          * data to be acked, restart retransmit timer, using current
4620          * (possibly backed-off) value.
4621          */
4622         if (th->th_ack == tp->snd_max) {
4623                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4624                 rack->r_wanted_output++;
4625         }
4626         /*
4627          * If no data (only SYN) was ACK'd, skip rest of ACK processing.
4628          */
4629         if (acked == 0) {
4630                 if (ofia)
4631                         *ofia = ourfinisacked;
4632                 return (0);
4633         }
4634         if (rack->r_ctl.rc_early_recovery) {
4635                 if (IN_FASTRECOVERY(tp->t_flags)) {
4636                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4637                                 tcp_rack_partialack(tp, th);
4638                         } else {
4639                                 rack_post_recovery(tp, th);
4640                                 recovery = 1;
4641                         }
4642                 }
4643         }
4644         /*
4645          * Let the congestion control algorithm update congestion control
4646          * related information. This typically means increasing the
4647          * congestion window.
4648          */
4649         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
4650         SOCKBUF_LOCK(&so->so_snd);
4651         acked_amount = min(acked, (int)sbavail(&so->so_snd));
4652         tp->snd_wnd -= acked_amount;
4653         mfree = sbcut_locked(&so->so_snd, acked_amount);
4654         if ((sbused(&so->so_snd) == 0) &&
4655             (acked > acked_amount) &&
4656             (tp->t_state >= TCPS_FIN_WAIT_1)) {
4657                 ourfinisacked = 1;
4658         }
4659         /* NB: sowwakeup_locked() does an implicit unlock. */
4660         sowwakeup_locked(so);
4661         m_freem(mfree);
4662         if (rack->r_ctl.rc_early_recovery == 0) {
4663                 if (IN_FASTRECOVERY(tp->t_flags)) {
4664                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4665                                 tcp_rack_partialack(tp, th);
4666                         } else {
4667                                 rack_post_recovery(tp, th);
4668                         }
4669                 }
4670         }
4671         tp->snd_una = th->th_ack;
4672         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4673                 tp->snd_recover = tp->snd_una;
4674
4675         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4676                 tp->snd_nxt = tp->snd_una;
4677         }
4678         if (tp->snd_una == tp->snd_max) {
4679                 /* Nothing left outstanding */
4680                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
4681                 tp->t_acktime = 0;
4682                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4683                 /* Set need output so persist might get set */
4684                 rack->r_wanted_output++;
4685                 if (rack_use_sack_filter)
4686                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
4687                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
4688                     (sbavail(&so->so_snd) == 0) &&
4689                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
4690                         /*
4691                          * The socket was gone and the
4692                          * peer sent data, time to
4693                          * reset him.
4694                          */
4695                         *ret_val = 1;
4696                         tp = tcp_close(tp);
4697                         rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
4698                         return (1);
4699                 }
4700         }
4701         if (ofia)
4702                 *ofia = ourfinisacked;
4703         return (0);
4704 }
4705
4706
4707 /*
4708  * Return value of 1, the TCB is unlocked and most
4709  * likely gone, return value of 0, the TCP is still
4710  * locked.
4711  */
4712 static int
4713 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
4714     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
4715     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
4716 {
4717         /*
4718          * Update window information. Don't look at window if no ACK: TAC's
4719          * send garbage on first SYN.
4720          */
4721         int32_t nsegs;
4722         int32_t tfo_syn;
4723         struct tcp_rack *rack;
4724
4725         rack = (struct tcp_rack *)tp->t_fb_ptr;
4726         INP_WLOCK_ASSERT(tp->t_inpcb);
4727         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4728         if ((thflags & TH_ACK) &&
4729             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4730             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4731             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4732                 /* keep track of pure window updates */
4733                 if (tlen == 0 &&
4734                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4735                         TCPSTAT_INC(tcps_rcvwinupd);
4736                 tp->snd_wnd = tiwin;
4737                 tp->snd_wl1 = th->th_seq;
4738                 tp->snd_wl2 = th->th_ack;
4739                 if (tp->snd_wnd > tp->max_sndwnd)
4740                         tp->max_sndwnd = tp->snd_wnd;
4741                 rack->r_wanted_output++;
4742         } else if (thflags & TH_ACK) {
4743                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
4744                         tp->snd_wnd = tiwin;
4745                         tp->snd_wl1 = th->th_seq;
4746                         tp->snd_wl2 = th->th_ack;
4747                 }
4748         }
4749         /* Was persist timer active and now we have window space? */
4750         if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
4751                 rack_exit_persist(tp, rack);
4752                 tp->snd_nxt = tp->snd_max;
4753                 /* Make sure we output to start the timer */
4754                 rack->r_wanted_output++;
4755         }
4756         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
4757                 m_freem(m);
4758                 return (0);
4759         }
4760         /*
4761          * Process segments with URG.
4762          */
4763         if ((thflags & TH_URG) && th->th_urp &&
4764             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4765                 /*
4766                  * This is a kludge, but if we receive and accept random
4767                  * urgent pointers, we'll crash in soreceive.  It's hard to
4768                  * imagine someone actually wanting to send this much urgent
4769                  * data.
4770                  */
4771                 SOCKBUF_LOCK(&so->so_rcv);
4772                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
4773                         th->th_urp = 0; /* XXX */
4774                         thflags &= ~TH_URG;     /* XXX */
4775                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
4776                         goto dodata;    /* XXX */
4777                 }
4778                 /*
4779                  * If this segment advances the known urgent pointer, then
4780                  * mark the data stream.  This should not happen in
4781                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
4782                  * FIN has been received from the remote side. In these
4783                  * states we ignore the URG.
4784                  *
4785                  * According to RFC961 (Assigned Protocols), the urgent
4786                  * pointer points to the last octet of urgent data.  We
4787                  * continue, however, to consider it to indicate the first
4788                  * octet of data past the urgent section as the original
4789                  * spec states (in one of two places).
4790                  */
4791                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
4792                         tp->rcv_up = th->th_seq + th->th_urp;
4793                         so->so_oobmark = sbavail(&so->so_rcv) +
4794                             (tp->rcv_up - tp->rcv_nxt) - 1;
4795                         if (so->so_oobmark == 0)
4796                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
4797                         sohasoutofband(so);
4798                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4799                 }
4800                 SOCKBUF_UNLOCK(&so->so_rcv);
4801                 /*
4802                  * Remove out of band data so doesn't get presented to user.
4803                  * This can happen independent of advancing the URG pointer,
4804                  * but if two URG's are pending at once, some out-of-band
4805                  * data may creep in... ick.
4806                  */
4807                 if (th->th_urp <= (uint32_t) tlen &&
4808                     !(so->so_options & SO_OOBINLINE)) {
4809                         /* hdr drop is delayed */
4810                         tcp_pulloutofband(so, th, m, drop_hdrlen);
4811                 }
4812         } else {
4813                 /*
4814                  * If no out of band data is expected, pull receive urgent
4815                  * pointer along with the receive window.
4816                  */
4817                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4818                         tp->rcv_up = tp->rcv_nxt;
4819         }
4820 dodata:                         /* XXX */
4821         INP_WLOCK_ASSERT(tp->t_inpcb);
4822
4823         /*
4824          * Process the segment text, merging it into the TCP sequencing
4825          * queue, and arranging for acknowledgment of receipt if necessary.
4826          * This process logically involves adjusting tp->rcv_wnd as data is
4827          * presented to the user (this happens in tcp_usrreq.c, case
4828          * PRU_RCVD).  If a FIN has already been received on this connection
4829          * then we just ignore the text.
4830          */
4831         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
4832                    IS_FASTOPEN(tp->t_flags));
4833         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
4834             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4835                 tcp_seq save_start = th->th_seq;
4836                 tcp_seq save_rnxt  = tp->rcv_nxt;
4837                 int     save_tlen  = tlen;
4838
4839                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4840                 /*
4841                  * Insert segment which includes th into TCP reassembly
4842                  * queue with control block tp.  Set thflags to whether
4843                  * reassembly now includes a segment with FIN.  This handles
4844                  * the common case inline (segment is the next to be
4845                  * received on an established connection, and the queue is
4846                  * empty), avoiding linkage into and removal from the queue
4847                  * and repetition of various conversions. Set DELACK for
4848                  * segments received in order, but ack immediately when
4849                  * segments are out of order (so fast retransmit can work).
4850                  */
4851                 if (th->th_seq == tp->rcv_nxt &&
4852                     SEGQ_EMPTY(tp) &&
4853                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
4854                     tfo_syn)) {
4855                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
4856                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4857                                 tp->t_flags |= TF_DELACK;
4858                         } else {
4859                                 rack->r_wanted_output++;
4860                                 tp->t_flags |= TF_ACKNOW;
4861                         }
4862                         tp->rcv_nxt += tlen;
4863                         thflags = th->th_flags & TH_FIN;
4864                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
4865                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
4866                         SOCKBUF_LOCK(&so->so_rcv);
4867                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4868                                 m_freem(m);
4869                         else
4870                                 sbappendstream_locked(&so->so_rcv, m, 0);
4871                         /* NB: sorwakeup_locked() does an implicit unlock. */
4872                         sorwakeup_locked(so);
4873                 } else {
4874                         /*
4875                          * XXX: Due to the header drop above "th" is
4876                          * theoretically invalid by now.  Fortunately
4877                          * m_adj() doesn't actually frees any mbufs when
4878                          * trimming from the head.
4879                          */
4880                         tcp_seq temp = save_start;
4881                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
4882                         tp->t_flags |= TF_ACKNOW;
4883                 }
4884                 if (((tlen == 0) && (save_tlen > 0) &&
4885                     (SEQ_LT(save_start, save_rnxt)))) {
4886                         /*
4887                          * DSACK actually handled in the fastpath
4888                          * above.
4889                          */
4890                         tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4891                 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
4892                         /*
4893                          * Cleaning sackblks by using zero length
4894                          * update.
4895                          */
4896                         tcp_update_sack_list(tp, save_start, save_start);
4897                 } else if ((tlen > 0) && (tlen >= save_tlen)) {
4898                         /* Update of sackblks. */
4899                         tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4900                 } else if (tlen > 0) {
4901                         tcp_update_sack_list(tp, save_start, save_start+tlen);
4902                 }
4903         } else {
4904                 m_freem(m);
4905                 thflags &= ~TH_FIN;
4906         }
4907
4908         /*
4909          * If FIN is received ACK the FIN and let the user know that the
4910          * connection is closing.
4911          */
4912         if (thflags & TH_FIN) {
4913                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4914                         socantrcvmore(so);
4915                         /*
4916                          * If connection is half-synchronized (ie NEEDSYN
4917                          * flag on) then delay ACK, so it may be piggybacked
4918                          * when SYN is sent. Otherwise, since we received a
4919                          * FIN then no more input can be expected, send ACK
4920                          * now.
4921                          */
4922                         if (tp->t_flags & TF_NEEDSYN) {
4923                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4924                                 tp->t_flags |= TF_DELACK;
4925                         } else {
4926                                 tp->t_flags |= TF_ACKNOW;
4927                         }
4928                         tp->rcv_nxt++;
4929                 }
4930                 switch (tp->t_state) {
4931
4932                         /*
4933                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
4934                          * CLOSE_WAIT state.
4935                          */
4936                 case TCPS_SYN_RECEIVED:
4937                         tp->t_starttime = ticks;
4938                         /* FALLTHROUGH */
4939                 case TCPS_ESTABLISHED:
4940                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4941                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
4942                         break;
4943
4944                         /*
4945                          * If still in FIN_WAIT_1 STATE FIN has not been
4946                          * acked so enter the CLOSING state.
4947                          */
4948                 case TCPS_FIN_WAIT_1:
4949                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4950                         tcp_state_change(tp, TCPS_CLOSING);
4951                         break;
4952
4953                         /*
4954                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
4955                          * starting the time-wait timer, turning off the
4956                          * other standard timers.
4957                          */
4958                 case TCPS_FIN_WAIT_2:
4959                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4960                         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
4961                         tcp_twstart(tp);
4962                         return (1);
4963                 }
4964         }
4965         /*
4966          * Return any desired output.
4967          */
4968         if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
4969                 rack->r_wanted_output++;
4970         }
4971         INP_WLOCK_ASSERT(tp->t_inpcb);
4972         return (0);
4973 }
4974
4975 /*
4976  * Here nothing is really faster, its just that we
4977  * have broken out the fast-data path also just like
4978  * the fast-ack.
4979  */
4980 static int
4981 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
4982     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
4983     uint32_t tiwin, int32_t nxt_pkt)
4984 {
4985         int32_t nsegs;
4986         int32_t newsize = 0;    /* automatic sockbuf scaling */
4987         struct tcp_rack *rack;
4988 #ifdef TCPDEBUG
4989         /*
4990          * The size of tcp_saveipgen must be the size of the max ip header,
4991          * now IPv6.
4992          */
4993         u_char tcp_saveipgen[IP6_HDR_LEN];
4994         struct tcphdr tcp_savetcp;
4995         short ostate = 0;
4996
4997 #endif
4998         /*
4999          * If last ACK falls within this segment's sequence numbers, record
5000          * the timestamp. NOTE that the test is modified according to the
5001          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5002          */
5003         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5004                 return (0);
5005         }
5006         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5007                 return (0);
5008         }
5009         if (tiwin && tiwin != tp->snd_wnd) {
5010                 return (0);
5011         }
5012         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5013                 return (0);
5014         }
5015         if (__predict_false((to->to_flags & TOF_TS) &&
5016             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5017                 return (0);
5018         }
5019         if (__predict_false((th->th_ack != tp->snd_una))) {
5020                 return (0);
5021         }
5022         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5023                 return (0);
5024         }
5025         if ((to->to_flags & TOF_TS) != 0 &&
5026             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5027                 tp->ts_recent_age = tcp_ts_getticks();
5028                 tp->ts_recent = to->to_tsval;
5029         }
5030         rack = (struct tcp_rack *)tp->t_fb_ptr;
5031         /*
5032          * This is a pure, in-sequence data packet with nothing on the
5033          * reassembly queue and we have enough buffer space to take it.
5034          */
5035         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5036
5037
5038         /* Clean receiver SACK report if present */
5039         if (tp->rcv_numsacks)
5040                 tcp_clean_sackreport(tp);
5041         TCPSTAT_INC(tcps_preddat);
5042         tp->rcv_nxt += tlen;
5043         /*
5044          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5045          */
5046         tp->snd_wl1 = th->th_seq;
5047         /*
5048          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5049          */
5050         tp->rcv_up = tp->rcv_nxt;
5051         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5052         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5053 #ifdef TCPDEBUG
5054         if (so->so_options & SO_DEBUG)
5055                 tcp_trace(TA_INPUT, ostate, tp,
5056                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5057 #endif
5058         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5059
5060         /* Add data to socket buffer. */
5061         SOCKBUF_LOCK(&so->so_rcv);
5062         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5063                 m_freem(m);
5064         } else {
5065                 /*
5066                  * Set new socket buffer size. Give up when limit is
5067                  * reached.
5068                  */
5069                 if (newsize)
5070                         if (!sbreserve_locked(&so->so_rcv,
5071                             newsize, so, NULL))
5072                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5073                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5074                 sbappendstream_locked(&so->so_rcv, m, 0);
5075                 rack_calc_rwin(so, tp);
5076         }
5077         /* NB: sorwakeup_locked() does an implicit unlock. */
5078         sorwakeup_locked(so);
5079         if (DELAY_ACK(tp, tlen)) {
5080                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5081                 tp->t_flags |= TF_DELACK;
5082         } else {
5083                 tp->t_flags |= TF_ACKNOW;
5084                 rack->r_wanted_output++;
5085         }
5086         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
5087                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5088         return (1);
5089 }
5090
5091 /*
5092  * This subfunction is used to try to highly optimize the
5093  * fast path. We again allow window updates that are
5094  * in sequence to remain in the fast-path. We also add
5095  * in the __predict's to attempt to help the compiler.
5096  * Note that if we return a 0, then we can *not* process
5097  * it and the caller should push the packet into the
5098  * slow-path.
5099  */
5100 static int
5101 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5102     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5103     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
5104 {
5105         int32_t acked;
5106         int32_t nsegs;
5107
5108 #ifdef TCPDEBUG
5109         /*
5110          * The size of tcp_saveipgen must be the size of the max ip header,
5111          * now IPv6.
5112          */
5113         u_char tcp_saveipgen[IP6_HDR_LEN];
5114         struct tcphdr tcp_savetcp;
5115         short ostate = 0;
5116
5117 #endif
5118         struct tcp_rack *rack;
5119
5120         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5121                 /* Old ack, behind (or duplicate to) the last one rcv'd */
5122                 return (0);
5123         }
5124         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
5125                 /* Above what we have sent? */
5126                 return (0);
5127         }
5128         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5129                 /* We are retransmitting */
5130                 return (0);
5131         }
5132         if (__predict_false(tiwin == 0)) {
5133                 /* zero window */
5134                 return (0);
5135         }
5136         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
5137                 /* We need a SYN or a FIN, unlikely.. */
5138                 return (0);
5139         }
5140         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
5141                 /* Timestamp is behind .. old ack with seq wrap? */
5142                 return (0);
5143         }
5144         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
5145                 /* Still recovering */
5146                 return (0);
5147         }
5148         rack = (struct tcp_rack *)tp->t_fb_ptr;
5149         if (rack->r_ctl.rc_sacked) {
5150                 /* We have sack holes on our scoreboard */
5151                 return (0);
5152         }
5153         /* Ok if we reach here, we can process a fast-ack */
5154         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5155         rack_log_ack(tp, to, th);
5156         /* Did the window get updated? */
5157         if (tiwin != tp->snd_wnd) {
5158                 tp->snd_wnd = tiwin;
5159                 tp->snd_wl1 = th->th_seq;
5160                 if (tp->snd_wnd > tp->max_sndwnd)
5161                         tp->max_sndwnd = tp->snd_wnd;
5162         }
5163         if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
5164                 rack_exit_persist(tp, rack);
5165         }
5166         /*
5167          * If last ACK falls within this segment's sequence numbers, record
5168          * the timestamp. NOTE that the test is modified according to the
5169          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5170          */
5171         if ((to->to_flags & TOF_TS) != 0 &&
5172             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5173                 tp->ts_recent_age = tcp_ts_getticks();
5174                 tp->ts_recent = to->to_tsval;
5175         }
5176         /*
5177          * This is a pure ack for outstanding data.
5178          */
5179         TCPSTAT_INC(tcps_predack);
5180
5181         /*
5182          * "bad retransmit" recovery.
5183          */
5184         if (tp->t_flags & TF_PREVVALID) {
5185                 tp->t_flags &= ~TF_PREVVALID;
5186                 if (tp->t_rxtshift == 1 &&
5187                     (int)(ticks - tp->t_badrxtwin) < 0)
5188                         rack_cong_signal(tp, th, CC_RTO_ERR);
5189         }
5190         /*
5191          * Recalculate the transmit timer / rtt.
5192          *
5193          * Some boxes send broken timestamp replies during the SYN+ACK
5194          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5195          * and blow up the retransmit timer.
5196          */
5197         acked = BYTES_THIS_ACK(tp, th);
5198
5199 #ifdef TCP_HHOOK
5200         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
5201         hhook_run_tcp_est_in(tp, th, to);
5202 #endif
5203
5204         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5205         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5206         sbdrop(&so->so_snd, acked);
5207         /*
5208          * Let the congestion control algorithm update congestion control
5209          * related information. This typically means increasing the
5210          * congestion window.
5211          */
5212         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
5213
5214         tp->snd_una = th->th_ack;
5215         /*
5216          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
5217          */
5218         tp->snd_wl2 = th->th_ack;
5219         tp->t_dupacks = 0;
5220         m_freem(m);
5221         /* ND6_HINT(tp);         *//* Some progress has been made. */
5222
5223         /*
5224          * If all outstanding data are acked, stop retransmit timer,
5225          * otherwise restart timer using current (possibly backed-off)
5226          * value. If process is waiting for space, wakeup/selwakeup/signal.
5227          * If data are ready to send, let tcp_output decide between more
5228          * output or persist.
5229          */
5230 #ifdef TCPDEBUG
5231         if (so->so_options & SO_DEBUG)
5232                 tcp_trace(TA_INPUT, ostate, tp,
5233                     (void *)tcp_saveipgen,
5234                     &tcp_savetcp, 0);
5235 #endif
5236         if (tp->snd_una == tp->snd_max) {
5237                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5238                 tp->t_acktime = 0;
5239                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5240         }
5241         /* Wake up the socket if we have room to write more */
5242         sowwakeup(so);
5243         if (sbavail(&so->so_snd)) {
5244                 rack->r_wanted_output++;
5245         }
5246         return (1);
5247 }
5248
5249 /*
5250  * Return value of 1, the TCB is unlocked and most
5251  * likely gone, return value of 0, the TCP is still
5252  * locked.
5253  */
5254 static int
5255 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
5256     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5257     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5258 {
5259         int32_t ret_val = 0;
5260         int32_t todrop;
5261         int32_t ourfinisacked = 0;
5262
5263         rack_calc_rwin(so, tp);
5264         /*
5265          * If the state is SYN_SENT: if seg contains an ACK, but not for our
5266          * SYN, drop the input. if seg contains a RST, then drop the
5267          * connection. if seg does not contain SYN, then drop it. Otherwise
5268          * this is an acceptable SYN segment initialize tp->rcv_nxt and
5269          * tp->irs if seg contains ack then advance tp->snd_una if seg
5270          * contains an ECE and ECN support is enabled, the stream is ECN
5271          * capable. if SYN has been acked change to ESTABLISHED else
5272          * SYN_RCVD state arrange for segment to be acked (eventually)
5273          * continue processing rest of data/controls, beginning with URG
5274          */
5275         if ((thflags & TH_ACK) &&
5276             (SEQ_LEQ(th->th_ack, tp->iss) ||
5277             SEQ_GT(th->th_ack, tp->snd_max))) {
5278                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5279                 return (1);
5280         }
5281         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
5282                 TCP_PROBE5(connect__refused, NULL, tp,
5283                     mtod(m, const char *), tp, th);
5284                 tp = tcp_drop(tp, ECONNREFUSED);
5285                 rack_do_drop(m, tp);
5286                 return (1);
5287         }
5288         if (thflags & TH_RST) {
5289                 rack_do_drop(m, tp);
5290                 return (1);
5291         }
5292         if (!(thflags & TH_SYN)) {
5293                 rack_do_drop(m, tp);
5294                 return (1);
5295         }
5296         tp->irs = th->th_seq;
5297         tcp_rcvseqinit(tp);
5298         if (thflags & TH_ACK) {
5299                 int tfo_partial = 0;
5300
5301                 TCPSTAT_INC(tcps_connects);
5302                 soisconnected(so);
5303 #ifdef MAC
5304                 mac_socketpeer_set_from_mbuf(m, so);
5305 #endif
5306                 /* Do window scaling on this connection? */
5307                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5308                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5309                         tp->rcv_scale = tp->request_r_scale;
5310                 }
5311                 tp->rcv_adv += min(tp->rcv_wnd,
5312                     TCP_MAXWIN << tp->rcv_scale);
5313                 /*
5314                  * If not all the data that was sent in the TFO SYN
5315                  * has been acked, resend the remainder right away.
5316                  */
5317                 if (IS_FASTOPEN(tp->t_flags) &&
5318                     (tp->snd_una != tp->snd_max)) {
5319                         tp->snd_nxt = th->th_ack;
5320                         tfo_partial = 1;
5321                 }
5322                 /*
5323                  * If there's data, delay ACK; if there's also a FIN ACKNOW
5324                  * will be turned on later.
5325                  */
5326                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
5327                         rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
5328                                           ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
5329                         tp->t_flags |= TF_DELACK;
5330                 } else {
5331                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
5332                         tp->t_flags |= TF_ACKNOW;
5333                 }
5334
5335                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
5336                     V_tcp_do_ecn) {
5337                         tp->t_flags |= TF_ECN_PERMIT;
5338                         TCPSTAT_INC(tcps_ecn_shs);
5339                 }
5340                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
5341                         /*
5342                          * We advance snd_una for the
5343                          * fast open case. If th_ack is
5344                          * acknowledging data beyond
5345                          * snd_una we can't just call
5346                          * ack-processing since the
5347                          * data stream in our send-map
5348                          * will start at snd_una + 1 (one
5349                          * beyond the SYN). If its just
5350                          * equal we don't need to do that
5351                          * and there is no send_map.
5352                          */
5353                         tp->snd_una++;
5354                 }
5355                 /*
5356                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
5357                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
5358                  */
5359                 tp->t_starttime = ticks;
5360                 if (tp->t_flags & TF_NEEDFIN) {
5361                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
5362                         tp->t_flags &= ~TF_NEEDFIN;
5363                         thflags &= ~TH_SYN;
5364                 } else {
5365                         tcp_state_change(tp, TCPS_ESTABLISHED);
5366                         TCP_PROBE5(connect__established, NULL, tp,
5367                             mtod(m, const char *), tp, th);
5368                         cc_conn_init(tp);
5369                 }
5370         } else {
5371                 /*
5372                  * Received initial SYN in SYN-SENT[*] state => simultaneous
5373                  * open.  If segment contains CC option and there is a
5374                  * cached CC, apply TAO test. If it succeeds, connection is *
5375                  * half-synchronized. Otherwise, do 3-way handshake:
5376                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
5377                  * there was no CC option, clear cached CC value.
5378                  */
5379                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
5380                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
5381         }
5382         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5383         INP_WLOCK_ASSERT(tp->t_inpcb);
5384         /*
5385          * Advance th->th_seq to correspond to first data byte. If data,
5386          * trim to stay within window, dropping FIN if necessary.
5387          */
5388         th->th_seq++;
5389         if (tlen > tp->rcv_wnd) {
5390                 todrop = tlen - tp->rcv_wnd;
5391                 m_adj(m, -todrop);
5392                 tlen = tp->rcv_wnd;
5393                 thflags &= ~TH_FIN;
5394                 TCPSTAT_INC(tcps_rcvpackafterwin);
5395                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
5396         }
5397         tp->snd_wl1 = th->th_seq - 1;
5398         tp->rcv_up = th->th_seq;
5399         /*
5400          * Client side of transaction: already sent SYN and data. If the
5401          * remote host used T/TCP to validate the SYN, our data will be
5402          * ACK'd; if so, enter normal data segment processing in the middle
5403          * of step 5, ack processing. Otherwise, goto step 6.
5404          */
5405         if (thflags & TH_ACK) {
5406                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
5407                         return (ret_val);
5408                 /* We may have changed to FIN_WAIT_1 above */
5409                 if (tp->t_state == TCPS_FIN_WAIT_1) {
5410                         /*
5411                          * In FIN_WAIT_1 STATE in addition to the processing
5412                          * for the ESTABLISHED state if our FIN is now
5413                          * acknowledged then enter FIN_WAIT_2.
5414                          */
5415                         if (ourfinisacked) {
5416                                 /*
5417                                  * If we can't receive any more data, then
5418                                  * closing user can proceed. Starting the
5419                                  * timer is contrary to the specification,
5420                                  * but if we don't get a FIN we'll hang
5421                                  * forever.
5422                                  *
5423                                  * XXXjl: we should release the tp also, and
5424                                  * use a compressed state.
5425                                  */
5426                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5427                                         soisdisconnected(so);
5428                                         tcp_timer_activate(tp, TT_2MSL,
5429                                             (tcp_fast_finwait2_recycle ?
5430                                             tcp_finwait2_timeout :
5431                                             TP_MAXIDLE(tp)));
5432                                 }
5433                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5434                         }
5435                 }
5436         }
5437         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5438            tiwin, thflags, nxt_pkt));
5439 }
5440
5441 /*
5442  * Return value of 1, the TCB is unlocked and most
5443  * likely gone, return value of 0, the TCP is still
5444  * locked.
5445  */
5446 static int
5447 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
5448     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5449     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5450 {
5451         int32_t ret_val = 0;
5452         int32_t ourfinisacked = 0;
5453
5454         rack_calc_rwin(so, tp);
5455
5456         if ((thflags & TH_ACK) &&
5457             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
5458             SEQ_GT(th->th_ack, tp->snd_max))) {
5459                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5460                 return (1);
5461         }
5462         if (IS_FASTOPEN(tp->t_flags)) {
5463                 /*
5464                  * When a TFO connection is in SYN_RECEIVED, the
5465                  * only valid packets are the initial SYN, a
5466                  * retransmit/copy of the initial SYN (possibly with
5467                  * a subset of the original data), a valid ACK, a
5468                  * FIN, or a RST.
5469                  */
5470                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
5471                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5472                         return (1);
5473                 } else if (thflags & TH_SYN) {
5474                         /* non-initial SYN is ignored */
5475                         struct tcp_rack *rack;
5476
5477                         rack = (struct tcp_rack *)tp->t_fb_ptr;
5478                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
5479                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
5480                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
5481                                 rack_do_drop(m, NULL);
5482                                 return (0);
5483                         }
5484                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
5485                         rack_do_drop(m, NULL);
5486                         return (0);
5487                 }
5488         }
5489         if (thflags & TH_RST)
5490                 return (rack_process_rst(m, th, so, tp));
5491         /*
5492          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5493          * it's less than ts_recent, drop it.
5494          */
5495         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5496             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5497                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5498                         return (ret_val);
5499         }
5500         /*
5501          * In the SYN-RECEIVED state, validate that the packet belongs to
5502          * this connection before trimming the data to fit the receive
5503          * window.  Check the sequence number versus IRS since we know the
5504          * sequence numbers haven't wrapped.  This is a partial fix for the
5505          * "LAND" DoS attack.
5506          */
5507         if (SEQ_LT(th->th_seq, tp->irs)) {
5508                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5509                 return (1);
5510         }
5511         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5512                 return (ret_val);
5513         }
5514         /*
5515          * If last ACK falls within this segment's sequence numbers, record
5516          * its timestamp. NOTE: 1) That the test incorporates suggestions
5517          * from the latest proposal of the tcplw@cray.com list (Braden
5518          * 1993/04/26). 2) That updating only on newer timestamps interferes
5519          * with our earlier PAWS tests, so this check should be solely
5520          * predicated on the sequence space of this segment. 3) That we
5521          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5522          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5523          * SEG.Len, This modified check allows us to overcome RFC1323's
5524          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5525          * p.869. In such cases, we can still calculate the RTT correctly
5526          * when RCV.NXT == Last.ACK.Sent.
5527          */
5528         if ((to->to_flags & TOF_TS) != 0 &&
5529             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5530             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5531             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5532                 tp->ts_recent_age = tcp_ts_getticks();
5533                 tp->ts_recent = to->to_tsval;
5534         }
5535         tp->snd_wnd = tiwin;
5536         /*
5537          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5538          * is on (half-synchronized state), then queue data for later
5539          * processing; else drop segment and return.
5540          */
5541         if ((thflags & TH_ACK) == 0) {
5542                 if (IS_FASTOPEN(tp->t_flags)) {
5543                         cc_conn_init(tp);
5544                 }
5545                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5546                     tiwin, thflags, nxt_pkt));
5547         }
5548         TCPSTAT_INC(tcps_connects);
5549         soisconnected(so);
5550         /* Do window scaling? */
5551         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5552             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5553                 tp->rcv_scale = tp->request_r_scale;
5554         }
5555         /*
5556          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
5557          * FIN-WAIT-1
5558          */
5559         tp->t_starttime = ticks;
5560         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
5561                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
5562                 tp->t_tfo_pending = NULL;
5563
5564                 /*
5565                  * Account for the ACK of our SYN prior to
5566                  * regular ACK processing below.
5567                  */
5568                 tp->snd_una++;
5569         }
5570         if (tp->t_flags & TF_NEEDFIN) {
5571                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
5572                 tp->t_flags &= ~TF_NEEDFIN;
5573         } else {
5574                 tcp_state_change(tp, TCPS_ESTABLISHED);
5575                 TCP_PROBE5(accept__established, NULL, tp,
5576                     mtod(m, const char *), tp, th);
5577                 /*
5578                  * TFO connections call cc_conn_init() during SYN
5579                  * processing.  Calling it again here for such connections
5580                  * is not harmless as it would undo the snd_cwnd reduction
5581                  * that occurs when a TFO SYN|ACK is retransmitted.
5582                  */
5583                 if (!IS_FASTOPEN(tp->t_flags))
5584                         cc_conn_init(tp);
5585         }
5586         /*
5587          * If segment contains data or ACK, will call tcp_reass() later; if
5588          * not, do so now to pass queued data to user.
5589          */
5590         if (tlen == 0 && (thflags & TH_FIN) == 0)
5591                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
5592                     (struct mbuf *)0);
5593         tp->snd_wl1 = th->th_seq - 1;
5594         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5595                 return (ret_val);
5596         }
5597         if (tp->t_state == TCPS_FIN_WAIT_1) {
5598                 /* We could have went to FIN_WAIT_1 (or EST) above */
5599                 /*
5600                  * In FIN_WAIT_1 STATE in addition to the processing for the
5601                  * ESTABLISHED state if our FIN is now acknowledged then
5602                  * enter FIN_WAIT_2.
5603                  */
5604                 if (ourfinisacked) {
5605                         /*
5606                          * If we can't receive any more data, then closing
5607                          * user can proceed. Starting the timer is contrary
5608                          * to the specification, but if we don't get a FIN
5609                          * we'll hang forever.
5610                          *
5611                          * XXXjl: we should release the tp also, and use a
5612                          * compressed state.
5613                          */
5614                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5615                                 soisdisconnected(so);
5616                                 tcp_timer_activate(tp, TT_2MSL,
5617                                     (tcp_fast_finwait2_recycle ?
5618                                     tcp_finwait2_timeout :
5619                                     TP_MAXIDLE(tp)));
5620                         }
5621                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
5622                 }
5623         }
5624         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5625             tiwin, thflags, nxt_pkt));
5626 }
5627
5628 /*
5629  * Return value of 1, the TCB is unlocked and most
5630  * likely gone, return value of 0, the TCP is still
5631  * locked.
5632  */
5633 static int
5634 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
5635     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5636     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5637 {
5638         int32_t ret_val = 0;
5639
5640         /*
5641          * Header prediction: check for the two common cases of a
5642          * uni-directional data xfer.  If the packet has no control flags,
5643          * is in-sequence, the window didn't change and we're not
5644          * retransmitting, it's a candidate.  If the length is zero and the
5645          * ack moved forward, we're the sender side of the xfer.  Just free
5646          * the data acked & wake any higher level process that was blocked
5647          * waiting for space.  If the length is non-zero and the ack didn't
5648          * move, we're the receiver side.  If we're getting packets in-order
5649          * (the reassembly queue is empty), add the data toc The socket
5650          * buffer and note that we need a delayed ack. Make sure that the
5651          * hidden state-flags are also off. Since we check for
5652          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
5653          */
5654         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
5655             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
5656             __predict_true(SEGQ_EMPTY(tp)) &&
5657             __predict_true(th->th_seq == tp->rcv_nxt)) {
5658                 struct tcp_rack *rack;
5659
5660                 rack = (struct tcp_rack *)tp->t_fb_ptr;
5661                 if (tlen == 0) {
5662                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
5663                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
5664                                 return (0);
5665                         }
5666                 } else {
5667                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
5668                             tiwin, nxt_pkt)) {
5669                                 return (0);
5670                         }
5671                 }
5672         }
5673         rack_calc_rwin(so, tp);
5674
5675         if (thflags & TH_RST)
5676                 return (rack_process_rst(m, th, so, tp));
5677
5678         /*
5679          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5680          * synchronized state.
5681          */
5682         if (thflags & TH_SYN) {
5683                 rack_challenge_ack(m, th, tp, &ret_val);
5684                 return (ret_val);
5685         }
5686         /*
5687          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5688          * it's less than ts_recent, drop it.
5689          */
5690         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5691             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5692                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5693                         return (ret_val);
5694         }
5695         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5696                 return (ret_val);
5697         }
5698         /*
5699          * If last ACK falls within this segment's sequence numbers, record
5700          * its timestamp. NOTE: 1) That the test incorporates suggestions
5701          * from the latest proposal of the tcplw@cray.com list (Braden
5702          * 1993/04/26). 2) That updating only on newer timestamps interferes
5703          * with our earlier PAWS tests, so this check should be solely
5704          * predicated on the sequence space of this segment. 3) That we
5705          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5706          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5707          * SEG.Len, This modified check allows us to overcome RFC1323's
5708          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5709          * p.869. In such cases, we can still calculate the RTT correctly
5710          * when RCV.NXT == Last.ACK.Sent.
5711          */
5712         if ((to->to_flags & TOF_TS) != 0 &&
5713             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5714             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5715             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5716                 tp->ts_recent_age = tcp_ts_getticks();
5717                 tp->ts_recent = to->to_tsval;
5718         }
5719         /*
5720          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5721          * is on (half-synchronized state), then queue data for later
5722          * processing; else drop segment and return.
5723          */
5724         if ((thflags & TH_ACK) == 0) {
5725                 if (tp->t_flags & TF_NEEDSYN) {
5726
5727                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5728                             tiwin, thflags, nxt_pkt));
5729
5730                 } else if (tp->t_flags & TF_ACKNOW) {
5731                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5732                         return (ret_val);
5733                 } else {
5734                         rack_do_drop(m, NULL);
5735                         return (0);
5736                 }
5737         }
5738         /*
5739          * Ack processing.
5740          */
5741         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5742                 return (ret_val);
5743         }
5744         if (sbavail(&so->so_snd)) {
5745                 if (rack_progress_timeout_check(tp)) {
5746                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5747                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5748                         return (1);
5749                 }
5750         }
5751         /* State changes only happen in rack_process_data() */
5752         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5753             tiwin, thflags, nxt_pkt));
5754 }
5755
5756 /*
5757  * Return value of 1, the TCB is unlocked and most
5758  * likely gone, return value of 0, the TCP is still
5759  * locked.
5760  */
5761 static int
5762 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
5763     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5764     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5765 {
5766         int32_t ret_val = 0;
5767
5768         rack_calc_rwin(so, tp);
5769         if (thflags & TH_RST)
5770                 return (rack_process_rst(m, th, so, tp));
5771         /*
5772          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5773          * synchronized state.
5774          */
5775         if (thflags & TH_SYN) {
5776                 rack_challenge_ack(m, th, tp, &ret_val);
5777                 return (ret_val);
5778         }
5779         /*
5780          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5781          * it's less than ts_recent, drop it.
5782          */
5783         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5784             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5785                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5786                         return (ret_val);
5787         }
5788         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5789                 return (ret_val);
5790         }
5791         /*
5792          * If last ACK falls within this segment's sequence numbers, record
5793          * its timestamp. NOTE: 1) That the test incorporates suggestions
5794          * from the latest proposal of the tcplw@cray.com list (Braden
5795          * 1993/04/26). 2) That updating only on newer timestamps interferes
5796          * with our earlier PAWS tests, so this check should be solely
5797          * predicated on the sequence space of this segment. 3) That we
5798          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5799          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5800          * SEG.Len, This modified check allows us to overcome RFC1323's
5801          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5802          * p.869. In such cases, we can still calculate the RTT correctly
5803          * when RCV.NXT == Last.ACK.Sent.
5804          */
5805         if ((to->to_flags & TOF_TS) != 0 &&
5806             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5807             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5808             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5809                 tp->ts_recent_age = tcp_ts_getticks();
5810                 tp->ts_recent = to->to_tsval;
5811         }
5812         /*
5813          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5814          * is on (half-synchronized state), then queue data for later
5815          * processing; else drop segment and return.
5816          */
5817         if ((thflags & TH_ACK) == 0) {
5818                 if (tp->t_flags & TF_NEEDSYN) {
5819                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5820                             tiwin, thflags, nxt_pkt));
5821
5822                 } else if (tp->t_flags & TF_ACKNOW) {
5823                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5824                         return (ret_val);
5825                 } else {
5826                         rack_do_drop(m, NULL);
5827                         return (0);
5828                 }
5829         }
5830         /*
5831          * Ack processing.
5832          */
5833         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5834                 return (ret_val);
5835         }
5836         if (sbavail(&so->so_snd)) {
5837                 if (rack_progress_timeout_check(tp)) {
5838                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5839                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5840                         return (1);
5841                 }
5842         }
5843         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5844             tiwin, thflags, nxt_pkt));
5845 }
5846
5847 static int
5848 rack_check_data_after_close(struct mbuf *m,
5849     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
5850 {
5851         struct tcp_rack *rack;
5852
5853         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5854         rack = (struct tcp_rack *)tp->t_fb_ptr;
5855         if (rack->rc_allow_data_af_clo == 0) {
5856         close_now:
5857                 tp = tcp_close(tp);
5858                 TCPSTAT_INC(tcps_rcvafterclose);
5859                 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
5860                 return (1);
5861         }
5862         if (sbavail(&so->so_snd) == 0)
5863                 goto close_now;
5864         /* Ok we allow data that is ignored and a followup reset */
5865         tp->rcv_nxt = th->th_seq + *tlen;
5866         tp->t_flags2 |= TF2_DROP_AF_DATA;
5867         rack->r_wanted_output = 1;
5868         *tlen = 0;
5869         return (0);
5870 }
5871
5872 /*
5873  * Return value of 1, the TCB is unlocked and most
5874  * likely gone, return value of 0, the TCP is still
5875  * locked.
5876  */
5877 static int
5878 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
5879     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5880     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5881 {
5882         int32_t ret_val = 0;
5883         int32_t ourfinisacked = 0;
5884
5885         rack_calc_rwin(so, tp);
5886
5887         if (thflags & TH_RST)
5888                 return (rack_process_rst(m, th, so, tp));
5889         /*
5890          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5891          * synchronized state.
5892          */
5893         if (thflags & TH_SYN) {
5894                 rack_challenge_ack(m, th, tp, &ret_val);
5895                 return (ret_val);
5896         }
5897         /*
5898          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5899          * it's less than ts_recent, drop it.
5900          */
5901         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5902             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5903                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5904                         return (ret_val);
5905         }
5906         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5907                 return (ret_val);
5908         }
5909         /*
5910          * If new data are received on a connection after the user processes
5911          * are gone, then RST the other end.
5912          */
5913         if ((so->so_state & SS_NOFDREF) && tlen) {
5914                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
5915                         return (1);
5916         }
5917         /*
5918          * If last ACK falls within this segment's sequence numbers, record
5919          * its timestamp. NOTE: 1) That the test incorporates suggestions
5920          * from the latest proposal of the tcplw@cray.com list (Braden
5921          * 1993/04/26). 2) That updating only on newer timestamps interferes
5922          * with our earlier PAWS tests, so this check should be solely
5923          * predicated on the sequence space of this segment. 3) That we
5924          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5925          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5926          * SEG.Len, This modified check allows us to overcome RFC1323's
5927          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5928          * p.869. In such cases, we can still calculate the RTT correctly
5929          * when RCV.NXT == Last.ACK.Sent.
5930          */
5931         if ((to->to_flags & TOF_TS) != 0 &&
5932             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5933             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5934             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5935                 tp->ts_recent_age = tcp_ts_getticks();
5936                 tp->ts_recent = to->to_tsval;
5937         }
5938         /*
5939          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5940          * is on (half-synchronized state), then queue data for later
5941          * processing; else drop segment and return.
5942          */
5943         if ((thflags & TH_ACK) == 0) {
5944                 if (tp->t_flags & TF_NEEDSYN) {
5945                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5946                             tiwin, thflags, nxt_pkt));
5947                 } else if (tp->t_flags & TF_ACKNOW) {
5948                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5949                         return (ret_val);
5950                 } else {
5951                         rack_do_drop(m, NULL);
5952                         return (0);
5953                 }
5954         }
5955         /*
5956          * Ack processing.
5957          */
5958         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5959                 return (ret_val);
5960         }
5961         if (ourfinisacked) {
5962                 /*
5963                  * If we can't receive any more data, then closing user can
5964                  * proceed. Starting the timer is contrary to the
5965                  * specification, but if we don't get a FIN we'll hang
5966                  * forever.
5967                  *
5968                  * XXXjl: we should release the tp also, and use a
5969                  * compressed state.
5970                  */
5971                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5972                         soisdisconnected(so);
5973                         tcp_timer_activate(tp, TT_2MSL,
5974                             (tcp_fast_finwait2_recycle ?
5975                             tcp_finwait2_timeout :
5976                             TP_MAXIDLE(tp)));
5977                 }
5978                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5979         }
5980         if (sbavail(&so->so_snd)) {
5981                 if (rack_progress_timeout_check(tp)) {
5982                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5983                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5984                         return (1);
5985                 }
5986         }
5987         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5988             tiwin, thflags, nxt_pkt));
5989 }
5990
5991 /*
5992  * Return value of 1, the TCB is unlocked and most
5993  * likely gone, return value of 0, the TCP is still
5994  * locked.
5995  */
5996 static int
5997 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
5998     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5999     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6000 {
6001         int32_t ret_val = 0;
6002         int32_t ourfinisacked = 0;
6003
6004         rack_calc_rwin(so, tp);
6005
6006         if (thflags & TH_RST)
6007                 return (rack_process_rst(m, th, so, tp));
6008         /*
6009          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6010          * synchronized state.
6011          */
6012         if (thflags & TH_SYN) {
6013                 rack_challenge_ack(m, th, tp, &ret_val);
6014                 return (ret_val);
6015         }
6016         /*
6017          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6018          * it's less than ts_recent, drop it.
6019          */
6020         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6021             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6022                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6023                         return (ret_val);
6024         }
6025         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6026                 return (ret_val);
6027         }
6028         /*
6029          * If new data are received on a connection after the user processes
6030          * are gone, then RST the other end.
6031          */
6032         if ((so->so_state & SS_NOFDREF) && tlen) {
6033                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6034                         return (1);
6035         }
6036         /*
6037          * If last ACK falls within this segment's sequence numbers, record
6038          * its timestamp. NOTE: 1) That the test incorporates suggestions
6039          * from the latest proposal of the tcplw@cray.com list (Braden
6040          * 1993/04/26). 2) That updating only on newer timestamps interferes
6041          * with our earlier PAWS tests, so this check should be solely
6042          * predicated on the sequence space of this segment. 3) That we
6043          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6044          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6045          * SEG.Len, This modified check allows us to overcome RFC1323's
6046          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6047          * p.869. In such cases, we can still calculate the RTT correctly
6048          * when RCV.NXT == Last.ACK.Sent.
6049          */
6050         if ((to->to_flags & TOF_TS) != 0 &&
6051             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6052             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6053             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6054                 tp->ts_recent_age = tcp_ts_getticks();
6055                 tp->ts_recent = to->to_tsval;
6056         }
6057         /*
6058          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6059          * is on (half-synchronized state), then queue data for later
6060          * processing; else drop segment and return.
6061          */
6062         if ((thflags & TH_ACK) == 0) {
6063                 if (tp->t_flags & TF_NEEDSYN) {
6064                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6065                             tiwin, thflags, nxt_pkt));
6066                 } else if (tp->t_flags & TF_ACKNOW) {
6067                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6068                         return (ret_val);
6069                 } else {
6070                         rack_do_drop(m, NULL);
6071                         return (0);
6072                 }
6073         }
6074         /*
6075          * Ack processing.
6076          */
6077         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6078                 return (ret_val);
6079         }
6080         if (ourfinisacked) {
6081                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6082                 tcp_twstart(tp);
6083                 m_freem(m);
6084                 return (1);
6085         }
6086         if (sbavail(&so->so_snd)) {
6087                 if (rack_progress_timeout_check(tp)) {
6088                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6089                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6090                         return (1);
6091                 }
6092         }
6093         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6094             tiwin, thflags, nxt_pkt));
6095 }
6096
6097 /*
6098  * Return value of 1, the TCB is unlocked and most
6099  * likely gone, return value of 0, the TCP is still
6100  * locked.
6101  */
6102 static int
6103 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6104     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6105     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6106 {
6107         int32_t ret_val = 0;
6108         int32_t ourfinisacked = 0;
6109
6110         rack_calc_rwin(so, tp);
6111
6112         if (thflags & TH_RST)
6113                 return (rack_process_rst(m, th, so, tp));
6114         /*
6115          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6116          * synchronized state.
6117          */
6118         if (thflags & TH_SYN) {
6119                 rack_challenge_ack(m, th, tp, &ret_val);
6120                 return (ret_val);
6121         }
6122         /*
6123          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6124          * it's less than ts_recent, drop it.
6125          */
6126         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6127             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6128                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6129                         return (ret_val);
6130         }
6131         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6132                 return (ret_val);
6133         }
6134         /*
6135          * If new data are received on a connection after the user processes
6136          * are gone, then RST the other end.
6137          */
6138         if ((so->so_state & SS_NOFDREF) && tlen) {
6139                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6140                         return (1);
6141         }
6142         /*
6143          * If last ACK falls within this segment's sequence numbers, record
6144          * its timestamp. NOTE: 1) That the test incorporates suggestions
6145          * from the latest proposal of the tcplw@cray.com list (Braden
6146          * 1993/04/26). 2) That updating only on newer timestamps interferes
6147          * with our earlier PAWS tests, so this check should be solely
6148          * predicated on the sequence space of this segment. 3) That we
6149          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6150          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6151          * SEG.Len, This modified check allows us to overcome RFC1323's
6152          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6153          * p.869. In such cases, we can still calculate the RTT correctly
6154          * when RCV.NXT == Last.ACK.Sent.
6155          */
6156         if ((to->to_flags & TOF_TS) != 0 &&
6157             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6158             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6159             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6160                 tp->ts_recent_age = tcp_ts_getticks();
6161                 tp->ts_recent = to->to_tsval;
6162         }
6163         /*
6164          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6165          * is on (half-synchronized state), then queue data for later
6166          * processing; else drop segment and return.
6167          */
6168         if ((thflags & TH_ACK) == 0) {
6169                 if (tp->t_flags & TF_NEEDSYN) {
6170                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6171                             tiwin, thflags, nxt_pkt));
6172                 } else if (tp->t_flags & TF_ACKNOW) {
6173                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6174                         return (ret_val);
6175                 } else {
6176                         rack_do_drop(m, NULL);
6177                         return (0);
6178                 }
6179         }
6180         /*
6181          * case TCPS_LAST_ACK: Ack processing.
6182          */
6183         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6184                 return (ret_val);
6185         }
6186         if (ourfinisacked) {
6187                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6188                 tp = tcp_close(tp);
6189                 rack_do_drop(m, tp);
6190                 return (1);
6191         }
6192         if (sbavail(&so->so_snd)) {
6193                 if (rack_progress_timeout_check(tp)) {
6194                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6195                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6196                         return (1);
6197                 }
6198         }
6199         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6200             tiwin, thflags, nxt_pkt));
6201 }
6202
6203
6204 /*
6205  * Return value of 1, the TCB is unlocked and most
6206  * likely gone, return value of 0, the TCP is still
6207  * locked.
6208  */
6209 static int
6210 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
6211     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6212     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6213 {
6214         int32_t ret_val = 0;
6215         int32_t ourfinisacked = 0;
6216
6217         rack_calc_rwin(so, tp);
6218
6219         /* Reset receive buffer auto scaling when not in bulk receive mode. */
6220         if (thflags & TH_RST)
6221                 return (rack_process_rst(m, th, so, tp));
6222         /*
6223          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6224          * synchronized state.
6225          */
6226         if (thflags & TH_SYN) {
6227                 rack_challenge_ack(m, th, tp, &ret_val);
6228                 return (ret_val);
6229         }
6230         /*
6231          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6232          * it's less than ts_recent, drop it.
6233          */
6234         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6235             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6236                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6237                         return (ret_val);
6238         }
6239         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6240                 return (ret_val);
6241         }
6242         /*
6243          * If new data are received on a connection after the user processes
6244          * are gone, then RST the other end.
6245          */
6246         if ((so->so_state & SS_NOFDREF) &&
6247             tlen) {
6248                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6249                         return (1);
6250         }
6251         /*
6252          * If last ACK falls within this segment's sequence numbers, record
6253          * its timestamp. NOTE: 1) That the test incorporates suggestions
6254          * from the latest proposal of the tcplw@cray.com list (Braden
6255          * 1993/04/26). 2) That updating only on newer timestamps interferes
6256          * with our earlier PAWS tests, so this check should be solely
6257          * predicated on the sequence space of this segment. 3) That we
6258          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6259          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6260          * SEG.Len, This modified check allows us to overcome RFC1323's
6261          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6262          * p.869. In such cases, we can still calculate the RTT correctly
6263          * when RCV.NXT == Last.ACK.Sent.
6264          */
6265         if ((to->to_flags & TOF_TS) != 0 &&
6266             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6267             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6268             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6269                 tp->ts_recent_age = tcp_ts_getticks();
6270                 tp->ts_recent = to->to_tsval;
6271         }
6272         /*
6273          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6274          * is on (half-synchronized state), then queue data for later
6275          * processing; else drop segment and return.
6276          */
6277         if ((thflags & TH_ACK) == 0) {
6278                 if (tp->t_flags & TF_NEEDSYN) {
6279                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6280                             tiwin, thflags, nxt_pkt));
6281                 } else if (tp->t_flags & TF_ACKNOW) {
6282                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6283                         return (ret_val);
6284                 } else {
6285                         rack_do_drop(m, NULL);
6286                         return (0);
6287                 }
6288         }
6289         /*
6290          * Ack processing.
6291          */
6292         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6293                 return (ret_val);
6294         }
6295         if (sbavail(&so->so_snd)) {
6296                 if (rack_progress_timeout_check(tp)) {
6297                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6298                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6299                         return (1);
6300                 }
6301         }
6302         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6303             tiwin, thflags, nxt_pkt));
6304 }
6305
6306
6307 static void inline
6308 rack_clear_rate_sample(struct tcp_rack *rack)
6309 {
6310         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
6311         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
6312         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
6313 }
6314
6315 static int
6316 rack_init(struct tcpcb *tp)
6317 {
6318         struct tcp_rack *rack = NULL;
6319
6320         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
6321         if (tp->t_fb_ptr == NULL) {
6322                 /*
6323                  * We need to allocate memory but cant. The INP and INP_INFO
6324                  * locks and they are recusive (happens during setup. So a
6325                  * scheme to drop the locks fails :(
6326                  *
6327                  */
6328                 return (ENOMEM);
6329         }
6330         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
6331
6332         rack = (struct tcp_rack *)tp->t_fb_ptr;
6333         TAILQ_INIT(&rack->r_ctl.rc_map);
6334         TAILQ_INIT(&rack->r_ctl.rc_free);
6335         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6336         rack->rc_tp = tp;
6337         if (tp->t_inpcb) {
6338                 rack->rc_inp = tp->t_inpcb;
6339         }
6340         /* Probably not needed but lets be sure */
6341         rack_clear_rate_sample(rack);
6342         rack->r_cpu = 0;
6343         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
6344         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
6345         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
6346         rack->rc_pace_reduce = rack_slot_reduction;
6347         if (V_tcp_delack_enabled)
6348                 tp->t_delayed_ack = 1;
6349         else
6350                 tp->t_delayed_ack = 0;
6351         rack->rc_pace_max_segs = rack_hptsi_segments;
6352         rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
6353         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
6354         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
6355         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
6356         rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
6357         rack->r_enforce_min_pace = rack_min_pace_time;
6358         rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
6359         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
6360         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
6361         rack->r_ctl.rc_early_recovery = rack_early_recovery;
6362         rack->rc_always_pace = rack_pace_every_seg;
6363         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
6364         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
6365         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
6366         rack->r_ctl.rc_min_to = rack_min_to;
6367         rack->r_ctl.rc_prr_inc_var = rack_inc_var;
6368         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6369         if (tp->snd_una != tp->snd_max) {
6370                 /* Create a send map for the current outstanding data */
6371                 struct rack_sendmap *rsm;
6372
6373                 rsm = rack_alloc(rack);
6374                 if (rsm == NULL) {
6375                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6376                         tp->t_fb_ptr = NULL;
6377                         return (ENOMEM);
6378                 }
6379                 rsm->r_flags = RACK_OVERMAX;
6380                 rsm->r_tim_lastsent[0] = tcp_ts_getticks();
6381                 rsm->r_rtr_cnt = 1;
6382                 rsm->r_rtr_bytes = 0;
6383                 rsm->r_start = tp->snd_una;
6384                 rsm->r_end = tp->snd_max;
6385                 rsm->r_sndcnt = 0;
6386                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
6387                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6388                 rsm->r_in_tmap = 1;
6389         }
6390         return (0);
6391 }
6392
6393 static int
6394 rack_handoff_ok(struct tcpcb *tp)
6395 {
6396         if ((tp->t_state == TCPS_CLOSED) ||
6397             (tp->t_state == TCPS_LISTEN)) {
6398                 /* Sure no problem though it may not stick */
6399                 return (0);
6400         }
6401         if ((tp->t_state == TCPS_SYN_SENT) ||
6402             (tp->t_state == TCPS_SYN_RECEIVED)) {
6403                 /*
6404                  * We really don't know you have to get to ESTAB or beyond
6405                  * to tell.
6406                  */
6407                 return (EAGAIN);
6408         }
6409         if (tp->t_flags & TF_SACK_PERMIT) {
6410                 return (0);
6411         }
6412         /*
6413          * If we reach here we don't do SACK on this connection so we can
6414          * never do rack.
6415          */
6416         return (EINVAL);
6417 }
6418
6419 static void
6420 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
6421 {
6422         if (tp->t_fb_ptr) {
6423                 struct tcp_rack *rack;
6424                 struct rack_sendmap *rsm;
6425
6426                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6427 #ifdef TCP_BLACKBOX
6428                 tcp_log_flowend(tp);
6429 #endif
6430                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6431                 while (rsm) {
6432                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
6433                         uma_zfree(rack_zone, rsm);
6434                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6435                 }
6436                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6437                 while (rsm) {
6438                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
6439                         uma_zfree(rack_zone, rsm);
6440                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6441                 }
6442                 rack->rc_free_cnt = 0;
6443                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6444                 tp->t_fb_ptr = NULL;
6445         }
6446 }
6447
6448 static void
6449 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
6450 {
6451         switch (tp->t_state) {
6452         case TCPS_SYN_SENT:
6453                 rack->r_state = TCPS_SYN_SENT;
6454                 rack->r_substate = rack_do_syn_sent;
6455                 break;
6456         case TCPS_SYN_RECEIVED:
6457                 rack->r_state = TCPS_SYN_RECEIVED;
6458                 rack->r_substate = rack_do_syn_recv;
6459                 break;
6460         case TCPS_ESTABLISHED:
6461                 rack->r_state = TCPS_ESTABLISHED;
6462                 rack->r_substate = rack_do_established;
6463                 break;
6464         case TCPS_CLOSE_WAIT:
6465                 rack->r_state = TCPS_CLOSE_WAIT;
6466                 rack->r_substate = rack_do_close_wait;
6467                 break;
6468         case TCPS_FIN_WAIT_1:
6469                 rack->r_state = TCPS_FIN_WAIT_1;
6470                 rack->r_substate = rack_do_fin_wait_1;
6471                 break;
6472         case TCPS_CLOSING:
6473                 rack->r_state = TCPS_CLOSING;
6474                 rack->r_substate = rack_do_closing;
6475                 break;
6476         case TCPS_LAST_ACK:
6477                 rack->r_state = TCPS_LAST_ACK;
6478                 rack->r_substate = rack_do_lastack;
6479                 break;
6480         case TCPS_FIN_WAIT_2:
6481                 rack->r_state = TCPS_FIN_WAIT_2;
6482                 rack->r_substate = rack_do_fin_wait_2;
6483                 break;
6484         case TCPS_LISTEN:
6485         case TCPS_CLOSED:
6486         case TCPS_TIME_WAIT:
6487         default:
6488 #ifdef INVARIANTS
6489                 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
6490 #endif
6491                 break;
6492         };
6493 }
6494
6495
6496 static void
6497 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
6498 {
6499         /*
6500          * We received an ack, and then did not
6501          * call send or were bounced out due to the
6502          * hpts was running. Now a timer is up as well, is
6503          * it the right timer?
6504          */
6505         struct rack_sendmap *rsm;
6506         int tmr_up;
6507
6508         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
6509         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
6510                 return;
6511         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6512         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
6513             (tmr_up == PACE_TMR_RXT)) {
6514                 /* Should be an RXT */
6515                 return;
6516         }
6517         if (rsm == NULL) {
6518                 /* Nothing outstanding? */
6519                 if (tp->t_flags & TF_DELACK) {
6520                         if (tmr_up == PACE_TMR_DELACK)
6521                                 /* We are supposed to have delayed ack up and we do */
6522                                 return;
6523                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
6524                         /*
6525                          * if we hit enobufs then we would expect the possiblity
6526                          * of nothing outstanding and the RXT up (and the hptsi timer).
6527                          */
6528                         return;
6529                 } else if (((tcp_always_keepalive ||
6530                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6531                             (tp->t_state <= TCPS_CLOSING)) &&
6532                            (tmr_up == PACE_TMR_KEEP) &&
6533                            (tp->snd_max == tp->snd_una)) {
6534                         /* We should have keep alive up and we do */
6535                         return;
6536                 }
6537         }
6538         if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
6539                 if ((tp->t_flags & TF_SENTFIN) &&
6540                     ((tp->snd_max - tp->snd_una) == 1) &&
6541                     (rsm->r_flags & RACK_HAS_FIN)) {
6542                         /* needs to be a RXT */
6543                         if (tmr_up == PACE_TMR_RXT)
6544                                 return;
6545                 } else if (tmr_up == PACE_TMR_RACK)
6546                         return;
6547         } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
6548                    ((tmr_up == PACE_TMR_TLP) ||
6549                     (tmr_up == PACE_TMR_RXT))) {
6550                 /*
6551                  * Either a TLP or RXT is fine if no sack-passed
6552                  * is in place and data is outstanding.
6553                  */
6554                 return;
6555         } else if (tmr_up == PACE_TMR_DELACK) {
6556                 /*
6557                  * If the delayed ack was going to go off
6558                  * before the rtx/tlp/rack timer were going to
6559                  * expire, then that would be the timer in control.
6560                  * Note we don't check the time here trusting the
6561                  * code is correct.
6562                  */
6563                 return;
6564         }
6565         /*
6566          * Ok the timer originally started is not what we want now.
6567          * We will force the hpts to be stopped if any, and restart
6568          * with the slot set to what was in the saved slot.
6569          */
6570         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6571         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6572 }
6573
6574 static void
6575 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6576     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
6577     int32_t nxt_pkt, struct timeval *tv)
6578 {
6579         int32_t thflags, retval, did_out = 0;
6580         int32_t way_out = 0;
6581         uint32_t cts;
6582         uint32_t tiwin;
6583         struct tcpopt to;
6584         struct tcp_rack *rack;
6585         struct rack_sendmap *rsm;
6586         int32_t prev_state = 0;
6587
6588         cts = tcp_tv_to_mssectick(tv);
6589         rack = (struct tcp_rack *)tp->t_fb_ptr;
6590
6591         kern_prefetch(rack, &prev_state);
6592         prev_state = 0;
6593         thflags = th->th_flags;
6594         /*
6595          * If this is either a state-changing packet or current state isn't
6596          * established, we require a read lock on tcbinfo.  Otherwise, we
6597          * allow the tcbinfo to be in either locked or unlocked, as the
6598          * caller may have unnecessarily acquired a lock due to a race.
6599          */
6600         if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
6601             tp->t_state != TCPS_ESTABLISHED) {
6602                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6603         }
6604         INP_WLOCK_ASSERT(tp->t_inpcb);
6605         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
6606             __func__));
6607         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
6608             __func__));
6609         {
6610                 union tcp_log_stackspecific log;
6611
6612                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6613                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
6614                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
6615                 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
6616                     tlen, &log, true);
6617         }
6618         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
6619                 way_out = 4;
6620                 goto done_with_input;
6621         }
6622         /*
6623          * If a segment with the ACK-bit set arrives in the SYN-SENT state
6624          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
6625          */
6626         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
6627             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
6628                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6629                 return;
6630         }
6631         /*
6632          * Segment received on connection. Reset idle time and keep-alive
6633          * timer. XXX: This should be done after segment validation to
6634          * ignore broken/spoofed segs.
6635          */
6636         if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
6637 #ifdef NETFLIX_CWV
6638                 if ((tp->cwv_enabled) &&
6639                     ((tp->cwv_cwnd_valid == 0) &&
6640                      TCPS_HAVEESTABLISHED(tp->t_state) &&
6641                      (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
6642                         tcp_newcwv_nvp_closedown(tp);
6643                 } else
6644 #endif
6645                        if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
6646                         counter_u64_add(rack_input_idle_reduces, 1);
6647                         rack_cc_after_idle(tp,
6648                             (rack->r_idle_reduce_largest ? 1 :0));
6649                 }
6650         }
6651         rack->r_ctl.rc_rcvtime = cts;
6652         tp->t_rcvtime = ticks;
6653
6654 #ifdef NETFLIX_CWV
6655         if (tp->cwv_enabled) {
6656                 if ((tp->cwv_cwnd_valid == 0) &&
6657                     TCPS_HAVEESTABLISHED(tp->t_state) &&
6658                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
6659                         tcp_newcwv_nvp_closedown(tp);
6660         }
6661 #endif
6662         /*
6663          * Unscale the window into a 32-bit value. For the SYN_SENT state
6664          * the scale is zero.
6665          */
6666         tiwin = th->th_win << tp->snd_scale;
6667 #ifdef NETFLIX_STATS
6668         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
6669 #endif
6670         /*
6671          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
6672          * this to occur after we've validated the segment.
6673          */
6674         if (tp->t_flags & TF_ECN_PERMIT) {
6675                 if (thflags & TH_CWR)
6676                         tp->t_flags &= ~TF_ECN_SND_ECE;
6677                 switch (iptos & IPTOS_ECN_MASK) {
6678                 case IPTOS_ECN_CE:
6679                         tp->t_flags |= TF_ECN_SND_ECE;
6680                         TCPSTAT_INC(tcps_ecn_ce);
6681                         break;
6682                 case IPTOS_ECN_ECT0:
6683                         TCPSTAT_INC(tcps_ecn_ect0);
6684                         break;
6685                 case IPTOS_ECN_ECT1:
6686                         TCPSTAT_INC(tcps_ecn_ect1);
6687                         break;
6688                 }
6689                 /* Congestion experienced. */
6690                 if (thflags & TH_ECE) {
6691                         rack_cong_signal(tp, th, CC_ECN);
6692                 }
6693         }
6694         /*
6695          * Parse options on any incoming segment.
6696          */
6697         tcp_dooptions(&to, (u_char *)(th + 1),
6698             (th->th_off << 2) - sizeof(struct tcphdr),
6699             (thflags & TH_SYN) ? TO_SYN : 0);
6700
6701         /*
6702          * If echoed timestamp is later than the current time, fall back to
6703          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
6704          * were used when this connection was established.
6705          */
6706         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
6707                 to.to_tsecr -= tp->ts_offset;
6708                 if (TSTMP_GT(to.to_tsecr, cts))
6709                         to.to_tsecr = 0;
6710         }
6711         /*
6712          * If its the first time in we need to take care of options and
6713          * verify we can do SACK for rack!
6714          */
6715         if (rack->r_state == 0) {
6716                 /* Should be init'd by rack_init() */
6717                 KASSERT(rack->rc_inp != NULL,
6718                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
6719                 if (rack->rc_inp == NULL) {
6720                         rack->rc_inp = tp->t_inpcb;
6721                 }
6722
6723                 /*
6724                  * Process options only when we get SYN/ACK back. The SYN
6725                  * case for incoming connections is handled in tcp_syncache.
6726                  * According to RFC1323 the window field in a SYN (i.e., a
6727                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
6728                  * this is traditional behavior, may need to be cleaned up.
6729                  */
6730                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
6731                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
6732                         if ((to.to_flags & TOF_SCALE) &&
6733                             (tp->t_flags & TF_REQ_SCALE)) {
6734                                 tp->t_flags |= TF_RCVD_SCALE;
6735                                 tp->snd_scale = to.to_wscale;
6736                         }
6737                         /*
6738                          * Initial send window.  It will be updated with the
6739                          * next incoming segment to the scaled value.
6740                          */
6741                         tp->snd_wnd = th->th_win;
6742                         if (to.to_flags & TOF_TS) {
6743                                 tp->t_flags |= TF_RCVD_TSTMP;
6744                                 tp->ts_recent = to.to_tsval;
6745                                 tp->ts_recent_age = cts;
6746                         }
6747                         if (to.to_flags & TOF_MSS)
6748                                 tcp_mss(tp, to.to_mss);
6749                         if ((tp->t_flags & TF_SACK_PERMIT) &&
6750                             (to.to_flags & TOF_SACKPERM) == 0)
6751                                 tp->t_flags &= ~TF_SACK_PERMIT;
6752                         if (IS_FASTOPEN(tp->t_flags)) {
6753                                 if (to.to_flags & TOF_FASTOPEN) {
6754                                         uint16_t mss;
6755
6756                                         if (to.to_flags & TOF_MSS)
6757                                                 mss = to.to_mss;
6758                                         else
6759                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
6760                                                         mss = TCP6_MSS;
6761                                                 else
6762                                                         mss = TCP_MSS;
6763                                         tcp_fastopen_update_cache(tp, mss,
6764                                             to.to_tfo_len, to.to_tfo_cookie);
6765                                 } else
6766                                         tcp_fastopen_disable_path(tp);
6767                         }
6768                 }
6769                 /*
6770                  * At this point we are at the initial call. Here we decide
6771                  * if we are doing RACK or not. We do this by seeing if
6772                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
6773                  * we switch to the default code.
6774                  */
6775                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
6776                         tcp_switch_back_to_default(tp);
6777                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
6778                             tlen, iptos);
6779                         return;
6780                 }
6781                 /* Set the flag */
6782                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
6783                 tcp_set_hpts(tp->t_inpcb);
6784                 rack_stop_all_timers(tp);
6785                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
6786         }
6787         /*
6788          * This is the one exception case where we set the rack state
6789          * always. All other times (timers etc) we must have a rack-state
6790          * set (so we assure we have done the checks above for SACK).
6791          */
6792         if (rack->r_state != tp->t_state)
6793                 rack_set_state(tp, rack);
6794         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
6795                 kern_prefetch(rsm, &prev_state);
6796         prev_state = rack->r_state;
6797         rack->r_ctl.rc_tlp_send_cnt = 0;
6798         rack_clear_rate_sample(rack);
6799         retval = (*rack->r_substate) (m, th, so,
6800             tp, &to, drop_hdrlen,
6801             tlen, tiwin, thflags, nxt_pkt);
6802 #ifdef INVARIANTS
6803         if ((retval == 0) &&
6804             (tp->t_inpcb == NULL)) {
6805                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
6806                     retval, tp, prev_state);
6807         }
6808 #endif
6809         if (retval == 0) {
6810                 /*
6811                  * If retval is 1 the tcb is unlocked and most likely the tp
6812                  * is gone.
6813                  */
6814                 INP_WLOCK_ASSERT(tp->t_inpcb);
6815                 tcp_rack_xmit_timer_commit(rack, tp);
6816                 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
6817                     (rack->rc_in_persist == 0)){
6818                         /*
6819                          * The peer shrunk its window on us to the point
6820                          * where we have sent too much. The only thing
6821                          * we can do here is stop any timers and
6822                          * enter persist. We most likely lost the last
6823                          * bytes we sent but oh well, we will have to
6824                          * retransmit them after the peer is caught up.
6825                          */
6826                         if (rack->rc_inp->inp_in_hpts)
6827                                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6828                         rack_timer_cancel(tp, rack, cts, __LINE__);
6829                         rack_enter_persist(tp, rack, cts);
6830                         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6831                         way_out = 3;
6832                         goto done_with_input;
6833                 }
6834                 if (nxt_pkt == 0) {
6835                         if (rack->r_wanted_output != 0) {
6836                                 did_out = 1;
6837                                 (void)tp->t_fb->tfb_tcp_output(tp);
6838                         }
6839                         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
6840                 }
6841                 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
6842                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
6843                      (tp->t_flags & TF_DELACK) ||
6844                      ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6845                       (tp->t_state <= TCPS_CLOSING)))) {
6846                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
6847                         if ((tp->snd_max == tp->snd_una) &&
6848                             ((tp->t_flags & TF_DELACK) == 0) &&
6849                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
6850                                 /* keep alive not needed if we are hptsi output yet */
6851                                 ;
6852                         } else {
6853                                 if (rack->rc_inp->inp_in_hpts)
6854                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6855                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6856                         }
6857                         way_out = 1;
6858                 } else {
6859                         /* Do we have the correct timer running? */
6860                         rack_timer_audit(tp, rack, &so->so_snd);
6861                         way_out = 2;
6862                 }
6863         done_with_input:
6864                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
6865                 if (did_out)
6866                         rack->r_wanted_output = 0;
6867 #ifdef INVARIANTS
6868                 if (tp->t_inpcb == NULL) {
6869                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
6870                               did_out,
6871                               retval, tp, prev_state);
6872                 }
6873 #endif
6874                 INP_WUNLOCK(tp->t_inpcb);
6875         }
6876 }
6877
6878 void
6879 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6880     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
6881 {
6882         struct timeval tv;
6883 #ifdef RSS
6884         struct tcp_function_block *tfb;
6885         struct tcp_rack *rack;
6886         struct epoch_tracker et;
6887
6888         rack = (struct tcp_rack *)tp->t_fb_ptr;
6889         if (rack->r_state == 0) {
6890                 /*
6891                  * Initial input (ACK to SYN-ACK etc)lets go ahead and get
6892                  * it processed
6893                  */
6894                 INP_INFO_RLOCK_ET(&V_tcbinfo, et);
6895                 tcp_get_usecs(&tv);
6896                 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6897                     tlen, iptos, 0, &tv);
6898                 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
6899                 return;
6900         }
6901         tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
6902         INP_WUNLOCK(tp->t_inpcb);
6903 #else
6904         tcp_get_usecs(&tv);
6905         rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6906             tlen, iptos, 0, &tv);
6907 #endif
6908 }
6909
6910 struct rack_sendmap *
6911 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
6912 {
6913         struct rack_sendmap *rsm = NULL;
6914         int32_t idx;
6915         uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
6916
6917         /* Return the next guy to be re-transmitted */
6918         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
6919                 return (NULL);
6920         }
6921         if (tp->t_flags & TF_SENTFIN) {
6922                 /* retran the end FIN? */
6923                 return (NULL);
6924         }
6925         /* ok lets look at this one */
6926         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6927         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
6928                 goto check_it;
6929         }
6930         rsm = rack_find_lowest_rsm(rack);
6931         if (rsm == NULL) {
6932                 return (NULL);
6933         }
6934 check_it:
6935         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
6936         srtt = TICKS_2_MSEC(srtt_cur);
6937         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
6938                 srtt = rack->rc_rack_rtt;
6939         if (rsm->r_flags & RACK_ACKED) {
6940                 return (NULL);
6941         }
6942         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
6943                 /* Its not yet ready */
6944                 return (NULL);
6945         }
6946         idx = rsm->r_rtr_cnt - 1;
6947         ts_low = rsm->r_tim_lastsent[idx];
6948         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6949         if (tsused <= ts_low) {
6950                 return (NULL);
6951         }
6952         if ((tsused - ts_low) >= thresh) {
6953                 return (rsm);
6954         }
6955         return (NULL);
6956 }
6957
6958 static int
6959 rack_output(struct tcpcb *tp)
6960 {
6961         struct socket *so;
6962         uint32_t recwin, sendwin;
6963         uint32_t sb_offset;
6964         int32_t len, flags, error = 0;
6965         struct mbuf *m;
6966         struct mbuf *mb;
6967         uint32_t if_hw_tsomaxsegcount = 0;
6968         uint32_t if_hw_tsomaxsegsize;
6969         long tot_len_this_send = 0;
6970         struct ip *ip = NULL;
6971 #ifdef TCPDEBUG
6972         struct ipovly *ipov = NULL;
6973 #endif
6974         struct udphdr *udp = NULL;
6975         struct tcp_rack *rack;
6976         struct tcphdr *th;
6977         uint8_t pass = 0;
6978         uint8_t wanted_cookie = 0;
6979         u_char opt[TCP_MAXOLEN];
6980         unsigned ipoptlen, optlen, hdrlen, ulen=0;
6981         uint32_t rack_seq;
6982
6983 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
6984         unsigned ipsec_optlen = 0;
6985
6986 #endif
6987         int32_t idle, sendalot;
6988         int32_t sub_from_prr = 0;
6989         volatile int32_t sack_rxmit;
6990         struct rack_sendmap *rsm = NULL;
6991         int32_t tso, mtu, would_have_fin = 0;
6992         struct tcpopt to;
6993         int32_t slot = 0;
6994         uint32_t cts;
6995         uint8_t hpts_calling, doing_tlp = 0;
6996         int32_t do_a_prefetch;
6997         int32_t prefetch_rsm = 0;
6998         int32_t prefetch_so_done = 0;
6999         struct tcp_log_buffer *lgb = NULL;
7000         struct inpcb *inp;
7001         struct sockbuf *sb;
7002 #ifdef INET6
7003         struct ip6_hdr *ip6 = NULL;
7004         int32_t isipv6;
7005 #endif
7006         /* setup and take the cache hits here */
7007         rack = (struct tcp_rack *)tp->t_fb_ptr;
7008         inp = rack->rc_inp;
7009         so = inp->inp_socket;
7010         sb = &so->so_snd;
7011         kern_prefetch(sb, &do_a_prefetch);
7012         do_a_prefetch = 1;
7013
7014         INP_WLOCK_ASSERT(inp);
7015 #ifdef TCP_OFFLOAD
7016         if (tp->t_flags & TF_TOE)
7017                 return (tcp_offload_output(tp));
7018 #endif
7019 #ifdef INET6
7020         if (rack->r_state) {
7021                 /* Use the cache line loaded if possible */
7022                 isipv6 = rack->r_is_v6;
7023         } else {
7024                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
7025         }
7026 #endif
7027         cts = tcp_ts_getticks();
7028         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
7029             inp->inp_in_hpts) {
7030                 /*
7031                  * We are on the hpts for some timer but not hptsi output.
7032                  * Remove from the hpts unconditionally.
7033                  */
7034                 rack_timer_cancel(tp, rack, cts, __LINE__);
7035         }
7036         /* Mark that we have called rack_output(). */
7037         if ((rack->r_timer_override) ||
7038             (tp->t_flags & TF_FORCEDATA) ||
7039             (tp->t_state < TCPS_ESTABLISHED)) {
7040                 if (tp->t_inpcb->inp_in_hpts)
7041                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
7042         } else if (tp->t_inpcb->inp_in_hpts) {
7043                 /*
7044                  * On the hpts you can't pass even if ACKNOW is on, we will
7045                  * when the hpts fires.
7046                  */
7047                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
7048                 return (0);
7049         }
7050         hpts_calling = inp->inp_hpts_calls;
7051         inp->inp_hpts_calls = 0;
7052         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7053                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
7054                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
7055                         return (0);
7056                 }
7057         }
7058         rack->r_wanted_output = 0;
7059         rack->r_timer_override = 0;
7060         /*
7061          * For TFO connections in SYN_SENT or SYN_RECEIVED,
7062          * only allow the initial SYN or SYN|ACK and those sent
7063          * by the retransmit timer.
7064          */
7065         if (IS_FASTOPEN(tp->t_flags) &&
7066             ((tp->t_state == TCPS_SYN_RECEIVED) ||
7067              (tp->t_state == TCPS_SYN_SENT)) &&
7068             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
7069             (tp->t_rxtshift == 0))              /* not a retransmit */
7070                 return (0);
7071         /*
7072          * Determine length of data that should be transmitted, and flags
7073          * that will be used. If there is some data or critical controls
7074          * (SYN, RST) to send, then transmit; otherwise, investigate
7075          * further.
7076          */
7077         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
7078 #ifdef NETFLIX_CWV
7079         if (tp->cwv_enabled) {
7080                 if ((tp->cwv_cwnd_valid == 0) &&
7081                     TCPS_HAVEESTABLISHED(tp->t_state) &&
7082                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
7083                         tcp_newcwv_nvp_closedown(tp);
7084         } else
7085 #endif
7086         if (tp->t_idle_reduce) {
7087                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
7088                         rack_cc_after_idle(tp,
7089                             (rack->r_idle_reduce_largest ? 1 :0));
7090         }
7091         tp->t_flags &= ~TF_LASTIDLE;
7092         if (idle) {
7093                 if (tp->t_flags & TF_MORETOCOME) {
7094                         tp->t_flags |= TF_LASTIDLE;
7095                         idle = 0;
7096                 }
7097         }
7098 again:
7099         /*
7100          * If we've recently taken a timeout, snd_max will be greater than
7101          * snd_nxt.  There may be SACK information that allows us to avoid
7102          * resending already delivered data.  Adjust snd_nxt accordingly.
7103          */
7104         sendalot = 0;
7105         cts = tcp_ts_getticks();
7106         tso = 0;
7107         mtu = 0;
7108         sb_offset = tp->snd_max - tp->snd_una;
7109         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
7110
7111         flags = tcp_outflags[tp->t_state];
7112         /*
7113          * Send any SACK-generated retransmissions.  If we're explicitly
7114          * trying to send out new data (when sendalot is 1), bypass this
7115          * function. If we retransmit in fast recovery mode, decrement
7116          * snd_cwnd, since we're replacing a (future) new transmission with
7117          * a retransmission now, and we previously incremented snd_cwnd in
7118          * tcp_input().
7119          */
7120         /*
7121          * Still in sack recovery , reset rxmit flag to zero.
7122          */
7123         while (rack->rc_free_cnt < rack_free_cache) {
7124                 rsm = rack_alloc(rack);
7125                 if (rsm == NULL) {
7126                         if (inp->inp_hpts_calls)
7127                                 /* Retry in a ms */
7128                                 slot = 1;
7129                         goto just_return_nolock;
7130                 }
7131                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
7132                 rack->rc_free_cnt++;
7133                 rsm = NULL;
7134         }
7135         if (inp->inp_hpts_calls)
7136                 inp->inp_hpts_calls = 0;
7137         sack_rxmit = 0;
7138         len = 0;
7139         rsm = NULL;
7140         if (flags & TH_RST) {
7141                 SOCKBUF_LOCK(sb);
7142                 goto send;
7143         }
7144         if (rack->r_ctl.rc_tlpsend) {
7145                 /* Tail loss probe */
7146                 long cwin;
7147                 long tlen;
7148
7149                 doing_tlp = 1;
7150                 rsm = rack->r_ctl.rc_tlpsend;
7151                 rack->r_ctl.rc_tlpsend = NULL;
7152                 sack_rxmit = 1;
7153                 tlen = rsm->r_end - rsm->r_start;
7154                 if (tlen > tp->t_maxseg)
7155                         tlen = tp->t_maxseg;
7156                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7157                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7158                     __func__, __LINE__,
7159                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7160                 sb_offset = rsm->r_start - tp->snd_una;
7161                 cwin = min(tp->snd_wnd, tlen);
7162                 len = cwin;
7163         } else if (rack->r_ctl.rc_resend) {
7164                 /* Retransmit timer */
7165                 rsm = rack->r_ctl.rc_resend;
7166                 rack->r_ctl.rc_resend = NULL;
7167                 len = rsm->r_end - rsm->r_start;
7168                 sack_rxmit = 1;
7169                 sendalot = 0;
7170                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7171                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7172                     __func__, __LINE__,
7173                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7174                 sb_offset = rsm->r_start - tp->snd_una;
7175                 if (len >= tp->t_maxseg) {
7176                         len = tp->t_maxseg;
7177                 }
7178         } else if ((rack->rc_in_persist == 0) &&
7179             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
7180                 long tlen;
7181
7182                 if ((!IN_RECOVERY(tp->t_flags)) &&
7183                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
7184                         /* Enter recovery if not induced by a time-out */
7185                         rack->r_ctl.rc_rsm_start = rsm->r_start;
7186                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
7187                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
7188                         rack_cong_signal(tp, NULL, CC_NDUPACK);
7189                         /*
7190                          * When we enter recovery we need to assure we send
7191                          * one packet.
7192                          */
7193                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
7194                 }
7195 #ifdef INVARIANTS
7196                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
7197                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
7198                             tp, rack, rsm, rsm->r_start, tp->snd_una);
7199                 }
7200 #endif
7201                 tlen = rsm->r_end - rsm->r_start;
7202                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7203                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7204                     __func__, __LINE__,
7205                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7206                 sb_offset = rsm->r_start - tp->snd_una;
7207                 if (tlen > rack->r_ctl.rc_prr_sndcnt) {
7208                         len = rack->r_ctl.rc_prr_sndcnt;
7209                 } else {
7210                         len = tlen;
7211                 }
7212                 if (len >= tp->t_maxseg) {
7213                         sendalot = 1;
7214                         len = tp->t_maxseg;
7215                 } else {
7216                         sendalot = 0;
7217                         if ((rack->rc_timer_up == 0) &&
7218                             (len < tlen)) {
7219                                 /*
7220                                  * If its not a timer don't send a partial
7221                                  * segment.
7222                                  */
7223                                 len = 0;
7224                                 goto just_return_nolock;
7225                         }
7226                 }
7227                 if (len > 0) {
7228                         sub_from_prr = 1;
7229                         sack_rxmit = 1;
7230                         TCPSTAT_INC(tcps_sack_rexmits);
7231                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
7232                             min(len, tp->t_maxseg));
7233                         counter_u64_add(rack_rtm_prr_retran, 1);
7234                 }
7235         }
7236         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
7237                 /* we are retransmitting the fin */
7238                 len--;
7239                 if (len) {
7240                         /*
7241                          * When retransmitting data do *not* include the
7242                          * FIN. This could happen from a TLP probe.
7243                          */
7244                         flags &= ~TH_FIN;
7245                 }
7246         }
7247 #ifdef INVARIANTS
7248         /* For debugging */
7249         rack->r_ctl.rc_rsm_at_retran = rsm;
7250 #endif
7251         /*
7252          * Get standard flags, and add SYN or FIN if requested by 'hidden'
7253          * state flags.
7254          */
7255         if (tp->t_flags & TF_NEEDFIN)
7256                 flags |= TH_FIN;
7257         if (tp->t_flags & TF_NEEDSYN)
7258                 flags |= TH_SYN;
7259         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
7260                 void *end_rsm;
7261                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
7262                 if (end_rsm)
7263                         kern_prefetch(end_rsm, &prefetch_rsm);
7264                 prefetch_rsm = 1;
7265         }
7266         SOCKBUF_LOCK(sb);
7267         /*
7268          * If in persist timeout with window of 0, send 1 byte. Otherwise,
7269          * if window is small but nonzero and time TF_SENTFIN expired, we
7270          * will send what we can and go to transmit state.
7271          */
7272         if (tp->t_flags & TF_FORCEDATA) {
7273                 if (sendwin == 0) {
7274                         /*
7275                          * If we still have some data to send, then clear
7276                          * the FIN bit.  Usually this would happen below
7277                          * when it realizes that we aren't sending all the
7278                          * data.  However, if we have exactly 1 byte of
7279                          * unsent data, then it won't clear the FIN bit
7280                          * below, and if we are in persist state, we wind up
7281                          * sending the packet without recording that we sent
7282                          * the FIN bit.
7283                          *
7284                          * We can't just blindly clear the FIN bit, because
7285                          * if we don't have any more data to send then the
7286                          * probe will be the FIN itself.
7287                          */
7288                         if (sb_offset < sbused(sb))
7289                                 flags &= ~TH_FIN;
7290                         sendwin = 1;
7291                 } else {
7292                         if (rack->rc_in_persist)
7293                                 rack_exit_persist(tp, rack);
7294                         /*
7295                          * If we are dropping persist mode then we need to
7296                          * correct snd_nxt/snd_max and off.
7297                          */
7298                         tp->snd_nxt = tp->snd_max;
7299                         sb_offset = tp->snd_nxt - tp->snd_una;
7300                 }
7301         }
7302         /*
7303          * If snd_nxt == snd_max and we have transmitted a FIN, the
7304          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
7305          * negative length.  This can also occur when TCP opens up its
7306          * congestion window while receiving additional duplicate acks after
7307          * fast-retransmit because TCP will reset snd_nxt to snd_max after
7308          * the fast-retransmit.
7309          *
7310          * In the normal retransmit-FIN-only case, however, snd_nxt will be
7311          * set to snd_una, the sb_offset will be 0, and the length may wind
7312          * up 0.
7313          *
7314          * If sack_rxmit is true we are retransmitting from the scoreboard
7315          * in which case len is already set.
7316          */
7317         if (sack_rxmit == 0) {
7318                 uint32_t avail;
7319
7320                 avail = sbavail(sb);
7321                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
7322                         sb_offset = tp->snd_nxt - tp->snd_una;
7323                 else
7324                         sb_offset = 0;
7325                 if (IN_RECOVERY(tp->t_flags) == 0) {
7326                         if (rack->r_ctl.rc_tlp_new_data) {
7327                                 /* TLP is forcing out new data */
7328                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
7329                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
7330                                 }
7331                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
7332                                         len = tp->snd_wnd;
7333                                 else
7334                                         len = rack->r_ctl.rc_tlp_new_data;
7335                                 rack->r_ctl.rc_tlp_new_data = 0;
7336                                 doing_tlp = 1;
7337                         } else {
7338                                 if (sendwin > avail) {
7339                                         /* use the available */
7340                                         if (avail > sb_offset) {
7341                                                 len = (int32_t)(avail - sb_offset);
7342                                         } else {
7343                                                 len = 0;
7344                                         }
7345                                 } else {
7346                                         if (sendwin > sb_offset) {
7347                                                 len = (int32_t)(sendwin - sb_offset);
7348                                         } else {
7349                                                 len = 0;
7350                                         }
7351                                 }
7352                         }
7353                 } else {
7354                         uint32_t outstanding;
7355
7356                         /*
7357                          * We are inside of a SACK recovery episode and are
7358                          * sending new data, having retransmitted all the
7359                          * data possible so far in the scoreboard.
7360                          */
7361                         outstanding = tp->snd_max - tp->snd_una;
7362                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
7363                                 len = 0;
7364                         else if (avail > sb_offset)
7365                                 len = avail - sb_offset;
7366                         else
7367                                 len = 0;
7368                         if (len > 0) {
7369                                 if (len > rack->r_ctl.rc_prr_sndcnt)
7370                                         len = rack->r_ctl.rc_prr_sndcnt;
7371
7372                                 if (len > 0) {
7373                                         sub_from_prr = 1;
7374                                         counter_u64_add(rack_rtm_prr_newdata, 1);
7375                                 }
7376                         }
7377                         if (len > tp->t_maxseg) {
7378                                 /*
7379                                  * We should never send more than a MSS when
7380                                  * retransmitting or sending new data in prr
7381                                  * mode unless the override flag is on. Most
7382                                  * likely the PRR algorithm is not going to
7383                                  * let us send a lot as well :-)
7384                                  */
7385                                 if (rack->r_ctl.rc_prr_sendalot == 0)
7386                                         len = tp->t_maxseg;
7387                         } else if (len < tp->t_maxseg) {
7388                                 /*
7389                                  * Do we send any? The idea here is if the
7390                                  * send empty's the socket buffer we want to
7391                                  * do it. However if not then lets just wait
7392                                  * for our prr_sndcnt to get bigger.
7393                                  */
7394                                 long leftinsb;
7395
7396                                 leftinsb = sbavail(sb) - sb_offset;
7397                                 if (leftinsb > len) {
7398                                         /* This send does not empty the sb */
7399                                         len = 0;
7400                                 }
7401                         }
7402                 }
7403         }
7404         if (prefetch_so_done == 0) {
7405                 kern_prefetch(so, &prefetch_so_done);
7406                 prefetch_so_done = 1;
7407         }
7408         /*
7409          * Lop off SYN bit if it has already been sent.  However, if this is
7410          * SYN-SENT state and if segment contains data and if we don't know
7411          * that foreign host supports TAO, suppress sending segment.
7412          */
7413         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
7414             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
7415                 if (tp->t_state != TCPS_SYN_RECEIVED)
7416                         flags &= ~TH_SYN;
7417                 /*
7418                  * When sending additional segments following a TFO SYN|ACK,
7419                  * do not include the SYN bit.
7420                  */
7421                 if (IS_FASTOPEN(tp->t_flags) &&
7422                     (tp->t_state == TCPS_SYN_RECEIVED))
7423                         flags &= ~TH_SYN;
7424                 sb_offset--, len++;
7425         }
7426         /*
7427          * Be careful not to send data and/or FIN on SYN segments. This
7428          * measure is needed to prevent interoperability problems with not
7429          * fully conformant TCP implementations.
7430          */
7431         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
7432                 len = 0;
7433                 flags &= ~TH_FIN;
7434         }
7435         /*
7436          * On TFO sockets, ensure no data is sent in the following cases:
7437          *
7438          *  - When retransmitting SYN|ACK on a passively-created socket
7439          *
7440          *  - When retransmitting SYN on an actively created socket
7441          *
7442          *  - When sending a zero-length cookie (cookie request) on an
7443          *    actively created socket
7444          *
7445          *  - When the socket is in the CLOSED state (RST is being sent)
7446          */
7447         if (IS_FASTOPEN(tp->t_flags) &&
7448             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
7449              ((tp->t_state == TCPS_SYN_SENT) &&
7450               (tp->t_tfo_client_cookie_len == 0)) ||
7451              (flags & TH_RST))) {
7452                 sack_rxmit = 0;
7453                 len = 0;
7454         }
7455         /* Without fast-open there should never be data sent on a SYN */
7456         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
7457                 len = 0;
7458         if (len <= 0) {
7459                 /*
7460                  * If FIN has been sent but not acked, but we haven't been
7461                  * called to retransmit, len will be < 0.  Otherwise, window
7462                  * shrank after we sent into it.  If window shrank to 0,
7463                  * cancel pending retransmit, pull snd_nxt back to (closed)
7464                  * window, and set the persist timer if it isn't already
7465                  * going.  If the window didn't close completely, just wait
7466                  * for an ACK.
7467                  *
7468                  * We also do a general check here to ensure that we will
7469                  * set the persist timer when we have data to send, but a
7470                  * 0-byte window. This makes sure the persist timer is set
7471                  * even if the packet hits one of the "goto send" lines
7472                  * below.
7473                  */
7474                 len = 0;
7475                 if ((tp->snd_wnd == 0) &&
7476                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
7477                     (sb_offset < (int)sbavail(sb))) {
7478                         tp->snd_nxt = tp->snd_una;
7479                         rack_enter_persist(tp, rack, cts);
7480                 }
7481         }
7482         /* len will be >= 0 after this point. */
7483         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7484         tcp_sndbuf_autoscale(tp, so, sendwin);
7485         /*
7486          * Decide if we can use TCP Segmentation Offloading (if supported by
7487          * hardware).
7488          *
7489          * TSO may only be used if we are in a pure bulk sending state.  The
7490          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
7491          * options prevent using TSO.  With TSO the TCP header is the same
7492          * (except for the sequence number) for all generated packets.  This
7493          * makes it impossible to transmit any options which vary per
7494          * generated segment or packet.
7495          *
7496          * IPv4 handling has a clear separation of ip options and ip header
7497          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
7498          * the right thing below to provide length of just ip options and thus
7499          * checking for ipoptlen is enough to decide if ip options are present.
7500          */
7501
7502 #ifdef INET6
7503         if (isipv6)
7504                 ipoptlen = ip6_optlen(tp->t_inpcb);
7505         else
7506 #endif
7507                 if (tp->t_inpcb->inp_options)
7508                         ipoptlen = tp->t_inpcb->inp_options->m_len -
7509                             offsetof(struct ipoption, ipopt_list);
7510                 else
7511                         ipoptlen = 0;
7512 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7513         /*
7514          * Pre-calculate here as we save another lookup into the darknesses
7515          * of IPsec that way and can actually decide if TSO is ok.
7516          */
7517 #ifdef INET6
7518         if (isipv6 && IPSEC_ENABLED(ipv6))
7519                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
7520 #ifdef INET
7521         else
7522 #endif
7523 #endif                          /* INET6 */
7524 #ifdef INET
7525         if (IPSEC_ENABLED(ipv4))
7526                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
7527 #endif                          /* INET */
7528 #endif
7529
7530 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7531         ipoptlen += ipsec_optlen;
7532 #endif
7533         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
7534             (tp->t_port == 0) &&
7535             ((tp->t_flags & TF_SIGNATURE) == 0) &&
7536             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
7537             ipoptlen == 0)
7538                 tso = 1;
7539         {
7540                 uint32_t outstanding;
7541
7542                 outstanding = tp->snd_max - tp->snd_una;
7543                 if (tp->t_flags & TF_SENTFIN) {
7544                         /*
7545                          * If we sent a fin, snd_max is 1 higher than
7546                          * snd_una
7547                          */
7548                         outstanding--;
7549                 }
7550                 if (outstanding > 0) {
7551                         /*
7552                          * This is sub-optimal. We only send a stand alone
7553                          * FIN on its own segment.
7554                          */
7555                         if (flags & TH_FIN) {
7556                                 flags &= ~TH_FIN;
7557                                 would_have_fin = 1;
7558                         }
7559                 } else if (sack_rxmit) {
7560                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
7561                                 flags &= ~TH_FIN;
7562                 } else {
7563                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
7564                             sbused(sb)))
7565                                 flags &= ~TH_FIN;
7566                 }
7567         }
7568         recwin = sbspace(&so->so_rcv);
7569
7570         /*
7571          * Sender silly window avoidance.   We transmit under the following
7572          * conditions when len is non-zero:
7573          *
7574          * - We have a full segment (or more with TSO) - This is the last
7575          * buffer in a write()/send() and we are either idle or running
7576          * NODELAY - we've timed out (e.g. persist timer) - we have more
7577          * then 1/2 the maximum send window's worth of data (receiver may be
7578          * limited the window size) - we need to retransmit
7579          */
7580         if (len) {
7581                 if (len >= tp->t_maxseg) {
7582                         pass = 1;
7583                         goto send;
7584                 }
7585                 /*
7586                  * NOTE! on localhost connections an 'ack' from the remote
7587                  * end may occur synchronously with the output and cause us
7588                  * to flush a buffer queued with moretocome.  XXX
7589                  *
7590                  */
7591                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
7592                     (idle || (tp->t_flags & TF_NODELAY)) &&
7593                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
7594                     (tp->t_flags & TF_NOPUSH) == 0) {
7595                         pass = 2;
7596                         goto send;
7597                 }
7598                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
7599                         pass = 3;
7600                         goto send;
7601                 }
7602                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
7603                         goto send;
7604                 }
7605                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
7606                         pass = 4;
7607                         goto send;
7608                 }
7609                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
7610                         pass = 5;
7611                         goto send;
7612                 }
7613                 if (sack_rxmit) {
7614                         pass = 6;
7615                         goto send;
7616                 }
7617         }
7618         /*
7619          * Sending of standalone window updates.
7620          *
7621          * Window updates are important when we close our window due to a
7622          * full socket buffer and are opening it again after the application
7623          * reads data from it.  Once the window has opened again and the
7624          * remote end starts to send again the ACK clock takes over and
7625          * provides the most current window information.
7626          *
7627          * We must avoid the silly window syndrome whereas every read from
7628          * the receive buffer, no matter how small, causes a window update
7629          * to be sent.  We also should avoid sending a flurry of window
7630          * updates when the socket buffer had queued a lot of data and the
7631          * application is doing small reads.
7632          *
7633          * Prevent a flurry of pointless window updates by only sending an
7634          * update when we can increase the advertized window by more than
7635          * 1/4th of the socket buffer capacity.  When the buffer is getting
7636          * full or is very small be more aggressive and send an update
7637          * whenever we can increase by two mss sized segments. In all other
7638          * situations the ACK's to new incoming data will carry further
7639          * window increases.
7640          *
7641          * Don't send an independent window update if a delayed ACK is
7642          * pending (it will get piggy-backed on it) or the remote side
7643          * already has done a half-close and won't send more data.  Skip
7644          * this if the connection is in T/TCP half-open state.
7645          */
7646         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
7647             !(tp->t_flags & TF_DELACK) &&
7648             !TCPS_HAVERCVDFIN(tp->t_state)) {
7649                 /*
7650                  * "adv" is the amount we could increase the window, taking
7651                  * into account that we are limited by TCP_MAXWIN <<
7652                  * tp->rcv_scale.
7653                  */
7654                 int32_t adv;
7655                 int oldwin;
7656
7657                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
7658                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
7659                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
7660                         adv -= oldwin;
7661                 } else
7662                         oldwin = 0;
7663
7664                 /*
7665                  * If the new window size ends up being the same as the old
7666                  * size when it is scaled, then don't force a window update.
7667                  */
7668                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
7669                         goto dontupdate;
7670
7671                 if (adv >= (int32_t)(2 * tp->t_maxseg) &&
7672                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
7673                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
7674                     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
7675                         pass = 7;
7676                         goto send;
7677                 }
7678                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
7679                         goto send;
7680         }
7681 dontupdate:
7682
7683         /*
7684          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
7685          * is also a catch-all for the retransmit timer timeout case.
7686          */
7687         if (tp->t_flags & TF_ACKNOW) {
7688                 pass = 8;
7689                 goto send;
7690         }
7691         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
7692                 pass = 9;
7693                 goto send;
7694         }
7695         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
7696                 pass = 10;
7697                 goto send;
7698         }
7699         /*
7700          * If our state indicates that FIN should be sent and we have not
7701          * yet done so, then we need to send.
7702          */
7703         if ((flags & TH_FIN) &&
7704             (tp->snd_nxt == tp->snd_una)) {
7705                 pass = 11;
7706                 goto send;
7707         }
7708         /*
7709          * No reason to send a segment, just return.
7710          */
7711 just_return:
7712         SOCKBUF_UNLOCK(sb);
7713 just_return_nolock:
7714         if (tot_len_this_send == 0)
7715                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
7716         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
7717         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
7718         tp->t_flags &= ~TF_FORCEDATA;
7719         return (0);
7720
7721 send:
7722         if (doing_tlp == 0) {
7723                 /*
7724                  * Data not a TLP, and its not the rxt firing. If it is the
7725                  * rxt firing, we want to leave the tlp_in_progress flag on
7726                  * so we don't send another TLP. It has to be a rack timer
7727                  * or normal send (response to acked data) to clear the tlp
7728                  * in progress flag.
7729                  */
7730                 rack->rc_tlp_in_progress = 0;
7731         }
7732         SOCKBUF_LOCK_ASSERT(sb);
7733         if (len > 0) {
7734                 if (len >= tp->t_maxseg)
7735                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
7736                 else
7737                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
7738         }
7739         /*
7740          * Before ESTABLISHED, force sending of initial options unless TCP
7741          * set not to do any options. NOTE: we assume that the IP/TCP header
7742          * plus TCP options always fit in a single mbuf, leaving room for a
7743          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
7744          * + optlen <= MCLBYTES
7745          */
7746         optlen = 0;
7747 #ifdef INET6
7748         if (isipv6)
7749                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
7750         else
7751 #endif
7752                 hdrlen = sizeof(struct tcpiphdr);
7753
7754         /*
7755          * Compute options for segment. We only have to care about SYN and
7756          * established connection segments.  Options for SYN-ACK segments
7757          * are handled in TCP syncache.
7758          */
7759         to.to_flags = 0;
7760         if ((tp->t_flags & TF_NOOPT) == 0) {
7761                 /* Maximum segment size. */
7762                 if (flags & TH_SYN) {
7763                         tp->snd_nxt = tp->iss;
7764                         to.to_mss = tcp_mssopt(&inp->inp_inc);
7765 #ifdef NETFLIX_TCPOUDP
7766                         if (tp->t_port)
7767                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
7768 #endif
7769                         to.to_flags |= TOF_MSS;
7770
7771                         /*
7772                          * On SYN or SYN|ACK transmits on TFO connections,
7773                          * only include the TFO option if it is not a
7774                          * retransmit, as the presence of the TFO option may
7775                          * have caused the original SYN or SYN|ACK to have
7776                          * been dropped by a middlebox.
7777                          */
7778                         if (IS_FASTOPEN(tp->t_flags) &&
7779                             (tp->t_rxtshift == 0)) {
7780                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
7781                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
7782                                         to.to_tfo_cookie =
7783                                             (u_int8_t *)&tp->t_tfo_cookie.server;
7784                                         to.to_flags |= TOF_FASTOPEN;
7785                                         wanted_cookie = 1;
7786                                 } else if (tp->t_state == TCPS_SYN_SENT) {
7787                                         to.to_tfo_len =
7788                                             tp->t_tfo_client_cookie_len;
7789                                         to.to_tfo_cookie =
7790                                             tp->t_tfo_cookie.client;
7791                                         to.to_flags |= TOF_FASTOPEN;
7792                                         wanted_cookie = 1;
7793                                         /*
7794                                          * If we wind up having more data to
7795                                          * send with the SYN than can fit in
7796                                          * one segment, don't send any more
7797                                          * until the SYN|ACK comes back from
7798                                          * the other end.
7799                                          */
7800                                         sendalot = 0;
7801                                 }
7802                         }
7803                 }
7804                 /* Window scaling. */
7805                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
7806                         to.to_wscale = tp->request_r_scale;
7807                         to.to_flags |= TOF_SCALE;
7808                 }
7809                 /* Timestamps. */
7810                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
7811                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
7812                         to.to_tsval = cts + tp->ts_offset;
7813                         to.to_tsecr = tp->ts_recent;
7814                         to.to_flags |= TOF_TS;
7815                 }
7816                 /* Set receive buffer autosizing timestamp. */
7817                 if (tp->rfbuf_ts == 0 &&
7818                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
7819                         tp->rfbuf_ts = tcp_ts_getticks();
7820                 /* Selective ACK's. */
7821                 if (flags & TH_SYN)
7822                         to.to_flags |= TOF_SACKPERM;
7823                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7824                     tp->rcv_numsacks > 0) {
7825                         to.to_flags |= TOF_SACK;
7826                         to.to_nsacks = tp->rcv_numsacks;
7827                         to.to_sacks = (u_char *)tp->sackblks;
7828                 }
7829 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
7830                 /* TCP-MD5 (RFC2385). */
7831                 if (tp->t_flags & TF_SIGNATURE)
7832                         to.to_flags |= TOF_SIGNATURE;
7833 #endif                          /* TCP_SIGNATURE */
7834
7835                 /* Processing the options. */
7836                 hdrlen += optlen = tcp_addoptions(&to, opt);
7837                 /*
7838                  * If we wanted a TFO option to be added, but it was unable
7839                  * to fit, ensure no data is sent.
7840                  */
7841                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
7842                     !(to.to_flags & TOF_FASTOPEN))
7843                         len = 0;
7844         }
7845 #ifdef NETFLIX_TCPOUDP
7846         if (tp->t_port) {
7847                 if (V_tcp_udp_tunneling_port == 0) {
7848                         /* The port was removed?? */
7849                         SOCKBUF_UNLOCK(&so->so_snd);
7850                         return (EHOSTUNREACH);
7851                 }
7852                 hdrlen += sizeof(struct udphdr);
7853         }
7854 #endif
7855         ipoptlen = 0;
7856 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7857         ipoptlen += ipsec_optlen;
7858 #endif
7859
7860         /*
7861          * Adjust data length if insertion of options will bump the packet
7862          * length beyond the t_maxseg length. Clear the FIN bit because we
7863          * cut off the tail of the segment.
7864          */
7865         if (len + optlen + ipoptlen > tp->t_maxseg) {
7866                 if (flags & TH_FIN) {
7867                         would_have_fin = 1;
7868                         flags &= ~TH_FIN;
7869                 }
7870                 if (tso) {
7871                         uint32_t if_hw_tsomax;
7872                         uint32_t moff;
7873                         int32_t max_len;
7874
7875                         /* extract TSO information */
7876                         if_hw_tsomax = tp->t_tsomax;
7877                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
7878                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
7879                         KASSERT(ipoptlen == 0,
7880                             ("%s: TSO can't do IP options", __func__));
7881
7882                         /*
7883                          * Check if we should limit by maximum payload
7884                          * length:
7885                          */
7886                         if (if_hw_tsomax != 0) {
7887                                 /* compute maximum TSO length */
7888                                 max_len = (if_hw_tsomax - hdrlen -
7889                                     max_linkhdr);
7890                                 if (max_len <= 0) {
7891                                         len = 0;
7892                                 } else if (len > max_len) {
7893                                         sendalot = 1;
7894                                         len = max_len;
7895                                 }
7896                         }
7897                         /*
7898                          * Prevent the last segment from being fractional
7899                          * unless the send sockbuf can be emptied:
7900                          */
7901                         max_len = (tp->t_maxseg - optlen);
7902                         if ((sb_offset + len) < sbavail(sb)) {
7903                                 moff = len % (u_int)max_len;
7904                                 if (moff != 0) {
7905                                         len -= moff;
7906                                         sendalot = 1;
7907                                 }
7908                         }
7909                         /*
7910                          * In case there are too many small fragments don't
7911                          * use TSO:
7912                          */
7913                         if (len <= max_len) {
7914                                 len = max_len;
7915                                 sendalot = 1;
7916                                 tso = 0;
7917                         }
7918                         /*
7919                          * Send the FIN in a separate segment after the bulk
7920                          * sending is done. We don't trust the TSO
7921                          * implementations to clear the FIN flag on all but
7922                          * the last segment.
7923                          */
7924                         if (tp->t_flags & TF_NEEDFIN)
7925                                 sendalot = 1;
7926
7927                 } else {
7928                         len = tp->t_maxseg - optlen - ipoptlen;
7929                         sendalot = 1;
7930                 }
7931         } else
7932                 tso = 0;
7933         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
7934             ("%s: len > IP_MAXPACKET", __func__));
7935 #ifdef DIAGNOSTIC
7936 #ifdef INET6
7937         if (max_linkhdr + hdrlen > MCLBYTES)
7938 #else
7939         if (max_linkhdr + hdrlen > MHLEN)
7940 #endif
7941                 panic("tcphdr too big");
7942 #endif
7943
7944         /*
7945          * This KASSERT is here to catch edge cases at a well defined place.
7946          * Before, those had triggered (random) panic conditions further
7947          * down.
7948          */
7949         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7950         if ((len == 0) &&
7951             (flags & TH_FIN) &&
7952             (sbused(sb))) {
7953                 /*
7954                  * We have outstanding data, don't send a fin by itself!.
7955                  */
7956                 goto just_return;
7957         }
7958         /*
7959          * Grab a header mbuf, attaching a copy of data to be transmitted,
7960          * and initialize the header from the template for sends on this
7961          * connection.
7962          */
7963         if (len) {
7964                 uint32_t max_val;
7965                 uint32_t moff;
7966
7967                 if (rack->rc_pace_max_segs)
7968                         max_val = rack->rc_pace_max_segs * tp->t_maxseg;
7969                 else
7970                         max_val = len;
7971                 /*
7972                  * We allow a limit on sending with hptsi.
7973                  */
7974                 if (len > max_val) {
7975                         len = max_val;
7976                 }
7977 #ifdef INET6
7978                 if (MHLEN < hdrlen + max_linkhdr)
7979                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
7980                 else
7981 #endif
7982                         m = m_gethdr(M_NOWAIT, MT_DATA);
7983
7984                 if (m == NULL) {
7985                         SOCKBUF_UNLOCK(sb);
7986                         error = ENOBUFS;
7987                         sack_rxmit = 0;
7988                         goto out;
7989                 }
7990                 m->m_data += max_linkhdr;
7991                 m->m_len = hdrlen;
7992
7993                 /*
7994                  * Start the m_copy functions from the closest mbuf to the
7995                  * sb_offset in the socket buffer chain.
7996                  */
7997                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
7998                 if (len <= MHLEN - hdrlen - max_linkhdr) {
7999                         m_copydata(mb, moff, (int)len,
8000                             mtod(m, caddr_t)+hdrlen);
8001                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8002                                 sbsndptr_adv(sb, mb, len);
8003                         m->m_len += len;
8004                 } else {
8005                         struct sockbuf *msb;
8006
8007                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8008                                 msb = NULL;
8009                         else
8010                                 msb = sb;
8011                         m->m_next = tcp_m_copym(mb, moff, &len,
8012                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
8013                         if (len <= (tp->t_maxseg - optlen)) {
8014                                 /*
8015                                  * Must have ran out of mbufs for the copy
8016                                  * shorten it to no longer need tso. Lets
8017                                  * not put on sendalot since we are low on
8018                                  * mbufs.
8019                                  */
8020                                 tso = 0;
8021                         }
8022                         if (m->m_next == NULL) {
8023                                 SOCKBUF_UNLOCK(sb);
8024                                 (void)m_free(m);
8025                                 error = ENOBUFS;
8026                                 sack_rxmit = 0;
8027                                 goto out;
8028                         }
8029                 }
8030                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
8031                         TCPSTAT_INC(tcps_sndprobe);
8032 #ifdef NETFLIX_STATS
8033                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8034                                 stats_voi_update_abs_u32(tp->t_stats,
8035                                     VOI_TCP_RETXPB, len);
8036                         else
8037                                 stats_voi_update_abs_u64(tp->t_stats,
8038                                     VOI_TCP_TXPB, len);
8039 #endif
8040                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
8041                         if (rsm && (rsm->r_flags & RACK_TLP)) {
8042                                 /*
8043                                  * TLP should not count in retran count, but
8044                                  * in its own bin
8045                                  */
8046                                 counter_u64_add(rack_tlp_retran, 1);
8047                                 counter_u64_add(rack_tlp_retran_bytes, len);
8048                         } else {
8049                                 tp->t_sndrexmitpack++;
8050                                 TCPSTAT_INC(tcps_sndrexmitpack);
8051                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
8052                         }
8053 #ifdef NETFLIX_STATS
8054                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
8055                             len);
8056 #endif
8057                 } else {
8058                         TCPSTAT_INC(tcps_sndpack);
8059                         TCPSTAT_ADD(tcps_sndbyte, len);
8060 #ifdef NETFLIX_STATS
8061                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
8062                             len);
8063 #endif
8064                 }
8065                 /*
8066                  * If we're sending everything we've got, set PUSH. (This
8067                  * will keep happy those implementations which only give
8068                  * data to the user when a buffer fills or a PUSH comes in.)
8069                  */
8070                 if (sb_offset + len == sbused(sb) &&
8071                     sbused(sb) &&
8072                     !(flags & TH_SYN))
8073                         flags |= TH_PUSH;
8074
8075                 /*
8076                  * Are we doing hptsi, if so we must calculate the slot. We
8077                  * only do hptsi in ESTABLISHED and with no RESET being
8078                  * sent where we have data to send.
8079                  */
8080                 if (((tp->t_state == TCPS_ESTABLISHED) ||
8081                     (tp->t_state == TCPS_CLOSE_WAIT) ||
8082                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
8083                     ((tp->t_flags & TF_SENTFIN) == 0) &&
8084                     ((flags & TH_FIN) == 0))) &&
8085                     ((flags & TH_RST) == 0) &&
8086                     (rack->rc_always_pace)) {
8087                         /*
8088                          * We use the most optimistic possible cwnd/srtt for
8089                          * sending calculations. This will make our
8090                          * calculation anticipate getting more through
8091                          * quicker then possible. But thats ok we don't want
8092                          * the peer to have a gap in data sending.
8093                          */
8094                         uint32_t srtt, cwnd, tr_perms = 0;
8095
8096                         if (rack->r_ctl.rc_rack_min_rtt)
8097                                 srtt = rack->r_ctl.rc_rack_min_rtt;
8098                         else
8099                                 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8100                         if (rack->r_ctl.rc_rack_largest_cwnd)
8101                                 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8102                         else
8103                                 cwnd = tp->snd_cwnd;
8104                         tr_perms = cwnd / srtt;
8105                         if (tr_perms == 0) {
8106                                 tr_perms = tp->t_maxseg;
8107                         }
8108                         tot_len_this_send += len;
8109                         /*
8110                          * Calculate how long this will take to drain, if
8111                          * the calculation comes out to zero, thats ok we
8112                          * will use send_a_lot to possibly spin around for
8113                          * more increasing tot_len_this_send to the point
8114                          * that its going to require a pace, or we hit the
8115                          * cwnd. Which in that case we are just waiting for
8116                          * a ACK.
8117                          */
8118                         slot = tot_len_this_send / tr_perms;
8119                         /* Now do we reduce the time so we don't run dry? */
8120                         if (slot && rack->rc_pace_reduce) {
8121                                 int32_t reduce;
8122
8123                                 reduce = (slot / rack->rc_pace_reduce);
8124                                 if (reduce < slot) {
8125                                         slot -= reduce;
8126                                 } else
8127                                         slot = 0;
8128                         }
8129                         if (rack->r_enforce_min_pace &&
8130                             (slot == 0) &&
8131                             (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
8132                                 /* We are enforcing a minimum pace time of 1ms */
8133                                 slot = rack->r_enforce_min_pace;
8134                         }
8135                 }
8136                 SOCKBUF_UNLOCK(sb);
8137         } else {
8138                 SOCKBUF_UNLOCK(sb);
8139                 if (tp->t_flags & TF_ACKNOW)
8140                         TCPSTAT_INC(tcps_sndacks);
8141                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
8142                         TCPSTAT_INC(tcps_sndctrl);
8143                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
8144                         TCPSTAT_INC(tcps_sndurg);
8145                 else
8146                         TCPSTAT_INC(tcps_sndwinup);
8147
8148                 m = m_gethdr(M_NOWAIT, MT_DATA);
8149                 if (m == NULL) {
8150                         error = ENOBUFS;
8151                         sack_rxmit = 0;
8152                         goto out;
8153                 }
8154 #ifdef INET6
8155                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
8156                     MHLEN >= hdrlen) {
8157                         M_ALIGN(m, hdrlen);
8158                 } else
8159 #endif
8160                         m->m_data += max_linkhdr;
8161                 m->m_len = hdrlen;
8162         }
8163         SOCKBUF_UNLOCK_ASSERT(sb);
8164         m->m_pkthdr.rcvif = (struct ifnet *)0;
8165 #ifdef MAC
8166         mac_inpcb_create_mbuf(inp, m);
8167 #endif
8168 #ifdef INET6
8169         if (isipv6) {
8170                 ip6 = mtod(m, struct ip6_hdr *);
8171 #ifdef NETFLIX_TCPOUDP
8172                 if (tp->t_port) {
8173                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
8174                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8175                         udp->uh_dport = tp->t_port;
8176                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
8177                         udp->uh_ulen = htons(ulen);
8178                         th = (struct tcphdr *)(udp + 1);
8179                 } else
8180 #endif
8181                         th = (struct tcphdr *)(ip6 + 1);
8182                 tcpip_fillheaders(inp, ip6, th);
8183         } else
8184 #endif                          /* INET6 */
8185         {
8186                 ip = mtod(m, struct ip *);
8187 #ifdef TCPDEBUG
8188                 ipov = (struct ipovly *)ip;
8189 #endif
8190 #ifdef NETFLIX_TCPOUDP
8191                 if (tp->t_port) {
8192                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
8193                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8194                         udp->uh_dport = tp->t_port;
8195                         ulen = hdrlen + len - sizeof(struct ip);
8196                         udp->uh_ulen = htons(ulen);
8197                         th = (struct tcphdr *)(udp + 1);
8198                 } else
8199 #endif
8200                         th = (struct tcphdr *)(ip + 1);
8201                 tcpip_fillheaders(inp, ip, th);
8202         }
8203         /*
8204          * Fill in fields, remembering maximum advertised window for use in
8205          * delaying messages about window sizes. If resending a FIN, be sure
8206          * not to use a new sequence number.
8207          */
8208         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
8209             tp->snd_nxt == tp->snd_max)
8210                 tp->snd_nxt--;
8211         /*
8212          * If we are starting a connection, send ECN setup SYN packet. If we
8213          * are on a retransmit, we may resend those bits a number of times
8214          * as per RFC 3168.
8215          */
8216         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
8217                 if (tp->t_rxtshift >= 1) {
8218                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
8219                                 flags |= TH_ECE | TH_CWR;
8220                 } else
8221                         flags |= TH_ECE | TH_CWR;
8222         }
8223         if (tp->t_state == TCPS_ESTABLISHED &&
8224             (tp->t_flags & TF_ECN_PERMIT)) {
8225                 /*
8226                  * If the peer has ECN, mark data packets with ECN capable
8227                  * transmission (ECT). Ignore pure ack packets,
8228                  * retransmissions and window probes.
8229                  */
8230                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
8231                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
8232 #ifdef INET6
8233                         if (isipv6)
8234                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
8235                         else
8236 #endif
8237                                 ip->ip_tos |= IPTOS_ECN_ECT0;
8238                         TCPSTAT_INC(tcps_ecn_ect0);
8239                 }
8240                 /*
8241                  * Reply with proper ECN notifications.
8242                  */
8243                 if (tp->t_flags & TF_ECN_SND_CWR) {
8244                         flags |= TH_CWR;
8245                         tp->t_flags &= ~TF_ECN_SND_CWR;
8246                 }
8247                 if (tp->t_flags & TF_ECN_SND_ECE)
8248                         flags |= TH_ECE;
8249         }
8250         /*
8251          * If we are doing retransmissions, then snd_nxt will not reflect
8252          * the first unsent octet.  For ACK only packets, we do not want the
8253          * sequence number of the retransmitted packet, we want the sequence
8254          * number of the next unsent octet.  So, if there is no data (and no
8255          * SYN or FIN), use snd_max instead of snd_nxt when filling in
8256          * ti_seq.  But if we are in persist state, snd_max might reflect
8257          * one byte beyond the right edge of the window, so use snd_nxt in
8258          * that case, since we know we aren't doing a retransmission.
8259          * (retransmit and persist are mutually exclusive...)
8260          */
8261         if (sack_rxmit == 0) {
8262                 if (len || (flags & (TH_SYN | TH_FIN)) ||
8263                     rack->rc_in_persist) {
8264                         th->th_seq = htonl(tp->snd_nxt);
8265                         rack_seq = tp->snd_nxt;
8266                 } else if (flags & TH_RST) {
8267                         /*
8268                          * For a Reset send the last cum ack in sequence
8269                          * (this like any other choice may still generate a
8270                          * challenge ack, if a ack-update packet is in
8271                          * flight).
8272                          */
8273                         th->th_seq = htonl(tp->snd_una);
8274                         rack_seq = tp->snd_una;
8275                 } else {
8276                         th->th_seq = htonl(tp->snd_max);
8277                         rack_seq = tp->snd_max;
8278                 }
8279         } else {
8280                 th->th_seq = htonl(rsm->r_start);
8281                 rack_seq = rsm->r_start;
8282         }
8283         th->th_ack = htonl(tp->rcv_nxt);
8284         if (optlen) {
8285                 bcopy(opt, th + 1, optlen);
8286                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
8287         }
8288         th->th_flags = flags;
8289         /*
8290          * Calculate receive window.  Don't shrink window, but avoid silly
8291          * window syndrome.
8292          * If a RST segment is sent, advertise a window of zero.
8293          */
8294         if (flags & TH_RST) {
8295                 recwin = 0;
8296         } else {
8297                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
8298                     recwin < (long)tp->t_maxseg)
8299                         recwin = 0;
8300                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
8301                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
8302                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
8303                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
8304                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
8305         }
8306
8307         /*
8308          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
8309          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
8310          * handled in syncache.
8311          */
8312         if (flags & TH_SYN)
8313                 th->th_win = htons((u_short)
8314                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
8315         else
8316                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
8317         /*
8318          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
8319          * window.  This may cause the remote transmitter to stall.  This
8320          * flag tells soreceive() to disable delayed acknowledgements when
8321          * draining the buffer.  This can occur if the receiver is
8322          * attempting to read more data than can be buffered prior to
8323          * transmitting on the connection.
8324          */
8325         if (th->th_win == 0) {
8326                 tp->t_sndzerowin++;
8327                 tp->t_flags |= TF_RXWIN0SENT;
8328         } else
8329                 tp->t_flags &= ~TF_RXWIN0SENT;
8330         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8331                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
8332                 th->th_flags |= TH_URG;
8333         } else
8334                 /*
8335                  * If no urgent pointer to send, then we pull the urgent
8336                  * pointer to the left edge of the send window so that it
8337                  * doesn't drift into the send window on sequence number
8338                  * wraparound.
8339                  */
8340                 tp->snd_up = tp->snd_una;       /* drag it along */
8341
8342 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
8343         if (to.to_flags & TOF_SIGNATURE) {
8344                 /*
8345                  * Calculate MD5 signature and put it into the place
8346                  * determined before.
8347                  * NOTE: since TCP options buffer doesn't point into
8348                  * mbuf's data, calculate offset and use it.
8349                  */
8350                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
8351                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
8352                         /*
8353                          * Do not send segment if the calculation of MD5
8354                          * digest has failed.
8355                          */
8356                         goto out;
8357                 }
8358         }
8359 #endif
8360
8361         /*
8362          * Put TCP length in extended header, and then checksum extended
8363          * header and data.
8364          */
8365         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
8366 #ifdef INET6
8367         if (isipv6) {
8368                 /*
8369                  * ip6_plen is not need to be filled now, and will be filled
8370                  * in ip6_output.
8371                  */
8372                 if (tp->t_port) {
8373                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
8374                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8375                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
8376                         th->th_sum = htons(0);
8377                 } else {
8378                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
8379                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8380                         th->th_sum = in6_cksum_pseudo(ip6,
8381                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
8382                             0);
8383                 }
8384         }
8385 #endif
8386 #if defined(INET6) && defined(INET)
8387         else
8388 #endif
8389 #ifdef INET
8390         {
8391                 if (tp->t_port) {
8392                         m->m_pkthdr.csum_flags = CSUM_UDP;
8393                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8394                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
8395                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
8396                         th->th_sum = htons(0);
8397                 } else {
8398                         m->m_pkthdr.csum_flags = CSUM_TCP;
8399                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8400                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
8401                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
8402                             IPPROTO_TCP + len + optlen));
8403                 }
8404                 /* IP version must be set here for ipv4/ipv6 checking later */
8405                 KASSERT(ip->ip_v == IPVERSION,
8406                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
8407         }
8408 #endif
8409
8410         /*
8411          * Enable TSO and specify the size of the segments. The TCP pseudo
8412          * header checksum is always provided. XXX: Fixme: This is currently
8413          * not the case for IPv6.
8414          */
8415         if (tso) {
8416                 KASSERT(len > tp->t_maxseg - optlen,
8417                     ("%s: len <= tso_segsz", __func__));
8418                 m->m_pkthdr.csum_flags |= CSUM_TSO;
8419                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
8420         }
8421 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8422         KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
8423             ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
8424             __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
8425 #else
8426         KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
8427             ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
8428             __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
8429 #endif
8430
8431 #ifdef TCP_HHOOK
8432         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
8433         hhook_run_tcp_est_out(tp, th, &to, len, tso);
8434 #endif
8435
8436 #ifdef TCPDEBUG
8437         /*
8438          * Trace.
8439          */
8440         if (so->so_options & SO_DEBUG) {
8441                 u_short save = 0;
8442
8443 #ifdef INET6
8444                 if (!isipv6)
8445 #endif
8446                 {
8447                         save = ipov->ih_len;
8448                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
8449                               * (th->th_off << 2) */ );
8450                 }
8451                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
8452 #ifdef INET6
8453                 if (!isipv6)
8454 #endif
8455                         ipov->ih_len = save;
8456         }
8457 #endif                          /* TCPDEBUG */
8458
8459         /* We're getting ready to send; log now. */
8460         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
8461                 union tcp_log_stackspecific log;
8462
8463                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
8464                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
8465                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
8466                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
8467                 if (rsm || sack_rxmit) {
8468                         log.u_bbr.flex8 = 1;
8469                 } else {
8470                         log.u_bbr.flex8 = 0;
8471                 }
8472                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
8473                     len, &log, false, NULL, NULL, 0, NULL);
8474         } else
8475                 lgb = NULL;
8476
8477         /*
8478          * Fill in IP length and desired time to live and send to IP level.
8479          * There should be a better way to handle ttl and tos; we could keep
8480          * them in the template, but need a way to checksum without them.
8481          */
8482         /*
8483          * m->m_pkthdr.len should have been set before cksum calcuration,
8484          * because in6_cksum() need it.
8485          */
8486 #ifdef INET6
8487         if (isipv6) {
8488                 /*
8489                  * we separately set hoplimit for every segment, since the
8490                  * user might want to change the value via setsockopt. Also,
8491                  * desired default hop limit might be changed via Neighbor
8492                  * Discovery.
8493                  */
8494                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
8495
8496                 /*
8497                  * Set the packet size here for the benefit of DTrace
8498                  * probes. ip6_output() will set it properly; it's supposed
8499                  * to include the option header lengths as well.
8500                  */
8501                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
8502
8503                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
8504                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8505                 else
8506                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8507
8508                 if (tp->t_state == TCPS_SYN_SENT)
8509                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
8510
8511                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
8512                 /* TODO: IPv6 IP6TOS_ECT bit on */
8513                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
8514                     &inp->inp_route6,
8515                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
8516                     NULL, NULL, inp);
8517
8518                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
8519                         mtu = inp->inp_route6.ro_rt->rt_mtu;
8520         }
8521 #endif                          /* INET6 */
8522 #if defined(INET) && defined(INET6)
8523         else
8524 #endif
8525 #ifdef INET
8526         {
8527                 ip->ip_len = htons(m->m_pkthdr.len);
8528 #ifdef INET6
8529                 if (inp->inp_vflag & INP_IPV6PROTO)
8530                         ip->ip_ttl = in6_selecthlim(inp, NULL);
8531 #endif                          /* INET6 */
8532                 /*
8533                  * If we do path MTU discovery, then we set DF on every
8534                  * packet. This might not be the best thing to do according
8535                  * to RFC3390 Section 2. However the tcp hostcache migitates
8536                  * the problem so it affects only the first tcp connection
8537                  * with a host.
8538                  *
8539                  * NB: Don't set DF on small MTU/MSS to have a safe
8540                  * fallback.
8541                  */
8542                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
8543                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8544                         if (tp->t_port == 0 || len < V_tcp_minmss) {
8545                                 ip->ip_off |= htons(IP_DF);
8546                         }
8547                 } else {
8548                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8549                 }
8550
8551                 if (tp->t_state == TCPS_SYN_SENT)
8552                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
8553
8554                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
8555
8556                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
8557                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
8558                     inp);
8559                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
8560                         mtu = inp->inp_route.ro_rt->rt_mtu;
8561         }
8562 #endif                          /* INET */
8563
8564 out:
8565         if (lgb) {
8566                 lgb->tlb_errno = error;
8567                 lgb = NULL;
8568         }
8569         /*
8570          * In transmit state, time the transmission and arrange for the
8571          * retransmit.  In persist state, just set snd_max.
8572          */
8573         if (error == 0) {
8574                 if (len == 0)
8575                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
8576                 else if (len == 1) {
8577                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
8578                 } else if (len > 1) {
8579                         int idx;
8580
8581                         idx = (len / tp->t_maxseg) + 3;
8582                         if (idx >= TCP_MSS_ACCT_ATIMER)
8583                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
8584                         else
8585                                 counter_u64_add(rack_out_size[idx], 1);
8586                 }
8587         }
8588         if (sub_from_prr && (error == 0)) {
8589                 rack->r_ctl.rc_prr_sndcnt -= len;
8590         }
8591         sub_from_prr = 0;
8592         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
8593             pass, rsm);
8594         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
8595             (rack->rc_in_persist == 0)) {
8596                 tcp_seq startseq = tp->snd_nxt;
8597
8598                 /*
8599                  * Advance snd_nxt over sequence space of this segment.
8600                  */
8601                 if (error)
8602                         /* We don't log or do anything with errors */
8603                         goto timer;
8604
8605                 if (flags & (TH_SYN | TH_FIN)) {
8606                         if (flags & TH_SYN)
8607                                 tp->snd_nxt++;
8608                         if (flags & TH_FIN) {
8609                                 tp->snd_nxt++;
8610                                 tp->t_flags |= TF_SENTFIN;
8611                         }
8612                 }
8613                 /* In the ENOBUFS case we do *not* update snd_max */
8614                 if (sack_rxmit)
8615                         goto timer;
8616
8617                 tp->snd_nxt += len;
8618                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
8619                         if (tp->snd_una == tp->snd_max) {
8620                                 /*
8621                                  * Update the time we just added data since
8622                                  * none was outstanding.
8623                                  */
8624                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8625                                 tp->t_acktime = ticks;
8626                         }
8627                         tp->snd_max = tp->snd_nxt;
8628                         /*
8629                          * Time this transmission if not a retransmission and
8630                          * not currently timing anything.
8631                          * This is only relevant in case of switching back to
8632                          * the base stack.
8633                          */
8634                         if (tp->t_rtttime == 0) {
8635                                 tp->t_rtttime = ticks;
8636                                 tp->t_rtseq = startseq;
8637                                 TCPSTAT_INC(tcps_segstimed);
8638                         }
8639 #ifdef NETFLIX_STATS
8640                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
8641                                 tp->t_flags |= TF_GPUTINPROG;
8642                                 tp->gput_seq = startseq;
8643                                 tp->gput_ack = startseq +
8644                                     ulmin(sbavail(sb) - sb_offset, sendwin);
8645                                 tp->gput_ts = tcp_ts_getticks();
8646                         }
8647 #endif
8648                 }
8649                 /*
8650                  * Set retransmit timer if not currently set, and not doing
8651                  * a pure ack or a keep-alive probe. Initial value for
8652                  * retransmit timer is smoothed round-trip time + 2 *
8653                  * round-trip time variance. Initialize shift counter which
8654                  * is used for backoff of retransmit time.
8655                  */
8656 timer:
8657                 if ((tp->snd_wnd == 0) &&
8658                     TCPS_HAVEESTABLISHED(tp->t_state)) {
8659                         /*
8660                          * If the persists timer was set above (right before
8661                          * the goto send), and still needs to be on. Lets
8662                          * make sure all is canceled. If the persist timer
8663                          * is not running, we want to get it up.
8664                          */
8665                         if (rack->rc_in_persist == 0) {
8666                                 rack_enter_persist(tp, rack, cts);
8667                         }
8668                 }
8669         } else {
8670                 /*
8671                  * Persist case, update snd_max but since we are in persist
8672                  * mode (no window) we do not update snd_nxt.
8673                  */
8674                 int32_t xlen = len;
8675
8676                 if (error)
8677                         goto nomore;
8678
8679                 if (flags & TH_SYN)
8680                         ++xlen;
8681                 if (flags & TH_FIN) {
8682                         ++xlen;
8683                         tp->t_flags |= TF_SENTFIN;
8684                 }
8685                 /* In the ENOBUFS case we do *not* update snd_max */
8686                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
8687                         if (tp->snd_una == tp->snd_max) {
8688                                 /*
8689                                  * Update the time we just added data since
8690                                  * none was outstanding.
8691                                  */
8692                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8693                                 tp->t_acktime = ticks;
8694                         }
8695                         tp->snd_max = tp->snd_nxt + len;
8696                 }
8697         }
8698 nomore:
8699         if (error) {
8700                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
8701                 /*
8702                  * Failures do not advance the seq counter above. For the
8703                  * case of ENOBUFS we will fall out and retry in 1ms with
8704                  * the hpts. Everything else will just have to retransmit
8705                  * with the timer.
8706                  *
8707                  * In any case, we do not want to loop around for another
8708                  * send without a good reason.
8709                  */
8710                 sendalot = 0;
8711                 switch (error) {
8712                 case EPERM:
8713                         tp->t_flags &= ~TF_FORCEDATA;
8714                         tp->t_softerror = error;
8715                         return (error);
8716                 case ENOBUFS:
8717                         if (slot == 0) {
8718                                 /*
8719                                  * Pace us right away to retry in a some
8720                                  * time
8721                                  */
8722                                 slot = 1 + rack->rc_enobuf;
8723                                 if (rack->rc_enobuf < 255)
8724                                         rack->rc_enobuf++;
8725                                 if (slot > (rack->rc_rack_rtt / 2)) {
8726                                         slot = rack->rc_rack_rtt / 2;
8727                                 }
8728                                 if (slot < 10)
8729                                         slot = 10;
8730                         }
8731                         counter_u64_add(rack_saw_enobuf, 1);
8732                         error = 0;
8733                         goto enobufs;
8734                 case EMSGSIZE:
8735                         /*
8736                          * For some reason the interface we used initially
8737                          * to send segments changed to another or lowered
8738                          * its MTU. If TSO was active we either got an
8739                          * interface without TSO capabilits or TSO was
8740                          * turned off. If we obtained mtu from ip_output()
8741                          * then update it and try again.
8742                          */
8743                         if (tso)
8744                                 tp->t_flags &= ~TF_TSO;
8745                         if (mtu != 0) {
8746                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
8747                                 goto again;
8748                         }
8749                         slot = 10;
8750                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8751                         tp->t_flags &= ~TF_FORCEDATA;
8752                         return (error);
8753                 case ENETUNREACH:
8754                         counter_u64_add(rack_saw_enetunreach, 1);
8755                 case EHOSTDOWN:
8756                 case EHOSTUNREACH:
8757                 case ENETDOWN:
8758                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
8759                                 tp->t_softerror = error;
8760                         }
8761                         /* FALLTHROUGH */
8762                 default:
8763                         slot = 10;
8764                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8765                         tp->t_flags &= ~TF_FORCEDATA;
8766                         return (error);
8767                 }
8768         } else {
8769                 rack->rc_enobuf = 0;
8770         }
8771         TCPSTAT_INC(tcps_sndtotal);
8772
8773         /*
8774          * Data sent (as far as we can tell). If this advertises a larger
8775          * window than any other segment, then remember the size of the
8776          * advertised window. Any pending ACK has now been sent.
8777          */
8778         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8779                 tp->rcv_adv = tp->rcv_nxt + recwin;
8780         tp->last_ack_sent = tp->rcv_nxt;
8781         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
8782 enobufs:
8783         rack->r_tlp_running = 0;
8784         if ((flags & TH_RST) || (would_have_fin == 1)) {
8785                 /*
8786                  * We don't send again after a RST. We also do *not* send
8787                  * again if we would have had a find, but now have
8788                  * outstanding data.
8789                  */
8790                 slot = 0;
8791                 sendalot = 0;
8792         }
8793         if (slot) {
8794                 /* set the rack tcb into the slot N */
8795                 counter_u64_add(rack_paced_segments, 1);
8796         } else if (sendalot) {
8797                 if (len)
8798                         counter_u64_add(rack_unpaced_segments, 1);
8799                 sack_rxmit = 0;
8800                 tp->t_flags &= ~TF_FORCEDATA;
8801                 goto again;
8802         } else if (len) {
8803                 counter_u64_add(rack_unpaced_segments, 1);
8804         }
8805         tp->t_flags &= ~TF_FORCEDATA;
8806         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
8807         return (error);
8808 }
8809
8810 /*
8811  * rack_ctloutput() must drop the inpcb lock before performing copyin on
8812  * socket option arguments.  When it re-acquires the lock after the copy, it
8813  * has to revalidate that the connection is still valid for the socket
8814  * option.
8815  */
8816 static int
8817 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
8818     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8819 {
8820         int32_t error = 0, optval;
8821
8822         switch (sopt->sopt_name) {
8823         case TCP_RACK_PROP_RATE:
8824         case TCP_RACK_PROP:
8825         case TCP_RACK_TLP_REDUCE:
8826         case TCP_RACK_EARLY_RECOV:
8827         case TCP_RACK_PACE_ALWAYS:
8828         case TCP_DELACK:
8829         case TCP_RACK_PACE_REDUCE:
8830         case TCP_RACK_PACE_MAX_SEG:
8831         case TCP_RACK_PRR_SENDALOT:
8832         case TCP_RACK_MIN_TO:
8833         case TCP_RACK_EARLY_SEG:
8834         case TCP_RACK_REORD_THRESH:
8835         case TCP_RACK_REORD_FADE:
8836         case TCP_RACK_TLP_THRESH:
8837         case TCP_RACK_PKT_DELAY:
8838         case TCP_RACK_TLP_USE:
8839         case TCP_RACK_TLP_INC_VAR:
8840         case TCP_RACK_IDLE_REDUCE_HIGH:
8841         case TCP_RACK_MIN_PACE:
8842         case TCP_RACK_MIN_PACE_SEG:
8843         case TCP_BBR_RACK_RTT_USE:
8844         case TCP_DATA_AFTER_CLOSE:
8845                 break;
8846         default:
8847                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8848                 break;
8849         }
8850         INP_WUNLOCK(inp);
8851         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
8852         if (error)
8853                 return (error);
8854         INP_WLOCK(inp);
8855         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
8856                 INP_WUNLOCK(inp);
8857                 return (ECONNRESET);
8858         }
8859         tp = intotcpcb(inp);
8860         rack = (struct tcp_rack *)tp->t_fb_ptr;
8861         switch (sopt->sopt_name) {
8862         case TCP_RACK_PROP_RATE:
8863                 if ((optval <= 0) || (optval >= 100)) {
8864                         error = EINVAL;
8865                         break;
8866                 }
8867                 RACK_OPTS_INC(tcp_rack_prop_rate);
8868                 rack->r_ctl.rc_prop_rate = optval;
8869                 break;
8870         case TCP_RACK_TLP_USE:
8871                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
8872                         error = EINVAL;
8873                         break;
8874                 }
8875                 RACK_OPTS_INC(tcp_tlp_use);
8876                 rack->rack_tlp_threshold_use = optval;
8877                 break;
8878         case TCP_RACK_PROP:
8879                 /* RACK proportional rate reduction (bool) */
8880                 RACK_OPTS_INC(tcp_rack_prop);
8881                 rack->r_ctl.rc_prop_reduce = optval;
8882                 break;
8883         case TCP_RACK_TLP_REDUCE:
8884                 /* RACK TLP cwnd reduction (bool) */
8885                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
8886                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
8887                 break;
8888         case TCP_RACK_EARLY_RECOV:
8889                 /* Should recovery happen early (bool) */
8890                 RACK_OPTS_INC(tcp_rack_early_recov);
8891                 rack->r_ctl.rc_early_recovery = optval;
8892                 break;
8893         case TCP_RACK_PACE_ALWAYS:
8894                 /* Use the always pace method (bool)  */
8895                 RACK_OPTS_INC(tcp_rack_pace_always);
8896                 if (optval > 0)
8897                         rack->rc_always_pace = 1;
8898                 else
8899                         rack->rc_always_pace = 0;
8900                 break;
8901         case TCP_RACK_PACE_REDUCE:
8902                 /* RACK Hptsi reduction factor (divisor) */
8903                 RACK_OPTS_INC(tcp_rack_pace_reduce);
8904                 if (optval)
8905                         /* Must be non-zero */
8906                         rack->rc_pace_reduce = optval;
8907                 else
8908                         error = EINVAL;
8909                 break;
8910         case TCP_RACK_PACE_MAX_SEG:
8911                 /* Max segments in a pace */
8912                 RACK_OPTS_INC(tcp_rack_max_seg);
8913                 rack->rc_pace_max_segs = optval;
8914                 break;
8915         case TCP_RACK_PRR_SENDALOT:
8916                 /* Allow PRR to send more than one seg */
8917                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
8918                 rack->r_ctl.rc_prr_sendalot = optval;
8919                 break;
8920         case TCP_RACK_MIN_TO:
8921                 /* Minimum time between rack t-o's in ms */
8922                 RACK_OPTS_INC(tcp_rack_min_to);
8923                 rack->r_ctl.rc_min_to = optval;
8924                 break;
8925         case TCP_RACK_EARLY_SEG:
8926                 /* If early recovery max segments */
8927                 RACK_OPTS_INC(tcp_rack_early_seg);
8928                 rack->r_ctl.rc_early_recovery_segs = optval;
8929                 break;
8930         case TCP_RACK_REORD_THRESH:
8931                 /* RACK reorder threshold (shift amount) */
8932                 RACK_OPTS_INC(tcp_rack_reord_thresh);
8933                 if ((optval > 0) && (optval < 31))
8934                         rack->r_ctl.rc_reorder_shift = optval;
8935                 else
8936                         error = EINVAL;
8937                 break;
8938         case TCP_RACK_REORD_FADE:
8939                 /* Does reordering fade after ms time */
8940                 RACK_OPTS_INC(tcp_rack_reord_fade);
8941                 rack->r_ctl.rc_reorder_fade = optval;
8942                 break;
8943         case TCP_RACK_TLP_THRESH:
8944                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
8945                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
8946                 if (optval)
8947                         rack->r_ctl.rc_tlp_threshold = optval;
8948                 else
8949                         error = EINVAL;
8950                 break;
8951         case TCP_RACK_PKT_DELAY:
8952                 /* RACK added ms i.e. rack-rtt + reord + N */
8953                 RACK_OPTS_INC(tcp_rack_pkt_delay);
8954                 rack->r_ctl.rc_pkt_delay = optval;
8955                 break;
8956         case TCP_RACK_TLP_INC_VAR:
8957                 /* Does TLP include rtt variance in t-o */
8958                 RACK_OPTS_INC(tcp_rack_tlp_inc_var);
8959                 rack->r_ctl.rc_prr_inc_var = optval;
8960                 break;
8961         case TCP_RACK_IDLE_REDUCE_HIGH:
8962                 RACK_OPTS_INC(tcp_rack_idle_reduce_high);
8963                 if (optval)
8964                         rack->r_idle_reduce_largest = 1;
8965                 else
8966                         rack->r_idle_reduce_largest = 0;
8967                 break;
8968         case TCP_DELACK:
8969                 if (optval == 0)
8970                         tp->t_delayed_ack = 0;
8971                 else
8972                         tp->t_delayed_ack = 1;
8973                 if (tp->t_flags & TF_DELACK) {
8974                         tp->t_flags &= ~TF_DELACK;
8975                         tp->t_flags |= TF_ACKNOW;
8976                         rack_output(tp);
8977                 }
8978                 break;
8979         case TCP_RACK_MIN_PACE:
8980                 RACK_OPTS_INC(tcp_rack_min_pace);
8981                 if (optval > 3)
8982                         rack->r_enforce_min_pace = 3;
8983                 else
8984                         rack->r_enforce_min_pace = optval;
8985                 break;
8986         case TCP_RACK_MIN_PACE_SEG:
8987                 RACK_OPTS_INC(tcp_rack_min_pace_seg);
8988                 if (optval >= 16)
8989                         rack->r_min_pace_seg_thresh = 15;
8990                 else
8991                         rack->r_min_pace_seg_thresh = optval;
8992                 break;
8993         case TCP_BBR_RACK_RTT_USE:
8994                 if ((optval != USE_RTT_HIGH) &&
8995                     (optval != USE_RTT_LOW) &&
8996                     (optval != USE_RTT_AVG))
8997                         error = EINVAL;
8998                 else
8999                         rack->r_ctl.rc_rate_sample_method = optval;
9000                 break;
9001         case TCP_DATA_AFTER_CLOSE:
9002                 if (optval)
9003                         rack->rc_allow_data_af_clo = 1;
9004                 else
9005                         rack->rc_allow_data_af_clo = 0;
9006                 break;
9007         default:
9008                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9009                 break;
9010         }
9011 #ifdef NETFLIX_STATS
9012         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
9013 #endif
9014         INP_WUNLOCK(inp);
9015         return (error);
9016 }
9017
9018 static int
9019 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
9020     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
9021 {
9022         int32_t error, optval;
9023
9024         /*
9025          * Because all our options are either boolean or an int, we can just
9026          * pull everything into optval and then unlock and copy. If we ever
9027          * add a option that is not a int, then this will have quite an
9028          * impact to this routine.
9029          */
9030         switch (sopt->sopt_name) {
9031         case TCP_RACK_PROP_RATE:
9032                 optval = rack->r_ctl.rc_prop_rate;
9033                 break;
9034         case TCP_RACK_PROP:
9035                 /* RACK proportional rate reduction (bool) */
9036                 optval = rack->r_ctl.rc_prop_reduce;
9037                 break;
9038         case TCP_RACK_TLP_REDUCE:
9039                 /* RACK TLP cwnd reduction (bool) */
9040                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
9041                 break;
9042         case TCP_RACK_EARLY_RECOV:
9043                 /* Should recovery happen early (bool) */
9044                 optval = rack->r_ctl.rc_early_recovery;
9045                 break;
9046         case TCP_RACK_PACE_REDUCE:
9047                 /* RACK Hptsi reduction factor (divisor) */
9048                 optval = rack->rc_pace_reduce;
9049                 break;
9050         case TCP_RACK_PACE_MAX_SEG:
9051                 /* Max segments in a pace */
9052                 optval = rack->rc_pace_max_segs;
9053                 break;
9054         case TCP_RACK_PACE_ALWAYS:
9055                 /* Use the always pace method */
9056                 optval = rack->rc_always_pace;
9057                 break;
9058         case TCP_RACK_PRR_SENDALOT:
9059                 /* Allow PRR to send more than one seg */
9060                 optval = rack->r_ctl.rc_prr_sendalot;
9061                 break;
9062         case TCP_RACK_MIN_TO:
9063                 /* Minimum time between rack t-o's in ms */
9064                 optval = rack->r_ctl.rc_min_to;
9065                 break;
9066         case TCP_RACK_EARLY_SEG:
9067                 /* If early recovery max segments */
9068                 optval = rack->r_ctl.rc_early_recovery_segs;
9069                 break;
9070         case TCP_RACK_REORD_THRESH:
9071                 /* RACK reorder threshold (shift amount) */
9072                 optval = rack->r_ctl.rc_reorder_shift;
9073                 break;
9074         case TCP_RACK_REORD_FADE:
9075                 /* Does reordering fade after ms time */
9076                 optval = rack->r_ctl.rc_reorder_fade;
9077                 break;
9078         case TCP_RACK_TLP_THRESH:
9079                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
9080                 optval = rack->r_ctl.rc_tlp_threshold;
9081                 break;
9082         case TCP_RACK_PKT_DELAY:
9083                 /* RACK added ms i.e. rack-rtt + reord + N */
9084                 optval = rack->r_ctl.rc_pkt_delay;
9085                 break;
9086         case TCP_RACK_TLP_USE:
9087                 optval = rack->rack_tlp_threshold_use;
9088                 break;
9089         case TCP_RACK_TLP_INC_VAR:
9090                 /* Does TLP include rtt variance in t-o */
9091                 optval = rack->r_ctl.rc_prr_inc_var;
9092                 break;
9093         case TCP_RACK_IDLE_REDUCE_HIGH:
9094                 optval = rack->r_idle_reduce_largest;
9095                 break;
9096         case TCP_RACK_MIN_PACE:
9097                 optval = rack->r_enforce_min_pace;
9098                 break;
9099         case TCP_RACK_MIN_PACE_SEG:
9100                 optval = rack->r_min_pace_seg_thresh;
9101                 break;
9102         case TCP_BBR_RACK_RTT_USE:
9103                 optval = rack->r_ctl.rc_rate_sample_method;
9104                 break;
9105         case TCP_DELACK:
9106                 optval = tp->t_delayed_ack;
9107                 break;
9108         case TCP_DATA_AFTER_CLOSE:
9109                 optval = rack->rc_allow_data_af_clo;
9110                 break;
9111         default:
9112                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9113                 break;
9114         }
9115         INP_WUNLOCK(inp);
9116         error = sooptcopyout(sopt, &optval, sizeof optval);
9117         return (error);
9118 }
9119
9120 static int
9121 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
9122 {
9123         int32_t error = EINVAL;
9124         struct tcp_rack *rack;
9125
9126         rack = (struct tcp_rack *)tp->t_fb_ptr;
9127         if (rack == NULL) {
9128                 /* Huh? */
9129                 goto out;
9130         }
9131         if (sopt->sopt_dir == SOPT_SET) {
9132                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
9133         } else if (sopt->sopt_dir == SOPT_GET) {
9134                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
9135         }
9136 out:
9137         INP_WUNLOCK(inp);
9138         return (error);
9139 }
9140
9141
9142 struct tcp_function_block __tcp_rack = {
9143         .tfb_tcp_block_name = __XSTRING(STACKNAME),
9144         .tfb_tcp_output = rack_output,
9145         .tfb_tcp_do_segment = rack_do_segment,
9146         .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
9147         .tfb_tcp_ctloutput = rack_ctloutput,
9148         .tfb_tcp_fb_init = rack_init,
9149         .tfb_tcp_fb_fini = rack_fini,
9150         .tfb_tcp_timer_stop_all = rack_stopall,
9151         .tfb_tcp_timer_activate = rack_timer_activate,
9152         .tfb_tcp_timer_active = rack_timer_active,
9153         .tfb_tcp_timer_stop = rack_timer_stop,
9154         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
9155         .tfb_tcp_handoff_ok = rack_handoff_ok
9156 };
9157
9158 static const char *rack_stack_names[] = {
9159         __XSTRING(STACKNAME),
9160 #ifdef STACKALIAS
9161         __XSTRING(STACKALIAS),
9162 #endif
9163 };
9164
9165 static int
9166 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
9167 {
9168         memset(mem, 0, size);
9169         return (0);
9170 }
9171
9172 static void
9173 rack_dtor(void *mem, int32_t size, void *arg)
9174 {
9175
9176 }
9177
9178 static bool rack_mod_inited = false;
9179
9180 static int
9181 tcp_addrack(module_t mod, int32_t type, void *data)
9182 {
9183         int32_t err = 0;
9184         int num_stacks;
9185
9186         switch (type) {
9187         case MOD_LOAD:
9188                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
9189                     sizeof(struct rack_sendmap),
9190                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
9191
9192                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
9193                     sizeof(struct tcp_rack),
9194                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
9195
9196                 sysctl_ctx_init(&rack_sysctl_ctx);
9197                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
9198                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
9199                     OID_AUTO,
9200                     __XSTRING(STACKNAME),
9201                     CTLFLAG_RW, 0,
9202                     "");
9203                 if (rack_sysctl_root == NULL) {
9204                         printf("Failed to add sysctl node\n");
9205                         err = EFAULT;
9206                         goto free_uma;
9207                 }
9208                 rack_init_sysctls();
9209                 num_stacks = nitems(rack_stack_names);
9210                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
9211                     rack_stack_names, &num_stacks);
9212                 if (err) {
9213                         printf("Failed to register %s stack name for "
9214                             "%s module\n", rack_stack_names[num_stacks],
9215                             __XSTRING(MODNAME));
9216                         sysctl_ctx_free(&rack_sysctl_ctx);
9217 free_uma:
9218                         uma_zdestroy(rack_zone);
9219                         uma_zdestroy(rack_pcb_zone);
9220                         rack_counter_destroy();
9221                         printf("Failed to register rack module -- err:%d\n", err);
9222                         return (err);
9223                 }
9224                 rack_mod_inited = true;
9225                 break;
9226         case MOD_QUIESCE:
9227                 err = deregister_tcp_functions(&__tcp_rack, true, false);
9228                 break;
9229         case MOD_UNLOAD:
9230                 err = deregister_tcp_functions(&__tcp_rack, false, true);
9231                 if (err == EBUSY)
9232                         break;
9233                 if (rack_mod_inited) {
9234                         uma_zdestroy(rack_zone);
9235                         uma_zdestroy(rack_pcb_zone);
9236                         sysctl_ctx_free(&rack_sysctl_ctx);
9237                         rack_counter_destroy();
9238                         rack_mod_inited = false;
9239                 }
9240                 err = 0;
9241                 break;
9242         default:
9243                 return (EOPNOTSUPP);
9244         }
9245         return (err);
9246 }
9247
9248 static moduledata_t tcp_rack = {
9249         .name = __XSTRING(MODNAME),
9250         .evhand = tcp_addrack,
9251         .priv = 0
9252 };
9253
9254 MODULE_VERSION(MODNAME, 1);
9255 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
9256 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);