sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-2018 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34
  35 #include <sys/param.h>
  36 #include <sys/module.h>
  37 #include <sys/kernel.h>
  38 #ifdef TCP_HHOOK
  39 #include <sys/hhook.h>
  40 #endif
  41 #include <sys/lock.h>
  42 #include <sys/malloc.h>
  43 #include <sys/lock.h>
  44 #include <sys/mutex.h>
  45 #include <sys/mbuf.h>
  46 #include <sys/proc.h>           /* for proc0 declaration */
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/systm.h>
  51 #ifdef NETFLIX_STATS
  52 #include <sys/stats.h>
  53 #endif
  54 #include <sys/refcount.h>
  55 #include <sys/queue.h>
  56 #include <sys/smp.h>
  57 #include <sys/kthread.h>
  58 #include <sys/kern_prefetch.h>
  59
  60 #include <vm/uma.h>
  61
  62 #include <net/route.h>
  63 #include <net/vnet.h>
  64
  65 #define TCPSTATES               /* for logging */
  66
  67 #include <netinet/in.h>
  68 #include <netinet/in_kdtrace.h>
  69 #include <netinet/in_pcb.h>
  70 #include <netinet/ip.h>
  71 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  72 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  73 #include <netinet/ip_var.h>
  74 #include <netinet/ip6.h>
  75 #include <netinet6/in6_pcb.h>
  76 #include <netinet6/ip6_var.h>
  77 #include <netinet/tcp.h>
  78 #define TCPOUTFLAGS
  79 #include <netinet/tcp_fsm.h>
  80 #include <netinet/tcp_log_buf.h>
  81 #include <netinet/tcp_seq.h>
  82 #include <netinet/tcp_timer.h>
  83 #include <netinet/tcp_var.h>
  84 #include <netinet/tcp_hpts.h>
  85 #include <netinet/tcpip.h>
  86 #include <netinet/cc/cc.h>
  87 #ifdef NETFLIX_CWV
  88 #include <netinet/tcp_newcwv.h>
  89 #endif
  90 #include <netinet/tcp_fastopen.h>
  91 #ifdef TCPDEBUG
  92 #include <netinet/tcp_debug.h>
  93 #endif                          /* TCPDEBUG */
  94 #ifdef TCP_OFFLOAD
  95 #include <netinet/tcp_offload.h>
  96 #endif
  97 #ifdef INET6
  98 #include <netinet6/tcp6_var.h>
  99 #endif
 100
 101 #include <netipsec/ipsec_support.h>
 102
 103 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 104 #include <netipsec/ipsec.h>
 105 #include <netipsec/ipsec6.h>
 106 #endif                          /* IPSEC */
 107
 108 #include <netinet/udp.h>
 109 #include <netinet/udp_var.h>
 110 #include <machine/in_cksum.h>
 111
 112 #ifdef MAC
 113 #include <security/mac/mac_framework.h>
 114 #endif
 115 #include "sack_filter.h"
 116 #include "tcp_rack.h"
 117 #include "rack_bbr_common.h"
 118
 119 uma_zone_t rack_zone;
 120 uma_zone_t rack_pcb_zone;
 121
 122 #ifndef TICKS2SBT
 123 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 124 #endif
 125
 126 struct sysctl_ctx_list rack_sysctl_ctx;
 127 struct sysctl_oid *rack_sysctl_root;
 128
 129 #define CUM_ACKED 1
 130 #define SACKED 2
 131
 132 /*
 133  * The RACK module incorporates a number of
 134  * TCP ideas that have been put out into the IETF
 135  * over the last few years:
 136  * - Matt Mathis's Rate Halving which slowly drops
 137  *    the congestion window so that the ack clock can
 138  *    be maintained during a recovery.
 139  * - Yuchung Cheng's RACK TCP (for which its named) that
 140  *    will stop us using the number of dup acks and instead
 141  *    use time as the gage of when we retransmit.
 142  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 143  *    of Dukkipati et.al.
 144  * RACK depends on SACK, so if an endpoint arrives that
 145  * cannot do SACK the state machine below will shuttle the
 146  * connection back to using the "default" TCP stack that is
 147  * in FreeBSD.
 148  *
 149  * To implement RACK the original TCP stack was first decomposed
 150  * into a functional state machine with individual states
 151  * for each of the possible TCP connection states. The do_segement
 152  * functions role in life is to mandate the connection supports SACK
 153  * initially and then assure that the RACK state matches the conenction
 154  * state before calling the states do_segment function. Each
 155  * state is simplified due to the fact that the original do_segment
 156  * has been decomposed and we *know* what state we are in (no
 157  * switches on the state) and all tests for SACK are gone. This
 158  * greatly simplifies what each state does.
 159  *
 160  * TCP output is also over-written with a new version since it
 161  * must maintain the new rack scoreboard.
 162  *
 163  */
 164 static int32_t rack_precache = 1;
 165 static int32_t rack_tlp_thresh = 1;
 166 static int32_t rack_reorder_thresh = 2;
 167 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 168                                                  * - 60 seconds */
 169 static int32_t rack_pkt_delay = 1;
 170 static int32_t rack_inc_var = 0;/* For TLP */
 171 static int32_t rack_reduce_largest_on_idle = 0;
 172 static int32_t rack_min_pace_time = 0;
 173 static int32_t rack_min_pace_time_seg_req=6;
 174 static int32_t rack_early_recovery = 1;
 175 static int32_t rack_early_recovery_max_seg = 6;
 176 static int32_t rack_send_a_lot_in_prr = 1;
 177 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 178 static int32_t rack_tlp_in_recovery = 1;        /* Can we do TLP in recovery? */
 179 static int32_t rack_verbose_logging = 0;
 180 static int32_t rack_ignore_data_after_close = 1;
 181 /*
 182  * Currently regular tcp has a rto_min of 30ms
 183  * the backoff goes 12 times so that ends up
 184  * being a total of 122.850 seconds before a
 185  * connection is killed.
 186  */
 187 static int32_t rack_tlp_min = 10;
 188 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 189 static int32_t rack_rto_max = 30000;    /* 30 seconds */
 190 static const int32_t rack_free_cache = 2;
 191 static int32_t rack_hptsi_segments = 40;
 192 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 193 static int32_t rack_pace_every_seg = 1;
 194 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 195 static int32_t rack_slot_reduction = 4;
 196 static int32_t rack_lower_cwnd_at_tlp = 0;
 197 static int32_t rack_use_proportional_reduce = 0;
 198 static int32_t rack_proportional_rate = 10;
 199 static int32_t rack_tlp_max_resend = 2;
 200 static int32_t rack_limited_retran = 0;
 201 static int32_t rack_always_send_oldest = 0;
 202 static int32_t rack_sack_block_limit = 128;
 203 static int32_t rack_use_sack_filter = 1;
 204 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 205 static uint32_t rack_map_split_limit = 0;       /* unlimited by default */
 206
 207 /* Rack specific counters */
 208 counter_u64_t rack_badfr;
 209 counter_u64_t rack_badfr_bytes;
 210 counter_u64_t rack_rtm_prr_retran;
 211 counter_u64_t rack_rtm_prr_newdata;
 212 counter_u64_t rack_timestamp_mismatch;
 213 counter_u64_t rack_reorder_seen;
 214 counter_u64_t rack_paced_segments;
 215 counter_u64_t rack_unpaced_segments;
 216 counter_u64_t rack_saw_enobuf;
 217 counter_u64_t rack_saw_enetunreach;
 218
 219 /* Tail loss probe counters */
 220 counter_u64_t rack_tlp_tot;
 221 counter_u64_t rack_tlp_newdata;
 222 counter_u64_t rack_tlp_retran;
 223 counter_u64_t rack_tlp_retran_bytes;
 224 counter_u64_t rack_tlp_retran_fail;
 225 counter_u64_t rack_to_tot;
 226 counter_u64_t rack_to_arm_rack;
 227 counter_u64_t rack_to_arm_tlp;
 228 counter_u64_t rack_to_alloc;
 229 counter_u64_t rack_to_alloc_hard;
 230 counter_u64_t rack_to_alloc_emerg;
 231 counter_u64_t rack_alloc_limited_conns;
 232 counter_u64_t rack_split_limited;
 233
 234 counter_u64_t rack_sack_proc_all;
 235 counter_u64_t rack_sack_proc_short;
 236 counter_u64_t rack_sack_proc_restart;
 237 counter_u64_t rack_runt_sacks;
 238 counter_u64_t rack_used_tlpmethod;
 239 counter_u64_t rack_used_tlpmethod2;
 240 counter_u64_t rack_enter_tlp_calc;
 241 counter_u64_t rack_input_idle_reduces;
 242 counter_u64_t rack_tlp_does_nada;
 243
 244 /* Temp CPU counters */
 245 counter_u64_t rack_find_high;
 246
 247 counter_u64_t rack_progress_drops;
 248 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 249 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 250
 251 static void
 252 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 253
 254 static int
 255 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 256     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 257     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 258 static int
 259 rack_process_data(struct mbuf *m, struct tcphdr *th,
 260     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 261     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 262 static void
 263 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 264     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 265 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 266 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 267     uint8_t limit_type);
 268 static struct rack_sendmap *
 269 rack_check_recovery_mode(struct tcpcb *tp,
 270     uint32_t tsused);
 271 static void
 272 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 273     uint32_t type);
 274 static void rack_counter_destroy(void);
 275 static int
 276 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 277     struct inpcb *inp, struct tcpcb *tp);
 278 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 279 static void
 280 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 281     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 282     uint8_t iptos);
 283 static void rack_dtor(void *mem, int32_t size, void *arg);
 284 static void
 285 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 286     uint32_t t, uint32_t cts);
 287 static struct rack_sendmap *
 288 rack_find_high_nonack(struct tcp_rack *rack,
 289     struct rack_sendmap *rsm);
 290 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 291 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 292 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 293 static int
 294 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 295     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 296 static int32_t rack_handoff_ok(struct tcpcb *tp);
 297 static int32_t rack_init(struct tcpcb *tp);
 298 static void rack_init_sysctls(void);
 299 static void
 300 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 301     struct tcphdr *th);
 302 static void
 303 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 304     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 305     uint8_t pass, struct rack_sendmap *hintrsm);
 306 static void
 307 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 308     struct rack_sendmap *rsm);
 309 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 310 static int32_t rack_output(struct tcpcb *tp);
 311 static void
 312 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
 313     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 314     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
 315
 316 static uint32_t
 317 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 318     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 319     uint32_t cts);
 320 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 321 static void rack_remxt_tmr(struct tcpcb *tp);
 322 static int
 323 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 324     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 325 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 326 static int32_t rack_stopall(struct tcpcb *tp);
 327 static void
 328 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 329     uint32_t delta);
 330 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 331 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 332 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 333 static uint32_t
 334 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 335     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 336 static void
 337 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 338     struct rack_sendmap *rsm, uint32_t ts);
 339 static int
 340 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 341     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 342 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 343 static void
 344 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
 345     struct tcpcb *tp, int32_t * ret_val);
 346 static int
 347 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 348     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 349     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 350 static int
 351 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 352     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 353     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 354 static void
 355 rack_do_drop(struct mbuf *m, struct tcpcb *tp);
 356 static void
 357 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
 358     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
 359 static void
 360 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
 361         struct tcphdr *th, int32_t rstreason, int32_t tlen);
 362 static int
 363 rack_do_established(struct mbuf *m, struct tcphdr *th,
 364     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 365     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 366 static int
 367 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 368     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 369     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
 370 static int
 371 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 372     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 373     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 374 static int
 375 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 376     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 377     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 378 static int
 379 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 380     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 381     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 382 static int
 383 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 384     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 385     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 386 static int
 387 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 388     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 389     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 390 static int
 391 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
 392     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
 393     int32_t * drop_hdrlen, int32_t * ret_val);
 394 static int
 395 rack_process_rst(struct mbuf *m, struct tcphdr *th,
 396     struct socket *so, struct tcpcb *tp);
 397 struct rack_sendmap *
 398 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 399     uint32_t tsused);
 400 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 401 static void
 402      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 403
 404 static int
 405 rack_ts_check(struct mbuf *m, struct tcphdr *th,
 406     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
 407
 408 int32_t rack_clear_counter=0;
 409
 410
 411 static int
 412 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 413 {
 414         uint32_t stat;
 415         int32_t error;
 416
 417         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 418         if (error || req->newptr == NULL)
 419                 return error;
 420
 421         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 422         if (error)
 423                 return (error);
 424         if (stat == 1) {
 425 #ifdef INVARIANTS
 426                 printf("Clearing RACK counters\n");
 427 #endif
 428                 counter_u64_zero(rack_badfr);
 429                 counter_u64_zero(rack_badfr_bytes);
 430                 counter_u64_zero(rack_rtm_prr_retran);
 431                 counter_u64_zero(rack_rtm_prr_newdata);
 432                 counter_u64_zero(rack_timestamp_mismatch);
 433                 counter_u64_zero(rack_reorder_seen);
 434                 counter_u64_zero(rack_tlp_tot);
 435                 counter_u64_zero(rack_tlp_newdata);
 436                 counter_u64_zero(rack_tlp_retran);
 437                 counter_u64_zero(rack_tlp_retran_bytes);
 438                 counter_u64_zero(rack_tlp_retran_fail);
 439                 counter_u64_zero(rack_to_tot);
 440                 counter_u64_zero(rack_to_arm_rack);
 441                 counter_u64_zero(rack_to_arm_tlp);
 442                 counter_u64_zero(rack_paced_segments);
 443                 counter_u64_zero(rack_unpaced_segments);
 444                 counter_u64_zero(rack_saw_enobuf);
 445                 counter_u64_zero(rack_saw_enetunreach);
 446                 counter_u64_zero(rack_to_alloc_hard);
 447                 counter_u64_zero(rack_to_alloc_emerg);
 448                 counter_u64_zero(rack_sack_proc_all);
 449                 counter_u64_zero(rack_sack_proc_short);
 450                 counter_u64_zero(rack_sack_proc_restart);
 451                 counter_u64_zero(rack_to_alloc);
 452                 counter_u64_zero(rack_alloc_limited_conns);
 453                 counter_u64_zero(rack_split_limited);
 454                 counter_u64_zero(rack_find_high);
 455                 counter_u64_zero(rack_runt_sacks);
 456                 counter_u64_zero(rack_used_tlpmethod);
 457                 counter_u64_zero(rack_used_tlpmethod2);
 458                 counter_u64_zero(rack_enter_tlp_calc);
 459                 counter_u64_zero(rack_progress_drops);
 460                 counter_u64_zero(rack_tlp_does_nada);
 461         }
 462         rack_clear_counter = 0;
 463         return (0);
 464 }
 465
 466
 467
 468 static void
 469 rack_init_sysctls()
 470 {
 471         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 472             SYSCTL_CHILDREN(rack_sysctl_root),
 473             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 474             &rack_rate_sample_method , USE_RTT_LOW,
 475             "What method should we use for rate sampling 0=high, 1=low ");
 476         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 477             SYSCTL_CHILDREN(rack_sysctl_root),
 478             OID_AUTO, "data_after_close", CTLFLAG_RW,
 479             &rack_ignore_data_after_close, 0,
 480             "Do we hold off sending a RST until all pending data is ack'd");
 481         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 482             SYSCTL_CHILDREN(rack_sysctl_root),
 483             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 484             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 485             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 486         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 487             SYSCTL_CHILDREN(rack_sysctl_root),
 488             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 489             &rack_min_pace_time, 0,
 490             "Should we enforce a minimum pace time of 1ms");
 491         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 492             SYSCTL_CHILDREN(rack_sysctl_root),
 493             OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 494             &rack_min_pace_time_seg_req, 6,
 495             "How many segments have to be in the len to enforce min-pace-time");
 496         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 497             SYSCTL_CHILDREN(rack_sysctl_root),
 498             OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 499             &rack_reduce_largest_on_idle, 0,
 500             "Should we reduce the largest cwnd seen to IW on idle reduction");
 501         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 502             SYSCTL_CHILDREN(rack_sysctl_root),
 503             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 504             &rack_verbose_logging, 0,
 505             "Should RACK black box logging be verbose");
 506         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 507             SYSCTL_CHILDREN(rack_sysctl_root),
 508             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 509             &rack_use_sack_filter, 1,
 510             "Do we use sack filtering?");
 511         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 512             SYSCTL_CHILDREN(rack_sysctl_root),
 513             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 514             &rack_delayed_ack_time, 200,
 515             "Delayed ack time (200ms)");
 516         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 517             SYSCTL_CHILDREN(rack_sysctl_root),
 518             OID_AUTO, "tlpminto", CTLFLAG_RW,
 519             &rack_tlp_min, 10,
 520             "TLP minimum timeout per the specification (10ms)");
 521         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 522             SYSCTL_CHILDREN(rack_sysctl_root),
 523             OID_AUTO, "precache", CTLFLAG_RW,
 524             &rack_precache, 0,
 525             "Where should we precache the mcopy (0 is not at all)");
 526         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 527             SYSCTL_CHILDREN(rack_sysctl_root),
 528             OID_AUTO, "sblklimit", CTLFLAG_RW,
 529             &rack_sack_block_limit, 128,
 530             "When do we start paying attention to small sack blocks");
 531         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 532             SYSCTL_CHILDREN(rack_sysctl_root),
 533             OID_AUTO, "send_oldest", CTLFLAG_RW,
 534             &rack_always_send_oldest, 1,
 535             "Should we always send the oldest TLP and RACK-TLP");
 536         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 537             SYSCTL_CHILDREN(rack_sysctl_root),
 538             OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 539             &rack_tlp_in_recovery, 1,
 540             "Can we do a TLP during recovery?");
 541         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 542             SYSCTL_CHILDREN(rack_sysctl_root),
 543             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 544             &rack_limited_retran, 0,
 545             "How many times can a rack timeout drive out sends");
 546         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 547             SYSCTL_CHILDREN(rack_sysctl_root),
 548             OID_AUTO, "minrto", CTLFLAG_RW,
 549             &rack_rto_min, 0,
 550             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 551         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 552             SYSCTL_CHILDREN(rack_sysctl_root),
 553             OID_AUTO, "maxrto", CTLFLAG_RW,
 554             &rack_rto_max, 0,
 555             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 556         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 557             SYSCTL_CHILDREN(rack_sysctl_root),
 558             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 559             &rack_tlp_max_resend, 2,
 560             "How many times does TLP retry a single segment or multiple with no ACK");
 561         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 562             SYSCTL_CHILDREN(rack_sysctl_root),
 563             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 564             &rack_use_proportional_reduce, 0,
 565             "Should we proportionaly reduce cwnd based on the number of losses ");
 566         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 567             SYSCTL_CHILDREN(rack_sysctl_root),
 568             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 569             &rack_proportional_rate, 10,
 570             "What percent reduction per loss");
 571         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 572             SYSCTL_CHILDREN(rack_sysctl_root),
 573             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 574             &rack_lower_cwnd_at_tlp, 0,
 575             "When a TLP completes a retran should we enter recovery?");
 576         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 577             SYSCTL_CHILDREN(rack_sysctl_root),
 578             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 579             &rack_slot_reduction, 4,
 580             "When setting a slot should we reduce by divisor");
 581         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 582             SYSCTL_CHILDREN(rack_sysctl_root),
 583             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 584             &rack_pace_every_seg, 1,
 585             "Should we pace out every segment hptsi");
 586         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 587             SYSCTL_CHILDREN(rack_sysctl_root),
 588             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 589             &rack_hptsi_segments, 6,
 590             "Should we pace out only a limited size of segments");
 591         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 592             SYSCTL_CHILDREN(rack_sysctl_root),
 593             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 594             &rack_send_a_lot_in_prr, 1,
 595             "Send a lot in prr");
 596         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 597             SYSCTL_CHILDREN(rack_sysctl_root),
 598             OID_AUTO, "minto", CTLFLAG_RW,
 599             &rack_min_to, 1,
 600             "Minimum rack timeout in milliseconds");
 601         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 602             SYSCTL_CHILDREN(rack_sysctl_root),
 603             OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 604             &rack_early_recovery_max_seg, 6,
 605             "Max segments in early recovery");
 606         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 607             SYSCTL_CHILDREN(rack_sysctl_root),
 608             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 609             &rack_early_recovery, 1,
 610             "Do we do early recovery with rack");
 611         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 612             SYSCTL_CHILDREN(rack_sysctl_root),
 613             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 614             &rack_reorder_thresh, 2,
 615             "What factor for rack will be added when seeing reordering (shift right)");
 616         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 617             SYSCTL_CHILDREN(rack_sysctl_root),
 618             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 619             &rack_tlp_thresh, 1,
 620             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 621         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 622             SYSCTL_CHILDREN(rack_sysctl_root),
 623             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 624             &rack_reorder_fade, 0,
 625             "Does reorder detection fade, if so how many ms (0 means never)");
 626         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 627             SYSCTL_CHILDREN(rack_sysctl_root),
 628             OID_AUTO, "pktdelay", CTLFLAG_RW,
 629             &rack_pkt_delay, 1,
 630             "Extra RACK time (in ms) besides reordering thresh");
 631         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "split_limit", CTLFLAG_RW,
 634             &rack_map_split_limit, 0,
 635             "Is there a limit on the number of map split entries (0=unlimited)");
 636         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "inc_var", CTLFLAG_RW,
 639             &rack_inc_var, 0,
 640             "Should rack add to the TLP timer the variance in rtt calculation");
 641         rack_badfr = counter_u64_alloc(M_WAITOK);
 642         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 643             SYSCTL_CHILDREN(rack_sysctl_root),
 644             OID_AUTO, "badfr", CTLFLAG_RD,
 645             &rack_badfr, "Total number of bad FRs");
 646         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 647         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 648             SYSCTL_CHILDREN(rack_sysctl_root),
 649             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 650             &rack_badfr_bytes, "Total number of bad FRs");
 651         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 652         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 653             SYSCTL_CHILDREN(rack_sysctl_root),
 654             OID_AUTO, "prrsndret", CTLFLAG_RD,
 655             &rack_rtm_prr_retran,
 656             "Total number of prr based retransmits");
 657         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 658         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 659             SYSCTL_CHILDREN(rack_sysctl_root),
 660             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 661             &rack_rtm_prr_newdata,
 662             "Total number of prr based new transmits");
 663         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 664         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 665             SYSCTL_CHILDREN(rack_sysctl_root),
 666             OID_AUTO, "tsnf", CTLFLAG_RD,
 667             &rack_timestamp_mismatch,
 668             "Total number of timestamps that we could not find the reported ts");
 669         rack_find_high = counter_u64_alloc(M_WAITOK);
 670         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 671             SYSCTL_CHILDREN(rack_sysctl_root),
 672             OID_AUTO, "findhigh", CTLFLAG_RD,
 673             &rack_find_high,
 674             "Total number of FIN causing find-high");
 675         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 676         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 677             SYSCTL_CHILDREN(rack_sysctl_root),
 678             OID_AUTO, "reordering", CTLFLAG_RD,
 679             &rack_reorder_seen,
 680             "Total number of times we added delay due to reordering");
 681         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 682         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 683             SYSCTL_CHILDREN(rack_sysctl_root),
 684             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 685             &rack_tlp_tot,
 686             "Total number of tail loss probe expirations");
 687         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 688         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 689             SYSCTL_CHILDREN(rack_sysctl_root),
 690             OID_AUTO, "tlp_new", CTLFLAG_RD,
 691             &rack_tlp_newdata,
 692             "Total number of tail loss probe sending new data");
 693
 694         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 695         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 696             SYSCTL_CHILDREN(rack_sysctl_root),
 697             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 698             &rack_tlp_retran,
 699             "Total number of tail loss probe sending retransmitted data");
 700         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 701         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 702             SYSCTL_CHILDREN(rack_sysctl_root),
 703             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 704             &rack_tlp_retran_bytes,
 705             "Total bytes of tail loss probe sending retransmitted data");
 706         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 707         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 708             SYSCTL_CHILDREN(rack_sysctl_root),
 709             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 710             &rack_tlp_retran_fail,
 711             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 712         rack_to_tot = counter_u64_alloc(M_WAITOK);
 713         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 714             SYSCTL_CHILDREN(rack_sysctl_root),
 715             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 716             &rack_to_tot,
 717             "Total number of times the rack to expired?");
 718         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 719         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 720             SYSCTL_CHILDREN(rack_sysctl_root),
 721             OID_AUTO, "arm_rack", CTLFLAG_RD,
 722             &rack_to_arm_rack,
 723             "Total number of times the rack timer armed?");
 724         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 725         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 726             SYSCTL_CHILDREN(rack_sysctl_root),
 727             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 728             &rack_to_arm_tlp,
 729             "Total number of times the tlp timer armed?");
 730         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 731         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 732             SYSCTL_CHILDREN(rack_sysctl_root),
 733             OID_AUTO, "paced", CTLFLAG_RD,
 734             &rack_paced_segments,
 735             "Total number of times a segment send caused hptsi");
 736         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 737         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 738             SYSCTL_CHILDREN(rack_sysctl_root),
 739             OID_AUTO, "unpaced", CTLFLAG_RD,
 740             &rack_unpaced_segments,
 741             "Total number of times a segment did not cause hptsi");
 742         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 743         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 744             SYSCTL_CHILDREN(rack_sysctl_root),
 745             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 746             &rack_saw_enobuf,
 747             "Total number of times a segment did not cause hptsi");
 748         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 749         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 750             SYSCTL_CHILDREN(rack_sysctl_root),
 751             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 752             &rack_saw_enetunreach,
 753             "Total number of times a segment did not cause hptsi");
 754         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 755         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 756             SYSCTL_CHILDREN(rack_sysctl_root),
 757             OID_AUTO, "allocs", CTLFLAG_RD,
 758             &rack_to_alloc,
 759             "Total allocations of tracking structures");
 760         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 761         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 762             SYSCTL_CHILDREN(rack_sysctl_root),
 763             OID_AUTO, "allochard", CTLFLAG_RD,
 764             &rack_to_alloc_hard,
 765             "Total allocations done with sleeping the hard way");
 766         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 767         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 768             SYSCTL_CHILDREN(rack_sysctl_root),
 769             OID_AUTO, "allocemerg", CTLFLAG_RD,
 770             &rack_to_alloc_emerg,
 771             "Total allocations done from emergency cache");
 772         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 773         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 774             SYSCTL_CHILDREN(rack_sysctl_root),
 775             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 776             &rack_alloc_limited_conns,
 777             "Connections with allocations dropped due to limit");
 778         rack_split_limited = counter_u64_alloc(M_WAITOK);
 779         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 780             SYSCTL_CHILDREN(rack_sysctl_root),
 781             OID_AUTO, "split_limited", CTLFLAG_RD,
 782             &rack_split_limited,
 783             "Split allocations dropped due to limit");
 784         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 785         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 786             SYSCTL_CHILDREN(rack_sysctl_root),
 787             OID_AUTO, "sack_long", CTLFLAG_RD,
 788             &rack_sack_proc_all,
 789             "Total times we had to walk whole list for sack processing");
 790
 791         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 792         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 793             SYSCTL_CHILDREN(rack_sysctl_root),
 794             OID_AUTO, "sack_restart", CTLFLAG_RD,
 795             &rack_sack_proc_restart,
 796             "Total times we had to walk whole list due to a restart");
 797         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 798         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 799             SYSCTL_CHILDREN(rack_sysctl_root),
 800             OID_AUTO, "sack_short", CTLFLAG_RD,
 801             &rack_sack_proc_short,
 802             "Total times we took shortcut for sack processing");
 803         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 804         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 805             SYSCTL_CHILDREN(rack_sysctl_root),
 806             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 807             &rack_enter_tlp_calc,
 808             "Total times we called calc-tlp");
 809         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 810         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 811             SYSCTL_CHILDREN(rack_sysctl_root),
 812             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 813             &rack_used_tlpmethod,
 814             "Total number of runt sacks");
 815         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 816         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 817             SYSCTL_CHILDREN(rack_sysctl_root),
 818             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 819             &rack_used_tlpmethod2,
 820             "Total number of runt sacks 2");
 821         rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 822         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 823             SYSCTL_CHILDREN(rack_sysctl_root),
 824             OID_AUTO, "runtsacks", CTLFLAG_RD,
 825             &rack_runt_sacks,
 826             "Total number of runt sacks");
 827         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 828         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 829             SYSCTL_CHILDREN(rack_sysctl_root),
 830             OID_AUTO, "prog_drops", CTLFLAG_RD,
 831             &rack_progress_drops,
 832             "Total number of progress drops");
 833         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 834         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 835             SYSCTL_CHILDREN(rack_sysctl_root),
 836             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 837             &rack_input_idle_reduces,
 838             "Total number of idle reductions on input");
 839         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 840         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 841             SYSCTL_CHILDREN(rack_sysctl_root),
 842             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 843             &rack_tlp_does_nada,
 844             "Total number of nada tlp calls");
 845         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 846         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 847             OID_AUTO, "outsize", CTLFLAG_RD,
 848             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 849         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 850         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 851             OID_AUTO, "opts", CTLFLAG_RD,
 852             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 853         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 854             SYSCTL_CHILDREN(rack_sysctl_root),
 855             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 856             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 857 }
 858
 859 static inline int32_t
 860 rack_progress_timeout_check(struct tcpcb *tp)
 861 {
 862         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 863                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 864                         /*
 865                          * There is an assumption that the caller
 866                          * will drop the connection so we will
 867                          * increment the counters here.
 868                          */
 869                         struct tcp_rack *rack;
 870                         rack = (struct tcp_rack *)tp->t_fb_ptr;
 871                         counter_u64_add(rack_progress_drops, 1);
 872 #ifdef NETFLIX_STATS
 873                         TCPSTAT_INC(tcps_progdrops);
 874 #endif
 875                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 876                         return (1);
 877                 }
 878         }
 879         return (0);
 880 }
 881
 882
 883 static void
 884 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 885 {
 886         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 887                 union tcp_log_stackspecific log;
 888
 889                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 890                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 891                 log.u_bbr.flex2 = to;
 892                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 893                 log.u_bbr.flex4 = slot;
 894                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 895                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 896                 log.u_bbr.flex8 = which;
 897                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 898                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 899                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 900                     &rack->rc_inp->inp_socket->so_rcv,
 901                     &rack->rc_inp->inp_socket->so_snd,
 902                     BBR_LOG_TIMERSTAR, 0,
 903                     0, &log, false);
 904         }
 905 }
 906
 907 static void
 908 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 909 {
 910         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 911                 union tcp_log_stackspecific log;
 912
 913                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 914                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 915                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 916                 log.u_bbr.flex8 = to_num;
 917                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 918                 log.u_bbr.flex2 = rack->rc_rack_rtt;
 919                 TCP_LOG_EVENT(rack->rc_tp, NULL,
 920                     &rack->rc_inp->inp_socket->so_rcv,
 921                     &rack->rc_inp->inp_socket->so_snd,
 922                     BBR_LOG_RTO, 0,
 923                     0, &log, false);
 924         }
 925 }
 926
 927 static void
 928 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
 929     uint32_t o_srtt, uint32_t o_var)
 930 {
 931         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 932                 union tcp_log_stackspecific log;
 933
 934                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 935                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 936                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 937                 log.u_bbr.flex1 = t;
 938                 log.u_bbr.flex2 = o_srtt;
 939                 log.u_bbr.flex3 = o_var;
 940                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 941                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 942                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 943                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 944                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 945                 TCP_LOG_EVENT(tp, NULL,
 946                     &rack->rc_inp->inp_socket->so_rcv,
 947                     &rack->rc_inp->inp_socket->so_snd,
 948                     BBR_LOG_BBRRTT, 0,
 949                     0, &log, false);
 950         }
 951 }
 952
 953 static void
 954 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 955 {
 956         /*
 957          * Log the rtt sample we are
 958          * applying to the srtt algorithm in
 959          * useconds.
 960          */
 961         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 962                 union tcp_log_stackspecific log;
 963                 struct timeval tv;
 964
 965                 /* Convert our ms to a microsecond */
 966                 log.u_bbr.flex1 = rtt * 1000;
 967                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 968                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
 969                     &rack->rc_inp->inp_socket->so_rcv,
 970                     &rack->rc_inp->inp_socket->so_snd,
 971                     TCP_LOG_RTT, 0,
 972                     0, &log, false, &tv);
 973         }
 974 }
 975
 976
 977 static inline void
 978 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 979 {
 980         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 981                 union tcp_log_stackspecific log;
 982
 983                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 984                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 985                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 986                 log.u_bbr.flex1 = line;
 987                 log.u_bbr.flex2 = tick;
 988                 log.u_bbr.flex3 = tp->t_maxunacktime;
 989                 log.u_bbr.flex4 = tp->t_acktime;
 990                 log.u_bbr.flex8 = event;
 991                 TCP_LOG_EVENT(tp, NULL,
 992                     &rack->rc_inp->inp_socket->so_rcv,
 993                     &rack->rc_inp->inp_socket->so_snd,
 994                     BBR_LOG_PROGRESS, 0,
 995                     0, &log, false);
 996         }
 997 }
 998
 999 static void
1000 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1001 {
1002         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1003                 union tcp_log_stackspecific log;
1004
1005                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1006                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1007                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1008                 log.u_bbr.flex1 = slot;
1009                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1010                 log.u_bbr.flex8 = rack->rc_in_persist;
1011                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1012                     &rack->rc_inp->inp_socket->so_rcv,
1013                     &rack->rc_inp->inp_socket->so_snd,
1014                     BBR_LOG_BBRSND, 0,
1015                     0, &log, false);
1016         }
1017 }
1018
1019 static void
1020 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1021 {
1022         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1023                 union tcp_log_stackspecific log;
1024                 log.u_bbr.flex1 = did_out;
1025                 log.u_bbr.flex2 = nxt_pkt;
1026                 log.u_bbr.flex3 = way_out;
1027                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1028                 log.u_bbr.flex7 = rack->r_wanted_output;
1029                 log.u_bbr.flex8 = rack->rc_in_persist;
1030                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1031                     &rack->rc_inp->inp_socket->so_rcv,
1032                     &rack->rc_inp->inp_socket->so_snd,
1033                     BBR_LOG_DOSEG_DONE, 0,
1034                     0, &log, false);
1035         }
1036 }
1037
1038
1039 static void
1040 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1041 {
1042         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1043                 union tcp_log_stackspecific log;
1044
1045                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1046                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1047                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1048                 log.u_bbr.flex1 = slot;
1049                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1050                 log.u_bbr.flex7 = hpts_calling;
1051                 log.u_bbr.flex8 = rack->rc_in_persist;
1052                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1053                     &rack->rc_inp->inp_socket->so_rcv,
1054                     &rack->rc_inp->inp_socket->so_snd,
1055                     BBR_LOG_JUSTRET, 0,
1056                     tlen, &log, false);
1057         }
1058 }
1059
1060 static void
1061 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1062 {
1063         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1064                 union tcp_log_stackspecific log;
1065
1066                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1067                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1068                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1069                 log.u_bbr.flex1 = line;
1070                 log.u_bbr.flex2 = 0;
1071                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1072                 log.u_bbr.flex4 = 0;
1073                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1074                 log.u_bbr.flex8 = hpts_removed;
1075                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1076                     &rack->rc_inp->inp_socket->so_rcv,
1077                     &rack->rc_inp->inp_socket->so_snd,
1078                     BBR_LOG_TIMERCANC, 0,
1079                     0, &log, false);
1080         }
1081 }
1082
1083 static void
1084 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1085 {
1086         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1087                 union tcp_log_stackspecific log;
1088
1089                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1090                 log.u_bbr.flex1 = timers;
1091                 log.u_bbr.flex2 = ret;
1092                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1093                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1094                 log.u_bbr.flex5 = cts;
1095                 TCP_LOG_EVENT(rack->rc_tp, NULL,
1096                     &rack->rc_inp->inp_socket->so_rcv,
1097                     &rack->rc_inp->inp_socket->so_snd,
1098                     BBR_LOG_TO_PROCESS, 0,
1099                     0, &log, false);
1100         }
1101 }
1102
1103 static void
1104 rack_counter_destroy()
1105 {
1106         counter_u64_free(rack_badfr);
1107         counter_u64_free(rack_badfr_bytes);
1108         counter_u64_free(rack_rtm_prr_retran);
1109         counter_u64_free(rack_rtm_prr_newdata);
1110         counter_u64_free(rack_timestamp_mismatch);
1111         counter_u64_free(rack_reorder_seen);
1112         counter_u64_free(rack_tlp_tot);
1113         counter_u64_free(rack_tlp_newdata);
1114         counter_u64_free(rack_tlp_retran);
1115         counter_u64_free(rack_tlp_retran_bytes);
1116         counter_u64_free(rack_tlp_retran_fail);
1117         counter_u64_free(rack_to_tot);
1118         counter_u64_free(rack_to_arm_rack);
1119         counter_u64_free(rack_to_arm_tlp);
1120         counter_u64_free(rack_paced_segments);
1121         counter_u64_free(rack_unpaced_segments);
1122         counter_u64_free(rack_saw_enobuf);
1123         counter_u64_free(rack_saw_enetunreach);
1124         counter_u64_free(rack_to_alloc_hard);
1125         counter_u64_free(rack_to_alloc_emerg);
1126         counter_u64_free(rack_sack_proc_all);
1127         counter_u64_free(rack_sack_proc_short);
1128         counter_u64_free(rack_sack_proc_restart);
1129         counter_u64_free(rack_to_alloc);
1130         counter_u64_free(rack_find_high);
1131         counter_u64_free(rack_runt_sacks);
1132         counter_u64_free(rack_enter_tlp_calc);
1133         counter_u64_free(rack_used_tlpmethod);
1134         counter_u64_free(rack_used_tlpmethod2);
1135         counter_u64_free(rack_progress_drops);
1136         counter_u64_free(rack_input_idle_reduces);
1137         counter_u64_free(rack_tlp_does_nada);
1138         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1139         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1140 }
1141
1142 static struct rack_sendmap *
1143 rack_alloc(struct tcp_rack *rack)
1144 {
1145         struct rack_sendmap *rsm;
1146
1147         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1148         if (rsm) {
1149 alloc_done:
1150                 counter_u64_add(rack_to_alloc, 1);
1151                 rack->r_ctl.rc_num_maps_alloced++;
1152                 return (rsm);
1153         }
1154         if (rack->rc_free_cnt) {
1155                 counter_u64_add(rack_to_alloc_emerg, 1);
1156                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1157                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
1158                 rack->rc_free_cnt--;
1159                 goto alloc_done;
1160         }
1161         return (NULL);
1162 }
1163
1164 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1165 static struct rack_sendmap *
1166 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1167 {
1168         struct rack_sendmap *rsm;
1169
1170         if (limit_type) {
1171                 /* currently there is only one limit type */
1172                 if (rack_map_split_limit > 0 &&
1173                     rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
1174                         counter_u64_add(rack_split_limited, 1);
1175                         if (!rack->alloc_limit_reported) {
1176                                 rack->alloc_limit_reported = 1;
1177                                 counter_u64_add(rack_alloc_limited_conns, 1);
1178                         }
1179                         return (NULL);
1180                 }
1181         }
1182
1183         /* allocate and mark in the limit type, if set */
1184         rsm = rack_alloc(rack);
1185         if (rsm != NULL && limit_type) {
1186                 rsm->r_limit_type = limit_type;
1187                 rack->r_ctl.rc_num_split_allocs++;
1188         }
1189         return (rsm);
1190 }
1191
1192 static void
1193 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1194 {
1195         if (rsm->r_limit_type) {
1196                 /* currently there is only one limit type */
1197                 rack->r_ctl.rc_num_split_allocs--;
1198         }
1199         rack->r_ctl.rc_num_maps_alloced--;
1200         if (rack->r_ctl.rc_tlpsend == rsm)
1201                 rack->r_ctl.rc_tlpsend = NULL;
1202         if (rack->r_ctl.rc_next == rsm)
1203                 rack->r_ctl.rc_next = NULL;
1204         if (rack->r_ctl.rc_sacklast == rsm)
1205                 rack->r_ctl.rc_sacklast = NULL;
1206         if (rack->rc_free_cnt < rack_free_cache) {
1207                 memset(rsm, 0, sizeof(struct rack_sendmap));
1208                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
1209                 rack->rc_free_cnt++;
1210                 return;
1211         }
1212         uma_zfree(rack_zone, rsm);
1213 }
1214
1215 /*
1216  * CC wrapper hook functions
1217  */
1218 static void
1219 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1220     uint16_t type, int32_t recovery)
1221 {
1222 #ifdef NETFLIX_STATS
1223         int32_t gput;
1224 #endif
1225 #ifdef NETFLIX_CWV
1226         u_long old_cwnd = tp->snd_cwnd;
1227 #endif
1228
1229         INP_WLOCK_ASSERT(tp->t_inpcb);
1230         tp->ccv->nsegs = nsegs;
1231         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1232         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1233                 uint32_t max;
1234
1235                 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
1236                 if (tp->ccv->bytes_this_ack > max) {
1237                         tp->ccv->bytes_this_ack = max;
1238                 }
1239         }
1240         if (tp->snd_cwnd <= tp->snd_wnd)
1241                 tp->ccv->flags |= CCF_CWND_LIMITED;
1242         else
1243                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1244
1245         if (type == CC_ACK) {
1246 #ifdef NETFLIX_STATS
1247                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1248                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1249                 if ((tp->t_flags & TF_GPUTINPROG) &&
1250                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1251                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1252                             max(1, tcp_ts_getticks() - tp->gput_ts);
1253                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1254                             gput);
1255                         /*
1256                          * XXXLAS: This is a temporary hack, and should be
1257                          * chained off VOI_TCP_GPUT when stats(9) grows an
1258                          * API to deal with chained VOIs.
1259                          */
1260                         if (tp->t_stats_gput_prev > 0)
1261                                 stats_voi_update_abs_s32(tp->t_stats,
1262                                     VOI_TCP_GPUT_ND,
1263                                     ((gput - tp->t_stats_gput_prev) * 100) /
1264                                     tp->t_stats_gput_prev);
1265                         tp->t_flags &= ~TF_GPUTINPROG;
1266                         tp->t_stats_gput_prev = gput;
1267 #ifdef NETFLIX_CWV
1268                         if (tp->t_maxpeakrate) {
1269                                 /*
1270                                  * We update t_peakrate_thr. This gives us roughly
1271                                  * one update per round trip time.
1272                                  */
1273                                 tcp_update_peakrate_thr(tp);
1274                         }
1275 #endif
1276                 }
1277 #endif
1278                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1279                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1280                             nsegs * V_tcp_abc_l_var * tp->t_maxseg);
1281                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1282                                 tp->t_bytes_acked -= tp->snd_cwnd;
1283                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1284                         }
1285                 } else {
1286                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1287                         tp->t_bytes_acked = 0;
1288                 }
1289         }
1290         if (CC_ALGO(tp)->ack_received != NULL) {
1291                 /* XXXLAS: Find a way to live without this */
1292                 tp->ccv->curack = th->th_ack;
1293                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1294         }
1295 #ifdef NETFLIX_STATS
1296         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1297 #endif
1298         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1299                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1300         }
1301 #ifdef NETFLIX_CWV
1302         if (tp->cwv_enabled) {
1303                 /*
1304                  * Per RFC 7661: The behaviour in the non-validated phase is
1305                  * specified as: o  A sender determines whether to increase
1306                  * the cwnd based upon whether it is cwnd-limited (see
1307                  * Section 4.5.3): * A sender that is cwnd-limited MAY use
1308                  * the standard TCP method to increase cwnd (i.e., the
1309                  * standard method permits a TCP sender that fully utilises
1310                  * the cwnd to increase the cwnd each time it receives an
1311                  * ACK). * A sender that is not cwnd-limited MUST NOT
1312                  * increase the cwnd when ACK packets are received in this
1313                  * phase (i.e., needs to avoid growing the cwnd when it has
1314                  * not recently sent using the current size of cwnd).
1315                  */
1316                 if ((tp->snd_cwnd > old_cwnd) &&
1317                     (tp->cwv_cwnd_valid == 0) &&
1318                     (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
1319                         tp->snd_cwnd = old_cwnd;
1320                 }
1321                 /* Try to update pipeAck and NCWV state */
1322                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1323                     !IN_RECOVERY(tp->t_flags)) {
1324                         uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
1325
1326                         tcp_newcwv_update_pipeack(tp, data);
1327                 }
1328         }
1329         /* we enforce max peak rate if it is set. */
1330         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1331                 tp->snd_cwnd = tp->t_peakrate_thr;
1332         }
1333 #endif
1334 }
1335
1336 static void
1337 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1338 {
1339         struct tcp_rack *rack;
1340
1341         rack = (struct tcp_rack *)tp->t_fb_ptr;
1342         INP_WLOCK_ASSERT(tp->t_inpcb);
1343         if (rack->r_ctl.rc_prr_sndcnt > 0)
1344                 rack->r_wanted_output++;
1345 }
1346
1347 static void
1348 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1349 {
1350         struct tcp_rack *rack;
1351
1352         INP_WLOCK_ASSERT(tp->t_inpcb);
1353         rack = (struct tcp_rack *)tp->t_fb_ptr;
1354         if (CC_ALGO(tp)->post_recovery != NULL) {
1355                 tp->ccv->curack = th->th_ack;
1356                 CC_ALGO(tp)->post_recovery(tp->ccv);
1357         }
1358         /*
1359          * Here we can in theory adjust cwnd to be based on the number of
1360          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1361          * based on the rack_use_proportional flag.
1362          */
1363         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1364                 int32_t reduce;
1365
1366                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1367                 if (reduce > 50) {
1368                         reduce = 50;
1369                 }
1370                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1371         } else {
1372                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1373                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1374                         tp->snd_cwnd = tp->snd_ssthresh;
1375                 }
1376         }
1377         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1378                 /* Suck the next prr cnt back into cwnd */
1379                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1380                 rack->r_ctl.rc_prr_sndcnt = 0;
1381         }
1382         EXIT_RECOVERY(tp->t_flags);
1383
1384
1385 #ifdef NETFLIX_CWV
1386         if (tp->cwv_enabled) {
1387                 if ((tp->cwv_cwnd_valid == 0) &&
1388                     (tp->snd_cwv.in_recovery))
1389                         tcp_newcwv_end_recovery(tp);
1390         }
1391 #endif
1392 }
1393
1394 static void
1395 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1396 {
1397         struct tcp_rack *rack;
1398
1399         INP_WLOCK_ASSERT(tp->t_inpcb);
1400
1401         rack = (struct tcp_rack *)tp->t_fb_ptr;
1402         switch (type) {
1403         case CC_NDUPACK:
1404 /*              rack->r_ctl.rc_ssthresh_set = 1;*/
1405                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1406                         rack->r_ctl.rc_tlp_rtx_out = 0;
1407                         rack->r_ctl.rc_prr_delivered = 0;
1408                         rack->r_ctl.rc_prr_out = 0;
1409                         rack->r_ctl.rc_loss_count = 0;
1410                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
1411                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1412                         tp->snd_recover = tp->snd_max;
1413                         if (tp->t_flags & TF_ECN_PERMIT)
1414                                 tp->t_flags |= TF_ECN_SND_CWR;
1415                 }
1416                 break;
1417         case CC_ECN:
1418                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1419                         TCPSTAT_INC(tcps_ecn_rcwnd);
1420                         tp->snd_recover = tp->snd_max;
1421                         if (tp->t_flags & TF_ECN_PERMIT)
1422                                 tp->t_flags |= TF_ECN_SND_CWR;
1423                 }
1424                 break;
1425         case CC_RTO:
1426                 tp->t_dupacks = 0;
1427                 tp->t_bytes_acked = 0;
1428                 EXIT_RECOVERY(tp->t_flags);
1429                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1430                     tp->t_maxseg) * tp->t_maxseg;
1431                 tp->snd_cwnd = tp->t_maxseg;
1432                 break;
1433         case CC_RTO_ERR:
1434                 TCPSTAT_INC(tcps_sndrexmitbad);
1435                 /* RTO was unnecessary, so reset everything. */
1436                 tp->snd_cwnd = tp->snd_cwnd_prev;
1437                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1438                 tp->snd_recover = tp->snd_recover_prev;
1439                 if (tp->t_flags & TF_WASFRECOVERY)
1440                         ENTER_FASTRECOVERY(tp->t_flags);
1441                 if (tp->t_flags & TF_WASCRECOVERY)
1442                         ENTER_CONGRECOVERY(tp->t_flags);
1443                 tp->snd_nxt = tp->snd_max;
1444                 tp->t_badrxtwin = 0;
1445                 break;
1446         }
1447
1448         if (CC_ALGO(tp)->cong_signal != NULL) {
1449                 if (th != NULL)
1450                         tp->ccv->curack = th->th_ack;
1451                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1452         }
1453 #ifdef NETFLIX_CWV
1454         if (tp->cwv_enabled) {
1455                 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
1456                         tcp_newcwv_enter_recovery(tp);
1457                 }
1458                 if (type == CC_RTO) {
1459                         tcp_newcwv_reset(tp);
1460                 }
1461         }
1462 #endif
1463 }
1464
1465
1466
1467 static inline void
1468 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
1469 {
1470         uint32_t i_cwnd;
1471
1472         INP_WLOCK_ASSERT(tp->t_inpcb);
1473
1474 #ifdef NETFLIX_STATS
1475         TCPSTAT_INC(tcps_idle_restarts);
1476         if (tp->t_state == TCPS_ESTABLISHED)
1477                 TCPSTAT_INC(tcps_idle_estrestarts);
1478 #endif
1479         if (CC_ALGO(tp)->after_idle != NULL)
1480                 CC_ALGO(tp)->after_idle(tp->ccv);
1481
1482         if (tp->snd_cwnd == 1)
1483                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1484         else if (V_tcp_initcwnd_segments)
1485                 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
1486                     max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
1487         else if (V_tcp_do_rfc3390)
1488                 i_cwnd = min(4 * tp->t_maxseg,
1489                     max(2 * tp->t_maxseg, 4380));
1490         else {
1491                 /* Per RFC5681 Section 3.1 */
1492                 if (tp->t_maxseg > 2190)
1493                         i_cwnd = 2 * tp->t_maxseg;
1494                 else if (tp->t_maxseg > 1095)
1495                         i_cwnd = 3 * tp->t_maxseg;
1496                 else
1497                         i_cwnd = 4 * tp->t_maxseg;
1498         }
1499         if (reduce_largest) {
1500                 /*
1501                  * Do we reduce the largest cwnd to make
1502                  * rack play nice on restart hptsi wise?
1503                  */
1504                 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
1505                         ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
1506         }
1507         /*
1508          * Being idle is no differnt than the initial window. If the cc
1509          * clamps it down below the initial window raise it to the initial
1510          * window.
1511          */
1512         if (tp->snd_cwnd < i_cwnd) {
1513                 tp->snd_cwnd = i_cwnd;
1514         }
1515 }
1516
1517
1518 /*
1519  * Indicate whether this ack should be delayed.  We can delay the ack if
1520  * following conditions are met:
1521  *      - There is no delayed ack timer in progress.
1522  *      - Our last ack wasn't a 0-sized window. We never want to delay
1523  *        the ack that opens up a 0-sized window.
1524  *      - LRO wasn't used for this segment. We make sure by checking that the
1525  *        segment size is not larger than the MSS.
1526  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1527  *        connection.
1528  */
1529 #define DELAY_ACK(tp, tlen)                      \
1530         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1531         ((tp->t_flags & TF_DELACK) == 0) &&      \
1532         (tlen <= tp->t_maxseg) &&                \
1533         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1534
1535 static inline void
1536 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
1537 {
1538         int32_t win;
1539
1540         /*
1541          * Calculate amount of space in receive window, and then do TCP
1542          * input processing. Receive window is amount of space in rcv queue,
1543          * but not less than advertised window.
1544          */
1545         win = sbspace(&so->so_rcv);
1546         if (win < 0)
1547                 win = 0;
1548         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1549 }
1550
1551 static void
1552 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
1553 {
1554         /*
1555          * Drop space held by incoming segment and return.
1556          */
1557         if (tp != NULL)
1558                 INP_WUNLOCK(tp->t_inpcb);
1559         if (m)
1560                 m_freem(m);
1561 }
1562
1563 static void
1564 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
1565     int32_t rstreason, int32_t tlen)
1566 {
1567         if (tp != NULL) {
1568                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
1569                 INP_WUNLOCK(tp->t_inpcb);
1570         } else
1571                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1572 }
1573
1574 /*
1575  * The value in ret_val informs the caller
1576  * if we dropped the tcb (and lock) or not.
1577  * 1 = we dropped it, 0 = the TCB is still locked
1578  * and valid.
1579  */
1580 static void
1581 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
1582 {
1583         /*
1584          * Generate an ACK dropping incoming segment if it occupies sequence
1585          * space, where the ACK reflects our state.
1586          *
1587          * We can now skip the test for the RST flag since all paths to this
1588          * code happen after packets containing RST have been dropped.
1589          *
1590          * In the SYN-RECEIVED state, don't send an ACK unless the segment
1591          * we received passes the SYN-RECEIVED ACK test. If it fails send a
1592          * RST.  This breaks the loop in the "LAND" DoS attack, and also
1593          * prevents an ACK storm between two listening ports that have been
1594          * sent forged SYN segments, each with the source address of the
1595          * other.
1596          */
1597         struct tcp_rack *rack;
1598
1599         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1600             (SEQ_GT(tp->snd_una, th->th_ack) ||
1601             SEQ_GT(th->th_ack, tp->snd_max))) {
1602                 *ret_val = 1;
1603                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
1604                 return;
1605         } else
1606                 *ret_val = 0;
1607         rack = (struct tcp_rack *)tp->t_fb_ptr;
1608         rack->r_wanted_output++;
1609         tp->t_flags |= TF_ACKNOW;
1610         if (m)
1611                 m_freem(m);
1612 }
1613
1614
1615 static int
1616 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
1617 {
1618         /*
1619          * RFC5961 Section 3.2
1620          *
1621          * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
1622          * window, we send challenge ACK.
1623          *
1624          * Note: to take into account delayed ACKs, we should test against
1625          * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
1626          * of closed window, not covered by the RFC.
1627          */
1628         int dropped = 0;
1629
1630         if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
1631             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1632             (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1633
1634                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1635                 KASSERT(tp->t_state != TCPS_SYN_SENT,
1636                     ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
1637                     __func__, th, tp));
1638
1639                 if (V_tcp_insecure_rst ||
1640                     (tp->last_ack_sent == th->th_seq) ||
1641                     (tp->rcv_nxt == th->th_seq) ||
1642                     ((tp->last_ack_sent - 1) == th->th_seq)) {
1643                         TCPSTAT_INC(tcps_drops);
1644                         /* Drop the connection. */
1645                         switch (tp->t_state) {
1646                         case TCPS_SYN_RECEIVED:
1647                                 so->so_error = ECONNREFUSED;
1648                                 goto close;
1649                         case TCPS_ESTABLISHED:
1650                         case TCPS_FIN_WAIT_1:
1651                         case TCPS_FIN_WAIT_2:
1652                         case TCPS_CLOSE_WAIT:
1653                         case TCPS_CLOSING:
1654                         case TCPS_LAST_ACK:
1655                                 so->so_error = ECONNRESET;
1656                 close:
1657                                 tcp_state_change(tp, TCPS_CLOSED);
1658                                 /* FALLTHROUGH */
1659                         default:
1660                                 tp = tcp_close(tp);
1661                         }
1662                         dropped = 1;
1663                         rack_do_drop(m, tp);
1664                 } else {
1665                         TCPSTAT_INC(tcps_badrst);
1666                         /* Send challenge ACK. */
1667                         tcp_respond(tp, mtod(m, void *), th, m,
1668                             tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1669                         tp->last_ack_sent = tp->rcv_nxt;
1670                 }
1671         } else {
1672                 m_freem(m);
1673         }
1674         return (dropped);
1675 }
1676
1677 /*
1678  * The value in ret_val informs the caller
1679  * if we dropped the tcb (and lock) or not.
1680  * 1 = we dropped it, 0 = the TCB is still locked
1681  * and valid.
1682  */
1683 static void
1684 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
1685 {
1686         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1687
1688         TCPSTAT_INC(tcps_badsyn);
1689         if (V_tcp_insecure_syn &&
1690             SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1691             SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1692                 tp = tcp_drop(tp, ECONNRESET);
1693                 *ret_val = 1;
1694                 rack_do_drop(m, tp);
1695         } else {
1696                 /* Send challenge ACK. */
1697                 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
1698                     tp->snd_nxt, TH_ACK);
1699                 tp->last_ack_sent = tp->rcv_nxt;
1700                 m = NULL;
1701                 *ret_val = 0;
1702                 rack_do_drop(m, NULL);
1703         }
1704 }
1705
1706 /*
1707  * rack_ts_check returns 1 for you should not proceed. It places
1708  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1709  * that the TCB is unlocked and probably dropped. The 0 indicates the
1710  * TCB is still valid and locked.
1711  */
1712 static int
1713 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
1714 {
1715
1716         /* Check to see if ts_recent is over 24 days old.  */
1717         if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1718                 /*
1719                  * Invalidate ts_recent.  If this segment updates ts_recent,
1720                  * the age will be reset later and ts_recent will get a
1721                  * valid value.  If it does not, setting ts_recent to zero
1722                  * will at least satisfy the requirement that zero be placed
1723                  * in the timestamp echo reply when ts_recent isn't valid.
1724                  * The age isn't reset until we get a valid ts_recent
1725                  * because we don't want out-of-order segments to be dropped
1726                  * when ts_recent is old.
1727                  */
1728                 tp->ts_recent = 0;
1729         } else {
1730                 TCPSTAT_INC(tcps_rcvduppack);
1731                 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1732                 TCPSTAT_INC(tcps_pawsdrop);
1733                 *ret_val = 0;
1734                 if (tlen) {
1735                         rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1736                 } else {
1737                         rack_do_drop(m, NULL);
1738                 }
1739                 return (1);
1740         }
1741         return (0);
1742 }
1743
1744 /*
1745  * rack_drop_checks returns 1 for you should not proceed. It places
1746  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1747  * that the TCB is unlocked and probably dropped. The 0 indicates the
1748  * TCB is still valid and locked.
1749  */
1750 static int
1751 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
1752 {
1753         int32_t todrop;
1754         int32_t thflags;
1755         int32_t tlen;
1756
1757         thflags = *thf;
1758         tlen = *tlenp;
1759         todrop = tp->rcv_nxt - th->th_seq;
1760         if (todrop > 0) {
1761                 if (thflags & TH_SYN) {
1762                         thflags &= ~TH_SYN;
1763                         th->th_seq++;
1764                         if (th->th_urp > 1)
1765                                 th->th_urp--;
1766                         else
1767                                 thflags &= ~TH_URG;
1768                         todrop--;
1769                 }
1770                 /*
1771                  * Following if statement from Stevens, vol. 2, p. 960.
1772                  */
1773                 if (todrop > tlen
1774                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1775                         /*
1776                          * Any valid FIN must be to the left of the window.
1777                          * At this point the FIN must be a duplicate or out
1778                          * of sequence; drop it.
1779                          */
1780                         thflags &= ~TH_FIN;
1781                         /*
1782                          * Send an ACK to resynchronize and drop any data.
1783                          * But keep on processing for RST or ACK.
1784                          */
1785                         tp->t_flags |= TF_ACKNOW;
1786                         todrop = tlen;
1787                         TCPSTAT_INC(tcps_rcvduppack);
1788                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1789                 } else {
1790                         TCPSTAT_INC(tcps_rcvpartduppack);
1791                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1792                 }
1793                 /*
1794                  * DSACK - add SACK block for dropped range
1795                  */
1796                 if (tp->t_flags & TF_SACK_PERMIT) {
1797                         tcp_update_sack_list(tp, th->th_seq,
1798                             th->th_seq + todrop);
1799                         /*
1800                          * ACK now, as the next in-sequence segment
1801                          * will clear the DSACK block again
1802                          */
1803                         tp->t_flags |= TF_ACKNOW;
1804                 }
1805                 *drop_hdrlen += todrop; /* drop from the top afterwards */
1806                 th->th_seq += todrop;
1807                 tlen -= todrop;
1808                 if (th->th_urp > todrop)
1809                         th->th_urp -= todrop;
1810                 else {
1811                         thflags &= ~TH_URG;
1812                         th->th_urp = 0;
1813                 }
1814         }
1815         /*
1816          * If segment ends after window, drop trailing data (and PUSH and
1817          * FIN); if nothing left, just ACK.
1818          */
1819         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1820         if (todrop > 0) {
1821                 TCPSTAT_INC(tcps_rcvpackafterwin);
1822                 if (todrop >= tlen) {
1823                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1824                         /*
1825                          * If window is closed can only take segments at
1826                          * window edge, and have to drop data and PUSH from
1827                          * incoming segments.  Continue processing, but
1828                          * remember to ack.  Otherwise, drop segment and
1829                          * ack.
1830                          */
1831                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1832                                 tp->t_flags |= TF_ACKNOW;
1833                                 TCPSTAT_INC(tcps_rcvwinprobe);
1834                         } else {
1835                                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1836                                 return (1);
1837                         }
1838                 } else
1839                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1840                 m_adj(m, -todrop);
1841                 tlen -= todrop;
1842                 thflags &= ~(TH_PUSH | TH_FIN);
1843         }
1844         *thf = thflags;
1845         *tlenp = tlen;
1846         return (0);
1847 }
1848
1849 static struct rack_sendmap *
1850 rack_find_lowest_rsm(struct tcp_rack *rack)
1851 {
1852         struct rack_sendmap *rsm;
1853
1854         /*
1855          * Walk the time-order transmitted list looking for an rsm that is
1856          * not acked. This will be the one that was sent the longest time
1857          * ago that is still outstanding.
1858          */
1859         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1860                 if (rsm->r_flags & RACK_ACKED) {
1861                         continue;
1862                 }
1863                 goto finish;
1864         }
1865 finish:
1866         return (rsm);
1867 }
1868
1869 static struct rack_sendmap *
1870 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1871 {
1872         struct rack_sendmap *prsm;
1873
1874         /*
1875          * Walk the sequence order list backward until we hit and arrive at
1876          * the highest seq not acked. In theory when this is called it
1877          * should be the last segment (which it was not).
1878          */
1879         counter_u64_add(rack_find_high, 1);
1880         prsm = rsm;
1881         TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
1882                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1883                         continue;
1884                 }
1885                 return (prsm);
1886         }
1887         return (NULL);
1888 }
1889
1890
1891 static uint32_t
1892 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1893 {
1894         int32_t lro;
1895         uint32_t thresh;
1896
1897         /*
1898          * lro is the flag we use to determine if we have seen reordering.
1899          * If it gets set we have seen reordering. The reorder logic either
1900          * works in one of two ways:
1901          *
1902          * If reorder-fade is configured, then we track the last time we saw
1903          * re-ordering occur. If we reach the point where enough time as
1904          * passed we no longer consider reordering has occuring.
1905          *
1906          * Or if reorder-face is 0, then once we see reordering we consider
1907          * the connection to alway be subject to reordering and just set lro
1908          * to 1.
1909          *
1910          * In the end if lro is non-zero we add the extra time for
1911          * reordering in.
1912          */
1913         if (srtt == 0)
1914                 srtt = 1;
1915         if (rack->r_ctl.rc_reorder_ts) {
1916                 if (rack->r_ctl.rc_reorder_fade) {
1917                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1918                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1919                                 if (lro == 0) {
1920                                         /*
1921                                          * No time as passed since the last
1922                                          * reorder, mark it as reordering.
1923                                          */
1924                                         lro = 1;
1925                                 }
1926                         } else {
1927                                 /* Negative time? */
1928                                 lro = 0;
1929                         }
1930                         if (lro > rack->r_ctl.rc_reorder_fade) {
1931                                 /* Turn off reordering seen too */
1932                                 rack->r_ctl.rc_reorder_ts = 0;
1933                                 lro = 0;
1934                         }
1935                 } else {
1936                         /* Reodering does not fade */
1937                         lro = 1;
1938                 }
1939         } else {
1940                 lro = 0;
1941         }
1942         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1943         if (lro) {
1944                 /* It must be set, if not you get 1/4 rtt */
1945                 if (rack->r_ctl.rc_reorder_shift)
1946                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1947                 else
1948                         thresh += (srtt >> 2);
1949         } else {
1950                 thresh += 1;
1951         }
1952         /* We don't let the rack timeout be above a RTO */
1953
1954         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1955                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1956         }
1957         /* And we don't want it above the RTO max either */
1958         if (thresh > rack_rto_max) {
1959                 thresh = rack_rto_max;
1960         }
1961         return (thresh);
1962 }
1963
1964 static uint32_t
1965 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
1966                      struct rack_sendmap *rsm, uint32_t srtt)
1967 {
1968         struct rack_sendmap *prsm;
1969         uint32_t thresh, len;
1970         int maxseg;
1971
1972         if (srtt == 0)
1973                 srtt = 1;
1974         if (rack->r_ctl.rc_tlp_threshold)
1975                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
1976         else
1977                 thresh = (srtt * 2);
1978
1979         /* Get the previous sent packet, if any  */
1980         maxseg = tcp_maxseg(tp);
1981         counter_u64_add(rack_enter_tlp_calc, 1);
1982         len = rsm->r_end - rsm->r_start;
1983         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
1984                 /* Exactly like the ID */
1985                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
1986                         uint32_t alt_thresh;
1987                         /*
1988                          * Compensate for delayed-ack with the d-ack time.
1989                          */
1990                         counter_u64_add(rack_used_tlpmethod, 1);
1991                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1992                         if (alt_thresh > thresh)
1993                                 thresh = alt_thresh;
1994                 }
1995         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
1996                 /* 2.1 behavior */
1997                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
1998                 if (prsm && (len <= maxseg)) {
1999                         /*
2000                          * Two packets outstanding, thresh should be (2*srtt) +
2001                          * possible inter-packet delay (if any).
2002                          */
2003                         uint32_t inter_gap = 0;
2004                         int idx, nidx;
2005
2006                         counter_u64_add(rack_used_tlpmethod, 1);
2007                         idx = rsm->r_rtr_cnt - 1;
2008                         nidx = prsm->r_rtr_cnt - 1;
2009                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
2010                                 /* Yes it was sent later (or at the same time) */
2011                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
2012                         }
2013                         thresh += inter_gap;
2014                 } else  if (len <= maxseg) {
2015                         /*
2016                          * Possibly compensate for delayed-ack.
2017                          */
2018                         uint32_t alt_thresh;
2019
2020                         counter_u64_add(rack_used_tlpmethod2, 1);
2021                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2022                         if (alt_thresh > thresh)
2023                                 thresh = alt_thresh;
2024                 }
2025         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2026                 /* 2.2 behavior */
2027                 if (len <= maxseg) {
2028                         uint32_t alt_thresh;
2029                         /*
2030                          * Compensate for delayed-ack with the d-ack time.
2031                          */
2032                         counter_u64_add(rack_used_tlpmethod, 1);
2033                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2034                         if (alt_thresh > thresh)
2035                                 thresh = alt_thresh;
2036                 }
2037         }
2038         /* Not above an RTO */
2039         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2040                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2041         }
2042         /* Not above a RTO max */
2043         if (thresh > rack_rto_max) {
2044                 thresh = rack_rto_max;
2045         }
2046         /* Apply user supplied min TLP */
2047         if (thresh < rack_tlp_min) {
2048                 thresh = rack_tlp_min;
2049         }
2050         return (thresh);
2051 }
2052
2053 static struct rack_sendmap *
2054 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2055 {
2056         /*
2057          * Check to see that we don't need to fall into recovery. We will
2058          * need to do so if our oldest transmit is past the time we should
2059          * have had an ack.
2060          */
2061         struct tcp_rack *rack;
2062         struct rack_sendmap *rsm;
2063         int32_t idx;
2064         uint32_t srtt_cur, srtt, thresh;
2065
2066         rack = (struct tcp_rack *)tp->t_fb_ptr;
2067         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
2068                 return (NULL);
2069         }
2070         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
2071         srtt = TICKS_2_MSEC(srtt_cur);
2072         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
2073                 srtt = rack->rc_rack_rtt;
2074
2075         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2076         if (rsm == NULL)
2077                 return (NULL);
2078
2079         if (rsm->r_flags & RACK_ACKED) {
2080                 rsm = rack_find_lowest_rsm(rack);
2081                 if (rsm == NULL)
2082                         return (NULL);
2083         }
2084         idx = rsm->r_rtr_cnt - 1;
2085         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2086         if (tsused < rsm->r_tim_lastsent[idx]) {
2087                 return (NULL);
2088         }
2089         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2090                 return (NULL);
2091         }
2092         /* Ok if we reach here we are over-due */
2093         rack->r_ctl.rc_rsm_start = rsm->r_start;
2094         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2095         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2096         rack_cong_signal(tp, NULL, CC_NDUPACK);
2097         return (rsm);
2098 }
2099
2100 static uint32_t
2101 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2102 {
2103         int32_t t;
2104         int32_t tt;
2105         uint32_t ret_val;
2106
2107         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2108         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2109             tcp_persmin, tcp_persmax);
2110         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2111                 tp->t_rxtshift++;
2112         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2113         ret_val = (uint32_t)tt;
2114         return (ret_val);
2115 }
2116
2117 static uint32_t
2118 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2119 {
2120         /*
2121          * Start the FR timer, we do this based on getting the first one in
2122          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2123          * events we need to stop the running timer (if its running) before
2124          * starting the new one.
2125          */
2126         uint32_t thresh, exp, to, srtt, time_since_sent;
2127         uint32_t srtt_cur;
2128         int32_t idx;
2129         int32_t is_tlp_timer = 0;
2130         struct rack_sendmap *rsm;
2131
2132         if (rack->t_timers_stopped) {
2133                 /* All timers have been stopped none are to run */
2134                 return (0);
2135         }
2136         if (rack->rc_in_persist) {
2137                 /* We can't start any timer in persists */
2138                 return (rack_get_persists_timer_val(tp, rack));
2139         }
2140         if (tp->t_state < TCPS_ESTABLISHED)
2141                 goto activate_rxt;
2142         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2143         if (rsm == NULL) {
2144                 /* Nothing on the send map */
2145 activate_rxt:
2146                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2147                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2148                         to = TICKS_2_MSEC(tp->t_rxtcur);
2149                         if (to == 0)
2150                                 to = 1;
2151                         return (to);
2152                 }
2153                 return (0);
2154         }
2155         if (rsm->r_flags & RACK_ACKED) {
2156                 rsm = rack_find_lowest_rsm(rack);
2157                 if (rsm == NULL) {
2158                         /* No lowest? */
2159                         goto activate_rxt;
2160                 }
2161         }
2162         /* Convert from ms to usecs */
2163         if (rsm->r_flags & RACK_SACK_PASSED) {
2164                 if ((tp->t_flags & TF_SENTFIN) &&
2165                     ((tp->snd_max - tp->snd_una) == 1) &&
2166                     (rsm->r_flags & RACK_HAS_FIN)) {
2167                         /*
2168                          * We don't start a rack timer if all we have is a
2169                          * FIN outstanding.
2170                          */
2171                         goto activate_rxt;
2172                 }
2173                 if (tp->t_srtt) {
2174                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2175                         srtt = TICKS_2_MSEC(srtt_cur);
2176                 } else
2177                         srtt = RACK_INITIAL_RTO;
2178
2179                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2180                 idx = rsm->r_rtr_cnt - 1;
2181                 exp = rsm->r_tim_lastsent[idx] + thresh;
2182                 if (SEQ_GEQ(exp, cts)) {
2183                         to = exp - cts;
2184                         if (to < rack->r_ctl.rc_min_to) {
2185                                 to = rack->r_ctl.rc_min_to;
2186                         }
2187                 } else {
2188                         to = rack->r_ctl.rc_min_to;
2189                 }
2190         } else {
2191                 /* Ok we need to do a TLP not RACK */
2192                 if ((rack->rc_tlp_in_progress != 0) ||
2193                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2194                         /*
2195                          * The previous send was a TLP or a tlp_rtx is in
2196                          * process.
2197                          */
2198                         goto activate_rxt;
2199                 }
2200                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2201                 if (rsm == NULL) {
2202                         /* We found no rsm to TLP with. */
2203                         goto activate_rxt;
2204                 }
2205                 if (rsm->r_flags & RACK_HAS_FIN) {
2206                         /* If its a FIN we dont do TLP */
2207                         rsm = NULL;
2208                         goto activate_rxt;
2209                 }
2210                 idx = rsm->r_rtr_cnt - 1;
2211                 if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
2212                         time_since_sent = cts - rsm->r_tim_lastsent[idx];
2213                 else
2214                         time_since_sent = 0;
2215                 is_tlp_timer = 1;
2216                 if (tp->t_srtt) {
2217                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2218                         srtt = TICKS_2_MSEC(srtt_cur);
2219                 } else
2220                         srtt = RACK_INITIAL_RTO;
2221                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2222                 if (thresh > time_since_sent)
2223                         to = thresh - time_since_sent;
2224                 else
2225                         to = rack->r_ctl.rc_min_to;
2226                 if (to > TCPTV_REXMTMAX) {
2227                         /*
2228                          * If the TLP time works out to larger than the max
2229                          * RTO lets not do TLP.. just RTO.
2230                          */
2231                         goto activate_rxt;
2232                 }
2233                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2234                         /*
2235                          * The tail is no longer the last one I did a probe
2236                          * on
2237                          */
2238                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2239                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2240                 }
2241         }
2242         if (is_tlp_timer == 0) {
2243                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2244         } else {
2245                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2246                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2247                         /*
2248                          * We have exceeded how many times we can retran the
2249                          * current TLP timer, switch to the RTO timer.
2250                          */
2251                         goto activate_rxt;
2252                 } else {
2253                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2254                 }
2255         }
2256         if (to == 0)
2257                 to = 1;
2258         return (to);
2259 }
2260
2261 static void
2262 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2263 {
2264         if (rack->rc_in_persist == 0) {
2265                 if (((tp->t_flags & TF_SENTFIN) == 0) &&
2266                     (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
2267                         /* Must need to send more data to enter persist */
2268                         return;
2269                 rack->r_ctl.rc_went_idle_time = cts;
2270                 rack_timer_cancel(tp, rack, cts, __LINE__);
2271                 tp->t_rxtshift = 0;
2272                 rack->rc_in_persist = 1;
2273         }
2274 }
2275
2276 static void
2277 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2278 {
2279         if (rack->rc_inp->inp_in_hpts)  {
2280                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2281                 rack->r_ctl.rc_hpts_flags  = 0;
2282         }
2283         rack->rc_in_persist = 0;
2284         rack->r_ctl.rc_went_idle_time = 0;
2285         tp->t_flags &= ~TF_FORCEDATA;
2286         tp->t_rxtshift = 0;
2287 }
2288
2289 static void
2290 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
2291     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
2292 {
2293         struct inpcb *inp;
2294         uint32_t delayed_ack = 0;
2295         uint32_t hpts_timeout;
2296         uint8_t stopped;
2297         uint32_t left = 0;
2298
2299         inp = tp->t_inpcb;
2300         if (inp->inp_in_hpts) {
2301                 /* A previous call is already set up */
2302                 return;
2303         }
2304         if (tp->t_state == TCPS_CLOSED) {
2305                 return;
2306         }
2307         stopped = rack->rc_tmr_stopped;
2308         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2309                 left = rack->r_ctl.rc_timer_exp - cts;
2310         }
2311         rack->r_ctl.rc_timer_exp = 0;
2312         if (rack->rc_inp->inp_in_hpts == 0) {
2313                 rack->r_ctl.rc_hpts_flags = 0;
2314         }
2315         if (slot) {
2316                 /* We are hptsi too */
2317                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2318         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2319                 /*
2320                  * We are still left on the hpts when the to goes
2321                  * it will be for output.
2322                  */
2323                 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
2324                         slot = cts - rack->r_ctl.rc_last_output_to;
2325                 else
2326                         slot = 1;
2327         }
2328         if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2329                 /* No send window.. we must enter persist */
2330                 rack_enter_persist(tp, rack, cts);
2331         } else if ((frm_out_sbavail &&
2332                     (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
2333                     (tp->snd_wnd < tp->t_maxseg)) &&
2334             TCPS_HAVEESTABLISHED(tp->t_state)) {
2335                 /*
2336                  * If we have no window or we can't send a segment (and have
2337                  * data to send.. we cheat here and frm_out_sbavail is
2338                  * passed in with the sbavail(sb) only from bbr_output) and
2339                  * we are established, then we must enter persits (if not
2340                  * already in persits).
2341                  */
2342                 rack_enter_persist(tp, rack, cts);
2343         }
2344         hpts_timeout = rack_timer_start(tp, rack, cts);
2345         if (tp->t_flags & TF_DELACK) {
2346                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2347                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2348         }
2349         if (delayed_ack && ((hpts_timeout == 0) ||
2350                             (delayed_ack < hpts_timeout)))
2351                 hpts_timeout = delayed_ack;
2352         else
2353                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2354         /*
2355          * If no timers are going to run and we will fall off the hptsi
2356          * wheel, we resort to a keep-alive timer if its configured.
2357          */
2358         if ((hpts_timeout == 0) &&
2359             (slot == 0)) {
2360                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2361                     (tp->t_state <= TCPS_CLOSING)) {
2362                         /*
2363                          * Ok we have no timer (persists, rack, tlp, rxt  or
2364                          * del-ack), we don't have segments being paced. So
2365                          * all that is left is the keepalive timer.
2366                          */
2367                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2368                                 /* Get the established keep-alive time */
2369                                 hpts_timeout = TP_KEEPIDLE(tp);
2370                         } else {
2371                                 /* Get the initial setup keep-alive time */
2372                                 hpts_timeout = TP_KEEPINIT(tp);
2373                         }
2374                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2375                 }
2376         }
2377         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2378             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2379                 /*
2380                  * RACK, TLP, persists and RXT timers all are restartable
2381                  * based on actions input .. i.e we received a packet (ack
2382                  * or sack) and that changes things (rw, or snd_una etc).
2383                  * Thus we can restart them with a new value. For
2384                  * keep-alive, delayed_ack we keep track of what was left
2385                  * and restart the timer with a smaller value.
2386                  */
2387                 if (left < hpts_timeout)
2388                         hpts_timeout = left;
2389         }
2390         if (hpts_timeout) {
2391                 /*
2392                  * Hack alert for now we can't time-out over 2,147,483
2393                  * seconds (a bit more than 596 hours), which is probably ok
2394                  * :).
2395                  */
2396                 if (hpts_timeout > 0x7ffffffe)
2397                         hpts_timeout = 0x7ffffffe;
2398                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2399         }
2400         if (slot) {
2401                 rack->r_ctl.rc_last_output_to = cts + slot;
2402                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2403                         if (rack->rc_inp->inp_in_hpts == 0)
2404                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2405                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2406                 } else {
2407                         /*
2408                          * Arrange for the hpts to kick back in after the
2409                          * t-o if the t-o does not cause a send.
2410                          */
2411                         if (rack->rc_inp->inp_in_hpts == 0)
2412                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2413                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2414                 }
2415         } else if (hpts_timeout) {
2416                 if (rack->rc_inp->inp_in_hpts == 0)
2417                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2418                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2419         } else {
2420                 /* No timer starting */
2421 #ifdef INVARIANTS
2422                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2423                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2424                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2425                 }
2426 #endif
2427         }
2428         rack->rc_tmr_stopped = 0;
2429         if (slot)
2430                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2431 }
2432
2433 /*
2434  * RACK Timer, here we simply do logging and house keeping.
2435  * the normal rack_output() function will call the
2436  * appropriate thing to check if we need to do a RACK retransmit.
2437  * We return 1, saying don't proceed with rack_output only
2438  * when all timers have been stopped (destroyed PCB?).
2439  */
2440 static int
2441 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2442 {
2443         /*
2444          * This timer simply provides an internal trigger to send out data.
2445          * The check_recovery_mode call will see if there are needed
2446          * retransmissions, if so we will enter fast-recovery. The output
2447          * call may or may not do the same thing depending on sysctl
2448          * settings.
2449          */
2450         struct rack_sendmap *rsm;
2451         int32_t recovery;
2452
2453         if (tp->t_timers->tt_flags & TT_STOPPED) {
2454                 return (1);
2455         }
2456         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2457                 /* Its not time yet */
2458                 return (0);
2459         }
2460         rack_log_to_event(rack, RACK_TO_FRM_RACK);
2461         recovery = IN_RECOVERY(tp->t_flags);
2462         counter_u64_add(rack_to_tot, 1);
2463         if (rack->r_state && (rack->r_state != tp->t_state))
2464                 rack_set_state(tp, rack);
2465         rsm = rack_check_recovery_mode(tp, cts);
2466         if (rsm) {
2467                 uint32_t rtt;
2468
2469                 rtt = rack->rc_rack_rtt;
2470                 if (rtt == 0)
2471                         rtt = 1;
2472                 if ((recovery == 0) &&
2473                     (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
2474                         /*
2475                          * The rack-timeout that enter's us into recovery
2476                          * will force out one MSS and set us up so that we
2477                          * can do one more send in 2*rtt (transitioning the
2478                          * rack timeout into a rack-tlp).
2479                          */
2480                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2481                 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
2482                     ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
2483                         /*
2484                          * When a rack timer goes, we have to send at
2485                          * least one segment. They will be paced a min of 1ms
2486                          * apart via the next rack timer (or further
2487                          * if the rack timer dictates it).
2488                          */
2489                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2490                 }
2491         } else {
2492                 /* This is a case that should happen rarely if ever */
2493                 counter_u64_add(rack_tlp_does_nada, 1);
2494 #ifdef TCP_BLACKBOX
2495                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2496 #endif
2497                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2498         }
2499         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2500         return (0);
2501 }
2502
2503 /*
2504  * TLP Timer, here we simply setup what segment we want to
2505  * have the TLP expire on, the normal rack_output() will then
2506  * send it out.
2507  *
2508  * We return 1, saying don't proceed with rack_output only
2509  * when all timers have been stopped (destroyed PCB?).
2510  */
2511 static int
2512 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2513 {
2514         /*
2515          * Tail Loss Probe.
2516          */
2517         struct rack_sendmap *rsm = NULL;
2518         struct socket *so;
2519         uint32_t amm, old_prr_snd = 0;
2520         uint32_t out, avail;
2521
2522         if (tp->t_timers->tt_flags & TT_STOPPED) {
2523                 return (1);
2524         }
2525         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2526                 /* Its not time yet */
2527                 return (0);
2528         }
2529         if (rack_progress_timeout_check(tp)) {
2530                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2531                 return (1);
2532         }
2533         /*
2534          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2535          * need to figure out how to force a full MSS segment out.
2536          */
2537         rack_log_to_event(rack, RACK_TO_FRM_TLP);
2538         counter_u64_add(rack_tlp_tot, 1);
2539         if (rack->r_state && (rack->r_state != tp->t_state))
2540                 rack_set_state(tp, rack);
2541         so = tp->t_inpcb->inp_socket;
2542         avail = sbavail(&so->so_snd);
2543         out = tp->snd_max - tp->snd_una;
2544         rack->rc_timer_up = 1;
2545         /*
2546          * If we are in recovery we can jazz out a segment if new data is
2547          * present simply by setting rc_prr_sndcnt to a segment.
2548          */
2549         if ((avail > out) &&
2550             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2551                 /* New data is available */
2552                 amm = avail - out;
2553                 if (amm > tp->t_maxseg) {
2554                         amm = tp->t_maxseg;
2555                 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
2556                         /* not enough to fill a MTU and no-delay is off */
2557                         goto need_retran;
2558                 }
2559                 if (IN_RECOVERY(tp->t_flags)) {
2560                         /* Unlikely */
2561                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2562                         if (out + amm <= tp->snd_wnd)
2563                                 rack->r_ctl.rc_prr_sndcnt = amm;
2564                         else
2565                                 goto need_retran;
2566                 } else {
2567                         /* Set the send-new override */
2568                         if (out + amm <= tp->snd_wnd)
2569                                 rack->r_ctl.rc_tlp_new_data = amm;
2570                         else
2571                                 goto need_retran;
2572                 }
2573                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2574                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2575                 rack->r_ctl.rc_tlpsend = NULL;
2576                 counter_u64_add(rack_tlp_newdata, 1);
2577                 goto send;
2578         }
2579 need_retran:
2580         /*
2581          * Ok we need to arrange the last un-acked segment to be re-sent, or
2582          * optionally the first un-acked segment.
2583          */
2584         if (rack_always_send_oldest)
2585                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2586         else {
2587                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
2588                 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2589                         rsm = rack_find_high_nonack(rack, rsm);
2590                 }
2591         }
2592         if (rsm == NULL) {
2593                 counter_u64_add(rack_tlp_does_nada, 1);
2594 #ifdef TCP_BLACKBOX
2595                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2596 #endif
2597                 goto out;
2598         }
2599         if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
2600                 /*
2601                  * We need to split this the last segment in two.
2602                  */
2603                 int32_t idx;
2604                 struct rack_sendmap *nrsm;
2605
2606                 nrsm = rack_alloc(rack);
2607                 if (nrsm == NULL) {
2608                         /*
2609                          * No memory to split, we will just exit and punt
2610                          * off to the RXT timer.
2611                          */
2612                         counter_u64_add(rack_tlp_does_nada, 1);
2613                         goto out;
2614                 }
2615                 nrsm->r_start = (rsm->r_end - tp->t_maxseg);
2616                 nrsm->r_end = rsm->r_end;
2617                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2618                 nrsm->r_flags = rsm->r_flags;
2619                 nrsm->r_sndcnt = rsm->r_sndcnt;
2620                 nrsm->r_rtr_bytes = 0;
2621                 rsm->r_end = nrsm->r_start;
2622                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2623                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2624                 }
2625                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
2626                 if (rsm->r_in_tmap) {
2627                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2628                         nrsm->r_in_tmap = 1;
2629                 }
2630                 rsm->r_flags &= (~RACK_HAS_FIN);
2631                 rsm = nrsm;
2632         }
2633         rack->r_ctl.rc_tlpsend = rsm;
2634         rack->r_ctl.rc_tlp_rtx_out = 1;
2635         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2636                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2637                 tp->t_rxtshift++;
2638         } else {
2639                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2640                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2641         }
2642 send:
2643         rack->r_ctl.rc_tlp_send_cnt++;
2644         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2645                 /*
2646                  * Can't [re]/transmit a segment we have not heard from the
2647                  * peer in max times. We need the retransmit timer to take
2648                  * over.
2649                  */
2650 restore:
2651                 rack->r_ctl.rc_tlpsend = NULL;
2652                 if (rsm)
2653                         rsm->r_flags &= ~RACK_TLP;
2654                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2655                 counter_u64_add(rack_tlp_retran_fail, 1);
2656                 goto out;
2657         } else if (rsm) {
2658                 rsm->r_flags |= RACK_TLP;
2659         }
2660         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2661             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2662                 /*
2663                  * We don't want to send a single segment more than the max
2664                  * either.
2665                  */
2666                 goto restore;
2667         }
2668         rack->r_timer_override = 1;
2669         rack->r_tlp_running = 1;
2670         rack->rc_tlp_in_progress = 1;
2671         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2672         return (0);
2673 out:
2674         rack->rc_timer_up = 0;
2675         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2676         return (0);
2677 }
2678
2679 /*
2680  * Delayed ack Timer, here we simply need to setup the
2681  * ACK_NOW flag and remove the DELACK flag. From there
2682  * the output routine will send the ack out.
2683  *
2684  * We only return 1, saying don't proceed, if all timers
2685  * are stopped (destroyed PCB?).
2686  */
2687 static int
2688 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2689 {
2690         if (tp->t_timers->tt_flags & TT_STOPPED) {
2691                 return (1);
2692         }
2693         rack_log_to_event(rack, RACK_TO_FRM_DELACK);
2694         tp->t_flags &= ~TF_DELACK;
2695         tp->t_flags |= TF_ACKNOW;
2696         TCPSTAT_INC(tcps_delack);
2697         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2698         return (0);
2699 }
2700
2701 /*
2702  * Persists timer, here we simply need to setup the
2703  * FORCE-DATA flag the output routine will send
2704  * the one byte send.
2705  *
2706  * We only return 1, saying don't proceed, if all timers
2707  * are stopped (destroyed PCB?).
2708  */
2709 static int
2710 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2711 {
2712         struct inpcb *inp;
2713         int32_t retval = 0;
2714
2715         inp = tp->t_inpcb;
2716
2717         if (tp->t_timers->tt_flags & TT_STOPPED) {
2718                 return (1);
2719         }
2720         if (rack->rc_in_persist == 0)
2721                 return (0);
2722         if (rack_progress_timeout_check(tp)) {
2723                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2724                 return (1);
2725         }
2726         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2727         /*
2728          * Persistence timer into zero window. Force a byte to be output, if
2729          * possible.
2730          */
2731         TCPSTAT_INC(tcps_persisttimeo);
2732         /*
2733          * Hack: if the peer is dead/unreachable, we do not time out if the
2734          * window is closed.  After a full backoff, drop the connection if
2735          * the idle time (no responses to probes) reaches the maximum
2736          * backoff that we would use if retransmitting.
2737          */
2738         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2739             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2740             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2741                 TCPSTAT_INC(tcps_persistdrop);
2742                 retval = 1;
2743                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2744                 goto out;
2745         }
2746         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2747             tp->snd_una == tp->snd_max)
2748                 rack_exit_persist(tp, rack);
2749         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2750         /*
2751          * If the user has closed the socket then drop a persisting
2752          * connection after a much reduced timeout.
2753          */
2754         if (tp->t_state > TCPS_CLOSE_WAIT &&
2755             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2756                 retval = 1;
2757                 TCPSTAT_INC(tcps_persistdrop);
2758                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2759                 goto out;
2760         }
2761         tp->t_flags |= TF_FORCEDATA;
2762 out:
2763         rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
2764         return (retval);
2765 }
2766
2767 /*
2768  * If a keepalive goes off, we had no other timers
2769  * happening. We always return 1 here since this
2770  * routine either drops the connection or sends
2771  * out a segment with respond.
2772  */
2773 static int
2774 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2775 {
2776         struct tcptemp *t_template;
2777         struct inpcb *inp;
2778
2779         if (tp->t_timers->tt_flags & TT_STOPPED) {
2780                 return (1);
2781         }
2782         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
2783         inp = tp->t_inpcb;
2784         rack_log_to_event(rack, RACK_TO_FRM_KEEP);
2785         /*
2786          * Keep-alive timer went off; send something or drop connection if
2787          * idle for too long.
2788          */
2789         TCPSTAT_INC(tcps_keeptimeo);
2790         if (tp->t_state < TCPS_ESTABLISHED)
2791                 goto dropit;
2792         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2793             tp->t_state <= TCPS_CLOSING) {
2794                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
2795                         goto dropit;
2796                 /*
2797                  * Send a packet designed to force a response if the peer is
2798                  * up and reachable: either an ACK if the connection is
2799                  * still alive, or an RST if the peer has closed the
2800                  * connection due to timeout or reboot. Using sequence
2801                  * number tp->snd_una-1 causes the transmitted zero-length
2802                  * segment to lie outside the receive window; by the
2803                  * protocol spec, this requires the correspondent TCP to
2804                  * respond.
2805                  */
2806                 TCPSTAT_INC(tcps_keepprobe);
2807                 t_template = tcpip_maketemplate(inp);
2808                 if (t_template) {
2809                         tcp_respond(tp, t_template->tt_ipgen,
2810                             &t_template->tt_t, (struct mbuf *)NULL,
2811                             tp->rcv_nxt, tp->snd_una - 1, 0);
2812                         free(t_template, M_TEMP);
2813                 }
2814         }
2815         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
2816         return (1);
2817 dropit:
2818         TCPSTAT_INC(tcps_keepdrops);
2819         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2820         return (1);
2821 }
2822
2823 /*
2824  * Retransmit helper function, clear up all the ack
2825  * flags and take care of important book keeping.
2826  */
2827 static void
2828 rack_remxt_tmr(struct tcpcb *tp)
2829 {
2830         /*
2831          * The retransmit timer went off, all sack'd blocks must be
2832          * un-acked.
2833          */
2834         struct rack_sendmap *rsm, *trsm = NULL;
2835         struct tcp_rack *rack;
2836         int32_t cnt = 0;
2837
2838         rack = (struct tcp_rack *)tp->t_fb_ptr;
2839         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
2840         rack_log_to_event(rack, RACK_TO_FRM_TMR);
2841         if (rack->r_state && (rack->r_state != tp->t_state))
2842                 rack_set_state(tp, rack);
2843         /*
2844          * Ideally we would like to be able to
2845          * mark SACK-PASS on anything not acked here.
2846          * However, if we do that we would burst out
2847          * all that data 1ms apart. This would be unwise,
2848          * so for now we will just let the normal rxt timer
2849          * and tlp timer take care of it.
2850          */
2851         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
2852                 if (rsm->r_flags & RACK_ACKED) {
2853                         cnt++;
2854                         rsm->r_sndcnt = 0;
2855                         if (rsm->r_in_tmap == 0) {
2856                                 /* We must re-add it back to the tlist */
2857                                 if (trsm == NULL) {
2858                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
2859                                 } else {
2860                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
2861                                 }
2862                                 rsm->r_in_tmap = 1;
2863                                 trsm = rsm;
2864                         }
2865                 }
2866                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
2867         }
2868         /* Clear the count (we just un-acked them) */
2869         rack->r_ctl.rc_sacked = 0;
2870         /* Clear the tlp rtx mark */
2871         rack->r_ctl.rc_tlp_rtx_out = 0;
2872         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2873         rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
2874         /* Setup so we send one segment */
2875         if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
2876                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2877         rack->r_timer_override = 1;
2878 }
2879
2880 /*
2881  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
2882  * we will setup to retransmit the lowest seq number outstanding.
2883  */
2884 static int
2885 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2886 {
2887         int32_t rexmt;
2888         struct inpcb *inp;
2889         int32_t retval = 0;
2890
2891         inp = tp->t_inpcb;
2892         if (tp->t_timers->tt_flags & TT_STOPPED) {
2893                 return (1);
2894         }
2895         if (rack_progress_timeout_check(tp)) {
2896                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2897                 return (1);
2898         }
2899         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
2900         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2901             (tp->snd_una == tp->snd_max)) {
2902                 /* Nothing outstanding .. nothing to do */
2903                 return (0);
2904         }
2905         /*
2906          * Retransmission timer went off.  Message has not been acked within
2907          * retransmit interval.  Back off to a longer retransmit interval
2908          * and retransmit one segment.
2909          */
2910         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
2911                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
2912                 TCPSTAT_INC(tcps_timeoutdrop);
2913                 retval = 1;
2914                 tcp_set_inp_to_drop(rack->rc_inp,
2915                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
2916                 goto out;
2917         }
2918         rack_remxt_tmr(tp);
2919         if (tp->t_state == TCPS_SYN_SENT) {
2920                 /*
2921                  * If the SYN was retransmitted, indicate CWND to be limited
2922                  * to 1 segment in cc_conn_init().
2923                  */
2924                 tp->snd_cwnd = 1;
2925         } else if (tp->t_rxtshift == 1) {
2926                 /*
2927                  * first retransmit; record ssthresh and cwnd so they can be
2928                  * recovered if this turns out to be a "bad" retransmit. A
2929                  * retransmit is considered "bad" if an ACK for this segment
2930                  * is received within RTT/2 interval; the assumption here is
2931                  * that the ACK was already in flight.  See "On Estimating
2932                  * End-to-End Network Path Properties" by Allman and Paxson
2933                  * for more details.
2934                  */
2935                 tp->snd_cwnd_prev = tp->snd_cwnd;
2936                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
2937                 tp->snd_recover_prev = tp->snd_recover;
2938                 if (IN_FASTRECOVERY(tp->t_flags))
2939                         tp->t_flags |= TF_WASFRECOVERY;
2940                 else
2941                         tp->t_flags &= ~TF_WASFRECOVERY;
2942                 if (IN_CONGRECOVERY(tp->t_flags))
2943                         tp->t_flags |= TF_WASCRECOVERY;
2944                 else
2945                         tp->t_flags &= ~TF_WASCRECOVERY;
2946                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
2947                 tp->t_flags |= TF_PREVVALID;
2948         } else
2949                 tp->t_flags &= ~TF_PREVVALID;
2950         TCPSTAT_INC(tcps_rexmttimeo);
2951         if ((tp->t_state == TCPS_SYN_SENT) ||
2952             (tp->t_state == TCPS_SYN_RECEIVED))
2953                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
2954         else
2955                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
2956         TCPT_RANGESET(tp->t_rxtcur, rexmt,
2957            max(MSEC_2_TICKS(rack_rto_min), rexmt),
2958            MSEC_2_TICKS(rack_rto_max));
2959         /*
2960          * We enter the path for PLMTUD if connection is established or, if
2961          * connection is FIN_WAIT_1 status, reason for the last is that if
2962          * amount of data we send is very small, we could send it in couple
2963          * of packets and process straight to FIN. In that case we won't
2964          * catch ESTABLISHED state.
2965          */
2966         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
2967             || (tp->t_state == TCPS_FIN_WAIT_1))) {
2968 #ifdef INET6
2969                 int32_t isipv6;
2970 #endif
2971
2972                 /*
2973                  * Idea here is that at each stage of mtu probe (usually,
2974                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
2975                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
2976                  * should take care of that.
2977                  */
2978                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
2979                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
2980                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
2981                     tp->t_rxtshift % 2 == 0)) {
2982                         /*
2983                          * Enter Path MTU Black-hole Detection mechanism: -
2984                          * Disable Path MTU Discovery (IP "DF" bit). -
2985                          * Reduce MTU to lower value than what we negotiated
2986                          * with peer.
2987                          */
2988                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
2989                                 /* Record that we may have found a black hole. */
2990                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
2991                                 /* Keep track of previous MSS. */
2992                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
2993                         }
2994
2995                         /*
2996                          * Reduce the MSS to blackhole value or to the
2997                          * default in an attempt to retransmit.
2998                          */
2999 #ifdef INET6
3000                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
3001                         if (isipv6 &&
3002                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3003                                 /* Use the sysctl tuneable blackhole MSS. */
3004                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3005                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3006                         } else if (isipv6) {
3007                                 /* Use the default MSS. */
3008                                 tp->t_maxseg = V_tcp_v6mssdflt;
3009                                 /*
3010                                  * Disable Path MTU Discovery when we switch
3011                                  * to minmss.
3012                                  */
3013                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3014                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3015                         }
3016 #endif
3017 #if defined(INET6) && defined(INET)
3018                         else
3019 #endif
3020 #ifdef INET
3021                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3022                                 /* Use the sysctl tuneable blackhole MSS. */
3023                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3024                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3025                         } else {
3026                                 /* Use the default MSS. */
3027                                 tp->t_maxseg = V_tcp_mssdflt;
3028                                 /*
3029                                  * Disable Path MTU Discovery when we switch
3030                                  * to minmss.
3031                                  */
3032                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3033                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3034                         }
3035 #endif
3036                 } else {
3037                         /*
3038                          * If further retransmissions are still unsuccessful
3039                          * with a lowered MTU, maybe this isn't a blackhole
3040                          * and we restore the previous MSS and blackhole
3041                          * detection flags. The limit '6' is determined by
3042                          * giving each probe stage (1448, 1188, 524) 2
3043                          * chances to recover.
3044                          */
3045                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3046                             (tp->t_rxtshift >= 6)) {
3047                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3048                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3049                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3050                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3051                         }
3052                 }
3053         }
3054         /*
3055          * Disable RFC1323 and SACK if we haven't got any response to our
3056          * third SYN to work-around some broken terminal servers (most of
3057          * which have hopefully been retired) that have bad VJ header
3058          * compression code which trashes TCP segments containing
3059          * unknown-to-them TCP options.
3060          */
3061         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
3062             (tp->t_rxtshift == 3))
3063                 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
3064         /*
3065          * If we backed off this far, our srtt estimate is probably bogus.
3066          * Clobber it so we'll take the next rtt measurement as our srtt;
3067          * move the current srtt into rttvar to keep the current retransmit
3068          * times until then.
3069          */
3070         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3071 #ifdef INET6
3072                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3073                         in6_losing(tp->t_inpcb);
3074                 else
3075 #endif
3076                         in_losing(tp->t_inpcb);
3077                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3078                 tp->t_srtt = 0;
3079         }
3080         if (rack_use_sack_filter)
3081                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3082         tp->snd_recover = tp->snd_max;
3083         tp->t_flags |= TF_ACKNOW;
3084         tp->t_rtttime = 0;
3085         rack_cong_signal(tp, NULL, CC_RTO);
3086 out:
3087         return (retval);
3088 }
3089
3090 static int
3091 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3092 {
3093         int32_t ret = 0;
3094         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3095
3096         if (timers == 0) {
3097                 return (0);
3098         }
3099         if (tp->t_state == TCPS_LISTEN) {
3100                 /* no timers on listen sockets */
3101                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3102                         return (0);
3103                 return (1);
3104         }
3105         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3106                 uint32_t left;
3107
3108                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3109                         ret = -1;
3110                         rack_log_to_processing(rack, cts, ret, 0);
3111                         return (0);
3112                 }
3113                 if (hpts_calling == 0) {
3114                         ret = -2;
3115                         rack_log_to_processing(rack, cts, ret, 0);
3116                         return (0);
3117                 }
3118                 /*
3119                  * Ok our timer went off early and we are not paced false
3120                  * alarm, go back to sleep.
3121                  */
3122                 ret = -3;
3123                 left = rack->r_ctl.rc_timer_exp - cts;
3124                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3125                 rack_log_to_processing(rack, cts, ret, left);
3126                 rack->rc_last_pto_set = 0;
3127                 return (1);
3128         }
3129         rack->rc_tmr_stopped = 0;
3130         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3131         if (timers & PACE_TMR_DELACK) {
3132                 ret = rack_timeout_delack(tp, rack, cts);
3133         } else if (timers & PACE_TMR_RACK) {
3134                 ret = rack_timeout_rack(tp, rack, cts);
3135         } else if (timers & PACE_TMR_TLP) {
3136                 ret = rack_timeout_tlp(tp, rack, cts);
3137         } else if (timers & PACE_TMR_RXT) {
3138                 ret = rack_timeout_rxt(tp, rack, cts);
3139         } else if (timers & PACE_TMR_PERSIT) {
3140                 ret = rack_timeout_persist(tp, rack, cts);
3141         } else if (timers & PACE_TMR_KEEP) {
3142                 ret = rack_timeout_keepalive(tp, rack, cts);
3143         }
3144         rack_log_to_processing(rack, cts, ret, timers);
3145         return (ret);
3146 }
3147
3148 static void
3149 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3150 {
3151         uint8_t hpts_removed = 0;
3152
3153         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3154             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3155                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3156                 hpts_removed = 1;
3157         }
3158         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3159                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3160                 if (rack->rc_inp->inp_in_hpts &&
3161                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3162                         /*
3163                          * Canceling timer's when we have no output being
3164                          * paced. We also must remove ourselves from the
3165                          * hpts.
3166                          */
3167                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3168                         hpts_removed = 1;
3169                 }
3170                 rack_log_to_cancel(rack, hpts_removed, line);
3171                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3172         }
3173 }
3174
3175 static void
3176 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3177 {
3178         return;
3179 }
3180
3181 static int
3182 rack_stopall(struct tcpcb *tp)
3183 {
3184         struct tcp_rack *rack;
3185         rack = (struct tcp_rack *)tp->t_fb_ptr;
3186         rack->t_timers_stopped = 1;
3187         return (0);
3188 }
3189
3190 static void
3191 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3192 {
3193         return;
3194 }
3195
3196 static int
3197 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3198 {
3199         return (0);
3200 }
3201
3202 static void
3203 rack_stop_all_timers(struct tcpcb *tp)
3204 {
3205         struct tcp_rack *rack;
3206
3207         /*
3208          * Assure no timers are running.
3209          */
3210         if (tcp_timer_active(tp, TT_PERSIST)) {
3211                 /* We enter in persists, set the flag appropriately */
3212                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3213                 rack->rc_in_persist = 1;
3214         }
3215         tcp_timer_suspend(tp, TT_PERSIST);
3216         tcp_timer_suspend(tp, TT_REXMT);
3217         tcp_timer_suspend(tp, TT_KEEP);
3218         tcp_timer_suspend(tp, TT_DELACK);
3219 }
3220
3221 static void
3222 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3223     struct rack_sendmap *rsm, uint32_t ts)
3224 {
3225         int32_t idx;
3226
3227         rsm->r_rtr_cnt++;
3228         rsm->r_sndcnt++;
3229         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3230                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3231                 rsm->r_flags |= RACK_OVERMAX;
3232         }
3233         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3234                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3235                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3236         }
3237         idx = rsm->r_rtr_cnt - 1;
3238         rsm->r_tim_lastsent[idx] = ts;
3239         if (rsm->r_flags & RACK_ACKED) {
3240                 /* Problably MTU discovery messing with us */
3241                 rsm->r_flags &= ~RACK_ACKED;
3242                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3243         }
3244         if (rsm->r_in_tmap) {
3245                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3246         }
3247         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3248         rsm->r_in_tmap = 1;
3249         if (rsm->r_flags & RACK_SACK_PASSED) {
3250                 /* We have retransmitted due to the SACK pass */
3251                 rsm->r_flags &= ~RACK_SACK_PASSED;
3252                 rsm->r_flags |= RACK_WAS_SACKPASS;
3253         }
3254         /* Update memory for next rtr */
3255         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3256 }
3257
3258
3259 static uint32_t
3260 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3261     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
3262 {
3263         /*
3264          * We (re-)transmitted starting at rsm->r_start for some length
3265          * (possibly less than r_end.
3266          */
3267         struct rack_sendmap *nrsm;
3268         uint32_t c_end;
3269         int32_t len;
3270         int32_t idx;
3271
3272         len = *lenp;
3273         c_end = rsm->r_start + len;
3274         if (SEQ_GEQ(c_end, rsm->r_end)) {
3275                 /*
3276                  * We retransmitted the whole piece or more than the whole
3277                  * slopping into the next rsm.
3278                  */
3279                 rack_update_rsm(tp, rack, rsm, ts);
3280                 if (c_end == rsm->r_end) {
3281                         *lenp = 0;
3282                         return (0);
3283                 } else {
3284                         int32_t act_len;
3285
3286                         /* Hangs over the end return whats left */
3287                         act_len = rsm->r_end - rsm->r_start;
3288                         *lenp = (len - act_len);
3289                         return (rsm->r_end);
3290                 }
3291                 /* We don't get out of this block. */
3292         }
3293         /*
3294          * Here we retransmitted less than the whole thing which means we
3295          * have to split this into what was transmitted and what was not.
3296          */
3297         nrsm = rack_alloc(rack);
3298         if (nrsm == NULL) {
3299                 /*
3300                  * We can't get memory, so lets not proceed.
3301                  */
3302                 *lenp = 0;
3303                 return (0);
3304         }
3305         /*
3306          * So here we are going to take the original rsm and make it what we
3307          * retransmitted. nrsm will be the tail portion we did not
3308          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3309          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3310          * 1, 6 and the new piece will be 6, 11.
3311          */
3312         nrsm->r_start = c_end;
3313         nrsm->r_end = rsm->r_end;
3314         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3315         nrsm->r_flags = rsm->r_flags;
3316         nrsm->r_sndcnt = rsm->r_sndcnt;
3317         nrsm->r_rtr_bytes = 0;
3318         rsm->r_end = c_end;
3319         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3320                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3321         }
3322         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3323         if (rsm->r_in_tmap) {
3324                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3325                 nrsm->r_in_tmap = 1;
3326         }
3327         rsm->r_flags &= (~RACK_HAS_FIN);
3328         rack_update_rsm(tp, rack, rsm, ts);
3329         *lenp = 0;
3330         return (0);
3331 }
3332
3333
3334 static void
3335 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3336     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3337     uint8_t pass, struct rack_sendmap *hintrsm)
3338 {
3339         struct tcp_rack *rack;
3340         struct rack_sendmap *rsm, *nrsm;
3341         register uint32_t snd_max, snd_una;
3342         int32_t idx;
3343
3344         /*
3345          * Add to the RACK log of packets in flight or retransmitted. If
3346          * there is a TS option we will use the TS echoed, if not we will
3347          * grab a TS.
3348          *
3349          * Retransmissions will increment the count and move the ts to its
3350          * proper place. Note that if options do not include TS's then we
3351          * won't be able to effectively use the ACK for an RTT on a retran.
3352          *
3353          * Notes about r_start and r_end. Lets consider a send starting at
3354          * sequence 1 for 10 bytes. In such an example the r_start would be
3355          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3356          * This means that r_end is actually the first sequence for the next
3357          * slot (11).
3358          *
3359          */
3360         /*
3361          * If err is set what do we do XXXrrs? should we not add the thing?
3362          * -- i.e. return if err != 0 or should we pretend we sent it? --
3363          * i.e. proceed with add ** do this for now.
3364          */
3365         INP_WLOCK_ASSERT(tp->t_inpcb);
3366         if (err)
3367                 /*
3368                  * We don't log errors -- we could but snd_max does not
3369                  * advance in this case either.
3370                  */
3371                 return;
3372
3373         if (th_flags & TH_RST) {
3374                 /*
3375                  * We don't log resets and we return immediately from
3376                  * sending
3377                  */
3378                 return;
3379         }
3380         rack = (struct tcp_rack *)tp->t_fb_ptr;
3381         snd_una = tp->snd_una;
3382         if (SEQ_LEQ((seq_out + len), snd_una)) {
3383                 /* Are sending an old segment to induce an ack (keep-alive)? */
3384                 return;
3385         }
3386         if (SEQ_LT(seq_out, snd_una)) {
3387                 /* huh? should we panic? */
3388                 uint32_t end;
3389
3390                 end = seq_out + len;
3391                 seq_out = snd_una;
3392                 len = end - seq_out;
3393         }
3394         snd_max = tp->snd_max;
3395         if (th_flags & (TH_SYN | TH_FIN)) {
3396                 /*
3397                  * The call to rack_log_output is made before bumping
3398                  * snd_max. This means we can record one extra byte on a SYN
3399                  * or FIN if seq_out is adding more on and a FIN is present
3400                  * (and we are not resending).
3401                  */
3402                 if (th_flags & TH_SYN)
3403                         len++;
3404                 if (th_flags & TH_FIN)
3405                         len++;
3406                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3407                         /*
3408                          * The add/update as not been done for the FIN/SYN
3409                          * yet.
3410                          */
3411                         snd_max = tp->snd_nxt;
3412                 }
3413         }
3414         if (len == 0) {
3415                 /* We don't log zero window probes */
3416                 return;
3417         }
3418         rack->r_ctl.rc_time_last_sent = ts;
3419         if (IN_RECOVERY(tp->t_flags)) {
3420                 rack->r_ctl.rc_prr_out += len;
3421         }
3422         /* First question is it a retransmission? */
3423         if (seq_out == snd_max) {
3424 again:
3425                 rsm = rack_alloc(rack);
3426                 if (rsm == NULL) {
3427                         /*
3428                          * Hmm out of memory and the tcb got destroyed while
3429                          * we tried to wait.
3430                          */
3431 #ifdef INVARIANTS
3432                         panic("Out of memory when we should not be rack:%p", rack);
3433 #endif
3434                         return;
3435                 }
3436                 if (th_flags & TH_FIN) {
3437                         rsm->r_flags = RACK_HAS_FIN;
3438                 } else {
3439                         rsm->r_flags = 0;
3440                 }
3441                 rsm->r_tim_lastsent[0] = ts;
3442                 rsm->r_rtr_cnt = 1;
3443                 rsm->r_rtr_bytes = 0;
3444                 if (th_flags & TH_SYN) {
3445                         /* The data space is one beyond snd_una */
3446                         rsm->r_start = seq_out + 1;
3447                         rsm->r_end = rsm->r_start + (len - 1);
3448                 } else {
3449                         /* Normal case */
3450                         rsm->r_start = seq_out;
3451                         rsm->r_end = rsm->r_start + len;
3452                 }
3453                 rsm->r_sndcnt = 0;
3454                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
3455                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3456                 rsm->r_in_tmap = 1;
3457                 return;
3458         }
3459         /*
3460          * If we reach here its a retransmission and we need to find it.
3461          */
3462 more:
3463         if (hintrsm && (hintrsm->r_start == seq_out)) {
3464                 rsm = hintrsm;
3465                 hintrsm = NULL;
3466         } else if (rack->r_ctl.rc_next) {
3467                 /* We have a hint from a previous run */
3468                 rsm = rack->r_ctl.rc_next;
3469         } else {
3470                 /* No hints sorry */
3471                 rsm = NULL;
3472         }
3473         if ((rsm) && (rsm->r_start == seq_out)) {
3474                 /*
3475                  * We used rc_next or hintrsm  to retransmit, hopefully the
3476                  * likely case.
3477                  */
3478                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3479                 if (len == 0) {
3480                         return;
3481                 } else {
3482                         goto more;
3483                 }
3484         }
3485         /* Ok it was not the last pointer go through it the hard way. */
3486         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3487                 if (rsm->r_start == seq_out) {
3488                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3489                         rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3490                         if (len == 0) {
3491                                 return;
3492                         } else {
3493                                 continue;
3494                         }
3495                 }
3496                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3497                         /* Transmitted within this piece */
3498                         /*
3499                          * Ok we must split off the front and then let the
3500                          * update do the rest
3501                          */
3502                         nrsm = rack_alloc(rack);
3503                         if (nrsm == NULL) {
3504 #ifdef INVARIANTS
3505                                 panic("Ran out of memory that was preallocated? rack:%p", rack);
3506 #endif
3507                                 rack_update_rsm(tp, rack, rsm, ts);
3508                                 return;
3509                         }
3510                         /*
3511                          * copy rsm to nrsm and then trim the front of rsm
3512                          * to not include this part.
3513                          */
3514                         nrsm->r_start = seq_out;
3515                         nrsm->r_end = rsm->r_end;
3516                         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3517                         nrsm->r_flags = rsm->r_flags;
3518                         nrsm->r_sndcnt = rsm->r_sndcnt;
3519                         nrsm->r_rtr_bytes = 0;
3520                         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3521                                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3522                         }
3523                         rsm->r_end = nrsm->r_start;
3524                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3525                         if (rsm->r_in_tmap) {
3526                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3527                                 nrsm->r_in_tmap = 1;
3528                         }
3529                         rsm->r_flags &= (~RACK_HAS_FIN);
3530                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3531                         if (len == 0) {
3532                                 return;
3533                         }
3534                 }
3535         }
3536         /*
3537          * Hmm not found in map did they retransmit both old and on into the
3538          * new?
3539          */
3540         if (seq_out == tp->snd_max) {
3541                 goto again;
3542         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3543 #ifdef INVARIANTS
3544                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3545                     seq_out, len, tp->snd_una, tp->snd_max);
3546                 printf("Starting Dump of all rack entries\n");
3547                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3548                         printf("rsm:%p start:%u end:%u\n",
3549                             rsm, rsm->r_start, rsm->r_end);
3550                 }
3551                 printf("Dump complete\n");
3552                 panic("seq_out not found rack:%p tp:%p",
3553                     rack, tp);
3554 #endif
3555         } else {
3556 #ifdef INVARIANTS
3557                 /*
3558                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3559                  * flag)
3560                  */
3561                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3562                     seq_out, len, tp->snd_max, tp);
3563 #endif
3564         }
3565 }
3566
3567 /*
3568  * Record one of the RTT updates from an ack into
3569  * our sample structure.
3570  */
3571 static void
3572 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3573 {
3574         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3575             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3576                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3577         }
3578         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3579             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3580                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3581         }
3582         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3583         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3584         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3585 }
3586
3587 /*
3588  * Collect new round-trip time estimate
3589  * and update averages and current timeout.
3590  */
3591 static void
3592 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3593 {
3594         int32_t delta;
3595         uint32_t o_srtt, o_var;
3596         int32_t rtt;
3597
3598         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3599                 /* No valid sample */
3600                 return;
3601         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3602                 /* We are to use the lowest RTT seen in a single ack */
3603                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3604         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3605                 /* We are to use the highest RTT seen in a single ack */
3606                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3607         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3608                 /* We are to use the average RTT seen in a single ack */
3609                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3610                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3611         } else {
3612 #ifdef INVARIANTS
3613                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3614 #endif
3615                 return;
3616         }
3617         if (rtt == 0)
3618                 rtt = 1;
3619         rack_log_rtt_sample(rack, rtt);
3620         o_srtt = tp->t_srtt;
3621         o_var = tp->t_rttvar;
3622         rack = (struct tcp_rack *)tp->t_fb_ptr;
3623         if (tp->t_srtt != 0) {
3624                 /*
3625                  * srtt is stored as fixed point with 5 bits after the
3626                  * binary point (i.e., scaled by 8).  The following magic is
3627                  * equivalent to the smoothing algorithm in rfc793 with an
3628                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3629                  * Adjust rtt to origin 0.
3630                  */
3631                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3632                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3633
3634                 tp->t_srtt += delta;
3635                 if (tp->t_srtt <= 0)
3636                         tp->t_srtt = 1;
3637
3638                 /*
3639                  * We accumulate a smoothed rtt variance (actually, a
3640                  * smoothed mean difference), then set the retransmit timer
3641                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3642                  * is stored as fixed point with 4 bits after the binary
3643                  * point (scaled by 16).  The following is equivalent to
3644                  * rfc793 smoothing with an alpha of .75 (rttvar =
3645                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3646                  * wired-in beta.
3647                  */
3648                 if (delta < 0)
3649                         delta = -delta;
3650                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3651                 tp->t_rttvar += delta;
3652                 if (tp->t_rttvar <= 0)
3653                         tp->t_rttvar = 1;
3654                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3655                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3656         } else {
3657                 /*
3658                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3659                  * variance to half the rtt (so our first retransmit happens
3660                  * at 3*rtt).
3661                  */
3662                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3663                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3664                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3665         }
3666         TCPSTAT_INC(tcps_rttupdated);
3667         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3668         tp->t_rttupdated++;
3669 #ifdef NETFLIX_STATS
3670         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3671 #endif
3672         tp->t_rxtshift = 0;
3673
3674         /*
3675          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3676          * way we do the smoothing, srtt and rttvar will each average +1/2
3677          * tick of bias.  When we compute the retransmit timer, we want 1/2
3678          * tick of rounding and 1 extra tick because of +-1/2 tick
3679          * uncertainty in the firing of the timer.  The bias will give us
3680          * exactly the 1.5 tick we need.  But, because the bias is
3681          * statistical, we have to test that we don't drop below the minimum
3682          * feasible timer (which is 2 ticks).
3683          */
3684         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3685            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3686         tp->t_softerror = 0;
3687 }
3688
3689 static void
3690 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3691     uint32_t t, uint32_t cts)
3692 {
3693         /*
3694          * For this RSM, we acknowledged the data from a previous
3695          * transmission, not the last one we made. This means we did a false
3696          * retransmit.
3697          */
3698         struct tcp_rack *rack;
3699
3700         if (rsm->r_flags & RACK_HAS_FIN) {
3701                 /*
3702                  * The sending of the FIN often is multiple sent when we
3703                  * have everything outstanding ack'd. We ignore this case
3704                  * since its over now.
3705                  */
3706                 return;
3707         }
3708         if (rsm->r_flags & RACK_TLP) {
3709                 /*
3710                  * We expect TLP's to have this occur.
3711                  */
3712                 return;
3713         }
3714         rack = (struct tcp_rack *)tp->t_fb_ptr;
3715         /* should we undo cc changes and exit recovery? */
3716         if (IN_RECOVERY(tp->t_flags)) {
3717                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3718                         /*
3719                          * Undo what we ratched down and exit recovery if
3720                          * possible
3721                          */
3722                         EXIT_RECOVERY(tp->t_flags);
3723                         tp->snd_recover = tp->snd_una;
3724                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3725                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3726                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3727                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3728                 }
3729         }
3730         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3731                 /*
3732                  * We retransmitted based on a sack and the earlier
3733                  * retransmission ack'd it - re-ordering is occuring.
3734                  */
3735                 counter_u64_add(rack_reorder_seen, 1);
3736                 rack->r_ctl.rc_reorder_ts = cts;
3737         }
3738         counter_u64_add(rack_badfr, 1);
3739         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3740 }
3741
3742
3743 static int
3744 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3745     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3746 {
3747         int32_t i;
3748         uint32_t t;
3749
3750         if (rsm->r_flags & RACK_ACKED)
3751                 /* Already done */
3752                 return (0);
3753
3754
3755         if ((rsm->r_rtr_cnt == 1) ||
3756             ((ack_type == CUM_ACKED) &&
3757             (to->to_flags & TOF_TS) &&
3758             (to->to_tsecr) &&
3759             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3760             ) {
3761                 /*
3762                  * We will only find a matching timestamp if its cum-acked.
3763                  * But if its only one retransmission its for-sure matching
3764                  * :-)
3765                  */
3766                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3767                 if ((int)t <= 0)
3768                         t = 1;
3769                 if (!tp->t_rttlow || tp->t_rttlow > t)
3770                         tp->t_rttlow = t;
3771                 if (!rack->r_ctl.rc_rack_min_rtt ||
3772                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3773                         rack->r_ctl.rc_rack_min_rtt = t;
3774                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3775                                 rack->r_ctl.rc_rack_min_rtt = 1;
3776                         }
3777                 }
3778                 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
3779                 if ((rsm->r_flags & RACK_TLP) &&
3780                     (!IN_RECOVERY(tp->t_flags))) {
3781                         /* Segment was a TLP and our retrans matched */
3782                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
3783                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
3784                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
3785                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
3786                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
3787                                 /*
3788                                  * When we enter recovery we need to assure
3789                                  * we send one packet.
3790                                  */
3791                                 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
3792                         } else
3793                                 rack->r_ctl.rc_tlp_rtx_out = 0;
3794                 }
3795                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3796                         /* New more recent rack_tmit_time */
3797                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3798                         rack->rc_rack_rtt = t;
3799                 }
3800                 return (1);
3801         }
3802         /*
3803          * We clear the soft/rxtshift since we got an ack.
3804          * There is no assurance we will call the commit() function
3805          * so we need to clear these to avoid incorrect handling.
3806          */
3807         tp->t_rxtshift = 0;
3808         tp->t_softerror = 0;
3809         if ((to->to_flags & TOF_TS) &&
3810             (ack_type == CUM_ACKED) &&
3811             (to->to_tsecr) &&
3812             ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
3813                 /*
3814                  * Now which timestamp does it match? In this block the ACK
3815                  * must be coming from a previous transmission.
3816                  */
3817                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
3818                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
3819                                 t = cts - rsm->r_tim_lastsent[i];
3820                                 if ((int)t <= 0)
3821                                         t = 1;
3822                                 if ((i + 1) < rsm->r_rtr_cnt) {
3823                                         /* Likely */
3824                                         rack_earlier_retran(tp, rsm, t, cts);
3825                                 }
3826                                 if (!tp->t_rttlow || tp->t_rttlow > t)
3827                                         tp->t_rttlow = t;
3828                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3829                                         rack->r_ctl.rc_rack_min_rtt = t;
3830                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
3831                                                 rack->r_ctl.rc_rack_min_rtt = 1;
3832                                         }
3833                                 }
3834                                 /*
3835                                  * Note the following calls to
3836                                  * tcp_rack_xmit_timer() are being commented
3837                                  * out for now. They give us no more accuracy
3838                                  * and often lead to a wrong choice. We have
3839                                  * enough samples that have not been
3840                                  * retransmitted. I leave the commented out
3841                                  * code in here in case in the future we
3842                                  * decide to add it back (though I can't forsee
3843                                  * doing that). That way we will easily see
3844                                  * where they need to be placed.
3845                                  */
3846                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
3847                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3848                                         /* New more recent rack_tmit_time */
3849                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3850                                         rack->rc_rack_rtt = t;
3851                                 }
3852                                 return (1);
3853                         }
3854                 }
3855                 goto ts_not_found;
3856         } else {
3857                 /*
3858                  * Ok its a SACK block that we retransmitted. or a windows
3859                  * machine without timestamps. We can tell nothing from the
3860                  * time-stamp since its not there or the time the peer last
3861                  * recieved a segment that moved forward its cum-ack point.
3862                  */
3863 ts_not_found:
3864                 i = rsm->r_rtr_cnt - 1;
3865                 t = cts - rsm->r_tim_lastsent[i];
3866                 if ((int)t <= 0)
3867                         t = 1;
3868                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3869                         /*
3870                          * We retransmitted and the ack came back in less
3871                          * than the smallest rtt we have observed. We most
3872                          * likey did an improper retransmit as outlined in
3873                          * 4.2 Step 3 point 2 in the rack-draft.
3874                          */
3875                         i = rsm->r_rtr_cnt - 2;
3876                         t = cts - rsm->r_tim_lastsent[i];
3877                         rack_earlier_retran(tp, rsm, t, cts);
3878                 } else if (rack->r_ctl.rc_rack_min_rtt) {
3879                         /*
3880                          * We retransmitted it and the retransmit did the
3881                          * job.
3882                          */
3883                         if (!rack->r_ctl.rc_rack_min_rtt ||
3884                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3885                                 rack->r_ctl.rc_rack_min_rtt = t;
3886                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
3887                                         rack->r_ctl.rc_rack_min_rtt = 1;
3888                                 }
3889                         }
3890                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
3891                                 /* New more recent rack_tmit_time */
3892                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
3893                                 rack->rc_rack_rtt = t;
3894                         }
3895                         return (1);
3896                 }
3897         }
3898         return (0);
3899 }
3900
3901 /*
3902  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
3903  */
3904 static void
3905 rack_log_sack_passed(struct tcpcb *tp,
3906     struct tcp_rack *rack, struct rack_sendmap *rsm)
3907 {
3908         struct rack_sendmap *nrsm;
3909         uint32_t ts;
3910         int32_t idx;
3911
3912         idx = rsm->r_rtr_cnt - 1;
3913         ts = rsm->r_tim_lastsent[idx];
3914         nrsm = rsm;
3915         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
3916             rack_head, r_tnext) {
3917                 if (nrsm == rsm) {
3918                         /* Skip orginal segment he is acked */
3919                         continue;
3920                 }
3921                 if (nrsm->r_flags & RACK_ACKED) {
3922                         /* Skip ack'd segments */
3923                         continue;
3924                 }
3925                 idx = nrsm->r_rtr_cnt - 1;
3926                 if (ts == nrsm->r_tim_lastsent[idx]) {
3927                         /*
3928                          * For this case lets use seq no, if we sent in a
3929                          * big block (TSO) we would have a bunch of segments
3930                          * sent at the same time.
3931                          *
3932                          * We would only get a report if its SEQ is earlier.
3933                          * If we have done multiple retransmits the times
3934                          * would not be equal.
3935                          */
3936                         if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
3937                                 nrsm->r_flags |= RACK_SACK_PASSED;
3938                                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3939                         }
3940                 } else {
3941                         /*
3942                          * Here they were sent at different times, not a big
3943                          * block. Since we transmitted this one later and
3944                          * see it sack'd then this must also be missing (or
3945                          * we would have gotten a sack block for it)
3946                          */
3947                         nrsm->r_flags |= RACK_SACK_PASSED;
3948                         nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3949                 }
3950         }
3951 }
3952
3953 static uint32_t
3954 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
3955     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
3956 {
3957         int32_t idx;
3958         int32_t times = 0;
3959         uint32_t start, end, changed = 0;
3960         struct rack_sendmap *rsm, *nrsm;
3961         int32_t used_ref = 1;
3962
3963         start = sack->start;
3964         end = sack->end;
3965         rsm = *prsm;
3966         if (rsm && SEQ_LT(start, rsm->r_start)) {
3967                 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
3968                         if (SEQ_GEQ(start, rsm->r_start) &&
3969                             SEQ_LT(start, rsm->r_end)) {
3970                                 goto do_rest_ofb;
3971                         }
3972                 }
3973         }
3974         if (rsm == NULL) {
3975 start_at_beginning:
3976                 rsm = NULL;
3977                 used_ref = 0;
3978         }
3979         /* First lets locate the block where this guy is */
3980         TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
3981                 if (SEQ_GEQ(start, rsm->r_start) &&
3982                     SEQ_LT(start, rsm->r_end)) {
3983                         break;
3984                 }
3985         }
3986 do_rest_ofb:
3987         if (rsm == NULL) {
3988                 /*
3989                  * This happens when we get duplicate sack blocks with the
3990                  * same end. For example SACK 4: 100 SACK 3: 100 The sort
3991                  * will not change there location so we would just start at
3992                  * the end of the first one and get lost.
3993                  */
3994                 if (tp->t_flags & TF_SENTFIN) {
3995                         /*
3996                          * Check to see if we have not logged the FIN that
3997                          * went out.
3998                          */
3999                         nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4000                         if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
4001                                 /*
4002                                  * Ok we did not get the FIN logged.
4003                                  */
4004                                 nrsm->r_end++;
4005                                 rsm = nrsm;
4006                                 goto do_rest_ofb;
4007                         }
4008                 }
4009                 if (times == 1) {
4010 #ifdef INVARIANTS
4011                         panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
4012                             tp, rack, sack, to, prsm);
4013 #else
4014                         goto out;
4015 #endif
4016                 }
4017                 times++;
4018                 counter_u64_add(rack_sack_proc_restart, 1);
4019                 goto start_at_beginning;
4020         }
4021         /* Ok we have an ACK for some piece of rsm */
4022         if (rsm->r_start != start) {
4023                 /*
4024                  * Need to split this in two pieces the before and after.
4025                  */
4026                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4027                 if (nrsm == NULL) {
4028                         /*
4029                          * failed XXXrrs what can we do but loose the sack
4030                          * info?
4031                          */
4032                         goto out;
4033                 }
4034                 nrsm->r_start = start;
4035                 nrsm->r_rtr_bytes = 0;
4036                 nrsm->r_end = rsm->r_end;
4037                 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4038                 nrsm->r_flags = rsm->r_flags;
4039                 nrsm->r_sndcnt = rsm->r_sndcnt;
4040                 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4041                         nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4042                 }
4043                 rsm->r_end = nrsm->r_start;
4044                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4045                 if (rsm->r_in_tmap) {
4046                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4047                         nrsm->r_in_tmap = 1;
4048                 }
4049                 rsm->r_flags &= (~RACK_HAS_FIN);
4050                 rsm = nrsm;
4051         }
4052         if (SEQ_GEQ(end, rsm->r_end)) {
4053                 /*
4054                  * The end of this block is either beyond this guy or right
4055                  * at this guy.
4056                  */
4057
4058                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4059                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4060                         changed += (rsm->r_end - rsm->r_start);
4061                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4062                         rack_log_sack_passed(tp, rack, rsm);
4063                         /* Is Reordering occuring? */
4064                         if (rsm->r_flags & RACK_SACK_PASSED) {
4065                                 counter_u64_add(rack_reorder_seen, 1);
4066                                 rack->r_ctl.rc_reorder_ts = cts;
4067                         }
4068                         rsm->r_flags |= RACK_ACKED;
4069                         rsm->r_flags &= ~RACK_TLP;
4070                         if (rsm->r_in_tmap) {
4071                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4072                                 rsm->r_in_tmap = 0;
4073                         }
4074                 }
4075                 if (end == rsm->r_end) {
4076                         /* This block only - done */
4077                         goto out;
4078                 }
4079                 /* There is more not coverend by this rsm move on */
4080                 start = rsm->r_end;
4081                 nrsm = TAILQ_NEXT(rsm, r_next);
4082                 rsm = nrsm;
4083                 times = 0;
4084                 goto do_rest_ofb;
4085         }
4086         /* Ok we need to split off this one at the tail */
4087         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4088         if (nrsm == NULL) {
4089                 /* failed rrs what can we do but loose the sack info? */
4090                 goto out;
4091         }
4092         /* Clone it */
4093         nrsm->r_start = end;
4094         nrsm->r_end = rsm->r_end;
4095         nrsm->r_rtr_bytes = 0;
4096         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4097         nrsm->r_flags = rsm->r_flags;
4098         nrsm->r_sndcnt = rsm->r_sndcnt;
4099         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4100                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4101         }
4102         /* The sack block does not cover this guy fully */
4103         rsm->r_flags &= (~RACK_HAS_FIN);
4104         rsm->r_end = end;
4105         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4106         if (rsm->r_in_tmap) {
4107                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4108                 nrsm->r_in_tmap = 1;
4109         }
4110         if (rsm->r_flags & RACK_ACKED) {
4111                 /* Been here done that */
4112                 goto out;
4113         }
4114         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4115         changed += (rsm->r_end - rsm->r_start);
4116         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4117         rack_log_sack_passed(tp, rack, rsm);
4118         /* Is Reordering occuring? */
4119         if (rsm->r_flags & RACK_SACK_PASSED) {
4120                 counter_u64_add(rack_reorder_seen, 1);
4121                 rack->r_ctl.rc_reorder_ts = cts;
4122         }
4123         rsm->r_flags |= RACK_ACKED;
4124         rsm->r_flags &= ~RACK_TLP;
4125         if (rsm->r_in_tmap) {
4126                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4127                 rsm->r_in_tmap = 0;
4128         }
4129 out:
4130         if (used_ref == 0) {
4131                 counter_u64_add(rack_sack_proc_all, 1);
4132         } else {
4133                 counter_u64_add(rack_sack_proc_short, 1);
4134         }
4135         /* Save off where we last were */
4136         if (rsm)
4137                 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
4138         else
4139                 rack->r_ctl.rc_sacklast = NULL;
4140         *prsm = rsm;
4141         return (changed);
4142 }
4143
4144 static void inline
4145 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4146 {
4147         struct rack_sendmap *tmap;
4148
4149         tmap = NULL;
4150         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4151                 /* Its no longer sacked, mark it so */
4152                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4153 #ifdef INVARIANTS
4154                 if (rsm->r_in_tmap) {
4155                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4156                               rack, rsm, rsm->r_flags);
4157                 }
4158 #endif
4159                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4160                 /* Rebuild it into our tmap */
4161                 if (tmap == NULL) {
4162                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4163                         tmap = rsm;
4164                 } else {
4165                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4166                         tmap = rsm;
4167                 }
4168                 tmap->r_in_tmap = 1;
4169                 rsm = TAILQ_NEXT(rsm, r_next);
4170         }
4171         /*
4172          * Now lets possibly clear the sack filter so we start
4173          * recognizing sacks that cover this area.
4174          */
4175         if (rack_use_sack_filter)
4176                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4177
4178 }
4179
4180 static void
4181 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4182 {
4183         uint32_t changed, last_seq, entered_recovery = 0;
4184         struct tcp_rack *rack;
4185         struct rack_sendmap *rsm;
4186         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4187         register uint32_t th_ack;
4188         int32_t i, j, k, num_sack_blks = 0;
4189         uint32_t cts, acked, ack_point, sack_changed = 0;
4190
4191         INP_WLOCK_ASSERT(tp->t_inpcb);
4192         if (th->th_flags & TH_RST) {
4193                 /* We don't log resets */
4194                 return;
4195         }
4196         rack = (struct tcp_rack *)tp->t_fb_ptr;
4197         cts = tcp_ts_getticks();
4198         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4199         changed = 0;
4200         th_ack = th->th_ack;
4201
4202         if (SEQ_GT(th_ack, tp->snd_una)) {
4203                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4204                 tp->t_acktime = ticks;
4205         }
4206         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4207                 changed = th_ack - rsm->r_start;
4208         if (changed) {
4209                 /*
4210                  * The ACK point is advancing to th_ack, we must drop off
4211                  * the packets in the rack log and calculate any eligble
4212                  * RTT's.
4213                  */
4214                 rack->r_wanted_output++;
4215 more:
4216                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4217                 if (rsm == NULL) {
4218                         if ((th_ack - 1) == tp->iss) {
4219                                 /*
4220                                  * For the SYN incoming case we will not
4221                                  * have called tcp_output for the sending of
4222                                  * the SYN, so there will be no map. All
4223                                  * other cases should probably be a panic.
4224                                  */
4225                                 goto proc_sack;
4226                         }
4227                         if (tp->t_flags & TF_SENTFIN) {
4228                                 /* if we send a FIN we will not hav a map */
4229                                 goto proc_sack;
4230                         }
4231 #ifdef INVARIANTS
4232                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4233                             tp,
4234                             th, tp->t_state, rack,
4235                             tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4236 #endif
4237                         goto proc_sack;
4238                 }
4239                 if (SEQ_LT(th_ack, rsm->r_start)) {
4240                         /* Huh map is missing this */
4241 #ifdef INVARIANTS
4242                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4243                             rsm->r_start,
4244                             th_ack, tp->t_state, rack->r_state);
4245 #endif
4246                         goto proc_sack;
4247                 }
4248                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4249                 /* Now do we consume the whole thing? */
4250                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4251                         /* Its all consumed. */
4252                         uint32_t left;
4253
4254                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4255                         rsm->r_rtr_bytes = 0;
4256                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
4257                         if (rsm->r_in_tmap) {
4258                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4259                                 rsm->r_in_tmap = 0;
4260                         }
4261                         if (rack->r_ctl.rc_next == rsm) {
4262                                 /* scoot along the marker */
4263                                 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
4264                         }
4265                         if (rsm->r_flags & RACK_ACKED) {
4266                                 /*
4267                                  * It was acked on the scoreboard -- remove
4268                                  * it from total
4269                                  */
4270                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4271                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4272                                 /*
4273                                  * There are acked segments ACKED on the
4274                                  * scoreboard further up. We are seeing
4275                                  * reordering.
4276                                  */
4277                                 counter_u64_add(rack_reorder_seen, 1);
4278                                 rsm->r_flags |= RACK_ACKED;
4279                                 rack->r_ctl.rc_reorder_ts = cts;
4280                         }
4281                         left = th_ack - rsm->r_end;
4282                         if (rsm->r_rtr_cnt > 1) {
4283                                 /*
4284                                  * Technically we should make r_rtr_cnt be
4285                                  * monotonicly increasing and just mod it to
4286                                  * the timestamp it is replacing.. that way
4287                                  * we would have the last 3 retransmits. Now
4288                                  * rc_loss_count will be wrong if we
4289                                  * retransmit something more than 2 times in
4290                                  * recovery :(
4291                                  */
4292                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4293                         }
4294                         /* Free back to zone */
4295                         rack_free(rack, rsm);
4296                         if (left) {
4297                                 goto more;
4298                         }
4299                         goto proc_sack;
4300                 }
4301                 if (rsm->r_flags & RACK_ACKED) {
4302                         /*
4303                          * It was acked on the scoreboard -- remove it from
4304                          * total for the part being cum-acked.
4305                          */
4306                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4307                 }
4308                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4309                 rsm->r_rtr_bytes = 0;
4310                 rsm->r_start = th_ack;
4311         }
4312 proc_sack:
4313         /* Check for reneging */
4314         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4315         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4316                 /*
4317                  * The peer has moved snd_una up to
4318                  * the edge of this send, i.e. one
4319                  * that it had previously acked. The only
4320                  * way that can be true if the peer threw
4321                  * away data (space issues) that it had
4322                  * previously sacked (else it would have
4323                  * given us snd_una up to (rsm->r_end).
4324                  * We need to undo the acked markings here.
4325                  *
4326                  * Note we have to look to make sure th_ack is
4327                  * our rsm->r_start in case we get an old ack
4328                  * where th_ack is behind snd_una.
4329                  */
4330                 rack_peer_reneges(rack, rsm, th->th_ack);
4331         }
4332         if ((to->to_flags & TOF_SACK) == 0) {
4333                 /* We are done nothing left to log */
4334                 goto out;
4335         }
4336         rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4337         if (rsm) {
4338                 last_seq = rsm->r_end;
4339         } else {
4340                 last_seq = tp->snd_max;
4341         }
4342         /* Sack block processing */
4343         if (SEQ_GT(th_ack, tp->snd_una))
4344                 ack_point = th_ack;
4345         else
4346                 ack_point = tp->snd_una;
4347         for (i = 0; i < to->to_nsacks; i++) {
4348                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4349                     &sack, sizeof(sack));
4350                 sack.start = ntohl(sack.start);
4351                 sack.end = ntohl(sack.end);
4352                 if (SEQ_GT(sack.end, sack.start) &&
4353                     SEQ_GT(sack.start, ack_point) &&
4354                     SEQ_LT(sack.start, tp->snd_max) &&
4355                     SEQ_GT(sack.end, ack_point) &&
4356                     SEQ_LEQ(sack.end, tp->snd_max)) {
4357                         if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
4358                             (SEQ_LT(sack.end, last_seq)) &&
4359                             ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
4360                                 /*
4361                                  * Not the last piece and its smaller than
4362                                  * 1/8th of a MSS. We ignore this.
4363                                  */
4364                                 counter_u64_add(rack_runt_sacks, 1);
4365                                 continue;
4366                         }
4367                         sack_blocks[num_sack_blks] = sack;
4368                         num_sack_blks++;
4369 #ifdef NETFLIX_STATS
4370                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4371                            SEQ_LEQ(sack.end, th_ack)) {
4372                         /*
4373                          * Its a D-SACK block.
4374                          */
4375                         tcp_record_dsack(sack.start, sack.end);
4376 #endif
4377                 }
4378
4379         }
4380         if (num_sack_blks == 0)
4381                 goto out;
4382         /*
4383          * Sort the SACK blocks so we can update the rack scoreboard with
4384          * just one pass.
4385          */
4386         if (rack_use_sack_filter) {
4387                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
4388         }
4389         if (num_sack_blks < 2) {
4390                 goto do_sack_work;
4391         }
4392         /* Sort the sacks */
4393         for (i = 0; i < num_sack_blks; i++) {
4394                 for (j = i + 1; j < num_sack_blks; j++) {
4395                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4396                                 sack = sack_blocks[i];
4397                                 sack_blocks[i] = sack_blocks[j];
4398                                 sack_blocks[j] = sack;
4399                         }
4400                 }
4401         }
4402         /*
4403          * Now are any of the sack block ends the same (yes some
4404          * implememtations send these)?
4405          */
4406 again:
4407         if (num_sack_blks > 1) {
4408                 for (i = 0; i < num_sack_blks; i++) {
4409                         for (j = i + 1; j < num_sack_blks; j++) {
4410                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4411                                         /*
4412                                          * Ok these two have the same end we
4413                                          * want the smallest end and then
4414                                          * throw away the larger and start
4415                                          * again.
4416                                          */
4417                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4418                                                 /*
4419                                                  * The second block covers
4420                                                  * more area use that
4421                                                  */
4422                                                 sack_blocks[i].start = sack_blocks[j].start;
4423                                         }
4424                                         /*
4425                                          * Now collapse out the dup-sack and
4426                                          * lower the count
4427                                          */
4428                                         for (k = (j + 1); k < num_sack_blks; k++) {
4429                                                 sack_blocks[j].start = sack_blocks[k].start;
4430                                                 sack_blocks[j].end = sack_blocks[k].end;
4431                                                 j++;
4432                                         }
4433                                         num_sack_blks--;
4434                                         goto again;
4435                                 }
4436                         }
4437                 }
4438         }
4439 do_sack_work:
4440         rsm = rack->r_ctl.rc_sacklast;
4441         for (i = 0; i < num_sack_blks; i++) {
4442                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
4443                 if (acked) {
4444                         rack->r_wanted_output++;
4445                         changed += acked;
4446                         sack_changed += acked;
4447                 }
4448         }
4449 out:
4450         if (changed) {
4451                 /* Something changed cancel the rack timer */
4452                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4453         }
4454         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
4455                 /*
4456                  * Ok we have a high probability that we need to go in to
4457                  * recovery since we have data sack'd
4458                  */
4459                 struct rack_sendmap *rsm;
4460                 uint32_t tsused;
4461
4462                 tsused = tcp_ts_getticks();
4463                 rsm = tcp_rack_output(tp, rack, tsused);
4464                 if (rsm) {
4465                         /* Enter recovery */
4466                         rack->r_ctl.rc_rsm_start = rsm->r_start;
4467                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4468                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4469                         entered_recovery = 1;
4470                         rack_cong_signal(tp, NULL, CC_NDUPACK);
4471                         /*
4472                          * When we enter recovery we need to assure we send
4473                          * one packet.
4474                          */
4475                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
4476                         rack->r_timer_override = 1;
4477                 }
4478         }
4479         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
4480                 /* Deal with changed an PRR here (in recovery only) */
4481                 uint32_t pipe, snd_una;
4482
4483                 rack->r_ctl.rc_prr_delivered += changed;
4484                 /* Compute prr_sndcnt */
4485                 if (SEQ_GT(tp->snd_una, th_ack)) {
4486                         snd_una = tp->snd_una;
4487                 } else {
4488                         snd_una = th_ack;
4489                 }
4490                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
4491                 if (pipe > tp->snd_ssthresh) {
4492                         long sndcnt;
4493
4494                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
4495                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
4496                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
4497                         else {
4498                                 rack->r_ctl.rc_prr_sndcnt = 0;
4499                                 sndcnt = 0;
4500                         }
4501                         sndcnt++;
4502                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
4503                                 sndcnt -= rack->r_ctl.rc_prr_out;
4504                         else
4505                                 sndcnt = 0;
4506                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
4507                 } else {
4508                         uint32_t limit;
4509
4510                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
4511                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
4512                         else
4513                                 limit = 0;
4514                         if (changed > limit)
4515                                 limit = changed;
4516                         limit += tp->t_maxseg;
4517                         if (tp->snd_ssthresh > pipe) {
4518                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
4519                         } else {
4520                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
4521                         }
4522                 }
4523                 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
4524                         rack->r_timer_override = 1;
4525                 }
4526         }
4527 }
4528
4529 /*
4530  * Return value of 1, we do not need to call rack_process_data().
4531  * return value of 0, rack_process_data can be called.
4532  * For ret_val if its 0 the TCP is locked, if its non-zero
4533  * its unlocked and probably unsafe to touch the TCB.
4534  */
4535 static int
4536 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
4537     struct tcpcb *tp, struct tcpopt *to,
4538     uint32_t tiwin, int32_t tlen,
4539     int32_t * ofia, int32_t thflags, int32_t * ret_val)
4540 {
4541         int32_t ourfinisacked = 0;
4542         int32_t nsegs, acked_amount;
4543         int32_t acked;
4544         struct mbuf *mfree;
4545         struct tcp_rack *rack;
4546         int32_t recovery = 0;
4547
4548         rack = (struct tcp_rack *)tp->t_fb_ptr;
4549         if (SEQ_GT(th->th_ack, tp->snd_max)) {
4550                 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
4551                 return (1);
4552         }
4553         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
4554                 rack_log_ack(tp, to, th);
4555         }
4556         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
4557                 /*
4558                  * Old ack, behind (or duplicate to) the last one rcv'd
4559                  * Note: Should mark reordering is occuring! We should also
4560                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
4561                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
4562                  * retran and> ack 3
4563                  */
4564                 return (0);
4565         }
4566         /*
4567          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
4568          * something we sent.
4569          */
4570         if (tp->t_flags & TF_NEEDSYN) {
4571                 /*
4572                  * T/TCP: Connection was half-synchronized, and our SYN has
4573                  * been ACK'd (so connection is now fully synchronized).  Go
4574                  * to non-starred state, increment snd_una for ACK of SYN,
4575                  * and check if we can do window scaling.
4576                  */
4577                 tp->t_flags &= ~TF_NEEDSYN;
4578                 tp->snd_una++;
4579                 /* Do window scaling? */
4580                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
4581                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
4582                         tp->rcv_scale = tp->request_r_scale;
4583                         /* Send window already scaled. */
4584                 }
4585         }
4586         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4587         INP_WLOCK_ASSERT(tp->t_inpcb);
4588
4589         acked = BYTES_THIS_ACK(tp, th);
4590         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
4591         TCPSTAT_ADD(tcps_rcvackbyte, acked);
4592
4593         /*
4594          * If we just performed our first retransmit, and the ACK arrives
4595          * within our recovery window, then it was a mistake to do the
4596          * retransmit in the first place.  Recover our original cwnd and
4597          * ssthresh, and proceed to transmit where we left off.
4598          */
4599         if (tp->t_flags & TF_PREVVALID) {
4600                 tp->t_flags &= ~TF_PREVVALID;
4601                 if (tp->t_rxtshift == 1 &&
4602                     (int)(ticks - tp->t_badrxtwin) < 0)
4603                         rack_cong_signal(tp, th, CC_RTO_ERR);
4604         }
4605         /*
4606          * If we have a timestamp reply, update smoothed round trip time. If
4607          * no timestamp is present but transmit timer is running and timed
4608          * sequence number was acked, update smoothed round trip time. Since
4609          * we now have an rtt measurement, cancel the timer backoff (cf.,
4610          * Phil Karn's retransmit alg.). Recompute the initial retransmit
4611          * timer.
4612          *
4613          * Some boxes send broken timestamp replies during the SYN+ACK
4614          * phase, ignore timestamps of 0 or we could calculate a huge RTT
4615          * and blow up the retransmit timer.
4616          */
4617         /*
4618          * If all outstanding data is acked, stop retransmit timer and
4619          * remember to restart (more output or persist). If there is more
4620          * data to be acked, restart retransmit timer, using current
4621          * (possibly backed-off) value.
4622          */
4623         if (th->th_ack == tp->snd_max) {
4624                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4625                 rack->r_wanted_output++;
4626         }
4627         /*
4628          * If no data (only SYN) was ACK'd, skip rest of ACK processing.
4629          */
4630         if (acked == 0) {
4631                 if (ofia)
4632                         *ofia = ourfinisacked;
4633                 return (0);
4634         }
4635         if (rack->r_ctl.rc_early_recovery) {
4636                 if (IN_FASTRECOVERY(tp->t_flags)) {
4637                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4638                                 tcp_rack_partialack(tp, th);
4639                         } else {
4640                                 rack_post_recovery(tp, th);
4641                                 recovery = 1;
4642                         }
4643                 }
4644         }
4645         /*
4646          * Let the congestion control algorithm update congestion control
4647          * related information. This typically means increasing the
4648          * congestion window.
4649          */
4650         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
4651         SOCKBUF_LOCK(&so->so_snd);
4652         acked_amount = min(acked, (int)sbavail(&so->so_snd));
4653         tp->snd_wnd -= acked_amount;
4654         mfree = sbcut_locked(&so->so_snd, acked_amount);
4655         if ((sbused(&so->so_snd) == 0) &&
4656             (acked > acked_amount) &&
4657             (tp->t_state >= TCPS_FIN_WAIT_1)) {
4658                 ourfinisacked = 1;
4659         }
4660         /* NB: sowwakeup_locked() does an implicit unlock. */
4661         sowwakeup_locked(so);
4662         m_freem(mfree);
4663         if (rack->r_ctl.rc_early_recovery == 0) {
4664                 if (IN_FASTRECOVERY(tp->t_flags)) {
4665                         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4666                                 tcp_rack_partialack(tp, th);
4667                         } else {
4668                                 rack_post_recovery(tp, th);
4669                         }
4670                 }
4671         }
4672         tp->snd_una = th->th_ack;
4673         if (SEQ_GT(tp->snd_una, tp->snd_recover))
4674                 tp->snd_recover = tp->snd_una;
4675
4676         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4677                 tp->snd_nxt = tp->snd_una;
4678         }
4679         if (tp->snd_una == tp->snd_max) {
4680                 /* Nothing left outstanding */
4681                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
4682                 tp->t_acktime = 0;
4683                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4684                 /* Set need output so persist might get set */
4685                 rack->r_wanted_output++;
4686                 if (rack_use_sack_filter)
4687                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
4688                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
4689                     (sbavail(&so->so_snd) == 0) &&
4690                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
4691                         /*
4692                          * The socket was gone and the
4693                          * peer sent data, time to
4694                          * reset him.
4695                          */
4696                         *ret_val = 1;
4697                         tp = tcp_close(tp);
4698                         rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
4699                         return (1);
4700                 }
4701         }
4702         if (ofia)
4703                 *ofia = ourfinisacked;
4704         return (0);
4705 }
4706
4707
4708 /*
4709  * Return value of 1, the TCB is unlocked and most
4710  * likely gone, return value of 0, the TCP is still
4711  * locked.
4712  */
4713 static int
4714 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
4715     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
4716     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
4717 {
4718         /*
4719          * Update window information. Don't look at window if no ACK: TAC's
4720          * send garbage on first SYN.
4721          */
4722         int32_t nsegs;
4723         int32_t tfo_syn;
4724         struct tcp_rack *rack;
4725
4726         rack = (struct tcp_rack *)tp->t_fb_ptr;
4727         INP_WLOCK_ASSERT(tp->t_inpcb);
4728         nsegs = max(1, m->m_pkthdr.lro_nsegs);
4729         if ((thflags & TH_ACK) &&
4730             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4731             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4732             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4733                 /* keep track of pure window updates */
4734                 if (tlen == 0 &&
4735                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4736                         TCPSTAT_INC(tcps_rcvwinupd);
4737                 tp->snd_wnd = tiwin;
4738                 tp->snd_wl1 = th->th_seq;
4739                 tp->snd_wl2 = th->th_ack;
4740                 if (tp->snd_wnd > tp->max_sndwnd)
4741                         tp->max_sndwnd = tp->snd_wnd;
4742                 rack->r_wanted_output++;
4743         } else if (thflags & TH_ACK) {
4744                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
4745                         tp->snd_wnd = tiwin;
4746                         tp->snd_wl1 = th->th_seq;
4747                         tp->snd_wl2 = th->th_ack;
4748                 }
4749         }
4750         /* Was persist timer active and now we have window space? */
4751         if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
4752                 rack_exit_persist(tp, rack);
4753                 tp->snd_nxt = tp->snd_max;
4754                 /* Make sure we output to start the timer */
4755                 rack->r_wanted_output++;
4756         }
4757         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
4758                 m_freem(m);
4759                 return (0);
4760         }
4761         /*
4762          * Process segments with URG.
4763          */
4764         if ((thflags & TH_URG) && th->th_urp &&
4765             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4766                 /*
4767                  * This is a kludge, but if we receive and accept random
4768                  * urgent pointers, we'll crash in soreceive.  It's hard to
4769                  * imagine someone actually wanting to send this much urgent
4770                  * data.
4771                  */
4772                 SOCKBUF_LOCK(&so->so_rcv);
4773                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
4774                         th->th_urp = 0; /* XXX */
4775                         thflags &= ~TH_URG;     /* XXX */
4776                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
4777                         goto dodata;    /* XXX */
4778                 }
4779                 /*
4780                  * If this segment advances the known urgent pointer, then
4781                  * mark the data stream.  This should not happen in
4782                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
4783                  * FIN has been received from the remote side. In these
4784                  * states we ignore the URG.
4785                  *
4786                  * According to RFC961 (Assigned Protocols), the urgent
4787                  * pointer points to the last octet of urgent data.  We
4788                  * continue, however, to consider it to indicate the first
4789                  * octet of data past the urgent section as the original
4790                  * spec states (in one of two places).
4791                  */
4792                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
4793                         tp->rcv_up = th->th_seq + th->th_urp;
4794                         so->so_oobmark = sbavail(&so->so_rcv) +
4795                             (tp->rcv_up - tp->rcv_nxt) - 1;
4796                         if (so->so_oobmark == 0)
4797                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
4798                         sohasoutofband(so);
4799                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4800                 }
4801                 SOCKBUF_UNLOCK(&so->so_rcv);
4802                 /*
4803                  * Remove out of band data so doesn't get presented to user.
4804                  * This can happen independent of advancing the URG pointer,
4805                  * but if two URG's are pending at once, some out-of-band
4806                  * data may creep in... ick.
4807                  */
4808                 if (th->th_urp <= (uint32_t) tlen &&
4809                     !(so->so_options & SO_OOBINLINE)) {
4810                         /* hdr drop is delayed */
4811                         tcp_pulloutofband(so, th, m, drop_hdrlen);
4812                 }
4813         } else {
4814                 /*
4815                  * If no out of band data is expected, pull receive urgent
4816                  * pointer along with the receive window.
4817                  */
4818                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4819                         tp->rcv_up = tp->rcv_nxt;
4820         }
4821 dodata:                         /* XXX */
4822         INP_WLOCK_ASSERT(tp->t_inpcb);
4823
4824         /*
4825          * Process the segment text, merging it into the TCP sequencing
4826          * queue, and arranging for acknowledgment of receipt if necessary.
4827          * This process logically involves adjusting tp->rcv_wnd as data is
4828          * presented to the user (this happens in tcp_usrreq.c, case
4829          * PRU_RCVD).  If a FIN has already been received on this connection
4830          * then we just ignore the text.
4831          */
4832         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
4833                    IS_FASTOPEN(tp->t_flags));
4834         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
4835             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4836                 tcp_seq save_start = th->th_seq;
4837                 tcp_seq save_rnxt  = tp->rcv_nxt;
4838                 int     save_tlen  = tlen;
4839
4840                 m_adj(m, drop_hdrlen);  /* delayed header drop */
4841                 /*
4842                  * Insert segment which includes th into TCP reassembly
4843                  * queue with control block tp.  Set thflags to whether
4844                  * reassembly now includes a segment with FIN.  This handles
4845                  * the common case inline (segment is the next to be
4846                  * received on an established connection, and the queue is
4847                  * empty), avoiding linkage into and removal from the queue
4848                  * and repetition of various conversions. Set DELACK for
4849                  * segments received in order, but ack immediately when
4850                  * segments are out of order (so fast retransmit can work).
4851                  */
4852                 if (th->th_seq == tp->rcv_nxt &&
4853                     SEGQ_EMPTY(tp) &&
4854                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
4855                     tfo_syn)) {
4856                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
4857                                 rack_timer_cancel(tp, rack,
4858                                     rack->r_ctl.rc_rcvtime, __LINE__);
4859                                 tp->t_flags |= TF_DELACK;
4860                         } else {
4861                                 rack->r_wanted_output++;
4862                                 tp->t_flags |= TF_ACKNOW;
4863                         }
4864                         tp->rcv_nxt += tlen;
4865                         thflags = th->th_flags & TH_FIN;
4866                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
4867                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
4868                         SOCKBUF_LOCK(&so->so_rcv);
4869                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4870                                 m_freem(m);
4871                         else
4872                                 sbappendstream_locked(&so->so_rcv, m, 0);
4873                         /* NB: sorwakeup_locked() does an implicit unlock. */
4874                         sorwakeup_locked(so);
4875                 } else {
4876                         /*
4877                          * XXX: Due to the header drop above "th" is
4878                          * theoretically invalid by now.  Fortunately
4879                          * m_adj() doesn't actually frees any mbufs when
4880                          * trimming from the head.
4881                          */
4882                         tcp_seq temp = save_start;
4883                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
4884                         tp->t_flags |= TF_ACKNOW;
4885                 }
4886                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
4887                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
4888                                 /*
4889                                  * DSACK actually handled in the fastpath
4890                                  * above.
4891                                  */
4892                                 tcp_update_sack_list(tp, save_start,
4893                                     save_start + save_tlen);
4894                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
4895                                 if ((tp->rcv_numsacks >= 1) &&
4896                                     (tp->sackblks[0].end == save_start)) {
4897                                         /*
4898                                          * Partial overlap, recorded at todrop
4899                                          * above.
4900                                          */
4901                                         tcp_update_sack_list(tp,
4902                                             tp->sackblks[0].start,
4903                                             tp->sackblks[0].end);
4904                                 } else {
4905                                         tcp_update_dsack_list(tp, save_start,
4906                                             save_start + save_tlen);
4907                                 }
4908                         } else if (tlen >= save_tlen) {
4909                                 /* Update of sackblks. */
4910                                 tcp_update_dsack_list(tp, save_start,
4911                                     save_start + save_tlen);
4912                         } else if (tlen > 0) {
4913                                 tcp_update_dsack_list(tp, save_start,
4914                                     save_start + tlen);
4915                         }
4916                 }
4917         } else {
4918                 m_freem(m);
4919                 thflags &= ~TH_FIN;
4920         }
4921
4922         /*
4923          * If FIN is received ACK the FIN and let the user know that the
4924          * connection is closing.
4925          */
4926         if (thflags & TH_FIN) {
4927                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4928                         socantrcvmore(so);
4929                         /*
4930                          * If connection is half-synchronized (ie NEEDSYN
4931                          * flag on) then delay ACK, so it may be piggybacked
4932                          * when SYN is sent. Otherwise, since we received a
4933                          * FIN then no more input can be expected, send ACK
4934                          * now.
4935                          */
4936                         if (tp->t_flags & TF_NEEDSYN) {
4937                                 rack_timer_cancel(tp, rack,
4938                                     rack->r_ctl.rc_rcvtime, __LINE__);
4939                                 tp->t_flags |= TF_DELACK;
4940                         } else {
4941                                 tp->t_flags |= TF_ACKNOW;
4942                         }
4943                         tp->rcv_nxt++;
4944                 }
4945                 switch (tp->t_state) {
4946
4947                         /*
4948                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
4949                          * CLOSE_WAIT state.
4950                          */
4951                 case TCPS_SYN_RECEIVED:
4952                         tp->t_starttime = ticks;
4953                         /* FALLTHROUGH */
4954                 case TCPS_ESTABLISHED:
4955                         rack_timer_cancel(tp, rack,
4956                             rack->r_ctl.rc_rcvtime, __LINE__);
4957                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
4958                         break;
4959
4960                         /*
4961                          * If still in FIN_WAIT_1 STATE FIN has not been
4962                          * acked so enter the CLOSING state.
4963                          */
4964                 case TCPS_FIN_WAIT_1:
4965                         rack_timer_cancel(tp, rack,
4966                             rack->r_ctl.rc_rcvtime, __LINE__);
4967                         tcp_state_change(tp, TCPS_CLOSING);
4968                         break;
4969
4970                         /*
4971                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
4972                          * starting the time-wait timer, turning off the
4973                          * other standard timers.
4974                          */
4975                 case TCPS_FIN_WAIT_2:
4976                         rack_timer_cancel(tp, rack,
4977                             rack->r_ctl.rc_rcvtime, __LINE__);
4978                         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
4979                         tcp_twstart(tp);
4980                         return (1);
4981                 }
4982         }
4983         /*
4984          * Return any desired output.
4985          */
4986         if ((tp->t_flags & TF_ACKNOW) ||
4987             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
4988                 rack->r_wanted_output++;
4989         }
4990         INP_WLOCK_ASSERT(tp->t_inpcb);
4991         return (0);
4992 }
4993
4994 /*
4995  * Here nothing is really faster, its just that we
4996  * have broken out the fast-data path also just like
4997  * the fast-ack.
4998  */
4999 static int
5000 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
5001     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5002     uint32_t tiwin, int32_t nxt_pkt)
5003 {
5004         int32_t nsegs;
5005         int32_t newsize = 0;    /* automatic sockbuf scaling */
5006         struct tcp_rack *rack;
5007 #ifdef TCPDEBUG
5008         /*
5009          * The size of tcp_saveipgen must be the size of the max ip header,
5010          * now IPv6.
5011          */
5012         u_char tcp_saveipgen[IP6_HDR_LEN];
5013         struct tcphdr tcp_savetcp;
5014         short ostate = 0;
5015
5016 #endif
5017         /*
5018          * If last ACK falls within this segment's sequence numbers, record
5019          * the timestamp. NOTE that the test is modified according to the
5020          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5021          */
5022         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5023                 return (0);
5024         }
5025         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5026                 return (0);
5027         }
5028         if (tiwin && tiwin != tp->snd_wnd) {
5029                 return (0);
5030         }
5031         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5032                 return (0);
5033         }
5034         if (__predict_false((to->to_flags & TOF_TS) &&
5035             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5036                 return (0);
5037         }
5038         if (__predict_false((th->th_ack != tp->snd_una))) {
5039                 return (0);
5040         }
5041         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5042                 return (0);
5043         }
5044         if ((to->to_flags & TOF_TS) != 0 &&
5045             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5046                 tp->ts_recent_age = tcp_ts_getticks();
5047                 tp->ts_recent = to->to_tsval;
5048         }
5049         rack = (struct tcp_rack *)tp->t_fb_ptr;
5050         /*
5051          * This is a pure, in-sequence data packet with nothing on the
5052          * reassembly queue and we have enough buffer space to take it.
5053          */
5054         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5055
5056
5057         /* Clean receiver SACK report if present */
5058         if (tp->rcv_numsacks)
5059                 tcp_clean_sackreport(tp);
5060
5061         TCPSTAT_INC(tcps_preddat);
5062         tp->rcv_nxt += tlen;
5063         /*
5064          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5065          */
5066         tp->snd_wl1 = th->th_seq;
5067         /*
5068          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5069          */
5070         tp->rcv_up = tp->rcv_nxt;
5071         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5072         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5073 #ifdef TCPDEBUG
5074         if (so->so_options & SO_DEBUG)
5075                 tcp_trace(TA_INPUT, ostate, tp,
5076                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5077 #endif
5078         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5079
5080         /* Add data to socket buffer. */
5081         SOCKBUF_LOCK(&so->so_rcv);
5082         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5083                 m_freem(m);
5084         } else {
5085                 /*
5086                  * Set new socket buffer size. Give up when limit is
5087                  * reached.
5088                  */
5089                 if (newsize)
5090                         if (!sbreserve_locked(&so->so_rcv,
5091                             newsize, so, NULL))
5092                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5093                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5094                 sbappendstream_locked(&so->so_rcv, m, 0);
5095                 rack_calc_rwin(so, tp);
5096         }
5097         /* NB: sorwakeup_locked() does an implicit unlock. */
5098         sorwakeup_locked(so);
5099         if (DELAY_ACK(tp, tlen)) {
5100                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5101                 tp->t_flags |= TF_DELACK;
5102         } else {
5103                 tp->t_flags |= TF_ACKNOW;
5104                 rack->r_wanted_output++;
5105         }
5106         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
5107                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5108         return (1);
5109 }
5110
5111 /*
5112  * This subfunction is used to try to highly optimize the
5113  * fast path. We again allow window updates that are
5114  * in sequence to remain in the fast-path. We also add
5115  * in the __predict's to attempt to help the compiler.
5116  * Note that if we return a 0, then we can *not* process
5117  * it and the caller should push the packet into the
5118  * slow-path.
5119  */
5120 static int
5121 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5122     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5123     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
5124 {
5125         int32_t acked;
5126         int32_t nsegs;
5127
5128 #ifdef TCPDEBUG
5129         /*
5130          * The size of tcp_saveipgen must be the size of the max ip header,
5131          * now IPv6.
5132          */
5133         u_char tcp_saveipgen[IP6_HDR_LEN];
5134         struct tcphdr tcp_savetcp;
5135         short ostate = 0;
5136
5137 #endif
5138         struct tcp_rack *rack;
5139
5140         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5141                 /* Old ack, behind (or duplicate to) the last one rcv'd */
5142                 return (0);
5143         }
5144         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
5145                 /* Above what we have sent? */
5146                 return (0);
5147         }
5148         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5149                 /* We are retransmitting */
5150                 return (0);
5151         }
5152         if (__predict_false(tiwin == 0)) {
5153                 /* zero window */
5154                 return (0);
5155         }
5156         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
5157                 /* We need a SYN or a FIN, unlikely.. */
5158                 return (0);
5159         }
5160         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
5161                 /* Timestamp is behind .. old ack with seq wrap? */
5162                 return (0);
5163         }
5164         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
5165                 /* Still recovering */
5166                 return (0);
5167         }
5168         rack = (struct tcp_rack *)tp->t_fb_ptr;
5169         if (rack->r_ctl.rc_sacked) {
5170                 /* We have sack holes on our scoreboard */
5171                 return (0);
5172         }
5173         /* Ok if we reach here, we can process a fast-ack */
5174         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5175         rack_log_ack(tp, to, th);
5176         /* Did the window get updated? */
5177         if (tiwin != tp->snd_wnd) {
5178                 tp->snd_wnd = tiwin;
5179                 tp->snd_wl1 = th->th_seq;
5180                 if (tp->snd_wnd > tp->max_sndwnd)
5181                         tp->max_sndwnd = tp->snd_wnd;
5182         }
5183         if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
5184                 rack_exit_persist(tp, rack);
5185         }
5186         /*
5187          * If last ACK falls within this segment's sequence numbers, record
5188          * the timestamp. NOTE that the test is modified according to the
5189          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5190          */
5191         if ((to->to_flags & TOF_TS) != 0 &&
5192             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5193                 tp->ts_recent_age = tcp_ts_getticks();
5194                 tp->ts_recent = to->to_tsval;
5195         }
5196         /*
5197          * This is a pure ack for outstanding data.
5198          */
5199         TCPSTAT_INC(tcps_predack);
5200
5201         /*
5202          * "bad retransmit" recovery.
5203          */
5204         if (tp->t_flags & TF_PREVVALID) {
5205                 tp->t_flags &= ~TF_PREVVALID;
5206                 if (tp->t_rxtshift == 1 &&
5207                     (int)(ticks - tp->t_badrxtwin) < 0)
5208                         rack_cong_signal(tp, th, CC_RTO_ERR);
5209         }
5210         /*
5211          * Recalculate the transmit timer / rtt.
5212          *
5213          * Some boxes send broken timestamp replies during the SYN+ACK
5214          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5215          * and blow up the retransmit timer.
5216          */
5217         acked = BYTES_THIS_ACK(tp, th);
5218
5219 #ifdef TCP_HHOOK
5220         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
5221         hhook_run_tcp_est_in(tp, th, to);
5222 #endif
5223
5224         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5225         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5226         sbdrop(&so->so_snd, acked);
5227         /*
5228          * Let the congestion control algorithm update congestion control
5229          * related information. This typically means increasing the
5230          * congestion window.
5231          */
5232         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
5233
5234         tp->snd_una = th->th_ack;
5235         /*
5236          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
5237          */
5238         tp->snd_wl2 = th->th_ack;
5239         tp->t_dupacks = 0;
5240         m_freem(m);
5241         /* ND6_HINT(tp);         *//* Some progress has been made. */
5242
5243         /*
5244          * If all outstanding data are acked, stop retransmit timer,
5245          * otherwise restart timer using current (possibly backed-off)
5246          * value. If process is waiting for space, wakeup/selwakeup/signal.
5247          * If data are ready to send, let tcp_output decide between more
5248          * output or persist.
5249          */
5250 #ifdef TCPDEBUG
5251         if (so->so_options & SO_DEBUG)
5252                 tcp_trace(TA_INPUT, ostate, tp,
5253                     (void *)tcp_saveipgen,
5254                     &tcp_savetcp, 0);
5255 #endif
5256         if (tp->snd_una == tp->snd_max) {
5257                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5258                 tp->t_acktime = 0;
5259                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5260         }
5261         /* Wake up the socket if we have room to write more */
5262         sowwakeup(so);
5263         if (sbavail(&so->so_snd)) {
5264                 rack->r_wanted_output++;
5265         }
5266         return (1);
5267 }
5268
5269 /*
5270  * Return value of 1, the TCB is unlocked and most
5271  * likely gone, return value of 0, the TCP is still
5272  * locked.
5273  */
5274 static int
5275 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
5276     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5277     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5278 {
5279         int32_t ret_val = 0;
5280         int32_t todrop;
5281         int32_t ourfinisacked = 0;
5282
5283         rack_calc_rwin(so, tp);
5284         /*
5285          * If the state is SYN_SENT: if seg contains an ACK, but not for our
5286          * SYN, drop the input. if seg contains a RST, then drop the
5287          * connection. if seg does not contain SYN, then drop it. Otherwise
5288          * this is an acceptable SYN segment initialize tp->rcv_nxt and
5289          * tp->irs if seg contains ack then advance tp->snd_una if seg
5290          * contains an ECE and ECN support is enabled, the stream is ECN
5291          * capable. if SYN has been acked change to ESTABLISHED else
5292          * SYN_RCVD state arrange for segment to be acked (eventually)
5293          * continue processing rest of data/controls, beginning with URG
5294          */
5295         if ((thflags & TH_ACK) &&
5296             (SEQ_LEQ(th->th_ack, tp->iss) ||
5297             SEQ_GT(th->th_ack, tp->snd_max))) {
5298                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5299                 return (1);
5300         }
5301         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
5302                 TCP_PROBE5(connect__refused, NULL, tp,
5303                     mtod(m, const char *), tp, th);
5304                 tp = tcp_drop(tp, ECONNREFUSED);
5305                 rack_do_drop(m, tp);
5306                 return (1);
5307         }
5308         if (thflags & TH_RST) {
5309                 rack_do_drop(m, tp);
5310                 return (1);
5311         }
5312         if (!(thflags & TH_SYN)) {
5313                 rack_do_drop(m, tp);
5314                 return (1);
5315         }
5316         tp->irs = th->th_seq;
5317         tcp_rcvseqinit(tp);
5318         if (thflags & TH_ACK) {
5319                 int tfo_partial = 0;
5320
5321                 TCPSTAT_INC(tcps_connects);
5322                 soisconnected(so);
5323 #ifdef MAC
5324                 mac_socketpeer_set_from_mbuf(m, so);
5325 #endif
5326                 /* Do window scaling on this connection? */
5327                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5328                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5329                         tp->rcv_scale = tp->request_r_scale;
5330                 }
5331                 tp->rcv_adv += min(tp->rcv_wnd,
5332                     TCP_MAXWIN << tp->rcv_scale);
5333                 /*
5334                  * If not all the data that was sent in the TFO SYN
5335                  * has been acked, resend the remainder right away.
5336                  */
5337                 if (IS_FASTOPEN(tp->t_flags) &&
5338                     (tp->snd_una != tp->snd_max)) {
5339                         tp->snd_nxt = th->th_ack;
5340                         tfo_partial = 1;
5341                 }
5342                 /*
5343                  * If there's data, delay ACK; if there's also a FIN ACKNOW
5344                  * will be turned on later.
5345                  */
5346                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
5347                         rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
5348                                           ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
5349                         tp->t_flags |= TF_DELACK;
5350                 } else {
5351                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
5352                         tp->t_flags |= TF_ACKNOW;
5353                 }
5354
5355                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
5356                     V_tcp_do_ecn) {
5357                         tp->t_flags |= TF_ECN_PERMIT;
5358                         TCPSTAT_INC(tcps_ecn_shs);
5359                 }
5360                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
5361                         /*
5362                          * We advance snd_una for the
5363                          * fast open case. If th_ack is
5364                          * acknowledging data beyond
5365                          * snd_una we can't just call
5366                          * ack-processing since the
5367                          * data stream in our send-map
5368                          * will start at snd_una + 1 (one
5369                          * beyond the SYN). If its just
5370                          * equal we don't need to do that
5371                          * and there is no send_map.
5372                          */
5373                         tp->snd_una++;
5374                 }
5375                 /*
5376                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
5377                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
5378                  */
5379                 tp->t_starttime = ticks;
5380                 if (tp->t_flags & TF_NEEDFIN) {
5381                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
5382                         tp->t_flags &= ~TF_NEEDFIN;
5383                         thflags &= ~TH_SYN;
5384                 } else {
5385                         tcp_state_change(tp, TCPS_ESTABLISHED);
5386                         TCP_PROBE5(connect__established, NULL, tp,
5387                             mtod(m, const char *), tp, th);
5388                         cc_conn_init(tp);
5389                 }
5390         } else {
5391                 /*
5392                  * Received initial SYN in SYN-SENT[*] state => simultaneous
5393                  * open.  If segment contains CC option and there is a
5394                  * cached CC, apply TAO test. If it succeeds, connection is *
5395                  * half-synchronized. Otherwise, do 3-way handshake:
5396                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
5397                  * there was no CC option, clear cached CC value.
5398                  */
5399                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
5400                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
5401         }
5402         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5403         INP_WLOCK_ASSERT(tp->t_inpcb);
5404         /*
5405          * Advance th->th_seq to correspond to first data byte. If data,
5406          * trim to stay within window, dropping FIN if necessary.
5407          */
5408         th->th_seq++;
5409         if (tlen > tp->rcv_wnd) {
5410                 todrop = tlen - tp->rcv_wnd;
5411                 m_adj(m, -todrop);
5412                 tlen = tp->rcv_wnd;
5413                 thflags &= ~TH_FIN;
5414                 TCPSTAT_INC(tcps_rcvpackafterwin);
5415                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
5416         }
5417         tp->snd_wl1 = th->th_seq - 1;
5418         tp->rcv_up = th->th_seq;
5419         /*
5420          * Client side of transaction: already sent SYN and data. If the
5421          * remote host used T/TCP to validate the SYN, our data will be
5422          * ACK'd; if so, enter normal data segment processing in the middle
5423          * of step 5, ack processing. Otherwise, goto step 6.
5424          */
5425         if (thflags & TH_ACK) {
5426                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
5427                         return (ret_val);
5428                 /* We may have changed to FIN_WAIT_1 above */
5429                 if (tp->t_state == TCPS_FIN_WAIT_1) {
5430                         /*
5431                          * In FIN_WAIT_1 STATE in addition to the processing
5432                          * for the ESTABLISHED state if our FIN is now
5433                          * acknowledged then enter FIN_WAIT_2.
5434                          */
5435                         if (ourfinisacked) {
5436                                 /*
5437                                  * If we can't receive any more data, then
5438                                  * closing user can proceed. Starting the
5439                                  * timer is contrary to the specification,
5440                                  * but if we don't get a FIN we'll hang
5441                                  * forever.
5442                                  *
5443                                  * XXXjl: we should release the tp also, and
5444                                  * use a compressed state.
5445                                  */
5446                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5447                                         soisdisconnected(so);
5448                                         tcp_timer_activate(tp, TT_2MSL,
5449                                             (tcp_fast_finwait2_recycle ?
5450                                             tcp_finwait2_timeout :
5451                                             TP_MAXIDLE(tp)));
5452                                 }
5453                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5454                         }
5455                 }
5456         }
5457         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5458            tiwin, thflags, nxt_pkt));
5459 }
5460
5461 /*
5462  * Return value of 1, the TCB is unlocked and most
5463  * likely gone, return value of 0, the TCP is still
5464  * locked.
5465  */
5466 static int
5467 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
5468     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5469     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5470 {
5471         int32_t ret_val = 0;
5472         int32_t ourfinisacked = 0;
5473
5474         rack_calc_rwin(so, tp);
5475
5476         if ((thflags & TH_ACK) &&
5477             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
5478             SEQ_GT(th->th_ack, tp->snd_max))) {
5479                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5480                 return (1);
5481         }
5482         if (IS_FASTOPEN(tp->t_flags)) {
5483                 /*
5484                  * When a TFO connection is in SYN_RECEIVED, the
5485                  * only valid packets are the initial SYN, a
5486                  * retransmit/copy of the initial SYN (possibly with
5487                  * a subset of the original data), a valid ACK, a
5488                  * FIN, or a RST.
5489                  */
5490                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
5491                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5492                         return (1);
5493                 } else if (thflags & TH_SYN) {
5494                         /* non-initial SYN is ignored */
5495                         struct tcp_rack *rack;
5496
5497                         rack = (struct tcp_rack *)tp->t_fb_ptr;
5498                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
5499                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
5500                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
5501                                 rack_do_drop(m, NULL);
5502                                 return (0);
5503                         }
5504                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
5505                         rack_do_drop(m, NULL);
5506                         return (0);
5507                 }
5508         }
5509         if (thflags & TH_RST)
5510                 return (rack_process_rst(m, th, so, tp));
5511         /*
5512          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5513          * it's less than ts_recent, drop it.
5514          */
5515         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5516             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5517                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5518                         return (ret_val);
5519         }
5520         /*
5521          * In the SYN-RECEIVED state, validate that the packet belongs to
5522          * this connection before trimming the data to fit the receive
5523          * window.  Check the sequence number versus IRS since we know the
5524          * sequence numbers haven't wrapped.  This is a partial fix for the
5525          * "LAND" DoS attack.
5526          */
5527         if (SEQ_LT(th->th_seq, tp->irs)) {
5528                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5529                 return (1);
5530         }
5531         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5532                 return (ret_val);
5533         }
5534         /*
5535          * If last ACK falls within this segment's sequence numbers, record
5536          * its timestamp. NOTE: 1) That the test incorporates suggestions
5537          * from the latest proposal of the tcplw@cray.com list (Braden
5538          * 1993/04/26). 2) That updating only on newer timestamps interferes
5539          * with our earlier PAWS tests, so this check should be solely
5540          * predicated on the sequence space of this segment. 3) That we
5541          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5542          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5543          * SEG.Len, This modified check allows us to overcome RFC1323's
5544          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5545          * p.869. In such cases, we can still calculate the RTT correctly
5546          * when RCV.NXT == Last.ACK.Sent.
5547          */
5548         if ((to->to_flags & TOF_TS) != 0 &&
5549             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5550             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5551             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5552                 tp->ts_recent_age = tcp_ts_getticks();
5553                 tp->ts_recent = to->to_tsval;
5554         }
5555         tp->snd_wnd = tiwin;
5556         /*
5557          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5558          * is on (half-synchronized state), then queue data for later
5559          * processing; else drop segment and return.
5560          */
5561         if ((thflags & TH_ACK) == 0) {
5562                 if (IS_FASTOPEN(tp->t_flags)) {
5563                         cc_conn_init(tp);
5564                 }
5565                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5566                     tiwin, thflags, nxt_pkt));
5567         }
5568         TCPSTAT_INC(tcps_connects);
5569         soisconnected(so);
5570         /* Do window scaling? */
5571         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5572             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5573                 tp->rcv_scale = tp->request_r_scale;
5574         }
5575         /*
5576          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
5577          * FIN-WAIT-1
5578          */
5579         tp->t_starttime = ticks;
5580         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
5581                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
5582                 tp->t_tfo_pending = NULL;
5583
5584                 /*
5585                  * Account for the ACK of our SYN prior to
5586                  * regular ACK processing below.
5587                  */
5588                 tp->snd_una++;
5589         }
5590         if (tp->t_flags & TF_NEEDFIN) {
5591                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
5592                 tp->t_flags &= ~TF_NEEDFIN;
5593         } else {
5594                 tcp_state_change(tp, TCPS_ESTABLISHED);
5595                 TCP_PROBE5(accept__established, NULL, tp,
5596                     mtod(m, const char *), tp, th);
5597                 /*
5598                  * TFO connections call cc_conn_init() during SYN
5599                  * processing.  Calling it again here for such connections
5600                  * is not harmless as it would undo the snd_cwnd reduction
5601                  * that occurs when a TFO SYN|ACK is retransmitted.
5602                  */
5603                 if (!IS_FASTOPEN(tp->t_flags))
5604                         cc_conn_init(tp);
5605         }
5606         /*
5607          * If segment contains data or ACK, will call tcp_reass() later; if
5608          * not, do so now to pass queued data to user.
5609          */
5610         if (tlen == 0 && (thflags & TH_FIN) == 0)
5611                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
5612                     (struct mbuf *)0);
5613         tp->snd_wl1 = th->th_seq - 1;
5614         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5615                 return (ret_val);
5616         }
5617         if (tp->t_state == TCPS_FIN_WAIT_1) {
5618                 /* We could have went to FIN_WAIT_1 (or EST) above */
5619                 /*
5620                  * In FIN_WAIT_1 STATE in addition to the processing for the
5621                  * ESTABLISHED state if our FIN is now acknowledged then
5622                  * enter FIN_WAIT_2.
5623                  */
5624                 if (ourfinisacked) {
5625                         /*
5626                          * If we can't receive any more data, then closing
5627                          * user can proceed. Starting the timer is contrary
5628                          * to the specification, but if we don't get a FIN
5629                          * we'll hang forever.
5630                          *
5631                          * XXXjl: we should release the tp also, and use a
5632                          * compressed state.
5633                          */
5634                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5635                                 soisdisconnected(so);
5636                                 tcp_timer_activate(tp, TT_2MSL,
5637                                     (tcp_fast_finwait2_recycle ?
5638                                     tcp_finwait2_timeout :
5639                                     TP_MAXIDLE(tp)));
5640                         }
5641                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
5642                 }
5643         }
5644         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5645             tiwin, thflags, nxt_pkt));
5646 }
5647
5648 /*
5649  * Return value of 1, the TCB is unlocked and most
5650  * likely gone, return value of 0, the TCP is still
5651  * locked.
5652  */
5653 static int
5654 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
5655     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5656     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5657 {
5658         int32_t ret_val = 0;
5659
5660         /*
5661          * Header prediction: check for the two common cases of a
5662          * uni-directional data xfer.  If the packet has no control flags,
5663          * is in-sequence, the window didn't change and we're not
5664          * retransmitting, it's a candidate.  If the length is zero and the
5665          * ack moved forward, we're the sender side of the xfer.  Just free
5666          * the data acked & wake any higher level process that was blocked
5667          * waiting for space.  If the length is non-zero and the ack didn't
5668          * move, we're the receiver side.  If we're getting packets in-order
5669          * (the reassembly queue is empty), add the data toc The socket
5670          * buffer and note that we need a delayed ack. Make sure that the
5671          * hidden state-flags are also off. Since we check for
5672          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
5673          */
5674         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
5675             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
5676             __predict_true(SEGQ_EMPTY(tp)) &&
5677             __predict_true(th->th_seq == tp->rcv_nxt)) {
5678                 struct tcp_rack *rack;
5679
5680                 rack = (struct tcp_rack *)tp->t_fb_ptr;
5681                 if (tlen == 0) {
5682                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
5683                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
5684                                 return (0);
5685                         }
5686                 } else {
5687                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
5688                             tiwin, nxt_pkt)) {
5689                                 return (0);
5690                         }
5691                 }
5692         }
5693         rack_calc_rwin(so, tp);
5694
5695         if (thflags & TH_RST)
5696                 return (rack_process_rst(m, th, so, tp));
5697
5698         /*
5699          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5700          * synchronized state.
5701          */
5702         if (thflags & TH_SYN) {
5703                 rack_challenge_ack(m, th, tp, &ret_val);
5704                 return (ret_val);
5705         }
5706         /*
5707          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5708          * it's less than ts_recent, drop it.
5709          */
5710         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5711             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5712                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5713                         return (ret_val);
5714         }
5715         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5716                 return (ret_val);
5717         }
5718         /*
5719          * If last ACK falls within this segment's sequence numbers, record
5720          * its timestamp. NOTE: 1) That the test incorporates suggestions
5721          * from the latest proposal of the tcplw@cray.com list (Braden
5722          * 1993/04/26). 2) That updating only on newer timestamps interferes
5723          * with our earlier PAWS tests, so this check should be solely
5724          * predicated on the sequence space of this segment. 3) That we
5725          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5726          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5727          * SEG.Len, This modified check allows us to overcome RFC1323's
5728          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5729          * p.869. In such cases, we can still calculate the RTT correctly
5730          * when RCV.NXT == Last.ACK.Sent.
5731          */
5732         if ((to->to_flags & TOF_TS) != 0 &&
5733             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5734             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5735             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5736                 tp->ts_recent_age = tcp_ts_getticks();
5737                 tp->ts_recent = to->to_tsval;
5738         }
5739         /*
5740          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5741          * is on (half-synchronized state), then queue data for later
5742          * processing; else drop segment and return.
5743          */
5744         if ((thflags & TH_ACK) == 0) {
5745                 if (tp->t_flags & TF_NEEDSYN) {
5746
5747                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5748                             tiwin, thflags, nxt_pkt));
5749
5750                 } else if (tp->t_flags & TF_ACKNOW) {
5751                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5752                         return (ret_val);
5753                 } else {
5754                         rack_do_drop(m, NULL);
5755                         return (0);
5756                 }
5757         }
5758         /*
5759          * Ack processing.
5760          */
5761         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5762                 return (ret_val);
5763         }
5764         if (sbavail(&so->so_snd)) {
5765                 if (rack_progress_timeout_check(tp)) {
5766                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5767                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5768                         return (1);
5769                 }
5770         }
5771         /* State changes only happen in rack_process_data() */
5772         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5773             tiwin, thflags, nxt_pkt));
5774 }
5775
5776 /*
5777  * Return value of 1, the TCB is unlocked and most
5778  * likely gone, return value of 0, the TCP is still
5779  * locked.
5780  */
5781 static int
5782 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
5783     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5784     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5785 {
5786         int32_t ret_val = 0;
5787
5788         rack_calc_rwin(so, tp);
5789         if (thflags & TH_RST)
5790                 return (rack_process_rst(m, th, so, tp));
5791         /*
5792          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5793          * synchronized state.
5794          */
5795         if (thflags & TH_SYN) {
5796                 rack_challenge_ack(m, th, tp, &ret_val);
5797                 return (ret_val);
5798         }
5799         /*
5800          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5801          * it's less than ts_recent, drop it.
5802          */
5803         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5804             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5805                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5806                         return (ret_val);
5807         }
5808         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5809                 return (ret_val);
5810         }
5811         /*
5812          * If last ACK falls within this segment's sequence numbers, record
5813          * its timestamp. NOTE: 1) That the test incorporates suggestions
5814          * from the latest proposal of the tcplw@cray.com list (Braden
5815          * 1993/04/26). 2) That updating only on newer timestamps interferes
5816          * with our earlier PAWS tests, so this check should be solely
5817          * predicated on the sequence space of this segment. 3) That we
5818          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5819          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5820          * SEG.Len, This modified check allows us to overcome RFC1323's
5821          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5822          * p.869. In such cases, we can still calculate the RTT correctly
5823          * when RCV.NXT == Last.ACK.Sent.
5824          */
5825         if ((to->to_flags & TOF_TS) != 0 &&
5826             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5827             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5828             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5829                 tp->ts_recent_age = tcp_ts_getticks();
5830                 tp->ts_recent = to->to_tsval;
5831         }
5832         /*
5833          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5834          * is on (half-synchronized state), then queue data for later
5835          * processing; else drop segment and return.
5836          */
5837         if ((thflags & TH_ACK) == 0) {
5838                 if (tp->t_flags & TF_NEEDSYN) {
5839                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5840                             tiwin, thflags, nxt_pkt));
5841
5842                 } else if (tp->t_flags & TF_ACKNOW) {
5843                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5844                         return (ret_val);
5845                 } else {
5846                         rack_do_drop(m, NULL);
5847                         return (0);
5848                 }
5849         }
5850         /*
5851          * Ack processing.
5852          */
5853         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5854                 return (ret_val);
5855         }
5856         if (sbavail(&so->so_snd)) {
5857                 if (rack_progress_timeout_check(tp)) {
5858                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5859                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5860                         return (1);
5861                 }
5862         }
5863         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5864             tiwin, thflags, nxt_pkt));
5865 }
5866
5867 static int
5868 rack_check_data_after_close(struct mbuf *m,
5869     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
5870 {
5871         struct tcp_rack *rack;
5872
5873         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5874         rack = (struct tcp_rack *)tp->t_fb_ptr;
5875         if (rack->rc_allow_data_af_clo == 0) {
5876         close_now:
5877                 tp = tcp_close(tp);
5878                 TCPSTAT_INC(tcps_rcvafterclose);
5879                 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
5880                 return (1);
5881         }
5882         if (sbavail(&so->so_snd) == 0)
5883                 goto close_now;
5884         /* Ok we allow data that is ignored and a followup reset */
5885         tp->rcv_nxt = th->th_seq + *tlen;
5886         tp->t_flags2 |= TF2_DROP_AF_DATA;
5887         rack->r_wanted_output = 1;
5888         *tlen = 0;
5889         return (0);
5890 }
5891
5892 /*
5893  * Return value of 1, the TCB is unlocked and most
5894  * likely gone, return value of 0, the TCP is still
5895  * locked.
5896  */
5897 static int
5898 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
5899     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5900     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5901 {
5902         int32_t ret_val = 0;
5903         int32_t ourfinisacked = 0;
5904
5905         rack_calc_rwin(so, tp);
5906
5907         if (thflags & TH_RST)
5908                 return (rack_process_rst(m, th, so, tp));
5909         /*
5910          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5911          * synchronized state.
5912          */
5913         if (thflags & TH_SYN) {
5914                 rack_challenge_ack(m, th, tp, &ret_val);
5915                 return (ret_val);
5916         }
5917         /*
5918          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5919          * it's less than ts_recent, drop it.
5920          */
5921         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5922             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5923                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5924                         return (ret_val);
5925         }
5926         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5927                 return (ret_val);
5928         }
5929         /*
5930          * If new data are received on a connection after the user processes
5931          * are gone, then RST the other end.
5932          */
5933         if ((so->so_state & SS_NOFDREF) && tlen) {
5934                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
5935                         return (1);
5936         }
5937         /*
5938          * If last ACK falls within this segment's sequence numbers, record
5939          * its timestamp. NOTE: 1) That the test incorporates suggestions
5940          * from the latest proposal of the tcplw@cray.com list (Braden
5941          * 1993/04/26). 2) That updating only on newer timestamps interferes
5942          * with our earlier PAWS tests, so this check should be solely
5943          * predicated on the sequence space of this segment. 3) That we
5944          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5945          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5946          * SEG.Len, This modified check allows us to overcome RFC1323's
5947          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5948          * p.869. In such cases, we can still calculate the RTT correctly
5949          * when RCV.NXT == Last.ACK.Sent.
5950          */
5951         if ((to->to_flags & TOF_TS) != 0 &&
5952             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5953             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5954             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5955                 tp->ts_recent_age = tcp_ts_getticks();
5956                 tp->ts_recent = to->to_tsval;
5957         }
5958         /*
5959          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5960          * is on (half-synchronized state), then queue data for later
5961          * processing; else drop segment and return.
5962          */
5963         if ((thflags & TH_ACK) == 0) {
5964                 if (tp->t_flags & TF_NEEDSYN) {
5965                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5966                             tiwin, thflags, nxt_pkt));
5967                 } else if (tp->t_flags & TF_ACKNOW) {
5968                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5969                         return (ret_val);
5970                 } else {
5971                         rack_do_drop(m, NULL);
5972                         return (0);
5973                 }
5974         }
5975         /*
5976          * Ack processing.
5977          */
5978         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5979                 return (ret_val);
5980         }
5981         if (ourfinisacked) {
5982                 /*
5983                  * If we can't receive any more data, then closing user can
5984                  * proceed. Starting the timer is contrary to the
5985                  * specification, but if we don't get a FIN we'll hang
5986                  * forever.
5987                  *
5988                  * XXXjl: we should release the tp also, and use a
5989                  * compressed state.
5990                  */
5991                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5992                         soisdisconnected(so);
5993                         tcp_timer_activate(tp, TT_2MSL,
5994                             (tcp_fast_finwait2_recycle ?
5995                             tcp_finwait2_timeout :
5996                             TP_MAXIDLE(tp)));
5997                 }
5998                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
5999         }
6000         if (sbavail(&so->so_snd)) {
6001                 if (rack_progress_timeout_check(tp)) {
6002                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6003                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6004                         return (1);
6005                 }
6006         }
6007         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6008             tiwin, thflags, nxt_pkt));
6009 }
6010
6011 /*
6012  * Return value of 1, the TCB is unlocked and most
6013  * likely gone, return value of 0, the TCP is still
6014  * locked.
6015  */
6016 static int
6017 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
6018     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6019     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6020 {
6021         int32_t ret_val = 0;
6022         int32_t ourfinisacked = 0;
6023
6024         rack_calc_rwin(so, tp);
6025
6026         if (thflags & TH_RST)
6027                 return (rack_process_rst(m, th, so, tp));
6028         /*
6029          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6030          * synchronized state.
6031          */
6032         if (thflags & TH_SYN) {
6033                 rack_challenge_ack(m, th, tp, &ret_val);
6034                 return (ret_val);
6035         }
6036         /*
6037          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6038          * it's less than ts_recent, drop it.
6039          */
6040         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6041             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6042                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6043                         return (ret_val);
6044         }
6045         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6046                 return (ret_val);
6047         }
6048         /*
6049          * If new data are received on a connection after the user processes
6050          * are gone, then RST the other end.
6051          */
6052         if ((so->so_state & SS_NOFDREF) && tlen) {
6053                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6054                         return (1);
6055         }
6056         /*
6057          * If last ACK falls within this segment's sequence numbers, record
6058          * its timestamp. NOTE: 1) That the test incorporates suggestions
6059          * from the latest proposal of the tcplw@cray.com list (Braden
6060          * 1993/04/26). 2) That updating only on newer timestamps interferes
6061          * with our earlier PAWS tests, so this check should be solely
6062          * predicated on the sequence space of this segment. 3) That we
6063          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6064          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6065          * SEG.Len, This modified check allows us to overcome RFC1323's
6066          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6067          * p.869. In such cases, we can still calculate the RTT correctly
6068          * when RCV.NXT == Last.ACK.Sent.
6069          */
6070         if ((to->to_flags & TOF_TS) != 0 &&
6071             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6072             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6073             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6074                 tp->ts_recent_age = tcp_ts_getticks();
6075                 tp->ts_recent = to->to_tsval;
6076         }
6077         /*
6078          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6079          * is on (half-synchronized state), then queue data for later
6080          * processing; else drop segment and return.
6081          */
6082         if ((thflags & TH_ACK) == 0) {
6083                 if (tp->t_flags & TF_NEEDSYN) {
6084                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6085                             tiwin, thflags, nxt_pkt));
6086                 } else if (tp->t_flags & TF_ACKNOW) {
6087                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6088                         return (ret_val);
6089                 } else {
6090                         rack_do_drop(m, NULL);
6091                         return (0);
6092                 }
6093         }
6094         /*
6095          * Ack processing.
6096          */
6097         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6098                 return (ret_val);
6099         }
6100         if (ourfinisacked) {
6101                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6102                 tcp_twstart(tp);
6103                 m_freem(m);
6104                 return (1);
6105         }
6106         if (sbavail(&so->so_snd)) {
6107                 if (rack_progress_timeout_check(tp)) {
6108                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6109                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6110                         return (1);
6111                 }
6112         }
6113         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6114             tiwin, thflags, nxt_pkt));
6115 }
6116
6117 /*
6118  * Return value of 1, the TCB is unlocked and most
6119  * likely gone, return value of 0, the TCP is still
6120  * locked.
6121  */
6122 static int
6123 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6124     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6125     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6126 {
6127         int32_t ret_val = 0;
6128         int32_t ourfinisacked = 0;
6129
6130         rack_calc_rwin(so, tp);
6131
6132         if (thflags & TH_RST)
6133                 return (rack_process_rst(m, th, so, tp));
6134         /*
6135          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6136          * synchronized state.
6137          */
6138         if (thflags & TH_SYN) {
6139                 rack_challenge_ack(m, th, tp, &ret_val);
6140                 return (ret_val);
6141         }
6142         /*
6143          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6144          * it's less than ts_recent, drop it.
6145          */
6146         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6147             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6148                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6149                         return (ret_val);
6150         }
6151         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6152                 return (ret_val);
6153         }
6154         /*
6155          * If new data are received on a connection after the user processes
6156          * are gone, then RST the other end.
6157          */
6158         if ((so->so_state & SS_NOFDREF) && tlen) {
6159                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6160                         return (1);
6161         }
6162         /*
6163          * If last ACK falls within this segment's sequence numbers, record
6164          * its timestamp. NOTE: 1) That the test incorporates suggestions
6165          * from the latest proposal of the tcplw@cray.com list (Braden
6166          * 1993/04/26). 2) That updating only on newer timestamps interferes
6167          * with our earlier PAWS tests, so this check should be solely
6168          * predicated on the sequence space of this segment. 3) That we
6169          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6170          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6171          * SEG.Len, This modified check allows us to overcome RFC1323's
6172          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6173          * p.869. In such cases, we can still calculate the RTT correctly
6174          * when RCV.NXT == Last.ACK.Sent.
6175          */
6176         if ((to->to_flags & TOF_TS) != 0 &&
6177             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6178             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6179             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6180                 tp->ts_recent_age = tcp_ts_getticks();
6181                 tp->ts_recent = to->to_tsval;
6182         }
6183         /*
6184          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6185          * is on (half-synchronized state), then queue data for later
6186          * processing; else drop segment and return.
6187          */
6188         if ((thflags & TH_ACK) == 0) {
6189                 if (tp->t_flags & TF_NEEDSYN) {
6190                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6191                             tiwin, thflags, nxt_pkt));
6192                 } else if (tp->t_flags & TF_ACKNOW) {
6193                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6194                         return (ret_val);
6195                 } else {
6196                         rack_do_drop(m, NULL);
6197                         return (0);
6198                 }
6199         }
6200         /*
6201          * case TCPS_LAST_ACK: Ack processing.
6202          */
6203         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6204                 return (ret_val);
6205         }
6206         if (ourfinisacked) {
6207                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6208                 tp = tcp_close(tp);
6209                 rack_do_drop(m, tp);
6210                 return (1);
6211         }
6212         if (sbavail(&so->so_snd)) {
6213                 if (rack_progress_timeout_check(tp)) {
6214                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6215                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6216                         return (1);
6217                 }
6218         }
6219         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6220             tiwin, thflags, nxt_pkt));
6221 }
6222
6223
6224 /*
6225  * Return value of 1, the TCB is unlocked and most
6226  * likely gone, return value of 0, the TCP is still
6227  * locked.
6228  */
6229 static int
6230 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
6231     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6232     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6233 {
6234         int32_t ret_val = 0;
6235         int32_t ourfinisacked = 0;
6236
6237         rack_calc_rwin(so, tp);
6238
6239         /* Reset receive buffer auto scaling when not in bulk receive mode. */
6240         if (thflags & TH_RST)
6241                 return (rack_process_rst(m, th, so, tp));
6242         /*
6243          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6244          * synchronized state.
6245          */
6246         if (thflags & TH_SYN) {
6247                 rack_challenge_ack(m, th, tp, &ret_val);
6248                 return (ret_val);
6249         }
6250         /*
6251          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6252          * it's less than ts_recent, drop it.
6253          */
6254         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6255             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6256                 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6257                         return (ret_val);
6258         }
6259         if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6260                 return (ret_val);
6261         }
6262         /*
6263          * If new data are received on a connection after the user processes
6264          * are gone, then RST the other end.
6265          */
6266         if ((so->so_state & SS_NOFDREF) &&
6267             tlen) {
6268                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6269                         return (1);
6270         }
6271         /*
6272          * If last ACK falls within this segment's sequence numbers, record
6273          * its timestamp. NOTE: 1) That the test incorporates suggestions
6274          * from the latest proposal of the tcplw@cray.com list (Braden
6275          * 1993/04/26). 2) That updating only on newer timestamps interferes
6276          * with our earlier PAWS tests, so this check should be solely
6277          * predicated on the sequence space of this segment. 3) That we
6278          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6279          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6280          * SEG.Len, This modified check allows us to overcome RFC1323's
6281          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6282          * p.869. In such cases, we can still calculate the RTT correctly
6283          * when RCV.NXT == Last.ACK.Sent.
6284          */
6285         if ((to->to_flags & TOF_TS) != 0 &&
6286             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6287             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6288             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6289                 tp->ts_recent_age = tcp_ts_getticks();
6290                 tp->ts_recent = to->to_tsval;
6291         }
6292         /*
6293          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6294          * is on (half-synchronized state), then queue data for later
6295          * processing; else drop segment and return.
6296          */
6297         if ((thflags & TH_ACK) == 0) {
6298                 if (tp->t_flags & TF_NEEDSYN) {
6299                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6300                             tiwin, thflags, nxt_pkt));
6301                 } else if (tp->t_flags & TF_ACKNOW) {
6302                         rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6303                         return (ret_val);
6304                 } else {
6305                         rack_do_drop(m, NULL);
6306                         return (0);
6307                 }
6308         }
6309         /*
6310          * Ack processing.
6311          */
6312         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6313                 return (ret_val);
6314         }
6315         if (sbavail(&so->so_snd)) {
6316                 if (rack_progress_timeout_check(tp)) {
6317                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6318                         rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6319                         return (1);
6320                 }
6321         }
6322         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6323             tiwin, thflags, nxt_pkt));
6324 }
6325
6326
6327 static void inline
6328 rack_clear_rate_sample(struct tcp_rack *rack)
6329 {
6330         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
6331         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
6332         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
6333 }
6334
6335 static int
6336 rack_init(struct tcpcb *tp)
6337 {
6338         struct tcp_rack *rack = NULL;
6339
6340         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
6341         if (tp->t_fb_ptr == NULL) {
6342                 /*
6343                  * We need to allocate memory but cant. The INP and INP_INFO
6344                  * locks and they are recusive (happens during setup. So a
6345                  * scheme to drop the locks fails :(
6346                  *
6347                  */
6348                 return (ENOMEM);
6349         }
6350         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
6351
6352         rack = (struct tcp_rack *)tp->t_fb_ptr;
6353         TAILQ_INIT(&rack->r_ctl.rc_map);
6354         TAILQ_INIT(&rack->r_ctl.rc_free);
6355         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6356         rack->rc_tp = tp;
6357         if (tp->t_inpcb) {
6358                 rack->rc_inp = tp->t_inpcb;
6359         }
6360         /* Probably not needed but lets be sure */
6361         rack_clear_rate_sample(rack);
6362         rack->r_cpu = 0;
6363         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
6364         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
6365         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
6366         rack->rc_pace_reduce = rack_slot_reduction;
6367         if (V_tcp_delack_enabled)
6368                 tp->t_delayed_ack = 1;
6369         else
6370                 tp->t_delayed_ack = 0;
6371         rack->rc_pace_max_segs = rack_hptsi_segments;
6372         rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
6373         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
6374         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
6375         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
6376         rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
6377         rack->r_enforce_min_pace = rack_min_pace_time;
6378         rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
6379         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
6380         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
6381         rack->r_ctl.rc_early_recovery = rack_early_recovery;
6382         rack->rc_always_pace = rack_pace_every_seg;
6383         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
6384         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
6385         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
6386         rack->r_ctl.rc_min_to = rack_min_to;
6387         rack->r_ctl.rc_prr_inc_var = rack_inc_var;
6388         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6389         if (tp->snd_una != tp->snd_max) {
6390                 /* Create a send map for the current outstanding data */
6391                 struct rack_sendmap *rsm;
6392
6393                 rsm = rack_alloc(rack);
6394                 if (rsm == NULL) {
6395                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6396                         tp->t_fb_ptr = NULL;
6397                         return (ENOMEM);
6398                 }
6399                 rsm->r_flags = RACK_OVERMAX;
6400                 rsm->r_tim_lastsent[0] = tcp_ts_getticks();
6401                 rsm->r_rtr_cnt = 1;
6402                 rsm->r_rtr_bytes = 0;
6403                 rsm->r_start = tp->snd_una;
6404                 rsm->r_end = tp->snd_max;
6405                 rsm->r_sndcnt = 0;
6406                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
6407                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6408                 rsm->r_in_tmap = 1;
6409         }
6410         return (0);
6411 }
6412
6413 static int
6414 rack_handoff_ok(struct tcpcb *tp)
6415 {
6416         if ((tp->t_state == TCPS_CLOSED) ||
6417             (tp->t_state == TCPS_LISTEN)) {
6418                 /* Sure no problem though it may not stick */
6419                 return (0);
6420         }
6421         if ((tp->t_state == TCPS_SYN_SENT) ||
6422             (tp->t_state == TCPS_SYN_RECEIVED)) {
6423                 /*
6424                  * We really don't know you have to get to ESTAB or beyond
6425                  * to tell.
6426                  */
6427                 return (EAGAIN);
6428         }
6429         if (tp->t_flags & TF_SACK_PERMIT) {
6430                 return (0);
6431         }
6432         /*
6433          * If we reach here we don't do SACK on this connection so we can
6434          * never do rack.
6435          */
6436         return (EINVAL);
6437 }
6438
6439 static void
6440 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
6441 {
6442         if (tp->t_fb_ptr) {
6443                 struct tcp_rack *rack;
6444                 struct rack_sendmap *rsm;
6445
6446                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6447 #ifdef TCP_BLACKBOX
6448                 tcp_log_flowend(tp);
6449 #endif
6450                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6451                 while (rsm) {
6452                         TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
6453                         uma_zfree(rack_zone, rsm);
6454                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6455                 }
6456                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6457                 while (rsm) {
6458                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
6459                         uma_zfree(rack_zone, rsm);
6460                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6461                 }
6462                 rack->rc_free_cnt = 0;
6463                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6464                 tp->t_fb_ptr = NULL;
6465         }
6466 }
6467
6468 static void
6469 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
6470 {
6471         switch (tp->t_state) {
6472         case TCPS_SYN_SENT:
6473                 rack->r_state = TCPS_SYN_SENT;
6474                 rack->r_substate = rack_do_syn_sent;
6475                 break;
6476         case TCPS_SYN_RECEIVED:
6477                 rack->r_state = TCPS_SYN_RECEIVED;
6478                 rack->r_substate = rack_do_syn_recv;
6479                 break;
6480         case TCPS_ESTABLISHED:
6481                 rack->r_state = TCPS_ESTABLISHED;
6482                 rack->r_substate = rack_do_established;
6483                 break;
6484         case TCPS_CLOSE_WAIT:
6485                 rack->r_state = TCPS_CLOSE_WAIT;
6486                 rack->r_substate = rack_do_close_wait;
6487                 break;
6488         case TCPS_FIN_WAIT_1:
6489                 rack->r_state = TCPS_FIN_WAIT_1;
6490                 rack->r_substate = rack_do_fin_wait_1;
6491                 break;
6492         case TCPS_CLOSING:
6493                 rack->r_state = TCPS_CLOSING;
6494                 rack->r_substate = rack_do_closing;
6495                 break;
6496         case TCPS_LAST_ACK:
6497                 rack->r_state = TCPS_LAST_ACK;
6498                 rack->r_substate = rack_do_lastack;
6499                 break;
6500         case TCPS_FIN_WAIT_2:
6501                 rack->r_state = TCPS_FIN_WAIT_2;
6502                 rack->r_substate = rack_do_fin_wait_2;
6503                 break;
6504         case TCPS_LISTEN:
6505         case TCPS_CLOSED:
6506         case TCPS_TIME_WAIT:
6507         default:
6508 #ifdef INVARIANTS
6509                 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
6510 #endif
6511                 break;
6512         };
6513 }
6514
6515
6516 static void
6517 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
6518 {
6519         /*
6520          * We received an ack, and then did not
6521          * call send or were bounced out due to the
6522          * hpts was running. Now a timer is up as well, is
6523          * it the right timer?
6524          */
6525         struct rack_sendmap *rsm;
6526         int tmr_up;
6527
6528         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
6529         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
6530                 return;
6531         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6532         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
6533             (tmr_up == PACE_TMR_RXT)) {
6534                 /* Should be an RXT */
6535                 return;
6536         }
6537         if (rsm == NULL) {
6538                 /* Nothing outstanding? */
6539                 if (tp->t_flags & TF_DELACK) {
6540                         if (tmr_up == PACE_TMR_DELACK)
6541                                 /* We are supposed to have delayed ack up and we do */
6542                                 return;
6543                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
6544                         /*
6545                          * if we hit enobufs then we would expect the possiblity
6546                          * of nothing outstanding and the RXT up (and the hptsi timer).
6547                          */
6548                         return;
6549                 } else if (((V_tcp_always_keepalive ||
6550                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6551                             (tp->t_state <= TCPS_CLOSING)) &&
6552                            (tmr_up == PACE_TMR_KEEP) &&
6553                            (tp->snd_max == tp->snd_una)) {
6554                         /* We should have keep alive up and we do */
6555                         return;
6556                 }
6557         }
6558         if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
6559                 if ((tp->t_flags & TF_SENTFIN) &&
6560                     ((tp->snd_max - tp->snd_una) == 1) &&
6561                     (rsm->r_flags & RACK_HAS_FIN)) {
6562                         /* needs to be a RXT */
6563                         if (tmr_up == PACE_TMR_RXT)
6564                                 return;
6565                 } else if (tmr_up == PACE_TMR_RACK)
6566                         return;
6567         } else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
6568                    ((tmr_up == PACE_TMR_TLP) ||
6569                     (tmr_up == PACE_TMR_RXT))) {
6570                 /*
6571                  * Either a TLP or RXT is fine if no sack-passed
6572                  * is in place and data is outstanding.
6573                  */
6574                 return;
6575         } else if (tmr_up == PACE_TMR_DELACK) {
6576                 /*
6577                  * If the delayed ack was going to go off
6578                  * before the rtx/tlp/rack timer were going to
6579                  * expire, then that would be the timer in control.
6580                  * Note we don't check the time here trusting the
6581                  * code is correct.
6582                  */
6583                 return;
6584         }
6585         /*
6586          * Ok the timer originally started is not what we want now.
6587          * We will force the hpts to be stopped if any, and restart
6588          * with the slot set to what was in the saved slot.
6589          */
6590         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6591         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6592 }
6593
6594 static void
6595 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6596     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
6597     int32_t nxt_pkt, struct timeval *tv)
6598 {
6599         int32_t thflags, retval, did_out = 0;
6600         int32_t way_out = 0;
6601         uint32_t cts;
6602         uint32_t tiwin;
6603         struct tcpopt to;
6604         struct tcp_rack *rack;
6605         struct rack_sendmap *rsm;
6606         int32_t prev_state = 0;
6607
6608         cts = tcp_tv_to_mssectick(tv);
6609         rack = (struct tcp_rack *)tp->t_fb_ptr;
6610
6611         kern_prefetch(rack, &prev_state);
6612         prev_state = 0;
6613         thflags = th->th_flags;
6614         /*
6615          * If this is either a state-changing packet or current state isn't
6616          * established, we require a read lock on tcbinfo.  Otherwise, we
6617          * allow the tcbinfo to be in either locked or unlocked, as the
6618          * caller may have unnecessarily acquired a lock due to a race.
6619          */
6620         if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
6621             tp->t_state != TCPS_ESTABLISHED) {
6622                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6623         }
6624         INP_WLOCK_ASSERT(tp->t_inpcb);
6625         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
6626             __func__));
6627         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
6628             __func__));
6629         {
6630                 union tcp_log_stackspecific log;
6631
6632                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6633                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
6634                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
6635                 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
6636                     tlen, &log, true);
6637         }
6638         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
6639                 way_out = 4;
6640                 goto done_with_input;
6641         }
6642         /*
6643          * If a segment with the ACK-bit set arrives in the SYN-SENT state
6644          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
6645          */
6646         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
6647             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
6648                 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6649                 return;
6650         }
6651         /*
6652          * Segment received on connection. Reset idle time and keep-alive
6653          * timer. XXX: This should be done after segment validation to
6654          * ignore broken/spoofed segs.
6655          */
6656         if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
6657 #ifdef NETFLIX_CWV
6658                 if ((tp->cwv_enabled) &&
6659                     ((tp->cwv_cwnd_valid == 0) &&
6660                      TCPS_HAVEESTABLISHED(tp->t_state) &&
6661                      (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
6662                         tcp_newcwv_nvp_closedown(tp);
6663                 } else
6664 #endif
6665                        if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
6666                         counter_u64_add(rack_input_idle_reduces, 1);
6667                         rack_cc_after_idle(tp,
6668                             (rack->r_idle_reduce_largest ? 1 :0));
6669                 }
6670         }
6671         rack->r_ctl.rc_rcvtime = cts;
6672         tp->t_rcvtime = ticks;
6673
6674 #ifdef NETFLIX_CWV
6675         if (tp->cwv_enabled) {
6676                 if ((tp->cwv_cwnd_valid == 0) &&
6677                     TCPS_HAVEESTABLISHED(tp->t_state) &&
6678                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
6679                         tcp_newcwv_nvp_closedown(tp);
6680         }
6681 #endif
6682         /*
6683          * Unscale the window into a 32-bit value. For the SYN_SENT state
6684          * the scale is zero.
6685          */
6686         tiwin = th->th_win << tp->snd_scale;
6687 #ifdef NETFLIX_STATS
6688         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
6689 #endif
6690         /*
6691          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
6692          * this to occur after we've validated the segment.
6693          */
6694         if (tp->t_flags & TF_ECN_PERMIT) {
6695                 if (thflags & TH_CWR)
6696                         tp->t_flags &= ~TF_ECN_SND_ECE;
6697                 switch (iptos & IPTOS_ECN_MASK) {
6698                 case IPTOS_ECN_CE:
6699                         tp->t_flags |= TF_ECN_SND_ECE;
6700                         TCPSTAT_INC(tcps_ecn_ce);
6701                         break;
6702                 case IPTOS_ECN_ECT0:
6703                         TCPSTAT_INC(tcps_ecn_ect0);
6704                         break;
6705                 case IPTOS_ECN_ECT1:
6706                         TCPSTAT_INC(tcps_ecn_ect1);
6707                         break;
6708                 }
6709                 /* Congestion experienced. */
6710                 if (thflags & TH_ECE) {
6711                         rack_cong_signal(tp, th, CC_ECN);
6712                 }
6713         }
6714         /*
6715          * Parse options on any incoming segment.
6716          */
6717         tcp_dooptions(&to, (u_char *)(th + 1),
6718             (th->th_off << 2) - sizeof(struct tcphdr),
6719             (thflags & TH_SYN) ? TO_SYN : 0);
6720
6721         /*
6722          * If echoed timestamp is later than the current time, fall back to
6723          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
6724          * were used when this connection was established.
6725          */
6726         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
6727                 to.to_tsecr -= tp->ts_offset;
6728                 if (TSTMP_GT(to.to_tsecr, cts))
6729                         to.to_tsecr = 0;
6730         }
6731         /*
6732          * If its the first time in we need to take care of options and
6733          * verify we can do SACK for rack!
6734          */
6735         if (rack->r_state == 0) {
6736                 /* Should be init'd by rack_init() */
6737                 KASSERT(rack->rc_inp != NULL,
6738                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
6739                 if (rack->rc_inp == NULL) {
6740                         rack->rc_inp = tp->t_inpcb;
6741                 }
6742
6743                 /*
6744                  * Process options only when we get SYN/ACK back. The SYN
6745                  * case for incoming connections is handled in tcp_syncache.
6746                  * According to RFC1323 the window field in a SYN (i.e., a
6747                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
6748                  * this is traditional behavior, may need to be cleaned up.
6749                  */
6750                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
6751                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
6752                         if ((to.to_flags & TOF_SCALE) &&
6753                             (tp->t_flags & TF_REQ_SCALE)) {
6754                                 tp->t_flags |= TF_RCVD_SCALE;
6755                                 tp->snd_scale = to.to_wscale;
6756                         }
6757                         /*
6758                          * Initial send window.  It will be updated with the
6759                          * next incoming segment to the scaled value.
6760                          */
6761                         tp->snd_wnd = th->th_win;
6762                         if (to.to_flags & TOF_TS) {
6763                                 tp->t_flags |= TF_RCVD_TSTMP;
6764                                 tp->ts_recent = to.to_tsval;
6765                                 tp->ts_recent_age = cts;
6766                         }
6767                         if (to.to_flags & TOF_MSS)
6768                                 tcp_mss(tp, to.to_mss);
6769                         if ((tp->t_flags & TF_SACK_PERMIT) &&
6770                             (to.to_flags & TOF_SACKPERM) == 0)
6771                                 tp->t_flags &= ~TF_SACK_PERMIT;
6772                         if (IS_FASTOPEN(tp->t_flags)) {
6773                                 if (to.to_flags & TOF_FASTOPEN) {
6774                                         uint16_t mss;
6775
6776                                         if (to.to_flags & TOF_MSS)
6777                                                 mss = to.to_mss;
6778                                         else
6779                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
6780                                                         mss = TCP6_MSS;
6781                                                 else
6782                                                         mss = TCP_MSS;
6783                                         tcp_fastopen_update_cache(tp, mss,
6784                                             to.to_tfo_len, to.to_tfo_cookie);
6785                                 } else
6786                                         tcp_fastopen_disable_path(tp);
6787                         }
6788                 }
6789                 /*
6790                  * At this point we are at the initial call. Here we decide
6791                  * if we are doing RACK or not. We do this by seeing if
6792                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
6793                  * we switch to the default code.
6794                  */
6795                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
6796                         tcp_switch_back_to_default(tp);
6797                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
6798                             tlen, iptos);
6799                         return;
6800                 }
6801                 /* Set the flag */
6802                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
6803                 tcp_set_hpts(tp->t_inpcb);
6804                 rack_stop_all_timers(tp);
6805                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
6806         }
6807         /*
6808          * This is the one exception case where we set the rack state
6809          * always. All other times (timers etc) we must have a rack-state
6810          * set (so we assure we have done the checks above for SACK).
6811          */
6812         if (rack->r_state != tp->t_state)
6813                 rack_set_state(tp, rack);
6814         if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
6815                 kern_prefetch(rsm, &prev_state);
6816         prev_state = rack->r_state;
6817         rack->r_ctl.rc_tlp_send_cnt = 0;
6818         rack_clear_rate_sample(rack);
6819         retval = (*rack->r_substate) (m, th, so,
6820             tp, &to, drop_hdrlen,
6821             tlen, tiwin, thflags, nxt_pkt);
6822 #ifdef INVARIANTS
6823         if ((retval == 0) &&
6824             (tp->t_inpcb == NULL)) {
6825                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
6826                     retval, tp, prev_state);
6827         }
6828 #endif
6829         if (retval == 0) {
6830                 /*
6831                  * If retval is 1 the tcb is unlocked and most likely the tp
6832                  * is gone.
6833                  */
6834                 INP_WLOCK_ASSERT(tp->t_inpcb);
6835                 tcp_rack_xmit_timer_commit(rack, tp);
6836                 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
6837                     (rack->rc_in_persist == 0)){
6838                         /*
6839                          * The peer shrunk its window on us to the point
6840                          * where we have sent too much. The only thing
6841                          * we can do here is stop any timers and
6842                          * enter persist. We most likely lost the last
6843                          * bytes we sent but oh well, we will have to
6844                          * retransmit them after the peer is caught up.
6845                          */
6846                         if (rack->rc_inp->inp_in_hpts)
6847                                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6848                         rack_timer_cancel(tp, rack, cts, __LINE__);
6849                         rack_enter_persist(tp, rack, cts);
6850                         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6851                         way_out = 3;
6852                         goto done_with_input;
6853                 }
6854                 if (nxt_pkt == 0) {
6855                         if (rack->r_wanted_output != 0) {
6856                                 did_out = 1;
6857                                 (void)tp->t_fb->tfb_tcp_output(tp);
6858                         }
6859                         rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
6860                 }
6861                 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
6862                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
6863                      (tp->t_flags & TF_DELACK) ||
6864                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6865                       (tp->t_state <= TCPS_CLOSING)))) {
6866                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
6867                         if ((tp->snd_max == tp->snd_una) &&
6868                             ((tp->t_flags & TF_DELACK) == 0) &&
6869                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
6870                                 /* keep alive not needed if we are hptsi output yet */
6871                                 ;
6872                         } else {
6873                                 if (rack->rc_inp->inp_in_hpts)
6874                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6875                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6876                         }
6877                         way_out = 1;
6878                 } else {
6879                         /* Do we have the correct timer running? */
6880                         rack_timer_audit(tp, rack, &so->so_snd);
6881                         way_out = 2;
6882                 }
6883         done_with_input:
6884                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
6885                 if (did_out)
6886                         rack->r_wanted_output = 0;
6887 #ifdef INVARIANTS
6888                 if (tp->t_inpcb == NULL) {
6889                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
6890                               did_out,
6891                               retval, tp, prev_state);
6892                 }
6893 #endif
6894                 INP_WUNLOCK(tp->t_inpcb);
6895         }
6896 }
6897
6898 void
6899 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6900     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
6901 {
6902         struct timeval tv;
6903 #ifdef RSS
6904         struct tcp_function_block *tfb;
6905         struct tcp_rack *rack;
6906         struct epoch_tracker et;
6907
6908         rack = (struct tcp_rack *)tp->t_fb_ptr;
6909         if (rack->r_state == 0) {
6910                 /*
6911                  * Initial input (ACK to SYN-ACK etc)lets go ahead and get
6912                  * it processed
6913                  */
6914                 INP_INFO_RLOCK_ET(&V_tcbinfo, et);
6915                 tcp_get_usecs(&tv);
6916                 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6917                     tlen, iptos, 0, &tv);
6918                 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
6919                 return;
6920         }
6921         tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
6922         INP_WUNLOCK(tp->t_inpcb);
6923 #else
6924         tcp_get_usecs(&tv);
6925         rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6926             tlen, iptos, 0, &tv);
6927 #endif
6928 }
6929
6930 struct rack_sendmap *
6931 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
6932 {
6933         struct rack_sendmap *rsm = NULL;
6934         int32_t idx;
6935         uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
6936
6937         /* Return the next guy to be re-transmitted */
6938         if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
6939                 return (NULL);
6940         }
6941         if (tp->t_flags & TF_SENTFIN) {
6942                 /* retran the end FIN? */
6943                 return (NULL);
6944         }
6945         /* ok lets look at this one */
6946         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6947         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
6948                 goto check_it;
6949         }
6950         rsm = rack_find_lowest_rsm(rack);
6951         if (rsm == NULL) {
6952                 return (NULL);
6953         }
6954 check_it:
6955         srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
6956         srtt = TICKS_2_MSEC(srtt_cur);
6957         if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
6958                 srtt = rack->rc_rack_rtt;
6959         if (rsm->r_flags & RACK_ACKED) {
6960                 return (NULL);
6961         }
6962         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
6963                 /* Its not yet ready */
6964                 return (NULL);
6965         }
6966         idx = rsm->r_rtr_cnt - 1;
6967         ts_low = rsm->r_tim_lastsent[idx];
6968         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6969         if (tsused <= ts_low) {
6970                 return (NULL);
6971         }
6972         if ((tsused - ts_low) >= thresh) {
6973                 return (rsm);
6974         }
6975         return (NULL);
6976 }
6977
6978 static int
6979 rack_output(struct tcpcb *tp)
6980 {
6981         struct socket *so;
6982         uint32_t recwin, sendwin;
6983         uint32_t sb_offset;
6984         int32_t len, flags, error = 0;
6985         struct mbuf *m;
6986         struct mbuf *mb;
6987         uint32_t if_hw_tsomaxsegcount = 0;
6988         uint32_t if_hw_tsomaxsegsize;
6989         long tot_len_this_send = 0;
6990         struct ip *ip = NULL;
6991 #ifdef TCPDEBUG
6992         struct ipovly *ipov = NULL;
6993 #endif
6994         struct udphdr *udp = NULL;
6995         struct tcp_rack *rack;
6996         struct tcphdr *th;
6997         uint8_t pass = 0;
6998         uint8_t wanted_cookie = 0;
6999         u_char opt[TCP_MAXOLEN];
7000         unsigned ipoptlen, optlen, hdrlen, ulen=0;
7001         uint32_t rack_seq;
7002
7003 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7004         unsigned ipsec_optlen = 0;
7005
7006 #endif
7007         int32_t idle, sendalot;
7008         int32_t sub_from_prr = 0;
7009         volatile int32_t sack_rxmit;
7010         struct rack_sendmap *rsm = NULL;
7011         int32_t tso, mtu, would_have_fin = 0;
7012         struct tcpopt to;
7013         int32_t slot = 0;
7014         uint32_t cts;
7015         uint8_t hpts_calling, doing_tlp = 0;
7016         int32_t do_a_prefetch;
7017         int32_t prefetch_rsm = 0;
7018         int32_t prefetch_so_done = 0;
7019         struct tcp_log_buffer *lgb = NULL;
7020         struct inpcb *inp;
7021         struct sockbuf *sb;
7022 #ifdef INET6
7023         struct ip6_hdr *ip6 = NULL;
7024         int32_t isipv6;
7025 #endif
7026         /* setup and take the cache hits here */
7027         rack = (struct tcp_rack *)tp->t_fb_ptr;
7028         inp = rack->rc_inp;
7029         so = inp->inp_socket;
7030         sb = &so->so_snd;
7031         kern_prefetch(sb, &do_a_prefetch);
7032         do_a_prefetch = 1;
7033
7034         INP_WLOCK_ASSERT(inp);
7035 #ifdef TCP_OFFLOAD
7036         if (tp->t_flags & TF_TOE)
7037                 return (tcp_offload_output(tp));
7038 #endif
7039 #ifdef INET6
7040         if (rack->r_state) {
7041                 /* Use the cache line loaded if possible */
7042                 isipv6 = rack->r_is_v6;
7043         } else {
7044                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
7045         }
7046 #endif
7047         cts = tcp_ts_getticks();
7048         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
7049             inp->inp_in_hpts) {
7050                 /*
7051                  * We are on the hpts for some timer but not hptsi output.
7052                  * Remove from the hpts unconditionally.
7053                  */
7054                 rack_timer_cancel(tp, rack, cts, __LINE__);
7055         }
7056         /* Mark that we have called rack_output(). */
7057         if ((rack->r_timer_override) ||
7058             (tp->t_flags & TF_FORCEDATA) ||
7059             (tp->t_state < TCPS_ESTABLISHED)) {
7060                 if (tp->t_inpcb->inp_in_hpts)
7061                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
7062         } else if (tp->t_inpcb->inp_in_hpts) {
7063                 /*
7064                  * On the hpts you can't pass even if ACKNOW is on, we will
7065                  * when the hpts fires.
7066                  */
7067                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
7068                 return (0);
7069         }
7070         hpts_calling = inp->inp_hpts_calls;
7071         inp->inp_hpts_calls = 0;
7072         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7073                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
7074                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
7075                         return (0);
7076                 }
7077         }
7078         rack->r_wanted_output = 0;
7079         rack->r_timer_override = 0;
7080         /*
7081          * For TFO connections in SYN_SENT or SYN_RECEIVED,
7082          * only allow the initial SYN or SYN|ACK and those sent
7083          * by the retransmit timer.
7084          */
7085         if (IS_FASTOPEN(tp->t_flags) &&
7086             ((tp->t_state == TCPS_SYN_RECEIVED) ||
7087              (tp->t_state == TCPS_SYN_SENT)) &&
7088             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
7089             (tp->t_rxtshift == 0))              /* not a retransmit */
7090                 return (0);
7091         /*
7092          * Determine length of data that should be transmitted, and flags
7093          * that will be used. If there is some data or critical controls
7094          * (SYN, RST) to send, then transmit; otherwise, investigate
7095          * further.
7096          */
7097         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
7098 #ifdef NETFLIX_CWV
7099         if (tp->cwv_enabled) {
7100                 if ((tp->cwv_cwnd_valid == 0) &&
7101                     TCPS_HAVEESTABLISHED(tp->t_state) &&
7102                     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
7103                         tcp_newcwv_nvp_closedown(tp);
7104         } else
7105 #endif
7106         if (tp->t_idle_reduce) {
7107                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
7108                         rack_cc_after_idle(tp,
7109                             (rack->r_idle_reduce_largest ? 1 :0));
7110         }
7111         tp->t_flags &= ~TF_LASTIDLE;
7112         if (idle) {
7113                 if (tp->t_flags & TF_MORETOCOME) {
7114                         tp->t_flags |= TF_LASTIDLE;
7115                         idle = 0;
7116                 }
7117         }
7118 again:
7119         /*
7120          * If we've recently taken a timeout, snd_max will be greater than
7121          * snd_nxt.  There may be SACK information that allows us to avoid
7122          * resending already delivered data.  Adjust snd_nxt accordingly.
7123          */
7124         sendalot = 0;
7125         cts = tcp_ts_getticks();
7126         tso = 0;
7127         mtu = 0;
7128         sb_offset = tp->snd_max - tp->snd_una;
7129         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
7130
7131         flags = tcp_outflags[tp->t_state];
7132         /*
7133          * Send any SACK-generated retransmissions.  If we're explicitly
7134          * trying to send out new data (when sendalot is 1), bypass this
7135          * function. If we retransmit in fast recovery mode, decrement
7136          * snd_cwnd, since we're replacing a (future) new transmission with
7137          * a retransmission now, and we previously incremented snd_cwnd in
7138          * tcp_input().
7139          */
7140         /*
7141          * Still in sack recovery , reset rxmit flag to zero.
7142          */
7143         while (rack->rc_free_cnt < rack_free_cache) {
7144                 rsm = rack_alloc(rack);
7145                 if (rsm == NULL) {
7146                         if (inp->inp_hpts_calls)
7147                                 /* Retry in a ms */
7148                                 slot = 1;
7149                         goto just_return_nolock;
7150                 }
7151                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
7152                 rack->rc_free_cnt++;
7153                 rsm = NULL;
7154         }
7155         if (inp->inp_hpts_calls)
7156                 inp->inp_hpts_calls = 0;
7157         sack_rxmit = 0;
7158         len = 0;
7159         rsm = NULL;
7160         if (flags & TH_RST) {
7161                 SOCKBUF_LOCK(sb);
7162                 goto send;
7163         }
7164         if (rack->r_ctl.rc_tlpsend) {
7165                 /* Tail loss probe */
7166                 long cwin;
7167                 long tlen;
7168
7169                 doing_tlp = 1;
7170                 rsm = rack->r_ctl.rc_tlpsend;
7171                 rack->r_ctl.rc_tlpsend = NULL;
7172                 sack_rxmit = 1;
7173                 tlen = rsm->r_end - rsm->r_start;
7174                 if (tlen > tp->t_maxseg)
7175                         tlen = tp->t_maxseg;
7176                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7177                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7178                     __func__, __LINE__,
7179                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7180                 sb_offset = rsm->r_start - tp->snd_una;
7181                 cwin = min(tp->snd_wnd, tlen);
7182                 len = cwin;
7183         } else if (rack->r_ctl.rc_resend) {
7184                 /* Retransmit timer */
7185                 rsm = rack->r_ctl.rc_resend;
7186                 rack->r_ctl.rc_resend = NULL;
7187                 len = rsm->r_end - rsm->r_start;
7188                 sack_rxmit = 1;
7189                 sendalot = 0;
7190                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7191                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7192                     __func__, __LINE__,
7193                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7194                 sb_offset = rsm->r_start - tp->snd_una;
7195                 if (len >= tp->t_maxseg) {
7196                         len = tp->t_maxseg;
7197                 }
7198         } else if ((rack->rc_in_persist == 0) &&
7199             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
7200                 long tlen;
7201
7202                 if ((!IN_RECOVERY(tp->t_flags)) &&
7203                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
7204                         /* Enter recovery if not induced by a time-out */
7205                         rack->r_ctl.rc_rsm_start = rsm->r_start;
7206                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
7207                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
7208                         rack_cong_signal(tp, NULL, CC_NDUPACK);
7209                         /*
7210                          * When we enter recovery we need to assure we send
7211                          * one packet.
7212                          */
7213                         rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
7214                 }
7215 #ifdef INVARIANTS
7216                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
7217                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
7218                             tp, rack, rsm, rsm->r_start, tp->snd_una);
7219                 }
7220 #endif
7221                 tlen = rsm->r_end - rsm->r_start;
7222                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7223                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7224                     __func__, __LINE__,
7225                     rsm->r_start, tp->snd_una, tp, rack, rsm));
7226                 sb_offset = rsm->r_start - tp->snd_una;
7227                 if (tlen > rack->r_ctl.rc_prr_sndcnt) {
7228                         len = rack->r_ctl.rc_prr_sndcnt;
7229                 } else {
7230                         len = tlen;
7231                 }
7232                 if (len >= tp->t_maxseg) {
7233                         sendalot = 1;
7234                         len = tp->t_maxseg;
7235                 } else {
7236                         sendalot = 0;
7237                         if ((rack->rc_timer_up == 0) &&
7238                             (len < tlen)) {
7239                                 /*
7240                                  * If its not a timer don't send a partial
7241                                  * segment.
7242                                  */
7243                                 len = 0;
7244                                 goto just_return_nolock;
7245                         }
7246                 }
7247                 if (len > 0) {
7248                         sub_from_prr = 1;
7249                         sack_rxmit = 1;
7250                         TCPSTAT_INC(tcps_sack_rexmits);
7251                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
7252                             min(len, tp->t_maxseg));
7253                         counter_u64_add(rack_rtm_prr_retran, 1);
7254                 }
7255         }
7256         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
7257                 /* we are retransmitting the fin */
7258                 len--;
7259                 if (len) {
7260                         /*
7261                          * When retransmitting data do *not* include the
7262                          * FIN. This could happen from a TLP probe.
7263                          */
7264                         flags &= ~TH_FIN;
7265                 }
7266         }
7267 #ifdef INVARIANTS
7268         /* For debugging */
7269         rack->r_ctl.rc_rsm_at_retran = rsm;
7270 #endif
7271         /*
7272          * Get standard flags, and add SYN or FIN if requested by 'hidden'
7273          * state flags.
7274          */
7275         if (tp->t_flags & TF_NEEDFIN)
7276                 flags |= TH_FIN;
7277         if (tp->t_flags & TF_NEEDSYN)
7278                 flags |= TH_SYN;
7279         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
7280                 void *end_rsm;
7281                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
7282                 if (end_rsm)
7283                         kern_prefetch(end_rsm, &prefetch_rsm);
7284                 prefetch_rsm = 1;
7285         }
7286         SOCKBUF_LOCK(sb);
7287         /*
7288          * If in persist timeout with window of 0, send 1 byte. Otherwise,
7289          * if window is small but nonzero and time TF_SENTFIN expired, we
7290          * will send what we can and go to transmit state.
7291          */
7292         if (tp->t_flags & TF_FORCEDATA) {
7293                 if (sendwin == 0) {
7294                         /*
7295                          * If we still have some data to send, then clear
7296                          * the FIN bit.  Usually this would happen below
7297                          * when it realizes that we aren't sending all the
7298                          * data.  However, if we have exactly 1 byte of
7299                          * unsent data, then it won't clear the FIN bit
7300                          * below, and if we are in persist state, we wind up
7301                          * sending the packet without recording that we sent
7302                          * the FIN bit.
7303                          *
7304                          * We can't just blindly clear the FIN bit, because
7305                          * if we don't have any more data to send then the
7306                          * probe will be the FIN itself.
7307                          */
7308                         if (sb_offset < sbused(sb))
7309                                 flags &= ~TH_FIN;
7310                         sendwin = 1;
7311                 } else {
7312                         if (rack->rc_in_persist)
7313                                 rack_exit_persist(tp, rack);
7314                         /*
7315                          * If we are dropping persist mode then we need to
7316                          * correct snd_nxt/snd_max and off.
7317                          */
7318                         tp->snd_nxt = tp->snd_max;
7319                         sb_offset = tp->snd_nxt - tp->snd_una;
7320                 }
7321         }
7322         /*
7323          * If snd_nxt == snd_max and we have transmitted a FIN, the
7324          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
7325          * negative length.  This can also occur when TCP opens up its
7326          * congestion window while receiving additional duplicate acks after
7327          * fast-retransmit because TCP will reset snd_nxt to snd_max after
7328          * the fast-retransmit.
7329          *
7330          * In the normal retransmit-FIN-only case, however, snd_nxt will be
7331          * set to snd_una, the sb_offset will be 0, and the length may wind
7332          * up 0.
7333          *
7334          * If sack_rxmit is true we are retransmitting from the scoreboard
7335          * in which case len is already set.
7336          */
7337         if (sack_rxmit == 0) {
7338                 uint32_t avail;
7339
7340                 avail = sbavail(sb);
7341                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
7342                         sb_offset = tp->snd_nxt - tp->snd_una;
7343                 else
7344                         sb_offset = 0;
7345                 if (IN_RECOVERY(tp->t_flags) == 0) {
7346                         if (rack->r_ctl.rc_tlp_new_data) {
7347                                 /* TLP is forcing out new data */
7348                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
7349                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
7350                                 }
7351                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
7352                                         len = tp->snd_wnd;
7353                                 else
7354                                         len = rack->r_ctl.rc_tlp_new_data;
7355                                 rack->r_ctl.rc_tlp_new_data = 0;
7356                                 doing_tlp = 1;
7357                         } else {
7358                                 if (sendwin > avail) {
7359                                         /* use the available */
7360                                         if (avail > sb_offset) {
7361                                                 len = (int32_t)(avail - sb_offset);
7362                                         } else {
7363                                                 len = 0;
7364                                         }
7365                                 } else {
7366                                         if (sendwin > sb_offset) {
7367                                                 len = (int32_t)(sendwin - sb_offset);
7368                                         } else {
7369                                                 len = 0;
7370                                         }
7371                                 }
7372                         }
7373                 } else {
7374                         uint32_t outstanding;
7375
7376                         /*
7377                          * We are inside of a SACK recovery episode and are
7378                          * sending new data, having retransmitted all the
7379                          * data possible so far in the scoreboard.
7380                          */
7381                         outstanding = tp->snd_max - tp->snd_una;
7382                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
7383                                 len = 0;
7384                         else if (avail > sb_offset)
7385                                 len = avail - sb_offset;
7386                         else
7387                                 len = 0;
7388                         if (len > 0) {
7389                                 if (len > rack->r_ctl.rc_prr_sndcnt)
7390                                         len = rack->r_ctl.rc_prr_sndcnt;
7391
7392                                 if (len > 0) {
7393                                         sub_from_prr = 1;
7394                                         counter_u64_add(rack_rtm_prr_newdata, 1);
7395                                 }
7396                         }
7397                         if (len > tp->t_maxseg) {
7398                                 /*
7399                                  * We should never send more than a MSS when
7400                                  * retransmitting or sending new data in prr
7401                                  * mode unless the override flag is on. Most
7402                                  * likely the PRR algorithm is not going to
7403                                  * let us send a lot as well :-)
7404                                  */
7405                                 if (rack->r_ctl.rc_prr_sendalot == 0)
7406                                         len = tp->t_maxseg;
7407                         } else if (len < tp->t_maxseg) {
7408                                 /*
7409                                  * Do we send any? The idea here is if the
7410                                  * send empty's the socket buffer we want to
7411                                  * do it. However if not then lets just wait
7412                                  * for our prr_sndcnt to get bigger.
7413                                  */
7414                                 long leftinsb;
7415
7416                                 leftinsb = sbavail(sb) - sb_offset;
7417                                 if (leftinsb > len) {
7418                                         /* This send does not empty the sb */
7419                                         len = 0;
7420                                 }
7421                         }
7422                 }
7423         }
7424         if (prefetch_so_done == 0) {
7425                 kern_prefetch(so, &prefetch_so_done);
7426                 prefetch_so_done = 1;
7427         }
7428         /*
7429          * Lop off SYN bit if it has already been sent.  However, if this is
7430          * SYN-SENT state and if segment contains data and if we don't know
7431          * that foreign host supports TAO, suppress sending segment.
7432          */
7433         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
7434             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
7435                 if (tp->t_state != TCPS_SYN_RECEIVED)
7436                         flags &= ~TH_SYN;
7437                 /*
7438                  * When sending additional segments following a TFO SYN|ACK,
7439                  * do not include the SYN bit.
7440                  */
7441                 if (IS_FASTOPEN(tp->t_flags) &&
7442                     (tp->t_state == TCPS_SYN_RECEIVED))
7443                         flags &= ~TH_SYN;
7444                 sb_offset--, len++;
7445         }
7446         /*
7447          * Be careful not to send data and/or FIN on SYN segments. This
7448          * measure is needed to prevent interoperability problems with not
7449          * fully conformant TCP implementations.
7450          */
7451         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
7452                 len = 0;
7453                 flags &= ~TH_FIN;
7454         }
7455         /*
7456          * On TFO sockets, ensure no data is sent in the following cases:
7457          *
7458          *  - When retransmitting SYN|ACK on a passively-created socket
7459          *
7460          *  - When retransmitting SYN on an actively created socket
7461          *
7462          *  - When sending a zero-length cookie (cookie request) on an
7463          *    actively created socket
7464          *
7465          *  - When the socket is in the CLOSED state (RST is being sent)
7466          */
7467         if (IS_FASTOPEN(tp->t_flags) &&
7468             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
7469              ((tp->t_state == TCPS_SYN_SENT) &&
7470               (tp->t_tfo_client_cookie_len == 0)) ||
7471              (flags & TH_RST))) {
7472                 sack_rxmit = 0;
7473                 len = 0;
7474         }
7475         /* Without fast-open there should never be data sent on a SYN */
7476         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
7477                 len = 0;
7478         if (len <= 0) {
7479                 /*
7480                  * If FIN has been sent but not acked, but we haven't been
7481                  * called to retransmit, len will be < 0.  Otherwise, window
7482                  * shrank after we sent into it.  If window shrank to 0,
7483                  * cancel pending retransmit, pull snd_nxt back to (closed)
7484                  * window, and set the persist timer if it isn't already
7485                  * going.  If the window didn't close completely, just wait
7486                  * for an ACK.
7487                  *
7488                  * We also do a general check here to ensure that we will
7489                  * set the persist timer when we have data to send, but a
7490                  * 0-byte window. This makes sure the persist timer is set
7491                  * even if the packet hits one of the "goto send" lines
7492                  * below.
7493                  */
7494                 len = 0;
7495                 if ((tp->snd_wnd == 0) &&
7496                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
7497                     (sb_offset < (int)sbavail(sb))) {
7498                         tp->snd_nxt = tp->snd_una;
7499                         rack_enter_persist(tp, rack, cts);
7500                 }
7501         }
7502         /* len will be >= 0 after this point. */
7503         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7504         tcp_sndbuf_autoscale(tp, so, sendwin);
7505         /*
7506          * Decide if we can use TCP Segmentation Offloading (if supported by
7507          * hardware).
7508          *
7509          * TSO may only be used if we are in a pure bulk sending state.  The
7510          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
7511          * options prevent using TSO.  With TSO the TCP header is the same
7512          * (except for the sequence number) for all generated packets.  This
7513          * makes it impossible to transmit any options which vary per
7514          * generated segment or packet.
7515          *
7516          * IPv4 handling has a clear separation of ip options and ip header
7517          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
7518          * the right thing below to provide length of just ip options and thus
7519          * checking for ipoptlen is enough to decide if ip options are present.
7520          */
7521
7522 #ifdef INET6
7523         if (isipv6)
7524                 ipoptlen = ip6_optlen(tp->t_inpcb);
7525         else
7526 #endif
7527                 if (tp->t_inpcb->inp_options)
7528                         ipoptlen = tp->t_inpcb->inp_options->m_len -
7529                             offsetof(struct ipoption, ipopt_list);
7530                 else
7531                         ipoptlen = 0;
7532 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7533         /*
7534          * Pre-calculate here as we save another lookup into the darknesses
7535          * of IPsec that way and can actually decide if TSO is ok.
7536          */
7537 #ifdef INET6
7538         if (isipv6 && IPSEC_ENABLED(ipv6))
7539                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
7540 #ifdef INET
7541         else
7542 #endif
7543 #endif                          /* INET6 */
7544 #ifdef INET
7545         if (IPSEC_ENABLED(ipv4))
7546                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
7547 #endif                          /* INET */
7548 #endif
7549
7550 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7551         ipoptlen += ipsec_optlen;
7552 #endif
7553         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
7554             (tp->t_port == 0) &&
7555             ((tp->t_flags & TF_SIGNATURE) == 0) &&
7556             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
7557             ipoptlen == 0)
7558                 tso = 1;
7559         {
7560                 uint32_t outstanding;
7561
7562                 outstanding = tp->snd_max - tp->snd_una;
7563                 if (tp->t_flags & TF_SENTFIN) {
7564                         /*
7565                          * If we sent a fin, snd_max is 1 higher than
7566                          * snd_una
7567                          */
7568                         outstanding--;
7569                 }
7570                 if (outstanding > 0) {
7571                         /*
7572                          * This is sub-optimal. We only send a stand alone
7573                          * FIN on its own segment.
7574                          */
7575                         if (flags & TH_FIN) {
7576                                 flags &= ~TH_FIN;
7577                                 would_have_fin = 1;
7578                         }
7579                 } else if (sack_rxmit) {
7580                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
7581                                 flags &= ~TH_FIN;
7582                 } else {
7583                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
7584                             sbused(sb)))
7585                                 flags &= ~TH_FIN;
7586                 }
7587         }
7588         recwin = sbspace(&so->so_rcv);
7589
7590         /*
7591          * Sender silly window avoidance.   We transmit under the following
7592          * conditions when len is non-zero:
7593          *
7594          * - We have a full segment (or more with TSO) - This is the last
7595          * buffer in a write()/send() and we are either idle or running
7596          * NODELAY - we've timed out (e.g. persist timer) - we have more
7597          * then 1/2 the maximum send window's worth of data (receiver may be
7598          * limited the window size) - we need to retransmit
7599          */
7600         if (len) {
7601                 if (len >= tp->t_maxseg) {
7602                         pass = 1;
7603                         goto send;
7604                 }
7605                 /*
7606                  * NOTE! on localhost connections an 'ack' from the remote
7607                  * end may occur synchronously with the output and cause us
7608                  * to flush a buffer queued with moretocome.  XXX
7609                  *
7610                  */
7611                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
7612                     (idle || (tp->t_flags & TF_NODELAY)) &&
7613                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
7614                     (tp->t_flags & TF_NOPUSH) == 0) {
7615                         pass = 2;
7616                         goto send;
7617                 }
7618                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
7619                         pass = 3;
7620                         goto send;
7621                 }
7622                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
7623                         goto send;
7624                 }
7625                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
7626                         pass = 4;
7627                         goto send;
7628                 }
7629                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
7630                         pass = 5;
7631                         goto send;
7632                 }
7633                 if (sack_rxmit) {
7634                         pass = 6;
7635                         goto send;
7636                 }
7637         }
7638         /*
7639          * Sending of standalone window updates.
7640          *
7641          * Window updates are important when we close our window due to a
7642          * full socket buffer and are opening it again after the application
7643          * reads data from it.  Once the window has opened again and the
7644          * remote end starts to send again the ACK clock takes over and
7645          * provides the most current window information.
7646          *
7647          * We must avoid the silly window syndrome whereas every read from
7648          * the receive buffer, no matter how small, causes a window update
7649          * to be sent.  We also should avoid sending a flurry of window
7650          * updates when the socket buffer had queued a lot of data and the
7651          * application is doing small reads.
7652          *
7653          * Prevent a flurry of pointless window updates by only sending an
7654          * update when we can increase the advertized window by more than
7655          * 1/4th of the socket buffer capacity.  When the buffer is getting
7656          * full or is very small be more aggressive and send an update
7657          * whenever we can increase by two mss sized segments. In all other
7658          * situations the ACK's to new incoming data will carry further
7659          * window increases.
7660          *
7661          * Don't send an independent window update if a delayed ACK is
7662          * pending (it will get piggy-backed on it) or the remote side
7663          * already has done a half-close and won't send more data.  Skip
7664          * this if the connection is in T/TCP half-open state.
7665          */
7666         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
7667             !(tp->t_flags & TF_DELACK) &&
7668             !TCPS_HAVERCVDFIN(tp->t_state)) {
7669                 /*
7670                  * "adv" is the amount we could increase the window, taking
7671                  * into account that we are limited by TCP_MAXWIN <<
7672                  * tp->rcv_scale.
7673                  */
7674                 int32_t adv;
7675                 int oldwin;
7676
7677                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
7678                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
7679                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
7680                         adv -= oldwin;
7681                 } else
7682                         oldwin = 0;
7683
7684                 /*
7685                  * If the new window size ends up being the same as the old
7686                  * size when it is scaled, then don't force a window update.
7687                  */
7688                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
7689                         goto dontupdate;
7690
7691                 if (adv >= (int32_t)(2 * tp->t_maxseg) &&
7692                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
7693                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
7694                     so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
7695                         pass = 7;
7696                         goto send;
7697                 }
7698                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
7699                         goto send;
7700         }
7701 dontupdate:
7702
7703         /*
7704          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
7705          * is also a catch-all for the retransmit timer timeout case.
7706          */
7707         if (tp->t_flags & TF_ACKNOW) {
7708                 pass = 8;
7709                 goto send;
7710         }
7711         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
7712                 pass = 9;
7713                 goto send;
7714         }
7715         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
7716                 pass = 10;
7717                 goto send;
7718         }
7719         /*
7720          * If our state indicates that FIN should be sent and we have not
7721          * yet done so, then we need to send.
7722          */
7723         if ((flags & TH_FIN) &&
7724             (tp->snd_nxt == tp->snd_una)) {
7725                 pass = 11;
7726                 goto send;
7727         }
7728         /*
7729          * No reason to send a segment, just return.
7730          */
7731 just_return:
7732         SOCKBUF_UNLOCK(sb);
7733 just_return_nolock:
7734         if (tot_len_this_send == 0)
7735                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
7736         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
7737         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
7738         tp->t_flags &= ~TF_FORCEDATA;
7739         return (0);
7740
7741 send:
7742         if (doing_tlp == 0) {
7743                 /*
7744                  * Data not a TLP, and its not the rxt firing. If it is the
7745                  * rxt firing, we want to leave the tlp_in_progress flag on
7746                  * so we don't send another TLP. It has to be a rack timer
7747                  * or normal send (response to acked data) to clear the tlp
7748                  * in progress flag.
7749                  */
7750                 rack->rc_tlp_in_progress = 0;
7751         }
7752         SOCKBUF_LOCK_ASSERT(sb);
7753         if (len > 0) {
7754                 if (len >= tp->t_maxseg)
7755                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
7756                 else
7757                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
7758         }
7759         /*
7760          * Before ESTABLISHED, force sending of initial options unless TCP
7761          * set not to do any options. NOTE: we assume that the IP/TCP header
7762          * plus TCP options always fit in a single mbuf, leaving room for a
7763          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
7764          * + optlen <= MCLBYTES
7765          */
7766         optlen = 0;
7767 #ifdef INET6
7768         if (isipv6)
7769                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
7770         else
7771 #endif
7772                 hdrlen = sizeof(struct tcpiphdr);
7773
7774         /*
7775          * Compute options for segment. We only have to care about SYN and
7776          * established connection segments.  Options for SYN-ACK segments
7777          * are handled in TCP syncache.
7778          */
7779         to.to_flags = 0;
7780         if ((tp->t_flags & TF_NOOPT) == 0) {
7781                 /* Maximum segment size. */
7782                 if (flags & TH_SYN) {
7783                         tp->snd_nxt = tp->iss;
7784                         to.to_mss = tcp_mssopt(&inp->inp_inc);
7785 #ifdef NETFLIX_TCPOUDP
7786                         if (tp->t_port)
7787                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
7788 #endif
7789                         to.to_flags |= TOF_MSS;
7790
7791                         /*
7792                          * On SYN or SYN|ACK transmits on TFO connections,
7793                          * only include the TFO option if it is not a
7794                          * retransmit, as the presence of the TFO option may
7795                          * have caused the original SYN or SYN|ACK to have
7796                          * been dropped by a middlebox.
7797                          */
7798                         if (IS_FASTOPEN(tp->t_flags) &&
7799                             (tp->t_rxtshift == 0)) {
7800                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
7801                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
7802                                         to.to_tfo_cookie =
7803                                             (u_int8_t *)&tp->t_tfo_cookie.server;
7804                                         to.to_flags |= TOF_FASTOPEN;
7805                                         wanted_cookie = 1;
7806                                 } else if (tp->t_state == TCPS_SYN_SENT) {
7807                                         to.to_tfo_len =
7808                                             tp->t_tfo_client_cookie_len;
7809                                         to.to_tfo_cookie =
7810                                             tp->t_tfo_cookie.client;
7811                                         to.to_flags |= TOF_FASTOPEN;
7812                                         wanted_cookie = 1;
7813                                         /*
7814                                          * If we wind up having more data to
7815                                          * send with the SYN than can fit in
7816                                          * one segment, don't send any more
7817                                          * until the SYN|ACK comes back from
7818                                          * the other end.
7819                                          */
7820                                         sendalot = 0;
7821                                 }
7822                         }
7823                 }
7824                 /* Window scaling. */
7825                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
7826                         to.to_wscale = tp->request_r_scale;
7827                         to.to_flags |= TOF_SCALE;
7828                 }
7829                 /* Timestamps. */
7830                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
7831                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
7832                         to.to_tsval = cts + tp->ts_offset;
7833                         to.to_tsecr = tp->ts_recent;
7834                         to.to_flags |= TOF_TS;
7835                 }
7836                 /* Set receive buffer autosizing timestamp. */
7837                 if (tp->rfbuf_ts == 0 &&
7838                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
7839                         tp->rfbuf_ts = tcp_ts_getticks();
7840                 /* Selective ACK's. */
7841                 if (flags & TH_SYN)
7842                         to.to_flags |= TOF_SACKPERM;
7843                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7844                     tp->rcv_numsacks > 0) {
7845                         to.to_flags |= TOF_SACK;
7846                         to.to_nsacks = tp->rcv_numsacks;
7847                         to.to_sacks = (u_char *)tp->sackblks;
7848                 }
7849 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
7850                 /* TCP-MD5 (RFC2385). */
7851                 if (tp->t_flags & TF_SIGNATURE)
7852                         to.to_flags |= TOF_SIGNATURE;
7853 #endif                          /* TCP_SIGNATURE */
7854
7855                 /* Processing the options. */
7856                 hdrlen += optlen = tcp_addoptions(&to, opt);
7857                 /*
7858                  * If we wanted a TFO option to be added, but it was unable
7859                  * to fit, ensure no data is sent.
7860                  */
7861                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
7862                     !(to.to_flags & TOF_FASTOPEN))
7863                         len = 0;
7864         }
7865 #ifdef NETFLIX_TCPOUDP
7866         if (tp->t_port) {
7867                 if (V_tcp_udp_tunneling_port == 0) {
7868                         /* The port was removed?? */
7869                         SOCKBUF_UNLOCK(&so->so_snd);
7870                         return (EHOSTUNREACH);
7871                 }
7872                 hdrlen += sizeof(struct udphdr);
7873         }
7874 #endif
7875 #ifdef INET6
7876         if (isipv6)
7877                 ipoptlen = ip6_optlen(tp->t_inpcb);
7878         else
7879 #endif
7880         if (tp->t_inpcb->inp_options)
7881                 ipoptlen = tp->t_inpcb->inp_options->m_len -
7882                     offsetof(struct ipoption, ipopt_list);
7883         else
7884                 ipoptlen = 0;
7885 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7886         ipoptlen += ipsec_optlen;
7887 #endif
7888
7889         /*
7890          * Adjust data length if insertion of options will bump the packet
7891          * length beyond the t_maxseg length. Clear the FIN bit because we
7892          * cut off the tail of the segment.
7893          */
7894         if (len + optlen + ipoptlen > tp->t_maxseg) {
7895                 if (flags & TH_FIN) {
7896                         would_have_fin = 1;
7897                         flags &= ~TH_FIN;
7898                 }
7899                 if (tso) {
7900                         uint32_t if_hw_tsomax;
7901                         uint32_t moff;
7902                         int32_t max_len;
7903
7904                         /* extract TSO information */
7905                         if_hw_tsomax = tp->t_tsomax;
7906                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
7907                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
7908                         KASSERT(ipoptlen == 0,
7909                             ("%s: TSO can't do IP options", __func__));
7910
7911                         /*
7912                          * Check if we should limit by maximum payload
7913                          * length:
7914                          */
7915                         if (if_hw_tsomax != 0) {
7916                                 /* compute maximum TSO length */
7917                                 max_len = (if_hw_tsomax - hdrlen -
7918                                     max_linkhdr);
7919                                 if (max_len <= 0) {
7920                                         len = 0;
7921                                 } else if (len > max_len) {
7922                                         sendalot = 1;
7923                                         len = max_len;
7924                                 }
7925                         }
7926                         /*
7927                          * Prevent the last segment from being fractional
7928                          * unless the send sockbuf can be emptied:
7929                          */
7930                         max_len = (tp->t_maxseg - optlen);
7931                         if ((sb_offset + len) < sbavail(sb)) {
7932                                 moff = len % (u_int)max_len;
7933                                 if (moff != 0) {
7934                                         len -= moff;
7935                                         sendalot = 1;
7936                                 }
7937                         }
7938                         /*
7939                          * In case there are too many small fragments don't
7940                          * use TSO:
7941                          */
7942                         if (len <= max_len) {
7943                                 len = max_len;
7944                                 sendalot = 1;
7945                                 tso = 0;
7946                         }
7947                         /*
7948                          * Send the FIN in a separate segment after the bulk
7949                          * sending is done. We don't trust the TSO
7950                          * implementations to clear the FIN flag on all but
7951                          * the last segment.
7952                          */
7953                         if (tp->t_flags & TF_NEEDFIN)
7954                                 sendalot = 1;
7955
7956                 } else {
7957                         if (optlen + ipoptlen >= tp->t_maxseg) {
7958                                 /*
7959                                  * Since we don't have enough space to put
7960                                  * the IP header chain and the TCP header in
7961                                  * one packet as required by RFC 7112, don't
7962                                  * send it. Also ensure that at least one
7963                                  * byte of the payload can be put into the
7964                                  * TCP segment.
7965                                  */
7966                                 SOCKBUF_UNLOCK(&so->so_snd);
7967                                 error = EMSGSIZE;
7968                                 sack_rxmit = 0;
7969                                 goto out;
7970                         }
7971                         len = tp->t_maxseg - optlen - ipoptlen;
7972                         sendalot = 1;
7973                 }
7974         } else
7975                 tso = 0;
7976         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
7977             ("%s: len > IP_MAXPACKET", __func__));
7978 #ifdef DIAGNOSTIC
7979 #ifdef INET6
7980         if (max_linkhdr + hdrlen > MCLBYTES)
7981 #else
7982         if (max_linkhdr + hdrlen > MHLEN)
7983 #endif
7984                 panic("tcphdr too big");
7985 #endif
7986
7987         /*
7988          * This KASSERT is here to catch edge cases at a well defined place.
7989          * Before, those had triggered (random) panic conditions further
7990          * down.
7991          */
7992         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7993         if ((len == 0) &&
7994             (flags & TH_FIN) &&
7995             (sbused(sb))) {
7996                 /*
7997                  * We have outstanding data, don't send a fin by itself!.
7998                  */
7999                 goto just_return;
8000         }
8001         /*
8002          * Grab a header mbuf, attaching a copy of data to be transmitted,
8003          * and initialize the header from the template for sends on this
8004          * connection.
8005          */
8006         if (len) {
8007                 uint32_t max_val;
8008                 uint32_t moff;
8009
8010                 if (rack->rc_pace_max_segs)
8011                         max_val = rack->rc_pace_max_segs * tp->t_maxseg;
8012                 else
8013                         max_val = len;
8014                 /*
8015                  * We allow a limit on sending with hptsi.
8016                  */
8017                 if (len > max_val) {
8018                         len = max_val;
8019                 }
8020 #ifdef INET6
8021                 if (MHLEN < hdrlen + max_linkhdr)
8022                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
8023                 else
8024 #endif
8025                         m = m_gethdr(M_NOWAIT, MT_DATA);
8026
8027                 if (m == NULL) {
8028                         SOCKBUF_UNLOCK(sb);
8029                         error = ENOBUFS;
8030                         sack_rxmit = 0;
8031                         goto out;
8032                 }
8033                 m->m_data += max_linkhdr;
8034                 m->m_len = hdrlen;
8035
8036                 /*
8037                  * Start the m_copy functions from the closest mbuf to the
8038                  * sb_offset in the socket buffer chain.
8039                  */
8040                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
8041                 if (len <= MHLEN - hdrlen - max_linkhdr) {
8042                         m_copydata(mb, moff, (int)len,
8043                             mtod(m, caddr_t)+hdrlen);
8044                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8045                                 sbsndptr_adv(sb, mb, len);
8046                         m->m_len += len;
8047                 } else {
8048                         struct sockbuf *msb;
8049
8050                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8051                                 msb = NULL;
8052                         else
8053                                 msb = sb;
8054                         m->m_next = tcp_m_copym(mb, moff, &len,
8055                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
8056                         if (len <= (tp->t_maxseg - optlen)) {
8057                                 /*
8058                                  * Must have ran out of mbufs for the copy
8059                                  * shorten it to no longer need tso. Lets
8060                                  * not put on sendalot since we are low on
8061                                  * mbufs.
8062                                  */
8063                                 tso = 0;
8064                         }
8065                         if (m->m_next == NULL) {
8066                                 SOCKBUF_UNLOCK(sb);
8067                                 (void)m_free(m);
8068                                 error = ENOBUFS;
8069                                 sack_rxmit = 0;
8070                                 goto out;
8071                         }
8072                 }
8073                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
8074                         TCPSTAT_INC(tcps_sndprobe);
8075 #ifdef NETFLIX_STATS
8076                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8077                                 stats_voi_update_abs_u32(tp->t_stats,
8078                                     VOI_TCP_RETXPB, len);
8079                         else
8080                                 stats_voi_update_abs_u64(tp->t_stats,
8081                                     VOI_TCP_TXPB, len);
8082 #endif
8083                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
8084                         if (rsm && (rsm->r_flags & RACK_TLP)) {
8085                                 /*
8086                                  * TLP should not count in retran count, but
8087                                  * in its own bin
8088                                  */
8089                                 counter_u64_add(rack_tlp_retran, 1);
8090                                 counter_u64_add(rack_tlp_retran_bytes, len);
8091                         } else {
8092                                 tp->t_sndrexmitpack++;
8093                                 TCPSTAT_INC(tcps_sndrexmitpack);
8094                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
8095                         }
8096 #ifdef NETFLIX_STATS
8097                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
8098                             len);
8099 #endif
8100                 } else {
8101                         TCPSTAT_INC(tcps_sndpack);
8102                         TCPSTAT_ADD(tcps_sndbyte, len);
8103 #ifdef NETFLIX_STATS
8104                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
8105                             len);
8106 #endif
8107                 }
8108                 /*
8109                  * If we're sending everything we've got, set PUSH. (This
8110                  * will keep happy those implementations which only give
8111                  * data to the user when a buffer fills or a PUSH comes in.)
8112                  */
8113                 if (sb_offset + len == sbused(sb) &&
8114                     sbused(sb) &&
8115                     !(flags & TH_SYN))
8116                         flags |= TH_PUSH;
8117
8118                 /*
8119                  * Are we doing hptsi, if so we must calculate the slot. We
8120                  * only do hptsi in ESTABLISHED and with no RESET being
8121                  * sent where we have data to send.
8122                  */
8123                 if (((tp->t_state == TCPS_ESTABLISHED) ||
8124                     (tp->t_state == TCPS_CLOSE_WAIT) ||
8125                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
8126                     ((tp->t_flags & TF_SENTFIN) == 0) &&
8127                     ((flags & TH_FIN) == 0))) &&
8128                     ((flags & TH_RST) == 0) &&
8129                     (rack->rc_always_pace)) {
8130                         /*
8131                          * We use the most optimistic possible cwnd/srtt for
8132                          * sending calculations. This will make our
8133                          * calculation anticipate getting more through
8134                          * quicker then possible. But thats ok we don't want
8135                          * the peer to have a gap in data sending.
8136                          */
8137                         uint32_t srtt, cwnd, tr_perms = 0;
8138
8139                         if (rack->r_ctl.rc_rack_min_rtt)
8140                                 srtt = rack->r_ctl.rc_rack_min_rtt;
8141                         else
8142                                 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8143                         if (rack->r_ctl.rc_rack_largest_cwnd)
8144                                 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8145                         else
8146                                 cwnd = tp->snd_cwnd;
8147                         tr_perms = cwnd / srtt;
8148                         if (tr_perms == 0) {
8149                                 tr_perms = tp->t_maxseg;
8150                         }
8151                         tot_len_this_send += len;
8152                         /*
8153                          * Calculate how long this will take to drain, if
8154                          * the calculation comes out to zero, thats ok we
8155                          * will use send_a_lot to possibly spin around for
8156                          * more increasing tot_len_this_send to the point
8157                          * that its going to require a pace, or we hit the
8158                          * cwnd. Which in that case we are just waiting for
8159                          * a ACK.
8160                          */
8161                         slot = tot_len_this_send / tr_perms;
8162                         /* Now do we reduce the time so we don't run dry? */
8163                         if (slot && rack->rc_pace_reduce) {
8164                                 int32_t reduce;
8165
8166                                 reduce = (slot / rack->rc_pace_reduce);
8167                                 if (reduce < slot) {
8168                                         slot -= reduce;
8169                                 } else
8170                                         slot = 0;
8171                         }
8172                         if (rack->r_enforce_min_pace &&
8173                             (slot == 0) &&
8174                             (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
8175                                 /* We are enforcing a minimum pace time of 1ms */
8176                                 slot = rack->r_enforce_min_pace;
8177                         }
8178                 }
8179                 SOCKBUF_UNLOCK(sb);
8180         } else {
8181                 SOCKBUF_UNLOCK(sb);
8182                 if (tp->t_flags & TF_ACKNOW)
8183                         TCPSTAT_INC(tcps_sndacks);
8184                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
8185                         TCPSTAT_INC(tcps_sndctrl);
8186                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
8187                         TCPSTAT_INC(tcps_sndurg);
8188                 else
8189                         TCPSTAT_INC(tcps_sndwinup);
8190
8191                 m = m_gethdr(M_NOWAIT, MT_DATA);
8192                 if (m == NULL) {
8193                         error = ENOBUFS;
8194                         sack_rxmit = 0;
8195                         goto out;
8196                 }
8197 #ifdef INET6
8198                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
8199                     MHLEN >= hdrlen) {
8200                         M_ALIGN(m, hdrlen);
8201                 } else
8202 #endif
8203                         m->m_data += max_linkhdr;
8204                 m->m_len = hdrlen;
8205         }
8206         SOCKBUF_UNLOCK_ASSERT(sb);
8207         m->m_pkthdr.rcvif = (struct ifnet *)0;
8208 #ifdef MAC
8209         mac_inpcb_create_mbuf(inp, m);
8210 #endif
8211 #ifdef INET6
8212         if (isipv6) {
8213                 ip6 = mtod(m, struct ip6_hdr *);
8214 #ifdef NETFLIX_TCPOUDP
8215                 if (tp->t_port) {
8216                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
8217                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8218                         udp->uh_dport = tp->t_port;
8219                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
8220                         udp->uh_ulen = htons(ulen);
8221                         th = (struct tcphdr *)(udp + 1);
8222                 } else
8223 #endif
8224                         th = (struct tcphdr *)(ip6 + 1);
8225                 tcpip_fillheaders(inp, ip6, th);
8226         } else
8227 #endif                          /* INET6 */
8228         {
8229                 ip = mtod(m, struct ip *);
8230 #ifdef TCPDEBUG
8231                 ipov = (struct ipovly *)ip;
8232 #endif
8233 #ifdef NETFLIX_TCPOUDP
8234                 if (tp->t_port) {
8235                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
8236                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8237                         udp->uh_dport = tp->t_port;
8238                         ulen = hdrlen + len - sizeof(struct ip);
8239                         udp->uh_ulen = htons(ulen);
8240                         th = (struct tcphdr *)(udp + 1);
8241                 } else
8242 #endif
8243                         th = (struct tcphdr *)(ip + 1);
8244                 tcpip_fillheaders(inp, ip, th);
8245         }
8246         /*
8247          * Fill in fields, remembering maximum advertised window for use in
8248          * delaying messages about window sizes. If resending a FIN, be sure
8249          * not to use a new sequence number.
8250          */
8251         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
8252             tp->snd_nxt == tp->snd_max)
8253                 tp->snd_nxt--;
8254         /*
8255          * If we are starting a connection, send ECN setup SYN packet. If we
8256          * are on a retransmit, we may resend those bits a number of times
8257          * as per RFC 3168.
8258          */
8259         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
8260                 if (tp->t_rxtshift >= 1) {
8261                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
8262                                 flags |= TH_ECE | TH_CWR;
8263                 } else
8264                         flags |= TH_ECE | TH_CWR;
8265         }
8266         if (tp->t_state == TCPS_ESTABLISHED &&
8267             (tp->t_flags & TF_ECN_PERMIT)) {
8268                 /*
8269                  * If the peer has ECN, mark data packets with ECN capable
8270                  * transmission (ECT). Ignore pure ack packets,
8271                  * retransmissions and window probes.
8272                  */
8273                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
8274                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
8275 #ifdef INET6
8276                         if (isipv6)
8277                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
8278                         else
8279 #endif
8280                                 ip->ip_tos |= IPTOS_ECN_ECT0;
8281                         TCPSTAT_INC(tcps_ecn_ect0);
8282                 }
8283                 /*
8284                  * Reply with proper ECN notifications.
8285                  */
8286                 if (tp->t_flags & TF_ECN_SND_CWR) {
8287                         flags |= TH_CWR;
8288                         tp->t_flags &= ~TF_ECN_SND_CWR;
8289                 }
8290                 if (tp->t_flags & TF_ECN_SND_ECE)
8291                         flags |= TH_ECE;
8292         }
8293         /*
8294          * If we are doing retransmissions, then snd_nxt will not reflect
8295          * the first unsent octet.  For ACK only packets, we do not want the
8296          * sequence number of the retransmitted packet, we want the sequence
8297          * number of the next unsent octet.  So, if there is no data (and no
8298          * SYN or FIN), use snd_max instead of snd_nxt when filling in
8299          * ti_seq.  But if we are in persist state, snd_max might reflect
8300          * one byte beyond the right edge of the window, so use snd_nxt in
8301          * that case, since we know we aren't doing a retransmission.
8302          * (retransmit and persist are mutually exclusive...)
8303          */
8304         if (sack_rxmit == 0) {
8305                 if (len || (flags & (TH_SYN | TH_FIN)) ||
8306                     rack->rc_in_persist) {
8307                         th->th_seq = htonl(tp->snd_nxt);
8308                         rack_seq = tp->snd_nxt;
8309                 } else if (flags & TH_RST) {
8310                         /*
8311                          * For a Reset send the last cum ack in sequence
8312                          * (this like any other choice may still generate a
8313                          * challenge ack, if a ack-update packet is in
8314                          * flight).
8315                          */
8316                         th->th_seq = htonl(tp->snd_una);
8317                         rack_seq = tp->snd_una;
8318                 } else {
8319                         th->th_seq = htonl(tp->snd_max);
8320                         rack_seq = tp->snd_max;
8321                 }
8322         } else {
8323                 th->th_seq = htonl(rsm->r_start);
8324                 rack_seq = rsm->r_start;
8325         }
8326         th->th_ack = htonl(tp->rcv_nxt);
8327         if (optlen) {
8328                 bcopy(opt, th + 1, optlen);
8329                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
8330         }
8331         th->th_flags = flags;
8332         /*
8333          * Calculate receive window.  Don't shrink window, but avoid silly
8334          * window syndrome.
8335          * If a RST segment is sent, advertise a window of zero.
8336          */
8337         if (flags & TH_RST) {
8338                 recwin = 0;
8339         } else {
8340                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
8341                     recwin < (long)tp->t_maxseg)
8342                         recwin = 0;
8343                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
8344                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
8345                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
8346                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
8347                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
8348         }
8349
8350         /*
8351          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
8352          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
8353          * handled in syncache.
8354          */
8355         if (flags & TH_SYN)
8356                 th->th_win = htons((u_short)
8357                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
8358         else
8359                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
8360         /*
8361          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
8362          * window.  This may cause the remote transmitter to stall.  This
8363          * flag tells soreceive() to disable delayed acknowledgements when
8364          * draining the buffer.  This can occur if the receiver is
8365          * attempting to read more data than can be buffered prior to
8366          * transmitting on the connection.
8367          */
8368         if (th->th_win == 0) {
8369                 tp->t_sndzerowin++;
8370                 tp->t_flags |= TF_RXWIN0SENT;
8371         } else
8372                 tp->t_flags &= ~TF_RXWIN0SENT;
8373         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8374                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
8375                 th->th_flags |= TH_URG;
8376         } else
8377                 /*
8378                  * If no urgent pointer to send, then we pull the urgent
8379                  * pointer to the left edge of the send window so that it
8380                  * doesn't drift into the send window on sequence number
8381                  * wraparound.
8382                  */
8383                 tp->snd_up = tp->snd_una;       /* drag it along */
8384
8385 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
8386         if (to.to_flags & TOF_SIGNATURE) {
8387                 /*
8388                  * Calculate MD5 signature and put it into the place
8389                  * determined before.
8390                  * NOTE: since TCP options buffer doesn't point into
8391                  * mbuf's data, calculate offset and use it.
8392                  */
8393                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
8394                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
8395                         /*
8396                          * Do not send segment if the calculation of MD5
8397                          * digest has failed.
8398                          */
8399                         goto out;
8400                 }
8401         }
8402 #endif
8403
8404         /*
8405          * Put TCP length in extended header, and then checksum extended
8406          * header and data.
8407          */
8408         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
8409 #ifdef INET6
8410         if (isipv6) {
8411                 /*
8412                  * ip6_plen is not need to be filled now, and will be filled
8413                  * in ip6_output.
8414                  */
8415                 if (tp->t_port) {
8416                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
8417                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8418                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
8419                         th->th_sum = htons(0);
8420                 } else {
8421                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
8422                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8423                         th->th_sum = in6_cksum_pseudo(ip6,
8424                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
8425                             0);
8426                 }
8427         }
8428 #endif
8429 #if defined(INET6) && defined(INET)
8430         else
8431 #endif
8432 #ifdef INET
8433         {
8434                 if (tp->t_port) {
8435                         m->m_pkthdr.csum_flags = CSUM_UDP;
8436                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8437                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
8438                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
8439                         th->th_sum = htons(0);
8440                 } else {
8441                         m->m_pkthdr.csum_flags = CSUM_TCP;
8442                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8443                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
8444                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
8445                             IPPROTO_TCP + len + optlen));
8446                 }
8447                 /* IP version must be set here for ipv4/ipv6 checking later */
8448                 KASSERT(ip->ip_v == IPVERSION,
8449                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
8450         }
8451 #endif
8452
8453         /*
8454          * Enable TSO and specify the size of the segments. The TCP pseudo
8455          * header checksum is always provided. XXX: Fixme: This is currently
8456          * not the case for IPv6.
8457          */
8458         if (tso) {
8459                 KASSERT(len > tp->t_maxseg - optlen,
8460                     ("%s: len <= tso_segsz", __func__));
8461                 m->m_pkthdr.csum_flags |= CSUM_TSO;
8462                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
8463         }
8464         KASSERT(len + hdrlen == m_length(m, NULL),
8465             ("%s: mbuf chain different than expected: %d + %u != %u",
8466             __func__, len, hdrlen, m_length(m, NULL)));
8467
8468 #ifdef TCP_HHOOK
8469         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
8470         hhook_run_tcp_est_out(tp, th, &to, len, tso);
8471 #endif
8472
8473 #ifdef TCPDEBUG
8474         /*
8475          * Trace.
8476          */
8477         if (so->so_options & SO_DEBUG) {
8478                 u_short save = 0;
8479
8480 #ifdef INET6
8481                 if (!isipv6)
8482 #endif
8483                 {
8484                         save = ipov->ih_len;
8485                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
8486                               * (th->th_off << 2) */ );
8487                 }
8488                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
8489 #ifdef INET6
8490                 if (!isipv6)
8491 #endif
8492                         ipov->ih_len = save;
8493         }
8494 #endif                          /* TCPDEBUG */
8495
8496         /* We're getting ready to send; log now. */
8497         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
8498                 union tcp_log_stackspecific log;
8499
8500                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
8501                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
8502                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
8503                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
8504                 if (rsm || sack_rxmit) {
8505                         log.u_bbr.flex8 = 1;
8506                 } else {
8507                         log.u_bbr.flex8 = 0;
8508                 }
8509                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
8510                     len, &log, false, NULL, NULL, 0, NULL);
8511         } else
8512                 lgb = NULL;
8513
8514         /*
8515          * Fill in IP length and desired time to live and send to IP level.
8516          * There should be a better way to handle ttl and tos; we could keep
8517          * them in the template, but need a way to checksum without them.
8518          */
8519         /*
8520          * m->m_pkthdr.len should have been set before cksum calcuration,
8521          * because in6_cksum() need it.
8522          */
8523 #ifdef INET6
8524         if (isipv6) {
8525                 /*
8526                  * we separately set hoplimit for every segment, since the
8527                  * user might want to change the value via setsockopt. Also,
8528                  * desired default hop limit might be changed via Neighbor
8529                  * Discovery.
8530                  */
8531                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
8532
8533                 /*
8534                  * Set the packet size here for the benefit of DTrace
8535                  * probes. ip6_output() will set it properly; it's supposed
8536                  * to include the option header lengths as well.
8537                  */
8538                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
8539
8540                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
8541                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8542                 else
8543                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8544
8545                 if (tp->t_state == TCPS_SYN_SENT)
8546                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
8547
8548                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
8549                 /* TODO: IPv6 IP6TOS_ECT bit on */
8550                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
8551                     &inp->inp_route6,
8552                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
8553                     NULL, NULL, inp);
8554
8555                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
8556                         mtu = inp->inp_route6.ro_rt->rt_mtu;
8557         }
8558 #endif                          /* INET6 */
8559 #if defined(INET) && defined(INET6)
8560         else
8561 #endif
8562 #ifdef INET
8563         {
8564                 ip->ip_len = htons(m->m_pkthdr.len);
8565 #ifdef INET6
8566                 if (inp->inp_vflag & INP_IPV6PROTO)
8567                         ip->ip_ttl = in6_selecthlim(inp, NULL);
8568 #endif                          /* INET6 */
8569                 /*
8570                  * If we do path MTU discovery, then we set DF on every
8571                  * packet. This might not be the best thing to do according
8572                  * to RFC3390 Section 2. However the tcp hostcache migitates
8573                  * the problem so it affects only the first tcp connection
8574                  * with a host.
8575                  *
8576                  * NB: Don't set DF on small MTU/MSS to have a safe
8577                  * fallback.
8578                  */
8579                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
8580                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8581                         if (tp->t_port == 0 || len < V_tcp_minmss) {
8582                                 ip->ip_off |= htons(IP_DF);
8583                         }
8584                 } else {
8585                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8586                 }
8587
8588                 if (tp->t_state == TCPS_SYN_SENT)
8589                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
8590
8591                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
8592
8593                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
8594                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
8595                     inp);
8596                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
8597                         mtu = inp->inp_route.ro_rt->rt_mtu;
8598         }
8599 #endif                          /* INET */
8600
8601 out:
8602         if (lgb) {
8603                 lgb->tlb_errno = error;
8604                 lgb = NULL;
8605         }
8606         /*
8607          * In transmit state, time the transmission and arrange for the
8608          * retransmit.  In persist state, just set snd_max.
8609          */
8610         if (error == 0) {
8611                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
8612                     (tp->t_flags & TF_SACK_PERMIT) &&
8613                     tp->rcv_numsacks > 0)
8614                     tcp_clean_dsack_blocks(tp);
8615                 if (len == 0)
8616                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
8617                 else if (len == 1) {
8618                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
8619                 } else if (len > 1) {
8620                         int idx;
8621
8622                         idx = (len / tp->t_maxseg) + 3;
8623                         if (idx >= TCP_MSS_ACCT_ATIMER)
8624                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
8625                         else
8626                                 counter_u64_add(rack_out_size[idx], 1);
8627                 }
8628         }
8629         if (sub_from_prr && (error == 0)) {
8630                 rack->r_ctl.rc_prr_sndcnt -= len;
8631         }
8632         sub_from_prr = 0;
8633         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
8634             pass, rsm);
8635         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
8636             (rack->rc_in_persist == 0)) {
8637                 tcp_seq startseq = tp->snd_nxt;
8638
8639                 /*
8640                  * Advance snd_nxt over sequence space of this segment.
8641                  */
8642                 if (error)
8643                         /* We don't log or do anything with errors */
8644                         goto timer;
8645
8646                 if (flags & (TH_SYN | TH_FIN)) {
8647                         if (flags & TH_SYN)
8648                                 tp->snd_nxt++;
8649                         if (flags & TH_FIN) {
8650                                 tp->snd_nxt++;
8651                                 tp->t_flags |= TF_SENTFIN;
8652                         }
8653                 }
8654                 /* In the ENOBUFS case we do *not* update snd_max */
8655                 if (sack_rxmit)
8656                         goto timer;
8657
8658                 tp->snd_nxt += len;
8659                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
8660                         if (tp->snd_una == tp->snd_max) {
8661                                 /*
8662                                  * Update the time we just added data since
8663                                  * none was outstanding.
8664                                  */
8665                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8666                                 tp->t_acktime = ticks;
8667                         }
8668                         tp->snd_max = tp->snd_nxt;
8669                         /*
8670                          * Time this transmission if not a retransmission and
8671                          * not currently timing anything.
8672                          * This is only relevant in case of switching back to
8673                          * the base stack.
8674                          */
8675                         if (tp->t_rtttime == 0) {
8676                                 tp->t_rtttime = ticks;
8677                                 tp->t_rtseq = startseq;
8678                                 TCPSTAT_INC(tcps_segstimed);
8679                         }
8680 #ifdef NETFLIX_STATS
8681                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
8682                                 tp->t_flags |= TF_GPUTINPROG;
8683                                 tp->gput_seq = startseq;
8684                                 tp->gput_ack = startseq +
8685                                     ulmin(sbavail(sb) - sb_offset, sendwin);
8686                                 tp->gput_ts = tcp_ts_getticks();
8687                         }
8688 #endif
8689                 }
8690                 /*
8691                  * Set retransmit timer if not currently set, and not doing
8692                  * a pure ack or a keep-alive probe. Initial value for
8693                  * retransmit timer is smoothed round-trip time + 2 *
8694                  * round-trip time variance. Initialize shift counter which
8695                  * is used for backoff of retransmit time.
8696                  */
8697 timer:
8698                 if ((tp->snd_wnd == 0) &&
8699                     TCPS_HAVEESTABLISHED(tp->t_state)) {
8700                         /*
8701                          * If the persists timer was set above (right before
8702                          * the goto send), and still needs to be on. Lets
8703                          * make sure all is canceled. If the persist timer
8704                          * is not running, we want to get it up.
8705                          */
8706                         if (rack->rc_in_persist == 0) {
8707                                 rack_enter_persist(tp, rack, cts);
8708                         }
8709                 }
8710         } else {
8711                 /*
8712                  * Persist case, update snd_max but since we are in persist
8713                  * mode (no window) we do not update snd_nxt.
8714                  */
8715                 int32_t xlen = len;
8716
8717                 if (error)
8718                         goto nomore;
8719
8720                 if (flags & TH_SYN)
8721                         ++xlen;
8722                 if (flags & TH_FIN) {
8723                         ++xlen;
8724                         tp->t_flags |= TF_SENTFIN;
8725                 }
8726                 /* In the ENOBUFS case we do *not* update snd_max */
8727                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
8728                         if (tp->snd_una == tp->snd_max) {
8729                                 /*
8730                                  * Update the time we just added data since
8731                                  * none was outstanding.
8732                                  */
8733                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8734                                 tp->t_acktime = ticks;
8735                         }
8736                         tp->snd_max = tp->snd_nxt + len;
8737                 }
8738         }
8739 nomore:
8740         if (error) {
8741                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
8742                 /*
8743                  * Failures do not advance the seq counter above. For the
8744                  * case of ENOBUFS we will fall out and retry in 1ms with
8745                  * the hpts. Everything else will just have to retransmit
8746                  * with the timer.
8747                  *
8748                  * In any case, we do not want to loop around for another
8749                  * send without a good reason.
8750                  */
8751                 sendalot = 0;
8752                 switch (error) {
8753                 case EPERM:
8754                         tp->t_flags &= ~TF_FORCEDATA;
8755                         tp->t_softerror = error;
8756                         return (error);
8757                 case ENOBUFS:
8758                         if (slot == 0) {
8759                                 /*
8760                                  * Pace us right away to retry in a some
8761                                  * time
8762                                  */
8763                                 slot = 1 + rack->rc_enobuf;
8764                                 if (rack->rc_enobuf < 255)
8765                                         rack->rc_enobuf++;
8766                                 if (slot > (rack->rc_rack_rtt / 2)) {
8767                                         slot = rack->rc_rack_rtt / 2;
8768                                 }
8769                                 if (slot < 10)
8770                                         slot = 10;
8771                         }
8772                         counter_u64_add(rack_saw_enobuf, 1);
8773                         error = 0;
8774                         goto enobufs;
8775                 case EMSGSIZE:
8776                         /*
8777                          * For some reason the interface we used initially
8778                          * to send segments changed to another or lowered
8779                          * its MTU. If TSO was active we either got an
8780                          * interface without TSO capabilits or TSO was
8781                          * turned off. If we obtained mtu from ip_output()
8782                          * then update it and try again.
8783                          */
8784                         if (tso)
8785                                 tp->t_flags &= ~TF_TSO;
8786                         if (mtu != 0) {
8787                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
8788                                 goto again;
8789                         }
8790                         slot = 10;
8791                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8792                         tp->t_flags &= ~TF_FORCEDATA;
8793                         return (error);
8794                 case ENETUNREACH:
8795                         counter_u64_add(rack_saw_enetunreach, 1);
8796                 case EHOSTDOWN:
8797                 case EHOSTUNREACH:
8798                 case ENETDOWN:
8799                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
8800                                 tp->t_softerror = error;
8801                         }
8802                         /* FALLTHROUGH */
8803                 default:
8804                         slot = 10;
8805                         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8806                         tp->t_flags &= ~TF_FORCEDATA;
8807                         return (error);
8808                 }
8809         } else {
8810                 rack->rc_enobuf = 0;
8811         }
8812         TCPSTAT_INC(tcps_sndtotal);
8813
8814         /*
8815          * Data sent (as far as we can tell). If this advertises a larger
8816          * window than any other segment, then remember the size of the
8817          * advertised window. Any pending ACK has now been sent.
8818          */
8819         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8820                 tp->rcv_adv = tp->rcv_nxt + recwin;
8821         tp->last_ack_sent = tp->rcv_nxt;
8822         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
8823 enobufs:
8824         rack->r_tlp_running = 0;
8825         if ((flags & TH_RST) || (would_have_fin == 1)) {
8826                 /*
8827                  * We don't send again after a RST. We also do *not* send
8828                  * again if we would have had a find, but now have
8829                  * outstanding data.
8830                  */
8831                 slot = 0;
8832                 sendalot = 0;
8833         }
8834         if (slot) {
8835                 /* set the rack tcb into the slot N */
8836                 counter_u64_add(rack_paced_segments, 1);
8837         } else if (sendalot) {
8838                 if (len)
8839                         counter_u64_add(rack_unpaced_segments, 1);
8840                 sack_rxmit = 0;
8841                 tp->t_flags &= ~TF_FORCEDATA;
8842                 goto again;
8843         } else if (len) {
8844                 counter_u64_add(rack_unpaced_segments, 1);
8845         }
8846         tp->t_flags &= ~TF_FORCEDATA;
8847         rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
8848         return (error);
8849 }
8850
8851 /*
8852  * rack_ctloutput() must drop the inpcb lock before performing copyin on
8853  * socket option arguments.  When it re-acquires the lock after the copy, it
8854  * has to revalidate that the connection is still valid for the socket
8855  * option.
8856  */
8857 static int
8858 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
8859     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8860 {
8861         int32_t error = 0, optval;
8862
8863         switch (sopt->sopt_name) {
8864         case TCP_RACK_PROP_RATE:
8865         case TCP_RACK_PROP:
8866         case TCP_RACK_TLP_REDUCE:
8867         case TCP_RACK_EARLY_RECOV:
8868         case TCP_RACK_PACE_ALWAYS:
8869         case TCP_DELACK:
8870         case TCP_RACK_PACE_REDUCE:
8871         case TCP_RACK_PACE_MAX_SEG:
8872         case TCP_RACK_PRR_SENDALOT:
8873         case TCP_RACK_MIN_TO:
8874         case TCP_RACK_EARLY_SEG:
8875         case TCP_RACK_REORD_THRESH:
8876         case TCP_RACK_REORD_FADE:
8877         case TCP_RACK_TLP_THRESH:
8878         case TCP_RACK_PKT_DELAY:
8879         case TCP_RACK_TLP_USE:
8880         case TCP_RACK_TLP_INC_VAR:
8881         case TCP_RACK_IDLE_REDUCE_HIGH:
8882         case TCP_RACK_MIN_PACE:
8883         case TCP_RACK_MIN_PACE_SEG:
8884         case TCP_BBR_RACK_RTT_USE:
8885         case TCP_DATA_AFTER_CLOSE:
8886                 break;
8887         default:
8888                 return (tcp_default_ctloutput(so, sopt, inp, tp));
8889                 break;
8890         }
8891         INP_WUNLOCK(inp);
8892         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
8893         if (error)
8894                 return (error);
8895         INP_WLOCK(inp);
8896         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
8897                 INP_WUNLOCK(inp);
8898                 return (ECONNRESET);
8899         }
8900         tp = intotcpcb(inp);
8901         rack = (struct tcp_rack *)tp->t_fb_ptr;
8902         switch (sopt->sopt_name) {
8903         case TCP_RACK_PROP_RATE:
8904                 if ((optval <= 0) || (optval >= 100)) {
8905                         error = EINVAL;
8906                         break;
8907                 }
8908                 RACK_OPTS_INC(tcp_rack_prop_rate);
8909                 rack->r_ctl.rc_prop_rate = optval;
8910                 break;
8911         case TCP_RACK_TLP_USE:
8912                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
8913                         error = EINVAL;
8914                         break;
8915                 }
8916                 RACK_OPTS_INC(tcp_tlp_use);
8917                 rack->rack_tlp_threshold_use = optval;
8918                 break;
8919         case TCP_RACK_PROP:
8920                 /* RACK proportional rate reduction (bool) */
8921                 RACK_OPTS_INC(tcp_rack_prop);
8922                 rack->r_ctl.rc_prop_reduce = optval;
8923                 break;
8924         case TCP_RACK_TLP_REDUCE:
8925                 /* RACK TLP cwnd reduction (bool) */
8926                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
8927                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
8928                 break;
8929         case TCP_RACK_EARLY_RECOV:
8930                 /* Should recovery happen early (bool) */
8931                 RACK_OPTS_INC(tcp_rack_early_recov);
8932                 rack->r_ctl.rc_early_recovery = optval;
8933                 break;
8934         case TCP_RACK_PACE_ALWAYS:
8935                 /* Use the always pace method (bool)  */
8936                 RACK_OPTS_INC(tcp_rack_pace_always);
8937                 if (optval > 0)
8938                         rack->rc_always_pace = 1;
8939                 else
8940                         rack->rc_always_pace = 0;
8941                 break;
8942         case TCP_RACK_PACE_REDUCE:
8943                 /* RACK Hptsi reduction factor (divisor) */
8944                 RACK_OPTS_INC(tcp_rack_pace_reduce);
8945                 if (optval)
8946                         /* Must be non-zero */
8947                         rack->rc_pace_reduce = optval;
8948                 else
8949                         error = EINVAL;
8950                 break;
8951         case TCP_RACK_PACE_MAX_SEG:
8952                 /* Max segments in a pace */
8953                 RACK_OPTS_INC(tcp_rack_max_seg);
8954                 rack->rc_pace_max_segs = optval;
8955                 break;
8956         case TCP_RACK_PRR_SENDALOT:
8957                 /* Allow PRR to send more than one seg */
8958                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
8959                 rack->r_ctl.rc_prr_sendalot = optval;
8960                 break;
8961         case TCP_RACK_MIN_TO:
8962                 /* Minimum time between rack t-o's in ms */
8963                 RACK_OPTS_INC(tcp_rack_min_to);
8964                 rack->r_ctl.rc_min_to = optval;
8965                 break;
8966         case TCP_RACK_EARLY_SEG:
8967                 /* If early recovery max segments */
8968                 RACK_OPTS_INC(tcp_rack_early_seg);
8969                 rack->r_ctl.rc_early_recovery_segs = optval;
8970                 break;
8971         case TCP_RACK_REORD_THRESH:
8972                 /* RACK reorder threshold (shift amount) */
8973                 RACK_OPTS_INC(tcp_rack_reord_thresh);
8974                 if ((optval > 0) && (optval < 31))
8975                         rack->r_ctl.rc_reorder_shift = optval;
8976                 else
8977                         error = EINVAL;
8978                 break;
8979         case TCP_RACK_REORD_FADE:
8980                 /* Does reordering fade after ms time */
8981                 RACK_OPTS_INC(tcp_rack_reord_fade);
8982                 rack->r_ctl.rc_reorder_fade = optval;
8983                 break;
8984         case TCP_RACK_TLP_THRESH:
8985                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
8986                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
8987                 if (optval)
8988                         rack->r_ctl.rc_tlp_threshold = optval;
8989                 else
8990                         error = EINVAL;
8991                 break;
8992         case TCP_RACK_PKT_DELAY:
8993                 /* RACK added ms i.e. rack-rtt + reord + N */
8994                 RACK_OPTS_INC(tcp_rack_pkt_delay);
8995                 rack->r_ctl.rc_pkt_delay = optval;
8996                 break;
8997         case TCP_RACK_TLP_INC_VAR:
8998                 /* Does TLP include rtt variance in t-o */
8999                 RACK_OPTS_INC(tcp_rack_tlp_inc_var);
9000                 rack->r_ctl.rc_prr_inc_var = optval;
9001                 break;
9002         case TCP_RACK_IDLE_REDUCE_HIGH:
9003                 RACK_OPTS_INC(tcp_rack_idle_reduce_high);
9004                 if (optval)
9005                         rack->r_idle_reduce_largest = 1;
9006                 else
9007                         rack->r_idle_reduce_largest = 0;
9008                 break;
9009         case TCP_DELACK:
9010                 if (optval == 0)
9011                         tp->t_delayed_ack = 0;
9012                 else
9013                         tp->t_delayed_ack = 1;
9014                 if (tp->t_flags & TF_DELACK) {
9015                         tp->t_flags &= ~TF_DELACK;
9016                         tp->t_flags |= TF_ACKNOW;
9017                         rack_output(tp);
9018                 }
9019                 break;
9020         case TCP_RACK_MIN_PACE:
9021                 RACK_OPTS_INC(tcp_rack_min_pace);
9022                 if (optval > 3)
9023                         rack->r_enforce_min_pace = 3;
9024                 else
9025                         rack->r_enforce_min_pace = optval;
9026                 break;
9027         case TCP_RACK_MIN_PACE_SEG:
9028                 RACK_OPTS_INC(tcp_rack_min_pace_seg);
9029                 if (optval >= 16)
9030                         rack->r_min_pace_seg_thresh = 15;
9031                 else
9032                         rack->r_min_pace_seg_thresh = optval;
9033                 break;
9034         case TCP_BBR_RACK_RTT_USE:
9035                 if ((optval != USE_RTT_HIGH) &&
9036                     (optval != USE_RTT_LOW) &&
9037                     (optval != USE_RTT_AVG))
9038                         error = EINVAL;
9039                 else
9040                         rack->r_ctl.rc_rate_sample_method = optval;
9041                 break;
9042         case TCP_DATA_AFTER_CLOSE:
9043                 if (optval)
9044                         rack->rc_allow_data_af_clo = 1;
9045                 else
9046                         rack->rc_allow_data_af_clo = 0;
9047                 break;
9048         default:
9049                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9050                 break;
9051         }
9052 #ifdef NETFLIX_STATS
9053         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
9054 #endif
9055         INP_WUNLOCK(inp);
9056         return (error);
9057 }
9058
9059 static int
9060 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
9061     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
9062 {
9063         int32_t error, optval;
9064
9065         /*
9066          * Because all our options are either boolean or an int, we can just
9067          * pull everything into optval and then unlock and copy. If we ever
9068          * add a option that is not a int, then this will have quite an
9069          * impact to this routine.
9070          */
9071         switch (sopt->sopt_name) {
9072         case TCP_RACK_PROP_RATE:
9073                 optval = rack->r_ctl.rc_prop_rate;
9074                 break;
9075         case TCP_RACK_PROP:
9076                 /* RACK proportional rate reduction (bool) */
9077                 optval = rack->r_ctl.rc_prop_reduce;
9078                 break;
9079         case TCP_RACK_TLP_REDUCE:
9080                 /* RACK TLP cwnd reduction (bool) */
9081                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
9082                 break;
9083         case TCP_RACK_EARLY_RECOV:
9084                 /* Should recovery happen early (bool) */
9085                 optval = rack->r_ctl.rc_early_recovery;
9086                 break;
9087         case TCP_RACK_PACE_REDUCE:
9088                 /* RACK Hptsi reduction factor (divisor) */
9089                 optval = rack->rc_pace_reduce;
9090                 break;
9091         case TCP_RACK_PACE_MAX_SEG:
9092                 /* Max segments in a pace */
9093                 optval = rack->rc_pace_max_segs;
9094                 break;
9095         case TCP_RACK_PACE_ALWAYS:
9096                 /* Use the always pace method */
9097                 optval = rack->rc_always_pace;
9098                 break;
9099         case TCP_RACK_PRR_SENDALOT:
9100                 /* Allow PRR to send more than one seg */
9101                 optval = rack->r_ctl.rc_prr_sendalot;
9102                 break;
9103         case TCP_RACK_MIN_TO:
9104                 /* Minimum time between rack t-o's in ms */
9105                 optval = rack->r_ctl.rc_min_to;
9106                 break;
9107         case TCP_RACK_EARLY_SEG:
9108                 /* If early recovery max segments */
9109                 optval = rack->r_ctl.rc_early_recovery_segs;
9110                 break;
9111         case TCP_RACK_REORD_THRESH:
9112                 /* RACK reorder threshold (shift amount) */
9113                 optval = rack->r_ctl.rc_reorder_shift;
9114                 break;
9115         case TCP_RACK_REORD_FADE:
9116                 /* Does reordering fade after ms time */
9117                 optval = rack->r_ctl.rc_reorder_fade;
9118                 break;
9119         case TCP_RACK_TLP_THRESH:
9120                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
9121                 optval = rack->r_ctl.rc_tlp_threshold;
9122                 break;
9123         case TCP_RACK_PKT_DELAY:
9124                 /* RACK added ms i.e. rack-rtt + reord + N */
9125                 optval = rack->r_ctl.rc_pkt_delay;
9126                 break;
9127         case TCP_RACK_TLP_USE:
9128                 optval = rack->rack_tlp_threshold_use;
9129                 break;
9130         case TCP_RACK_TLP_INC_VAR:
9131                 /* Does TLP include rtt variance in t-o */
9132                 optval = rack->r_ctl.rc_prr_inc_var;
9133                 break;
9134         case TCP_RACK_IDLE_REDUCE_HIGH:
9135                 optval = rack->r_idle_reduce_largest;
9136                 break;
9137         case TCP_RACK_MIN_PACE:
9138                 optval = rack->r_enforce_min_pace;
9139                 break;
9140         case TCP_RACK_MIN_PACE_SEG:
9141                 optval = rack->r_min_pace_seg_thresh;
9142                 break;
9143         case TCP_BBR_RACK_RTT_USE:
9144                 optval = rack->r_ctl.rc_rate_sample_method;
9145                 break;
9146         case TCP_DELACK:
9147                 optval = tp->t_delayed_ack;
9148                 break;
9149         case TCP_DATA_AFTER_CLOSE:
9150                 optval = rack->rc_allow_data_af_clo;
9151                 break;
9152         default:
9153                 return (tcp_default_ctloutput(so, sopt, inp, tp));
9154                 break;
9155         }
9156         INP_WUNLOCK(inp);
9157         error = sooptcopyout(sopt, &optval, sizeof optval);
9158         return (error);
9159 }
9160
9161 static int
9162 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
9163 {
9164         int32_t error = EINVAL;
9165         struct tcp_rack *rack;
9166
9167         rack = (struct tcp_rack *)tp->t_fb_ptr;
9168         if (rack == NULL) {
9169                 /* Huh? */
9170                 goto out;
9171         }
9172         if (sopt->sopt_dir == SOPT_SET) {
9173                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
9174         } else if (sopt->sopt_dir == SOPT_GET) {
9175                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
9176         }
9177 out:
9178         INP_WUNLOCK(inp);
9179         return (error);
9180 }
9181
9182
9183 struct tcp_function_block __tcp_rack = {
9184         .tfb_tcp_block_name = __XSTRING(STACKNAME),
9185         .tfb_tcp_output = rack_output,
9186         .tfb_tcp_do_segment = rack_do_segment,
9187         .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
9188         .tfb_tcp_ctloutput = rack_ctloutput,
9189         .tfb_tcp_fb_init = rack_init,
9190         .tfb_tcp_fb_fini = rack_fini,
9191         .tfb_tcp_timer_stop_all = rack_stopall,
9192         .tfb_tcp_timer_activate = rack_timer_activate,
9193         .tfb_tcp_timer_active = rack_timer_active,
9194         .tfb_tcp_timer_stop = rack_timer_stop,
9195         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
9196         .tfb_tcp_handoff_ok = rack_handoff_ok
9197 };
9198
9199 static const char *rack_stack_names[] = {
9200         __XSTRING(STACKNAME),
9201 #ifdef STACKALIAS
9202         __XSTRING(STACKALIAS),
9203 #endif
9204 };
9205
9206 static int
9207 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
9208 {
9209         memset(mem, 0, size);
9210         return (0);
9211 }
9212
9213 static void
9214 rack_dtor(void *mem, int32_t size, void *arg)
9215 {
9216
9217 }
9218
9219 static bool rack_mod_inited = false;
9220
9221 static int
9222 tcp_addrack(module_t mod, int32_t type, void *data)
9223 {
9224         int32_t err = 0;
9225         int num_stacks;
9226
9227         switch (type) {
9228         case MOD_LOAD:
9229                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
9230                     sizeof(struct rack_sendmap),
9231                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
9232
9233                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
9234                     sizeof(struct tcp_rack),
9235                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
9236
9237                 sysctl_ctx_init(&rack_sysctl_ctx);
9238                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
9239                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
9240                     OID_AUTO,
9241                     __XSTRING(STACKNAME),
9242                     CTLFLAG_RW, 0,
9243                     "");
9244                 if (rack_sysctl_root == NULL) {
9245                         printf("Failed to add sysctl node\n");
9246                         err = EFAULT;
9247                         goto free_uma;
9248                 }
9249                 rack_init_sysctls();
9250                 num_stacks = nitems(rack_stack_names);
9251                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
9252                     rack_stack_names, &num_stacks);
9253                 if (err) {
9254                         printf("Failed to register %s stack name for "
9255                             "%s module\n", rack_stack_names[num_stacks],
9256                             __XSTRING(MODNAME));
9257                         sysctl_ctx_free(&rack_sysctl_ctx);
9258 free_uma:
9259                         uma_zdestroy(rack_zone);
9260                         uma_zdestroy(rack_pcb_zone);
9261                         rack_counter_destroy();
9262                         printf("Failed to register rack module -- err:%d\n", err);
9263                         return (err);
9264                 }
9265                 rack_mod_inited = true;
9266                 break;
9267         case MOD_QUIESCE:
9268                 err = deregister_tcp_functions(&__tcp_rack, true, false);
9269                 break;
9270         case MOD_UNLOAD:
9271                 err = deregister_tcp_functions(&__tcp_rack, false, true);
9272                 if (err == EBUSY)
9273                         break;
9274                 if (rack_mod_inited) {
9275                         uma_zdestroy(rack_zone);
9276                         uma_zdestroy(rack_pcb_zone);
9277                         sysctl_ctx_free(&rack_sysctl_ctx);
9278                         rack_counter_destroy();
9279                         rack_mod_inited = false;
9280                 }
9281                 err = 0;
9282                 break;
9283         default:
9284                 return (EOPNOTSUPP);
9285         }
9286         return (err);
9287 }
9288
9289 static moduledata_t tcp_rack = {
9290         .name = __XSTRING(MODNAME),
9291         .evhand = tcp_addrack,
9292         .priv = 0
9293 };
9294
9295 MODULE_VERSION(MODNAME, 1);
9296 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
9297 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);