sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-9 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34 #include "opt_ratelimit.h"
  35 #include "opt_kern_tls.h"
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #ifdef KERN_TLS
  52 #include <sys/ktls.h>
  53 #endif
  54 #include <sys/sysctl.h>
  55 #include <sys/systm.h>
  56 #ifdef STATS
  57 #include <sys/qmath.h>
  58 #include <sys/tree.h>
  59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  60 #endif
  61 #include <sys/refcount.h>
  62 #include <sys/tree.h>
  63 #include <sys/queue.h>
  64 #include <sys/smp.h>
  65 #include <sys/kthread.h>
  66 #include <sys/kern_prefetch.h>
  67
  68 #include <vm/uma.h>
  69
  70 #include <net/route.h>
  71 #include <net/vnet.h>
  72
  73 #define TCPSTATES               /* for logging */
  74
  75 #include <netinet/in.h>
  76 #include <netinet/in_kdtrace.h>
  77 #include <netinet/in_pcb.h>
  78 #include <netinet/ip.h>
  79 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  80 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  81 #include <netinet/ip_var.h>
  82 #include <netinet/ip6.h>
  83 #include <netinet6/in6_pcb.h>
  84 #include <netinet6/ip6_var.h>
  85 #include <netinet/tcp.h>
  86 #define TCPOUTFLAGS
  87 #include <netinet/tcp_fsm.h>
  88 #include <netinet/tcp_log_buf.h>
  89 #include <netinet/tcp_seq.h>
  90 #include <netinet/tcp_timer.h>
  91 #include <netinet/tcp_var.h>
  92 #include <netinet/tcp_hpts.h>
  93 #include <netinet/tcpip.h>
  94 #include <netinet/cc/cc.h>
  95 #include <netinet/tcp_fastopen.h>
  96 #include <netinet/tcp_lro.h>
  97 #ifdef TCPDEBUG
  98 #include <netinet/tcp_debug.h>
  99 #endif                          /* TCPDEBUG */
 100 #ifdef TCP_OFFLOAD
 101 #include <netinet/tcp_offload.h>
 102 #endif
 103 #ifdef INET6
 104 #include <netinet6/tcp6_var.h>
 105 #endif
 106
 107 #include <netipsec/ipsec_support.h>
 108
 109 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 110 #include <netipsec/ipsec.h>
 111 #include <netipsec/ipsec6.h>
 112 #endif                          /* IPSEC */
 113
 114 #include <netinet/udp.h>
 115 #include <netinet/udp_var.h>
 116 #include <machine/in_cksum.h>
 117
 118 #ifdef MAC
 119 #include <security/mac/mac_framework.h>
 120 #endif
 121 #include "sack_filter.h"
 122 #include "tcp_rack.h"
 123 #include "rack_bbr_common.h"
 124
 125 uma_zone_t rack_zone;
 126 uma_zone_t rack_pcb_zone;
 127
 128 #ifndef TICKS2SBT
 129 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 130 #endif
 131
 132 struct sysctl_ctx_list rack_sysctl_ctx;
 133 struct sysctl_oid *rack_sysctl_root;
 134
 135 #define CUM_ACKED 1
 136 #define SACKED 2
 137
 138 /*
 139  * The RACK module incorporates a number of
 140  * TCP ideas that have been put out into the IETF
 141  * over the last few years:
 142  * - Matt Mathis's Rate Halving which slowly drops
 143  *    the congestion window so that the ack clock can
 144  *    be maintained during a recovery.
 145  * - Yuchung Cheng's RACK TCP (for which its named) that
 146  *    will stop us using the number of dup acks and instead
 147  *    use time as the gage of when we retransmit.
 148  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 149  *    of Dukkipati et.al.
 150  * RACK depends on SACK, so if an endpoint arrives that
 151  * cannot do SACK the state machine below will shuttle the
 152  * connection back to using the "default" TCP stack that is
 153  * in FreeBSD.
 154  *
 155  * To implement RACK the original TCP stack was first decomposed
 156  * into a functional state machine with individual states
 157  * for each of the possible TCP connection states. The do_segement
 158  * functions role in life is to mandate the connection supports SACK
 159  * initially and then assure that the RACK state matches the conenction
 160  * state before calling the states do_segment function. Each
 161  * state is simplified due to the fact that the original do_segment
 162  * has been decomposed and we *know* what state we are in (no
 163  * switches on the state) and all tests for SACK are gone. This
 164  * greatly simplifies what each state does.
 165  *
 166  * TCP output is also over-written with a new version since it
 167  * must maintain the new rack scoreboard.
 168  *
 169  */
 170 static int32_t rack_tlp_thresh = 1;
 171 static int32_t rack_reorder_thresh = 2;
 172 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 173                                                  * - 60 seconds */
 174 /* Attack threshold detections */
 175 static uint32_t rack_highest_sack_thresh_seen = 0;
 176 static uint32_t rack_highest_move_thresh_seen = 0;
 177
 178 static int32_t rack_pkt_delay = 1;
 179 static int32_t rack_min_pace_time = 0;
 180 static int32_t rack_early_recovery = 1;
 181 static int32_t rack_send_a_lot_in_prr = 1;
 182 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 183 static int32_t rack_verbose_logging = 0;
 184 static int32_t rack_ignore_data_after_close = 1;
 185 static int32_t use_rack_cheat = 1;
 186 static int32_t rack_persist_min = 250;  /* 250ms */
 187 static int32_t rack_persist_max = 1000; /* 1 Second */
 188 static int32_t rack_sack_not_required = 0;      /* set to one to allow non-sack to use rack */
 189 static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */
 190
 191 /*
 192  * Currently regular tcp has a rto_min of 30ms
 193  * the backoff goes 12 times so that ends up
 194  * being a total of 122.850 seconds before a
 195  * connection is killed.
 196  */
 197 static int32_t rack_tlp_min = 10;
 198 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 199 static int32_t rack_rto_max = 4000;     /* 4 seconds */
 200 static const int32_t rack_free_cache = 2;
 201 static int32_t rack_hptsi_segments = 40;
 202 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 203 static int32_t rack_pace_every_seg = 0;
 204 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 205 static int32_t rack_slot_reduction = 4;
 206 static int32_t rack_lower_cwnd_at_tlp = 0;
 207 static int32_t rack_use_proportional_reduce = 0;
 208 static int32_t rack_proportional_rate = 10;
 209 static int32_t rack_tlp_max_resend = 2;
 210 static int32_t rack_limited_retran = 0;
 211 static int32_t rack_always_send_oldest = 0;
 212 static int32_t rack_use_sack_filter = 1;
 213 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 214 static int32_t rack_per_of_gp = 50;
 215
 216 /* Rack specific counters */
 217 counter_u64_t rack_badfr;
 218 counter_u64_t rack_badfr_bytes;
 219 counter_u64_t rack_rtm_prr_retran;
 220 counter_u64_t rack_rtm_prr_newdata;
 221 counter_u64_t rack_timestamp_mismatch;
 222 counter_u64_t rack_reorder_seen;
 223 counter_u64_t rack_paced_segments;
 224 counter_u64_t rack_unpaced_segments;
 225 counter_u64_t rack_calc_zero;
 226 counter_u64_t rack_calc_nonzero;
 227 counter_u64_t rack_saw_enobuf;
 228 counter_u64_t rack_saw_enetunreach;
 229 counter_u64_t rack_per_timer_hole;
 230
 231 /* Tail loss probe counters */
 232 counter_u64_t rack_tlp_tot;
 233 counter_u64_t rack_tlp_newdata;
 234 counter_u64_t rack_tlp_retran;
 235 counter_u64_t rack_tlp_retran_bytes;
 236 counter_u64_t rack_tlp_retran_fail;
 237 counter_u64_t rack_to_tot;
 238 counter_u64_t rack_to_arm_rack;
 239 counter_u64_t rack_to_arm_tlp;
 240 counter_u64_t rack_to_alloc;
 241 counter_u64_t rack_to_alloc_hard;
 242 counter_u64_t rack_to_alloc_emerg;
 243 counter_u64_t rack_to_alloc_limited;
 244 counter_u64_t rack_alloc_limited_conns;
 245 counter_u64_t rack_split_limited;
 246
 247 counter_u64_t rack_sack_proc_all;
 248 counter_u64_t rack_sack_proc_short;
 249 counter_u64_t rack_sack_proc_restart;
 250 counter_u64_t rack_sack_attacks_detected;
 251 counter_u64_t rack_sack_attacks_reversed;
 252 counter_u64_t rack_sack_used_next_merge;
 253 counter_u64_t rack_sack_splits;
 254 counter_u64_t rack_sack_used_prev_merge;
 255 counter_u64_t rack_sack_skipped_acked;
 256 counter_u64_t rack_ack_total;
 257 counter_u64_t rack_express_sack;
 258 counter_u64_t rack_sack_total;
 259 counter_u64_t rack_move_none;
 260 counter_u64_t rack_move_some;
 261
 262 counter_u64_t rack_used_tlpmethod;
 263 counter_u64_t rack_used_tlpmethod2;
 264 counter_u64_t rack_enter_tlp_calc;
 265 counter_u64_t rack_input_idle_reduces;
 266 counter_u64_t rack_collapsed_win;
 267 counter_u64_t rack_tlp_does_nada;
 268
 269 /* Counters for HW TLS */
 270 counter_u64_t rack_tls_rwnd;
 271 counter_u64_t rack_tls_cwnd;
 272 counter_u64_t rack_tls_app;
 273 counter_u64_t rack_tls_other;
 274 counter_u64_t rack_tls_filled;
 275 counter_u64_t rack_tls_rxt;
 276 counter_u64_t rack_tls_tlp;
 277
 278 /* Temp CPU counters */
 279 counter_u64_t rack_find_high;
 280
 281 counter_u64_t rack_progress_drops;
 282 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 283 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 284
 285 static void
 286 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 287
 288 static int
 289 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 290     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 291     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 292 static int
 293 rack_process_data(struct mbuf *m, struct tcphdr *th,
 294     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 295     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 296 static void
 297 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 298     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 299 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 300 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 301     uint8_t limit_type);
 302 static struct rack_sendmap *
 303 rack_check_recovery_mode(struct tcpcb *tp,
 304     uint32_t tsused);
 305 static void
 306 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 307     uint32_t type);
 308 static void rack_counter_destroy(void);
 309 static int
 310 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 311     struct inpcb *inp, struct tcpcb *tp);
 312 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 313 static void
 314 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 315     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 316     uint8_t iptos);
 317 static void rack_dtor(void *mem, int32_t size, void *arg);
 318 static void
 319 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 320     uint32_t t, uint32_t cts);
 321 static struct rack_sendmap *
 322 rack_find_high_nonack(struct tcp_rack *rack,
 323     struct rack_sendmap *rsm);
 324 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 325 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 326 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 327 static int
 328 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 329     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 330 static int32_t rack_handoff_ok(struct tcpcb *tp);
 331 static int32_t rack_init(struct tcpcb *tp);
 332 static void rack_init_sysctls(void);
 333 static void
 334 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 335     struct tcphdr *th);
 336 static void
 337 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 338     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 339     uint8_t pass, struct rack_sendmap *hintrsm);
 340 static void
 341 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 342     struct rack_sendmap *rsm);
 343 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num);
 344 static int32_t rack_output(struct tcpcb *tp);
 345
 346 static uint32_t
 347 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 348     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 349     uint32_t cts, int *moved_two);
 350 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 351 static void rack_remxt_tmr(struct tcpcb *tp);
 352 static int
 353 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 354     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 355 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 356 static int32_t rack_stopall(struct tcpcb *tp);
 357 static void
 358 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 359     uint32_t delta);
 360 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 361 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 362 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 363 static uint32_t
 364 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 365     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 366 static void
 367 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 368     struct rack_sendmap *rsm, uint32_t ts);
 369 static int
 370 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 371     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 372 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 373 static int
 374 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 375     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 376     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 377 static int
 378 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 379     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 380     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 381 static int
 382 rack_do_established(struct mbuf *m, struct tcphdr *th,
 383     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 384     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 385 static int
 386 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 387     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 388     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 389 static int
 390 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 391     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 392     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 393 static int
 394 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 395     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 396     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 397 static int
 398 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 399     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 400     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 401 static int
 402 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 403     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 404     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 405 static int
 406 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 407     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 408     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 409 struct rack_sendmap *
 410 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 411     uint32_t tsused);
 412 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 413 static void
 414      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 415
 416 int32_t rack_clear_counter=0;
 417
 418
 419 static int
 420 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 421 {
 422         uint32_t stat;
 423         int32_t error;
 424
 425         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 426         if (error || req->newptr == NULL)
 427                 return error;
 428
 429         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 430         if (error)
 431                 return (error);
 432         if (stat == 1) {
 433 #ifdef INVARIANTS
 434                 printf("Clearing RACK counters\n");
 435 #endif
 436                 counter_u64_zero(rack_badfr);
 437                 counter_u64_zero(rack_badfr_bytes);
 438                 counter_u64_zero(rack_rtm_prr_retran);
 439                 counter_u64_zero(rack_rtm_prr_newdata);
 440                 counter_u64_zero(rack_timestamp_mismatch);
 441                 counter_u64_zero(rack_reorder_seen);
 442                 counter_u64_zero(rack_tlp_tot);
 443                 counter_u64_zero(rack_tlp_newdata);
 444                 counter_u64_zero(rack_tlp_retran);
 445                 counter_u64_zero(rack_tlp_retran_bytes);
 446                 counter_u64_zero(rack_tlp_retran_fail);
 447                 counter_u64_zero(rack_to_tot);
 448                 counter_u64_zero(rack_to_arm_rack);
 449                 counter_u64_zero(rack_to_arm_tlp);
 450                 counter_u64_zero(rack_paced_segments);
 451                 counter_u64_zero(rack_calc_zero);
 452                 counter_u64_zero(rack_calc_nonzero);
 453                 counter_u64_zero(rack_unpaced_segments);
 454                 counter_u64_zero(rack_saw_enobuf);
 455                 counter_u64_zero(rack_saw_enetunreach);
 456                 counter_u64_zero(rack_per_timer_hole);
 457                 counter_u64_zero(rack_to_alloc_hard);
 458                 counter_u64_zero(rack_to_alloc_emerg);
 459                 counter_u64_zero(rack_sack_proc_all);
 460                 counter_u64_zero(rack_sack_proc_short);
 461                 counter_u64_zero(rack_sack_proc_restart);
 462                 counter_u64_zero(rack_to_alloc);
 463                 counter_u64_zero(rack_to_alloc_limited);
 464                 counter_u64_zero(rack_alloc_limited_conns);
 465                 counter_u64_zero(rack_split_limited);
 466                 counter_u64_zero(rack_find_high);
 467                 counter_u64_zero(rack_tls_rwnd);
 468                 counter_u64_zero(rack_tls_cwnd);
 469                 counter_u64_zero(rack_tls_app);
 470                 counter_u64_zero(rack_tls_other);
 471                 counter_u64_zero(rack_tls_filled);
 472                 counter_u64_zero(rack_tls_rxt);
 473                 counter_u64_zero(rack_tls_tlp);
 474                 counter_u64_zero(rack_sack_attacks_detected);
 475                 counter_u64_zero(rack_sack_attacks_reversed);
 476                 counter_u64_zero(rack_sack_used_next_merge);
 477                 counter_u64_zero(rack_sack_used_prev_merge);
 478                 counter_u64_zero(rack_sack_splits);
 479                 counter_u64_zero(rack_sack_skipped_acked);
 480                 counter_u64_zero(rack_ack_total);
 481                 counter_u64_zero(rack_express_sack);
 482                 counter_u64_zero(rack_sack_total);
 483                 counter_u64_zero(rack_move_none);
 484                 counter_u64_zero(rack_move_some);
 485                 counter_u64_zero(rack_used_tlpmethod);
 486                 counter_u64_zero(rack_used_tlpmethod2);
 487                 counter_u64_zero(rack_enter_tlp_calc);
 488                 counter_u64_zero(rack_progress_drops);
 489                 counter_u64_zero(rack_tlp_does_nada);
 490                 counter_u64_zero(rack_collapsed_win);
 491
 492         }
 493         rack_clear_counter = 0;
 494         return (0);
 495 }
 496
 497
 498
 499 static void
 500 rack_init_sysctls(void)
 501 {
 502         struct sysctl_oid *rack_counters;
 503         struct sysctl_oid *rack_attack;
 504
 505         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 506             SYSCTL_CHILDREN(rack_sysctl_root),
 507             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 508             &rack_rate_sample_method , USE_RTT_LOW,
 509             "What method should we use for rate sampling 0=high, 1=low ");
 510         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 511             SYSCTL_CHILDREN(rack_sysctl_root),
 512             OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
 513             &rack_hw_tls_max_seg , 0,
 514             "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? ");
 515         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 516             SYSCTL_CHILDREN(rack_sysctl_root),
 517             OID_AUTO, "data_after_close", CTLFLAG_RW,
 518             &rack_ignore_data_after_close, 0,
 519             "Do we hold off sending a RST until all pending data is ack'd");
 520         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 521             SYSCTL_CHILDREN(rack_sysctl_root),
 522             OID_AUTO, "cheat_rxt", CTLFLAG_RW,
 523             &use_rack_cheat, 1,
 524             "Do we use the rxt cheat for rack?");
 525
 526         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 527             SYSCTL_CHILDREN(rack_sysctl_root),
 528             OID_AUTO, "persmin", CTLFLAG_RW,
 529             &rack_persist_min, 250,
 530             "What is the minimum time in milliseconds between persists");
 531         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 532             SYSCTL_CHILDREN(rack_sysctl_root),
 533             OID_AUTO, "persmax", CTLFLAG_RW,
 534             &rack_persist_max, 1000,
 535             "What is the largest delay in milliseconds between persists");
 536         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 537             SYSCTL_CHILDREN(rack_sysctl_root),
 538             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
 539             &rack_sack_not_required, 0,
 540             "Do we allow rack to run on connections not supporting SACK?");
 541         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 542             SYSCTL_CHILDREN(rack_sysctl_root),
 543             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 544             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 545             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 546         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 547             SYSCTL_CHILDREN(rack_sysctl_root),
 548             OID_AUTO, "gp_percentage", CTLFLAG_RW,
 549             &rack_per_of_gp, 50,
 550             "Do we pace to percentage of goodput (0=old method)?");
 551         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 552             SYSCTL_CHILDREN(rack_sysctl_root),
 553             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 554             &rack_min_pace_time, 0,
 555             "Should we enforce a minimum pace time of 1ms");
 556         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 557             SYSCTL_CHILDREN(rack_sysctl_root),
 558             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 559             &rack_verbose_logging, 0,
 560             "Should RACK black box logging be verbose");
 561         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 562             SYSCTL_CHILDREN(rack_sysctl_root),
 563             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 564             &rack_use_sack_filter, 1,
 565             "Do we use sack filtering?");
 566         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 567             SYSCTL_CHILDREN(rack_sysctl_root),
 568             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 569             &rack_delayed_ack_time, 200,
 570             "Delayed ack time (200ms)");
 571         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 572             SYSCTL_CHILDREN(rack_sysctl_root),
 573             OID_AUTO, "tlpminto", CTLFLAG_RW,
 574             &rack_tlp_min, 10,
 575             "TLP minimum timeout per the specification (10ms)");
 576         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 577             SYSCTL_CHILDREN(rack_sysctl_root),
 578             OID_AUTO, "send_oldest", CTLFLAG_RW,
 579             &rack_always_send_oldest, 1,
 580             "Should we always send the oldest TLP and RACK-TLP");
 581         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 582             SYSCTL_CHILDREN(rack_sysctl_root),
 583             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 584             &rack_limited_retran, 0,
 585             "How many times can a rack timeout drive out sends");
 586         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 587             SYSCTL_CHILDREN(rack_sysctl_root),
 588             OID_AUTO, "minrto", CTLFLAG_RW,
 589             &rack_rto_min, 0,
 590             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 591         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 592             SYSCTL_CHILDREN(rack_sysctl_root),
 593             OID_AUTO, "maxrto", CTLFLAG_RW,
 594             &rack_rto_max, 0,
 595             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 596         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 597             SYSCTL_CHILDREN(rack_sysctl_root),
 598             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 599             &rack_tlp_max_resend, 2,
 600             "How many times does TLP retry a single segment or multiple with no ACK");
 601         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 602             SYSCTL_CHILDREN(rack_sysctl_root),
 603             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 604             &rack_use_proportional_reduce, 0,
 605             "Should we proportionaly reduce cwnd based on the number of losses ");
 606         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 607             SYSCTL_CHILDREN(rack_sysctl_root),
 608             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 609             &rack_proportional_rate, 10,
 610             "What percent reduction per loss");
 611         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 612             SYSCTL_CHILDREN(rack_sysctl_root),
 613             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 614             &rack_lower_cwnd_at_tlp, 0,
 615             "When a TLP completes a retran should we enter recovery?");
 616         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 617             SYSCTL_CHILDREN(rack_sysctl_root),
 618             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 619             &rack_slot_reduction, 4,
 620             "When setting a slot should we reduce by divisor");
 621         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 622             SYSCTL_CHILDREN(rack_sysctl_root),
 623             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 624             &rack_pace_every_seg, 0,
 625             "Should we use the original pacing mechanism that did not pace much?");
 626         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 627             SYSCTL_CHILDREN(rack_sysctl_root),
 628             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 629             &rack_hptsi_segments, 40,
 630             "Should we pace out only a limited size of segments");
 631         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 634             &rack_send_a_lot_in_prr, 1,
 635             "Send a lot in prr");
 636         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "minto", CTLFLAG_RW,
 639             &rack_min_to, 1,
 640             "Minimum rack timeout in milliseconds");
 641         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 642             SYSCTL_CHILDREN(rack_sysctl_root),
 643             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 644             &rack_early_recovery, 1,
 645             "Do we do early recovery with rack");
 646         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 647             SYSCTL_CHILDREN(rack_sysctl_root),
 648             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 649             &rack_reorder_thresh, 2,
 650             "What factor for rack will be added when seeing reordering (shift right)");
 651         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 652             SYSCTL_CHILDREN(rack_sysctl_root),
 653             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 654             &rack_tlp_thresh, 1,
 655             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 656         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 657             SYSCTL_CHILDREN(rack_sysctl_root),
 658             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 659             &rack_reorder_fade, 0,
 660             "Does reorder detection fade, if so how many ms (0 means never)");
 661         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 662             SYSCTL_CHILDREN(rack_sysctl_root),
 663             OID_AUTO, "pktdelay", CTLFLAG_RW,
 664             &rack_pkt_delay, 1,
 665             "Extra RACK time (in ms) besides reordering thresh");
 666
 667         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 668             SYSCTL_CHILDREN(rack_sysctl_root),
 669             OID_AUTO,
 670             "stats",
 671             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 672             "Rack Counters");
 673         rack_badfr = counter_u64_alloc(M_WAITOK);
 674         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 675             SYSCTL_CHILDREN(rack_counters),
 676             OID_AUTO, "badfr", CTLFLAG_RD,
 677             &rack_badfr, "Total number of bad FRs");
 678         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 680             SYSCTL_CHILDREN(rack_counters),
 681             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 682             &rack_badfr_bytes, "Total number of bad FRs");
 683         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 684         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 685             SYSCTL_CHILDREN(rack_counters),
 686             OID_AUTO, "prrsndret", CTLFLAG_RD,
 687             &rack_rtm_prr_retran,
 688             "Total number of prr based retransmits");
 689         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 690         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 691             SYSCTL_CHILDREN(rack_counters),
 692             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 693             &rack_rtm_prr_newdata,
 694             "Total number of prr based new transmits");
 695         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 696         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 697             SYSCTL_CHILDREN(rack_counters),
 698             OID_AUTO, "tsnf", CTLFLAG_RD,
 699             &rack_timestamp_mismatch,
 700             "Total number of timestamps that we could not find the reported ts");
 701         rack_find_high = counter_u64_alloc(M_WAITOK);
 702         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 703             SYSCTL_CHILDREN(rack_counters),
 704             OID_AUTO, "findhigh", CTLFLAG_RD,
 705             &rack_find_high,
 706             "Total number of FIN causing find-high");
 707         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 708         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 709             SYSCTL_CHILDREN(rack_counters),
 710             OID_AUTO, "reordering", CTLFLAG_RD,
 711             &rack_reorder_seen,
 712             "Total number of times we added delay due to reordering");
 713         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 714         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 715             SYSCTL_CHILDREN(rack_counters),
 716             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 717             &rack_tlp_tot,
 718             "Total number of tail loss probe expirations");
 719         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 720         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 721             SYSCTL_CHILDREN(rack_counters),
 722             OID_AUTO, "tlp_new", CTLFLAG_RD,
 723             &rack_tlp_newdata,
 724             "Total number of tail loss probe sending new data");
 725
 726         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 727         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 728             SYSCTL_CHILDREN(rack_counters),
 729             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 730             &rack_tlp_retran,
 731             "Total number of tail loss probe sending retransmitted data");
 732         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 733         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 734             SYSCTL_CHILDREN(rack_counters),
 735             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 736             &rack_tlp_retran_bytes,
 737             "Total bytes of tail loss probe sending retransmitted data");
 738         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 739         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 740             SYSCTL_CHILDREN(rack_counters),
 741             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 742             &rack_tlp_retran_fail,
 743             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 744         rack_to_tot = counter_u64_alloc(M_WAITOK);
 745         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 746             SYSCTL_CHILDREN(rack_counters),
 747             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 748             &rack_to_tot,
 749             "Total number of times the rack to expired?");
 750         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 751         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 752             SYSCTL_CHILDREN(rack_counters),
 753             OID_AUTO, "arm_rack", CTLFLAG_RD,
 754             &rack_to_arm_rack,
 755             "Total number of times the rack timer armed?");
 756         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 757         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 758             SYSCTL_CHILDREN(rack_counters),
 759             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 760             &rack_to_arm_tlp,
 761             "Total number of times the tlp timer armed?");
 762
 763         rack_calc_zero = counter_u64_alloc(M_WAITOK);
 764         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
 765         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 766             SYSCTL_CHILDREN(rack_counters),
 767             OID_AUTO, "calc_zero", CTLFLAG_RD,
 768             &rack_calc_zero,
 769             "Total number of times pacing time worked out to zero?");
 770         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 771             SYSCTL_CHILDREN(rack_counters),
 772             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
 773             &rack_calc_nonzero,
 774             "Total number of times pacing time worked out to non-zero?");
 775         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 776         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 777             SYSCTL_CHILDREN(rack_counters),
 778             OID_AUTO, "paced", CTLFLAG_RD,
 779             &rack_paced_segments,
 780             "Total number of times a segment send caused hptsi");
 781         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 782         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 783             SYSCTL_CHILDREN(rack_counters),
 784             OID_AUTO, "unpaced", CTLFLAG_RD,
 785             &rack_unpaced_segments,
 786             "Total number of times a segment did not cause hptsi");
 787         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 788         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 789             SYSCTL_CHILDREN(rack_counters),
 790             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 791             &rack_saw_enobuf,
 792             "Total number of times a segment did not cause hptsi");
 793         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 794         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 795             SYSCTL_CHILDREN(rack_counters),
 796             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 797             &rack_saw_enetunreach,
 798             "Total number of times a segment did not cause hptsi");
 799         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 800         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 801             SYSCTL_CHILDREN(rack_counters),
 802             OID_AUTO, "allocs", CTLFLAG_RD,
 803             &rack_to_alloc,
 804             "Total allocations of tracking structures");
 805         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 806         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 807             SYSCTL_CHILDREN(rack_counters),
 808             OID_AUTO, "allochard", CTLFLAG_RD,
 809             &rack_to_alloc_hard,
 810             "Total allocations done with sleeping the hard way");
 811         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 812         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 813             SYSCTL_CHILDREN(rack_counters),
 814             OID_AUTO, "allocemerg", CTLFLAG_RD,
 815             &rack_to_alloc_emerg,
 816             "Total allocations done from emergency cache");
 817         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
 818         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 819             SYSCTL_CHILDREN(rack_counters),
 820             OID_AUTO, "alloc_limited", CTLFLAG_RD,
 821             &rack_to_alloc_limited,
 822             "Total allocations dropped due to limit");
 823         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 824         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 825             SYSCTL_CHILDREN(rack_counters),
 826             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 827             &rack_alloc_limited_conns,
 828             "Connections with allocations dropped due to limit");
 829         rack_split_limited = counter_u64_alloc(M_WAITOK);
 830         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 831             SYSCTL_CHILDREN(rack_counters),
 832             OID_AUTO, "split_limited", CTLFLAG_RD,
 833             &rack_split_limited,
 834             "Split allocations dropped due to limit");
 835         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 836         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 837             SYSCTL_CHILDREN(rack_counters),
 838             OID_AUTO, "sack_long", CTLFLAG_RD,
 839             &rack_sack_proc_all,
 840             "Total times we had to walk whole list for sack processing");
 841
 842         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 843         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 844             SYSCTL_CHILDREN(rack_counters),
 845             OID_AUTO, "sack_restart", CTLFLAG_RD,
 846             &rack_sack_proc_restart,
 847             "Total times we had to walk whole list due to a restart");
 848         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 849         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 850             SYSCTL_CHILDREN(rack_counters),
 851             OID_AUTO, "sack_short", CTLFLAG_RD,
 852             &rack_sack_proc_short,
 853             "Total times we took shortcut for sack processing");
 854         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 855         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 856             SYSCTL_CHILDREN(rack_counters),
 857             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 858             &rack_enter_tlp_calc,
 859             "Total times we called calc-tlp");
 860         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 861         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 862             SYSCTL_CHILDREN(rack_counters),
 863             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 864             &rack_used_tlpmethod,
 865             "Total number of runt sacks");
 866         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 867         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 868             SYSCTL_CHILDREN(rack_counters),
 869             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 870             &rack_used_tlpmethod2,
 871             "Total number of times we hit TLP method 2");
 872         /* Sack Attacker detection stuff */
 873         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 874             SYSCTL_CHILDREN(rack_sysctl_root),
 875             OID_AUTO,
 876             "sack_attack",
 877             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 878             "Rack Sack Attack Counters and Controls");
 879         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 880             SYSCTL_CHILDREN(rack_attack),
 881             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
 882             &rack_highest_sack_thresh_seen, 0,
 883             "Highest sack to ack ratio seen");
 884         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 885             SYSCTL_CHILDREN(rack_attack),
 886             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
 887             &rack_highest_move_thresh_seen, 0,
 888             "Highest move to non-move ratio seen");
 889         rack_ack_total = counter_u64_alloc(M_WAITOK);
 890         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 891             SYSCTL_CHILDREN(rack_attack),
 892             OID_AUTO, "acktotal", CTLFLAG_RD,
 893             &rack_ack_total,
 894             "Total number of Ack's");
 895
 896         rack_express_sack = counter_u64_alloc(M_WAITOK);
 897         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 898             SYSCTL_CHILDREN(rack_attack),
 899             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
 900             &rack_express_sack,
 901             "Total expresss number of Sack's");
 902         rack_sack_total = counter_u64_alloc(M_WAITOK);
 903         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 904             SYSCTL_CHILDREN(rack_attack),
 905             OID_AUTO, "sacktotal", CTLFLAG_RD,
 906             &rack_sack_total,
 907             "Total number of SACK's");
 908         rack_move_none = counter_u64_alloc(M_WAITOK);
 909         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 910             SYSCTL_CHILDREN(rack_attack),
 911             OID_AUTO, "move_none", CTLFLAG_RD,
 912             &rack_move_none,
 913             "Total number of SACK index reuse of postions under threshold");
 914         rack_move_some = counter_u64_alloc(M_WAITOK);
 915         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 916             SYSCTL_CHILDREN(rack_attack),
 917             OID_AUTO, "move_some", CTLFLAG_RD,
 918             &rack_move_some,
 919             "Total number of SACK index reuse of postions over threshold");
 920         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
 921         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 922             SYSCTL_CHILDREN(rack_attack),
 923             OID_AUTO, "attacks", CTLFLAG_RD,
 924             &rack_sack_attacks_detected,
 925             "Total number of SACK attackers that had sack disabled");
 926         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
 927         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 928             SYSCTL_CHILDREN(rack_attack),
 929             OID_AUTO, "reversed", CTLFLAG_RD,
 930             &rack_sack_attacks_reversed,
 931             "Total number of SACK attackers that were later determined false positive");
 932         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
 933         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 934             SYSCTL_CHILDREN(rack_attack),
 935             OID_AUTO, "nextmerge", CTLFLAG_RD,
 936             &rack_sack_used_next_merge,
 937             "Total number of times we used the next merge");
 938         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
 939         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 940             SYSCTL_CHILDREN(rack_attack),
 941             OID_AUTO, "prevmerge", CTLFLAG_RD,
 942             &rack_sack_used_prev_merge,
 943             "Total number of times we used the prev merge");
 944         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
 945         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 946             SYSCTL_CHILDREN(rack_attack),
 947             OID_AUTO, "skipacked", CTLFLAG_RD,
 948             &rack_sack_skipped_acked,
 949             "Total number of times we skipped previously sacked");
 950         rack_sack_splits = counter_u64_alloc(M_WAITOK);
 951         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 952             SYSCTL_CHILDREN(rack_attack),
 953             OID_AUTO, "ofsplit", CTLFLAG_RD,
 954             &rack_sack_splits,
 955             "Total number of times we did the old fashion tree split");
 956         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 957         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 958             SYSCTL_CHILDREN(rack_counters),
 959             OID_AUTO, "prog_drops", CTLFLAG_RD,
 960             &rack_progress_drops,
 961             "Total number of progress drops");
 962         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 963         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 964             SYSCTL_CHILDREN(rack_counters),
 965             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 966             &rack_input_idle_reduces,
 967             "Total number of idle reductions on input");
 968         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
 969         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 970             SYSCTL_CHILDREN(rack_counters),
 971             OID_AUTO, "collapsed_win", CTLFLAG_RD,
 972             &rack_collapsed_win,
 973             "Total number of collapsed windows");
 974         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 975         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 976             SYSCTL_CHILDREN(rack_counters),
 977             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 978             &rack_tlp_does_nada,
 979             "Total number of nada tlp calls");
 980
 981         rack_tls_rwnd = counter_u64_alloc(M_WAITOK);
 982         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 983             SYSCTL_CHILDREN(rack_counters),
 984             OID_AUTO, "tls_rwnd", CTLFLAG_RD,
 985             &rack_tls_rwnd,
 986             "Total hdwr tls rwnd limited");
 987
 988         rack_tls_cwnd = counter_u64_alloc(M_WAITOK);
 989         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 990             SYSCTL_CHILDREN(rack_counters),
 991             OID_AUTO, "tls_cwnd", CTLFLAG_RD,
 992             &rack_tls_cwnd,
 993             "Total hdwr tls cwnd limited");
 994
 995         rack_tls_app = counter_u64_alloc(M_WAITOK);
 996         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 997             SYSCTL_CHILDREN(rack_counters),
 998             OID_AUTO, "tls_app", CTLFLAG_RD,
 999             &rack_tls_app,
1000             "Total hdwr tls app limited");
1001
1002         rack_tls_other = counter_u64_alloc(M_WAITOK);
1003         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1004             SYSCTL_CHILDREN(rack_counters),
1005             OID_AUTO, "tls_other", CTLFLAG_RD,
1006             &rack_tls_other,
1007             "Total hdwr tls other limited");
1008
1009         rack_tls_filled = counter_u64_alloc(M_WAITOK);
1010         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1011             SYSCTL_CHILDREN(rack_counters),
1012             OID_AUTO, "tls_filled", CTLFLAG_RD,
1013             &rack_tls_filled,
1014             "Total hdwr tls filled");
1015
1016         rack_tls_rxt = counter_u64_alloc(M_WAITOK);
1017         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1018             SYSCTL_CHILDREN(rack_counters),
1019             OID_AUTO, "tls_rxt", CTLFLAG_RD,
1020             &rack_tls_rxt,
1021             "Total hdwr rxt");
1022
1023         rack_tls_tlp = counter_u64_alloc(M_WAITOK);
1024         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1025             SYSCTL_CHILDREN(rack_counters),
1026             OID_AUTO, "tls_tlp", CTLFLAG_RD,
1027             &rack_tls_tlp,
1028             "Total hdwr tls tlp");
1029         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1030         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1031             SYSCTL_CHILDREN(rack_counters),
1032             OID_AUTO, "timer_hole", CTLFLAG_RD,
1033             &rack_per_timer_hole,
1034             "Total persists start in timer hole");
1035
1036         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1037         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1038             OID_AUTO, "outsize", CTLFLAG_RD,
1039             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1040         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1041         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1042             OID_AUTO, "opts", CTLFLAG_RD,
1043             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1044         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1045             SYSCTL_CHILDREN(rack_sysctl_root),
1046             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1047             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1048 }
1049
1050 static __inline int
1051 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1052 {
1053         if (SEQ_GEQ(b->r_start, a->r_start) &&
1054             SEQ_LT(b->r_start, a->r_end)) {
1055                 /*
1056                  * The entry b is within the
1057                  * block a. i.e.:
1058                  * a --   |-------------|
1059                  * b --   |----|
1060                  * <or>
1061                  * b --       |------|
1062                  * <or>
1063                  * b --       |-----------|
1064                  */
1065                 return (0);
1066         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1067                 /*
1068                  * b falls as either the next
1069                  * sequence block after a so a
1070                  * is said to be smaller than b.
1071                  * i.e:
1072                  * a --   |------|
1073                  * b --          |--------|
1074                  * or
1075                  * b --              |-----|
1076                  */
1077                 return (1);
1078         }
1079         /*
1080          * Whats left is where a is
1081          * larger than b. i.e:
1082          * a --         |-------|
1083          * b --  |---|
1084          * or even possibly
1085          * b --   |--------------|
1086          */
1087         return (-1);
1088 }
1089
1090 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1091 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1092
1093 static inline int32_t
1094 rack_progress_timeout_check(struct tcpcb *tp)
1095 {
1096         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
1097                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
1098                         /*
1099                          * There is an assumption that the caller
1100                          * will drop the connection so we will
1101                          * increment the counters here.
1102                          */
1103                         struct tcp_rack *rack;
1104                         rack = (struct tcp_rack *)tp->t_fb_ptr;
1105                         counter_u64_add(rack_progress_drops, 1);
1106 #ifdef NETFLIX_STATS
1107                         TCPSTAT_INC(tcps_progdrops);
1108 #endif
1109                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
1110                         return (1);
1111                 }
1112         }
1113         return (0);
1114 }
1115
1116
1117
1118 static void
1119 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
1120 {
1121         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1122                 union tcp_log_stackspecific log;
1123                 struct timeval tv;
1124                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1125                 log.u_bbr.flex1 = tsused;
1126                 log.u_bbr.flex2 = thresh;
1127                 log.u_bbr.flex3 = rsm->r_flags;
1128                 log.u_bbr.flex4 = rsm->r_dupack;
1129                 log.u_bbr.flex5 = rsm->r_start;
1130                 log.u_bbr.flex6 = rsm->r_end;
1131                 log.u_bbr.flex8 = mod;
1132                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1133                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1134                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1135                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1136                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1137                     &rack->rc_inp->inp_socket->so_rcv,
1138                     &rack->rc_inp->inp_socket->so_snd,
1139                     BBR_LOG_SETTINGS_CHG, 0,
1140                     0, &log, false, &tv);
1141         }
1142 }
1143
1144
1145
1146 static void
1147 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
1148 {
1149         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1150                 union tcp_log_stackspecific log;
1151                 struct timeval tv;
1152
1153                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1154                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
1155                 log.u_bbr.flex2 = to;
1156                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1157                 log.u_bbr.flex4 = slot;
1158                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
1159                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1160                 log.u_bbr.flex7 = rack->rc_in_persist;
1161                 log.u_bbr.flex8 = which;
1162                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1163                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1164                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1165                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1166                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1167                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1168                     &rack->rc_inp->inp_socket->so_rcv,
1169                     &rack->rc_inp->inp_socket->so_snd,
1170                     BBR_LOG_TIMERSTAR, 0,
1171                     0, &log, false, &tv);
1172         }
1173 }
1174
1175 static void
1176 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no)
1177 {
1178         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1179                 union tcp_log_stackspecific log;
1180                 struct timeval tv;
1181
1182                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1183                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1184                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1185                 log.u_bbr.flex8 = to_num;
1186                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
1187                 log.u_bbr.flex2 = rack->rc_rack_rtt;
1188                 log.u_bbr.flex3 = no;
1189                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1190                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1191                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1192                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1193                     &rack->rc_inp->inp_socket->so_rcv,
1194                     &rack->rc_inp->inp_socket->so_snd,
1195                     BBR_LOG_RTO, 0,
1196                     0, &log, false, &tv);
1197         }
1198 }
1199
1200 static void
1201 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
1202     uint32_t o_srtt, uint32_t o_var)
1203 {
1204         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1205                 union tcp_log_stackspecific log;
1206                 struct timeval tv;
1207
1208                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1209                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1210                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1211                 log.u_bbr.flex1 = t;
1212                 log.u_bbr.flex2 = o_srtt;
1213                 log.u_bbr.flex3 = o_var;
1214                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
1215                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
1216                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
1217                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
1218                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
1219                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1220                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1221                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1222                 TCP_LOG_EVENTP(tp, NULL,
1223                     &rack->rc_inp->inp_socket->so_rcv,
1224                     &rack->rc_inp->inp_socket->so_snd,
1225                     BBR_LOG_BBRRTT, 0,
1226                     0, &log, false, &tv);
1227         }
1228 }
1229
1230 static void
1231 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
1232 {
1233         /*
1234          * Log the rtt sample we are
1235          * applying to the srtt algorithm in
1236          * useconds.
1237          */
1238         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1239                 union tcp_log_stackspecific log;
1240                 struct timeval tv;
1241
1242                 /* Convert our ms to a microsecond */
1243                 memset(&log, 0, sizeof(log));
1244                 log.u_bbr.flex1 = rtt * 1000;
1245                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1246                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
1247                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1248                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
1249                 log.u_bbr.flex8 = rack->sack_attack_disable;
1250                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1251                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1252                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1253                     &rack->rc_inp->inp_socket->so_rcv,
1254                     &rack->rc_inp->inp_socket->so_snd,
1255                     TCP_LOG_RTT, 0,
1256                     0, &log, false, &tv);
1257         }
1258 }
1259
1260
1261 static inline void
1262 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
1263 {
1264         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1265                 union tcp_log_stackspecific log;
1266                 struct timeval tv;
1267
1268                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1269                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1270                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1271                 log.u_bbr.flex1 = line;
1272                 log.u_bbr.flex2 = tick;
1273                 log.u_bbr.flex3 = tp->t_maxunacktime;
1274                 log.u_bbr.flex4 = tp->t_acktime;
1275                 log.u_bbr.flex8 = event;
1276                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1277                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1278                 TCP_LOG_EVENTP(tp, NULL,
1279                     &rack->rc_inp->inp_socket->so_rcv,
1280                     &rack->rc_inp->inp_socket->so_snd,
1281                     BBR_LOG_PROGRESS, 0,
1282                     0, &log, false, &tv);
1283         }
1284 }
1285
1286 static void
1287 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1288 {
1289         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1290                 union tcp_log_stackspecific log;
1291                 struct timeval tv;
1292
1293                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1294                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1295                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1296                 log.u_bbr.flex1 = slot;
1297                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
1298                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1299                 log.u_bbr.flex8 = rack->rc_in_persist;
1300                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1301                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1302                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1303                     &rack->rc_inp->inp_socket->so_rcv,
1304                     &rack->rc_inp->inp_socket->so_snd,
1305                     BBR_LOG_BBRSND, 0,
1306                     0, &log, false, &tv);
1307         }
1308 }
1309
1310 static void
1311 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1312 {
1313         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1314                 union tcp_log_stackspecific log;
1315                 struct timeval tv;
1316
1317                 memset(&log, 0, sizeof(log));
1318                 log.u_bbr.flex1 = did_out;
1319                 log.u_bbr.flex2 = nxt_pkt;
1320                 log.u_bbr.flex3 = way_out;
1321                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1322                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1323                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
1324                 log.u_bbr.flex7 = rack->r_wanted_output;
1325                 log.u_bbr.flex8 = rack->rc_in_persist;
1326                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1327                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1328                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1329                     &rack->rc_inp->inp_socket->so_rcv,
1330                     &rack->rc_inp->inp_socket->so_snd,
1331                     BBR_LOG_DOSEG_DONE, 0,
1332                     0, &log, false, &tv);
1333         }
1334 }
1335
1336 static void
1337 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
1338 {
1339         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1340                 union tcp_log_stackspecific log;
1341                 struct timeval tv;
1342                 uint32_t cts;
1343
1344                 memset(&log, 0, sizeof(log));
1345                 cts = tcp_get_usecs(&tv);
1346                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
1347                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
1348                 log.u_bbr.flex4 = len;
1349                 log.u_bbr.flex5 = orig_len;
1350                 log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
1351                 log.u_bbr.flex7 = mod;
1352                 log.u_bbr.flex8 = frm;
1353                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1354                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1355                 TCP_LOG_EVENTP(tp, NULL,
1356                     &tp->t_inpcb->inp_socket->so_rcv,
1357                     &tp->t_inpcb->inp_socket->so_snd,
1358                     TCP_HDWR_TLS, 0,
1359                     0, &log, false, &tv);
1360         }
1361 }
1362
1363 static void
1364 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1365 {
1366         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1367                 union tcp_log_stackspecific log;
1368                 struct timeval tv;
1369
1370                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1371                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1372                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1373                 log.u_bbr.flex1 = slot;
1374                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1375                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1376                 log.u_bbr.flex7 = hpts_calling;
1377                 log.u_bbr.flex8 = rack->rc_in_persist;
1378                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1379                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1380                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1381                     &rack->rc_inp->inp_socket->so_rcv,
1382                     &rack->rc_inp->inp_socket->so_snd,
1383                     BBR_LOG_JUSTRET, 0,
1384                     tlen, &log, false, &tv);
1385         }
1386 }
1387
1388 static void
1389 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1390 {
1391         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1392                 union tcp_log_stackspecific log;
1393                 struct timeval tv;
1394
1395                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1396                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1397                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1398                 log.u_bbr.flex1 = line;
1399                 log.u_bbr.flex2 = 0;
1400                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1401                 log.u_bbr.flex4 = 0;
1402                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1403                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1404                 log.u_bbr.flex8 = hpts_removed;
1405                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1406                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1407                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1408                     &rack->rc_inp->inp_socket->so_rcv,
1409                     &rack->rc_inp->inp_socket->so_snd,
1410                     BBR_LOG_TIMERCANC, 0,
1411                     0, &log, false, &tv);
1412         }
1413 }
1414
1415 static void
1416 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1417 {
1418         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1419                 union tcp_log_stackspecific log;
1420                 struct timeval tv;
1421
1422                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1423                 log.u_bbr.flex1 = timers;
1424                 log.u_bbr.flex2 = ret;
1425                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1426                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1427                 log.u_bbr.flex5 = cts;
1428                 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
1429                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1430                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1431                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1432                     &rack->rc_inp->inp_socket->so_rcv,
1433                     &rack->rc_inp->inp_socket->so_snd,
1434                     BBR_LOG_TO_PROCESS, 0,
1435                     0, &log, false, &tv);
1436         }
1437 }
1438
1439 static void
1440 rack_log_to_prr(struct tcp_rack *rack, int frm)
1441 {
1442         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1443                 union tcp_log_stackspecific log;
1444                 struct timeval tv;
1445
1446                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1447                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
1448                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
1449                 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
1450                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
1451                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
1452                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
1453                 log.u_bbr.flex8 = frm;
1454                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1455                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1456                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1457                     &rack->rc_inp->inp_socket->so_rcv,
1458                     &rack->rc_inp->inp_socket->so_snd,
1459                     BBR_LOG_BBRUPD, 0,
1460                     0, &log, false, &tv);
1461         }
1462 }
1463
1464 #ifdef NETFLIX_EXP_DETECTION
1465 static void
1466 rack_log_sad(struct tcp_rack *rack, int event)
1467 {
1468         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1469                 union tcp_log_stackspecific log;
1470                 struct timeval tv;
1471
1472                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1473                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
1474                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1475                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
1476                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1477                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
1478                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
1479                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
1480                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
1481                 log.u_bbr.lt_epoch |= rack->do_detection;
1482                 log.u_bbr.applimited = tcp_map_minimum;
1483                 log.u_bbr.flex7 = rack->sack_attack_disable;
1484                 log.u_bbr.flex8 = event;
1485                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1486                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1487                 log.u_bbr.delivered = tcp_sad_decay_val;
1488                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1489                     &rack->rc_inp->inp_socket->so_rcv,
1490                     &rack->rc_inp->inp_socket->so_snd,
1491                     TCP_SAD_DETECTION, 0,
1492                     0, &log, false, &tv);
1493         }
1494 }
1495 #endif
1496
1497 static void
1498 rack_counter_destroy(void)
1499 {
1500         counter_u64_free(rack_badfr);
1501         counter_u64_free(rack_badfr_bytes);
1502         counter_u64_free(rack_rtm_prr_retran);
1503         counter_u64_free(rack_rtm_prr_newdata);
1504         counter_u64_free(rack_timestamp_mismatch);
1505         counter_u64_free(rack_reorder_seen);
1506         counter_u64_free(rack_tlp_tot);
1507         counter_u64_free(rack_tlp_newdata);
1508         counter_u64_free(rack_tlp_retran);
1509         counter_u64_free(rack_tlp_retran_bytes);
1510         counter_u64_free(rack_tlp_retran_fail);
1511         counter_u64_free(rack_to_tot);
1512         counter_u64_free(rack_to_arm_rack);
1513         counter_u64_free(rack_to_arm_tlp);
1514         counter_u64_free(rack_paced_segments);
1515         counter_u64_free(rack_unpaced_segments);
1516         counter_u64_free(rack_saw_enobuf);
1517         counter_u64_free(rack_saw_enetunreach);
1518         counter_u64_free(rack_to_alloc_hard);
1519         counter_u64_free(rack_to_alloc_emerg);
1520         counter_u64_free(rack_sack_proc_all);
1521         counter_u64_free(rack_sack_proc_short);
1522         counter_u64_free(rack_sack_proc_restart);
1523         counter_u64_free(rack_to_alloc);
1524         counter_u64_free(rack_to_alloc_limited);
1525         counter_u64_free(rack_alloc_limited_conns);
1526         counter_u64_free(rack_split_limited);
1527         counter_u64_free(rack_find_high);
1528         counter_u64_free(rack_enter_tlp_calc);
1529         counter_u64_free(rack_used_tlpmethod);
1530         counter_u64_free(rack_used_tlpmethod2);
1531         counter_u64_free(rack_progress_drops);
1532         counter_u64_free(rack_input_idle_reduces);
1533         counter_u64_free(rack_collapsed_win);
1534         counter_u64_free(rack_tlp_does_nada);
1535         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1536         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1537 }
1538
1539 static struct rack_sendmap *
1540 rack_alloc(struct tcp_rack *rack)
1541 {
1542         struct rack_sendmap *rsm;
1543
1544         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1545         if (rsm) {
1546                 rack->r_ctl.rc_num_maps_alloced++;
1547                 counter_u64_add(rack_to_alloc, 1);
1548                 return (rsm);
1549         }
1550         if (rack->rc_free_cnt) {
1551                 counter_u64_add(rack_to_alloc_emerg, 1);
1552                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1553                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
1554                 rack->rc_free_cnt--;
1555                 return (rsm);
1556         }
1557         return (NULL);
1558 }
1559
1560 static struct rack_sendmap *
1561 rack_alloc_full_limit(struct tcp_rack *rack)
1562 {
1563         if ((V_tcp_map_entries_limit > 0) &&
1564             (rack->do_detection == 0) &&
1565             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
1566                 counter_u64_add(rack_to_alloc_limited, 1);
1567                 if (!rack->alloc_limit_reported) {
1568                         rack->alloc_limit_reported = 1;
1569                         counter_u64_add(rack_alloc_limited_conns, 1);
1570                 }
1571                 return (NULL);
1572         }
1573         return (rack_alloc(rack));
1574 }
1575
1576 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1577 static struct rack_sendmap *
1578 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1579 {
1580         struct rack_sendmap *rsm;
1581
1582         if (limit_type) {
1583                 /* currently there is only one limit type */
1584                 if (V_tcp_map_split_limit > 0 &&
1585                     (rack->do_detection == 0) &&
1586                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
1587                         counter_u64_add(rack_split_limited, 1);
1588                         if (!rack->alloc_limit_reported) {
1589                                 rack->alloc_limit_reported = 1;
1590                                 counter_u64_add(rack_alloc_limited_conns, 1);
1591                         }
1592                         return (NULL);
1593                 }
1594         }
1595
1596         /* allocate and mark in the limit type, if set */
1597         rsm = rack_alloc(rack);
1598         if (rsm != NULL && limit_type) {
1599                 rsm->r_limit_type = limit_type;
1600                 rack->r_ctl.rc_num_split_allocs++;
1601         }
1602         return (rsm);
1603 }
1604
1605 static void
1606 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1607 {
1608         if (rsm->r_limit_type) {
1609                 /* currently there is only one limit type */
1610                 rack->r_ctl.rc_num_split_allocs--;
1611         }
1612         if (rack->r_ctl.rc_tlpsend == rsm)
1613                 rack->r_ctl.rc_tlpsend = NULL;
1614         if (rack->r_ctl.rc_sacklast == rsm)
1615                 rack->r_ctl.rc_sacklast = NULL;
1616         if (rack->rc_free_cnt < rack_free_cache) {
1617                 memset(rsm, 0, sizeof(struct rack_sendmap));
1618                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
1619                 rsm->r_limit_type = 0;
1620                 rack->rc_free_cnt++;
1621                 return;
1622         }
1623         rack->r_ctl.rc_num_maps_alloced--;
1624         uma_zfree(rack_zone, rsm);
1625 }
1626
1627 /*
1628  * CC wrapper hook functions
1629  */
1630 static void
1631 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1632     uint16_t type, int32_t recovery)
1633 {
1634 #ifdef STATS
1635         int32_t gput;
1636 #endif
1637
1638         INP_WLOCK_ASSERT(tp->t_inpcb);
1639         tp->ccv->nsegs = nsegs;
1640         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1641         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1642                 uint32_t max;
1643
1644                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
1645                 if (tp->ccv->bytes_this_ack > max) {
1646                         tp->ccv->bytes_this_ack = max;
1647                 }
1648         }
1649         if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
1650             (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
1651              (tp->snd_cwnd < (ctf_flight_size(tp, rack->r_ctl.rc_sacked) * 2))))
1652                 tp->ccv->flags |= CCF_CWND_LIMITED;
1653         else
1654                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1655
1656         if (type == CC_ACK) {
1657 #ifdef STATS
1658                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1659                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1660                 if ((tp->t_flags & TF_GPUTINPROG) &&
1661                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1662                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1663                             max(1, tcp_ts_getticks() - tp->gput_ts);
1664                         /* We store it in bytes per ms (or kbytes per sec) */
1665                         rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8;
1666                         rack->r_ctl.rc_gp_hist_idx++;
1667                         if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST)
1668                                 rack->r_ctl.rc_gp_hist_filled = 1;
1669                         rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST;
1670                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1671                             gput);
1672                         /*
1673                          * XXXLAS: This is a temporary hack, and should be
1674                          * chained off VOI_TCP_GPUT when stats(9) grows an
1675                          * API to deal with chained VOIs.
1676                          */
1677                         if (tp->t_stats_gput_prev > 0)
1678                                 stats_voi_update_abs_s32(tp->t_stats,
1679                                     VOI_TCP_GPUT_ND,
1680                                     ((gput - tp->t_stats_gput_prev) * 100) /
1681                                     tp->t_stats_gput_prev);
1682                         tp->t_flags &= ~TF_GPUTINPROG;
1683                         tp->t_stats_gput_prev = gput;
1684 #ifdef NETFLIX_PEAKRATE
1685                         if (tp->t_maxpeakrate) {
1686                                 /*
1687                                  * We update t_peakrate_thr. This gives us roughly
1688                                  * one update per round trip time.
1689                                  */
1690                                 tcp_update_peakrate_thr(tp);
1691                         }
1692 #endif
1693                 }
1694 #endif
1695                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1696                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1697                             nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
1698                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1699                                 tp->t_bytes_acked -= tp->snd_cwnd;
1700                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1701                         }
1702                 } else {
1703                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1704                         tp->t_bytes_acked = 0;
1705                 }
1706         }
1707         if (CC_ALGO(tp)->ack_received != NULL) {
1708                 /* XXXLAS: Find a way to live without this */
1709                 tp->ccv->curack = th->th_ack;
1710                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1711         }
1712 #ifdef STATS
1713         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1714 #endif
1715         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1716                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1717         }
1718         /* we enforce max peak rate if it is set. */
1719         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1720                 tp->snd_cwnd = tp->t_peakrate_thr;
1721         }
1722 }
1723
1724 static void
1725 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1726 {
1727         struct tcp_rack *rack;
1728
1729         rack = (struct tcp_rack *)tp->t_fb_ptr;
1730         INP_WLOCK_ASSERT(tp->t_inpcb);
1731         if (rack->r_ctl.rc_prr_sndcnt > 0)
1732                 rack->r_wanted_output++;
1733 }
1734
1735 static void
1736 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1737 {
1738         struct tcp_rack *rack;
1739
1740         INP_WLOCK_ASSERT(tp->t_inpcb);
1741         rack = (struct tcp_rack *)tp->t_fb_ptr;
1742         if (CC_ALGO(tp)->post_recovery != NULL) {
1743                 tp->ccv->curack = th->th_ack;
1744                 CC_ALGO(tp)->post_recovery(tp->ccv);
1745         }
1746         /*
1747          * Here we can in theory adjust cwnd to be based on the number of
1748          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1749          * based on the rack_use_proportional flag.
1750          */
1751         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1752                 int32_t reduce;
1753
1754                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1755                 if (reduce > 50) {
1756                         reduce = 50;
1757                 }
1758                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1759         } else {
1760                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1761                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1762                         tp->snd_cwnd = tp->snd_ssthresh;
1763                 }
1764         }
1765         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1766                 /* Suck the next prr cnt back into cwnd */
1767                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1768                 rack->r_ctl.rc_prr_sndcnt = 0;
1769                 rack_log_to_prr(rack, 1);
1770         }
1771         tp->snd_recover = tp->snd_una;
1772         EXIT_RECOVERY(tp->t_flags);
1773
1774
1775 }
1776
1777 static void
1778 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1779 {
1780         struct tcp_rack *rack;
1781
1782         INP_WLOCK_ASSERT(tp->t_inpcb);
1783
1784         rack = (struct tcp_rack *)tp->t_fb_ptr;
1785         switch (type) {
1786         case CC_NDUPACK:
1787                 tp->t_flags &= ~TF_WASFRECOVERY;
1788                 tp->t_flags &= ~TF_WASCRECOVERY;
1789                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1790                         rack->r_ctl.rc_tlp_rtx_out = 0;
1791                         rack->r_ctl.rc_prr_delivered = 0;
1792                         rack->r_ctl.rc_prr_out = 0;
1793                         rack->r_ctl.rc_loss_count = 0;
1794                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
1795                         rack_log_to_prr(rack, 2);
1796                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1797                         tp->snd_recover = tp->snd_max;
1798                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1799                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1800                 }
1801                 break;
1802         case CC_ECN:
1803                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1804                         TCPSTAT_INC(tcps_ecn_rcwnd);
1805                         tp->snd_recover = tp->snd_max;
1806                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1807                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1808                 }
1809                 break;
1810         case CC_RTO:
1811                 tp->t_dupacks = 0;
1812                 tp->t_bytes_acked = 0;
1813                 EXIT_RECOVERY(tp->t_flags);
1814                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1815                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
1816                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
1817                 if (tp->t_flags2 & TF2_ECN_PERMIT)
1818                         tp->t_flags2 |= TF2_ECN_SND_CWR;
1819                 break;
1820         case CC_RTO_ERR:
1821                 TCPSTAT_INC(tcps_sndrexmitbad);
1822                 /* RTO was unnecessary, so reset everything. */
1823                 tp->snd_cwnd = tp->snd_cwnd_prev;
1824                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1825                 tp->snd_recover = tp->snd_recover_prev;
1826                 if (tp->t_flags & TF_WASFRECOVERY) {
1827                         ENTER_FASTRECOVERY(tp->t_flags);
1828                         tp->t_flags &= ~TF_WASFRECOVERY;
1829                 }
1830                 if (tp->t_flags & TF_WASCRECOVERY) {
1831                         ENTER_CONGRECOVERY(tp->t_flags);
1832                         tp->t_flags &= ~TF_WASCRECOVERY;
1833                 }
1834                 tp->snd_nxt = tp->snd_max;
1835                 tp->t_badrxtwin = 0;
1836                 break;
1837         }
1838
1839         if (CC_ALGO(tp)->cong_signal != NULL) {
1840                 if (th != NULL)
1841                         tp->ccv->curack = th->th_ack;
1842                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1843         }
1844 }
1845
1846
1847
1848 static inline void
1849 rack_cc_after_idle(struct tcpcb *tp)
1850 {
1851         uint32_t i_cwnd;
1852
1853         INP_WLOCK_ASSERT(tp->t_inpcb);
1854
1855 #ifdef NETFLIX_STATS
1856         TCPSTAT_INC(tcps_idle_restarts);
1857         if (tp->t_state == TCPS_ESTABLISHED)
1858                 TCPSTAT_INC(tcps_idle_estrestarts);
1859 #endif
1860         if (CC_ALGO(tp)->after_idle != NULL)
1861                 CC_ALGO(tp)->after_idle(tp->ccv);
1862
1863         if (tp->snd_cwnd == 1)
1864                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1865         else
1866                 i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
1867
1868         /*
1869          * Being idle is no differnt than the initial window. If the cc
1870          * clamps it down below the initial window raise it to the initial
1871          * window.
1872          */
1873         if (tp->snd_cwnd < i_cwnd) {
1874                 tp->snd_cwnd = i_cwnd;
1875         }
1876 }
1877
1878
1879 /*
1880  * Indicate whether this ack should be delayed.  We can delay the ack if
1881  * following conditions are met:
1882  *      - There is no delayed ack timer in progress.
1883  *      - Our last ack wasn't a 0-sized window. We never want to delay
1884  *        the ack that opens up a 0-sized window.
1885  *      - LRO wasn't used for this segment. We make sure by checking that the
1886  *        segment size is not larger than the MSS.
1887  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1888  *        connection.
1889  */
1890 #define DELAY_ACK(tp, tlen)                      \
1891         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1892         ((tp->t_flags & TF_DELACK) == 0) &&      \
1893         (tlen <= tp->t_maxseg) &&                \
1894         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1895
1896 static struct rack_sendmap *
1897 rack_find_lowest_rsm(struct tcp_rack *rack)
1898 {
1899         struct rack_sendmap *rsm;
1900
1901         /*
1902          * Walk the time-order transmitted list looking for an rsm that is
1903          * not acked. This will be the one that was sent the longest time
1904          * ago that is still outstanding.
1905          */
1906         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1907                 if (rsm->r_flags & RACK_ACKED) {
1908                         continue;
1909                 }
1910                 goto finish;
1911         }
1912 finish:
1913         return (rsm);
1914 }
1915
1916 static struct rack_sendmap *
1917 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1918 {
1919         struct rack_sendmap *prsm;
1920
1921         /*
1922          * Walk the sequence order list backward until we hit and arrive at
1923          * the highest seq not acked. In theory when this is called it
1924          * should be the last segment (which it was not).
1925          */
1926         counter_u64_add(rack_find_high, 1);
1927         prsm = rsm;
1928         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
1929                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1930                         continue;
1931                 }
1932                 return (prsm);
1933         }
1934         return (NULL);
1935 }
1936
1937
1938 static uint32_t
1939 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1940 {
1941         int32_t lro;
1942         uint32_t thresh;
1943
1944         /*
1945          * lro is the flag we use to determine if we have seen reordering.
1946          * If it gets set we have seen reordering. The reorder logic either
1947          * works in one of two ways:
1948          *
1949          * If reorder-fade is configured, then we track the last time we saw
1950          * re-ordering occur. If we reach the point where enough time as
1951          * passed we no longer consider reordering has occuring.
1952          *
1953          * Or if reorder-face is 0, then once we see reordering we consider
1954          * the connection to alway be subject to reordering and just set lro
1955          * to 1.
1956          *
1957          * In the end if lro is non-zero we add the extra time for
1958          * reordering in.
1959          */
1960         if (srtt == 0)
1961                 srtt = 1;
1962         if (rack->r_ctl.rc_reorder_ts) {
1963                 if (rack->r_ctl.rc_reorder_fade) {
1964                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1965                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1966                                 if (lro == 0) {
1967                                         /*
1968                                          * No time as passed since the last
1969                                          * reorder, mark it as reordering.
1970                                          */
1971                                         lro = 1;
1972                                 }
1973                         } else {
1974                                 /* Negative time? */
1975                                 lro = 0;
1976                         }
1977                         if (lro > rack->r_ctl.rc_reorder_fade) {
1978                                 /* Turn off reordering seen too */
1979                                 rack->r_ctl.rc_reorder_ts = 0;
1980                                 lro = 0;
1981                         }
1982                 } else {
1983                         /* Reodering does not fade */
1984                         lro = 1;
1985                 }
1986         } else {
1987                 lro = 0;
1988         }
1989         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1990         if (lro) {
1991                 /* It must be set, if not you get 1/4 rtt */
1992                 if (rack->r_ctl.rc_reorder_shift)
1993                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1994                 else
1995                         thresh += (srtt >> 2);
1996         } else {
1997                 thresh += 1;
1998         }
1999         /* We don't let the rack timeout be above a RTO */
2000         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
2001                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
2002         }
2003         /* And we don't want it above the RTO max either */
2004         if (thresh > rack_rto_max) {
2005                 thresh = rack_rto_max;
2006         }
2007         return (thresh);
2008 }
2009
2010 static uint32_t
2011 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
2012                      struct rack_sendmap *rsm, uint32_t srtt)
2013 {
2014         struct rack_sendmap *prsm;
2015         uint32_t thresh, len;
2016         int maxseg;
2017
2018         if (srtt == 0)
2019                 srtt = 1;
2020         if (rack->r_ctl.rc_tlp_threshold)
2021                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
2022         else
2023                 thresh = (srtt * 2);
2024
2025         /* Get the previous sent packet, if any  */
2026         maxseg = ctf_fixed_maxseg(tp);
2027         counter_u64_add(rack_enter_tlp_calc, 1);
2028         len = rsm->r_end - rsm->r_start;
2029         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
2030                 /* Exactly like the ID */
2031                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
2032                         uint32_t alt_thresh;
2033                         /*
2034                          * Compensate for delayed-ack with the d-ack time.
2035                          */
2036                         counter_u64_add(rack_used_tlpmethod, 1);
2037                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2038                         if (alt_thresh > thresh)
2039                                 thresh = alt_thresh;
2040                 }
2041         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
2042                 /* 2.1 behavior */
2043                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
2044                 if (prsm && (len <= maxseg)) {
2045                         /*
2046                          * Two packets outstanding, thresh should be (2*srtt) +
2047                          * possible inter-packet delay (if any).
2048                          */
2049                         uint32_t inter_gap = 0;
2050                         int idx, nidx;
2051
2052                         counter_u64_add(rack_used_tlpmethod, 1);
2053                         idx = rsm->r_rtr_cnt - 1;
2054                         nidx = prsm->r_rtr_cnt - 1;
2055                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
2056                                 /* Yes it was sent later (or at the same time) */
2057                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
2058                         }
2059                         thresh += inter_gap;
2060                 } else  if (len <= maxseg) {
2061                         /*
2062                          * Possibly compensate for delayed-ack.
2063                          */
2064                         uint32_t alt_thresh;
2065
2066                         counter_u64_add(rack_used_tlpmethod2, 1);
2067                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2068                         if (alt_thresh > thresh)
2069                                 thresh = alt_thresh;
2070                 }
2071         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2072                 /* 2.2 behavior */
2073                 if (len <= maxseg) {
2074                         uint32_t alt_thresh;
2075                         /*
2076                          * Compensate for delayed-ack with the d-ack time.
2077                          */
2078                         counter_u64_add(rack_used_tlpmethod, 1);
2079                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2080                         if (alt_thresh > thresh)
2081                                 thresh = alt_thresh;
2082                 }
2083         }
2084         /* Not above an RTO */
2085         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2086                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2087         }
2088         /* Not above a RTO max */
2089         if (thresh > rack_rto_max) {
2090                 thresh = rack_rto_max;
2091         }
2092         /* Apply user supplied min TLP */
2093         if (thresh < rack_tlp_min) {
2094                 thresh = rack_tlp_min;
2095         }
2096         return (thresh);
2097 }
2098
2099 static uint32_t
2100 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
2101 {
2102         /*
2103          * We want the rack_rtt which is the
2104          * last rtt we measured. However if that
2105          * does not exist we fallback to the srtt (which
2106          * we probably will never do) and then as a last
2107          * resort we use RACK_INITIAL_RTO if no srtt is
2108          * yet set.
2109          */
2110         if (rack->rc_rack_rtt)
2111                 return(rack->rc_rack_rtt);
2112         else if (tp->t_srtt == 0)
2113                 return(RACK_INITIAL_RTO);
2114         return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
2115 }
2116
2117 static struct rack_sendmap *
2118 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2119 {
2120         /*
2121          * Check to see that we don't need to fall into recovery. We will
2122          * need to do so if our oldest transmit is past the time we should
2123          * have had an ack.
2124          */
2125         struct tcp_rack *rack;
2126         struct rack_sendmap *rsm;
2127         int32_t idx;
2128         uint32_t srtt, thresh;
2129
2130         rack = (struct tcp_rack *)tp->t_fb_ptr;
2131         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
2132                 return (NULL);
2133         }
2134         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2135         if (rsm == NULL)
2136                 return (NULL);
2137
2138         if (rsm->r_flags & RACK_ACKED) {
2139                 rsm = rack_find_lowest_rsm(rack);
2140                 if (rsm == NULL)
2141                         return (NULL);
2142         }
2143         idx = rsm->r_rtr_cnt - 1;
2144         srtt = rack_grab_rtt(tp, rack);
2145         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2146         if (tsused < rsm->r_tim_lastsent[idx]) {
2147                 return (NULL);
2148         }
2149         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2150                 return (NULL);
2151         }
2152         /* Ok if we reach here we are over-due */
2153         rack->r_ctl.rc_rsm_start = rsm->r_start;
2154         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2155         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2156         rack_cong_signal(tp, NULL, CC_NDUPACK);
2157         return (rsm);
2158 }
2159
2160 static uint32_t
2161 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2162 {
2163         int32_t t;
2164         int32_t tt;
2165         uint32_t ret_val;
2166
2167         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2168         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2169             rack_persist_min, rack_persist_max);
2170         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2171                 tp->t_rxtshift++;
2172         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2173         ret_val = (uint32_t)tt;
2174         return (ret_val);
2175 }
2176
2177 static uint32_t
2178 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
2179 {
2180         /*
2181          * Start the FR timer, we do this based on getting the first one in
2182          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2183          * events we need to stop the running timer (if its running) before
2184          * starting the new one.
2185          */
2186         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
2187         uint32_t srtt_cur;
2188         int32_t idx;
2189         int32_t is_tlp_timer = 0;
2190         struct rack_sendmap *rsm;
2191
2192         if (rack->t_timers_stopped) {
2193                 /* All timers have been stopped none are to run */
2194                 return (0);
2195         }
2196         if (rack->rc_in_persist) {
2197                 /* We can't start any timer in persists */
2198                 return (rack_get_persists_timer_val(tp, rack));
2199         }
2200         if ((tp->t_state < TCPS_ESTABLISHED) ||
2201             ((tp->t_flags & TF_SACK_PERMIT) == 0))
2202                 goto activate_rxt;
2203         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2204         if ((rsm == NULL) || sup_rack) {
2205                 /* Nothing on the send map */
2206 activate_rxt:
2207                 time_since_sent = 0;
2208                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2209                 if (rsm) {
2210                         idx = rsm->r_rtr_cnt - 1;
2211                         if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2212                                 tstmp_touse = rsm->r_tim_lastsent[idx];
2213                         else
2214                                 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2215                         if (TSTMP_GT(tstmp_touse, cts))
2216                             time_since_sent = cts - tstmp_touse;
2217                 }
2218                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2219                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2220                         to = TICKS_2_MSEC(tp->t_rxtcur);
2221                         if (to > time_since_sent)
2222                                 to -= time_since_sent;
2223                         else
2224                                 to = rack->r_ctl.rc_min_to;
2225                         if (to == 0)
2226                                 to = 1;
2227                         return (to);
2228                 }
2229                 return (0);
2230         }
2231         if (rsm->r_flags & RACK_ACKED) {
2232                 rsm = rack_find_lowest_rsm(rack);
2233                 if (rsm == NULL) {
2234                         /* No lowest? */
2235                         goto activate_rxt;
2236                 }
2237         }
2238         if (rack->sack_attack_disable) {
2239                 /*
2240                  * We don't want to do
2241                  * any TLP's if you are an attacker.
2242                  * Though if you are doing what
2243                  * is expected you may still have
2244                  * SACK-PASSED marks.
2245                  */
2246                 goto activate_rxt;
2247         }
2248         /* Convert from ms to usecs */
2249         if (rsm->r_flags & RACK_SACK_PASSED) {
2250                 if ((tp->t_flags & TF_SENTFIN) &&
2251                     ((tp->snd_max - tp->snd_una) == 1) &&
2252                     (rsm->r_flags & RACK_HAS_FIN)) {
2253                         /*
2254                          * We don't start a rack timer if all we have is a
2255                          * FIN outstanding.
2256                          */
2257                         goto activate_rxt;
2258                 }
2259                 if ((rack->use_rack_cheat == 0) &&
2260                     (IN_RECOVERY(tp->t_flags)) &&
2261                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
2262                         /*
2263                          * We are not cheating, in recovery  and
2264                          * not enough ack's to yet get our next
2265                          * retransmission out.
2266                          *
2267                          * Note that classified attackers do not
2268                          * get to use the rack-cheat.
2269                          */
2270                         goto activate_tlp;
2271                 }
2272                 srtt = rack_grab_rtt(tp, rack);
2273                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2274                 idx = rsm->r_rtr_cnt - 1;
2275                 exp = rsm->r_tim_lastsent[idx] + thresh;
2276                 if (SEQ_GEQ(exp, cts)) {
2277                         to = exp - cts;
2278                         if (to < rack->r_ctl.rc_min_to) {
2279                                 to = rack->r_ctl.rc_min_to;
2280                         }
2281                 } else {
2282                         to = rack->r_ctl.rc_min_to;
2283                 }
2284         } else {
2285                 /* Ok we need to do a TLP not RACK */
2286 activate_tlp:
2287                 if ((rack->rc_tlp_in_progress != 0) ||
2288                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2289                         /*
2290                          * The previous send was a TLP or a tlp_rtx is in
2291                          * process.
2292                          */
2293                         goto activate_rxt;
2294                 }
2295                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2296                 if (rsm == NULL) {
2297                         /* We found no rsm to TLP with. */
2298                         goto activate_rxt;
2299                 }
2300                 if (rsm->r_flags & RACK_HAS_FIN) {
2301                         /* If its a FIN we dont do TLP */
2302                         rsm = NULL;
2303                         goto activate_rxt;
2304                 }
2305                 idx = rsm->r_rtr_cnt - 1;
2306                 time_since_sent = 0;
2307                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2308                         tstmp_touse = rsm->r_tim_lastsent[idx];
2309                 else
2310                         tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2311                 if (TSTMP_GT(tstmp_touse, cts))
2312                     time_since_sent = cts - tstmp_touse;
2313                 is_tlp_timer = 1;
2314                 if (tp->t_srtt) {
2315                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2316                         srtt = TICKS_2_MSEC(srtt_cur);
2317                 } else
2318                         srtt = RACK_INITIAL_RTO;
2319                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2320                 if (thresh > time_since_sent)
2321                         to = thresh - time_since_sent;
2322                 else
2323                         to = rack->r_ctl.rc_min_to;
2324                 if (to > TCPTV_REXMTMAX) {
2325                         /*
2326                          * If the TLP time works out to larger than the max
2327                          * RTO lets not do TLP.. just RTO.
2328                          */
2329                         goto activate_rxt;
2330                 }
2331                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2332                         /*
2333                          * The tail is no longer the last one I did a probe
2334                          * on
2335                          */
2336                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2337                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2338                 }
2339         }
2340         if (is_tlp_timer == 0) {
2341                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2342         } else {
2343                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2344                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2345                         /*
2346                          * We have exceeded how many times we can retran the
2347                          * current TLP timer, switch to the RTO timer.
2348                          */
2349                         goto activate_rxt;
2350                 } else {
2351                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2352                 }
2353         }
2354         if (to == 0)
2355                 to = 1;
2356         return (to);
2357 }
2358
2359 static void
2360 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2361 {
2362         if (rack->rc_in_persist == 0) {
2363                 rack->r_ctl.rc_went_idle_time = cts;
2364                 rack_timer_cancel(tp, rack, cts, __LINE__);
2365                 tp->t_rxtshift = 0;
2366                 rack->rc_in_persist = 1;
2367         }
2368 }
2369
2370 static void
2371 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2372 {
2373         if (rack->rc_inp->inp_in_hpts)  {
2374                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2375                 rack->r_ctl.rc_hpts_flags  = 0;
2376         }
2377         rack->rc_in_persist = 0;
2378         rack->r_ctl.rc_went_idle_time = 0;
2379         tp->t_flags &= ~TF_FORCEDATA;
2380         tp->t_rxtshift = 0;
2381 }
2382
2383 static void
2384 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
2385       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
2386 {
2387         struct inpcb *inp;
2388         uint32_t delayed_ack = 0;
2389         uint32_t hpts_timeout;
2390         uint8_t stopped;
2391         uint32_t left = 0;
2392
2393         inp = tp->t_inpcb;
2394         if (inp->inp_in_hpts) {
2395                 /* A previous call is already set up */
2396                 return;
2397         }
2398         if ((tp->t_state == TCPS_CLOSED) ||
2399             (tp->t_state == TCPS_LISTEN)) {
2400                 return;
2401         }
2402         stopped = rack->rc_tmr_stopped;
2403         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2404                 left = rack->r_ctl.rc_timer_exp - cts;
2405         }
2406         rack->tlp_timer_up = 0;
2407         rack->r_ctl.rc_timer_exp = 0;
2408         if (rack->rc_inp->inp_in_hpts == 0) {
2409                 rack->r_ctl.rc_hpts_flags = 0;
2410         }
2411         if (slot) {
2412                 /* We are hptsi too */
2413                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2414         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2415                 /*
2416                  * We are still left on the hpts when the to goes
2417                  * it will be for output.
2418                  */
2419                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
2420                         slot = rack->r_ctl.rc_last_output_to - cts;
2421                 else
2422                         slot = 1;
2423         }
2424         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
2425 #ifdef NETFLIX_EXP_DETECTION
2426         if (rack->sack_attack_disable &&
2427             (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) {
2428                 /*
2429                  * We have a potential attacker on
2430                  * the line. We have possibly some
2431                  * (or now) pacing time set. We want to
2432                  * slow down the processing of sacks by some
2433                  * amount (if it is an attacker). Set the default
2434                  * slot for attackers in place (unless the orginal
2435                  * interval is longer). Its stored in
2436                  * micro-seconds, so lets convert to msecs.
2437                  */
2438                 slot = USEC_TO_MSEC(tcp_sad_pacing_interval);
2439         }
2440 #endif
2441         if (tp->t_flags & TF_DELACK) {
2442                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2443                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2444         }
2445         if (delayed_ack && ((hpts_timeout == 0) ||
2446                             (delayed_ack < hpts_timeout)))
2447                 hpts_timeout = delayed_ack;
2448         else
2449                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2450         /*
2451          * If no timers are going to run and we will fall off the hptsi
2452          * wheel, we resort to a keep-alive timer if its configured.
2453          */
2454         if ((hpts_timeout == 0) &&
2455             (slot == 0)) {
2456                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2457                     (tp->t_state <= TCPS_CLOSING)) {
2458                         /*
2459                          * Ok we have no timer (persists, rack, tlp, rxt  or
2460                          * del-ack), we don't have segments being paced. So
2461                          * all that is left is the keepalive timer.
2462                          */
2463                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2464                                 /* Get the established keep-alive time */
2465                                 hpts_timeout = TP_KEEPIDLE(tp);
2466                         } else {
2467                                 /* Get the initial setup keep-alive time */
2468                                 hpts_timeout = TP_KEEPINIT(tp);
2469                         }
2470                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2471                 }
2472         }
2473         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2474             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2475                 /*
2476                  * RACK, TLP, persists and RXT timers all are restartable
2477                  * based on actions input .. i.e we received a packet (ack
2478                  * or sack) and that changes things (rw, or snd_una etc).
2479                  * Thus we can restart them with a new value. For
2480                  * keep-alive, delayed_ack we keep track of what was left
2481                  * and restart the timer with a smaller value.
2482                  */
2483                 if (left < hpts_timeout)
2484                         hpts_timeout = left;
2485         }
2486         if (hpts_timeout) {
2487                 /*
2488                  * Hack alert for now we can't time-out over 2,147,483
2489                  * seconds (a bit more than 596 hours), which is probably ok
2490                  * :).
2491                  */
2492                 if (hpts_timeout > 0x7ffffffe)
2493                         hpts_timeout = 0x7ffffffe;
2494                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2495         }
2496         if (slot) {
2497                 rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2498                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)
2499                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2500                 else
2501                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2502                 rack->r_ctl.rc_last_output_to = cts + slot;
2503                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2504                         if (rack->rc_inp->inp_in_hpts == 0)
2505                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2506                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2507                 } else {
2508                         /*
2509                          * Arrange for the hpts to kick back in after the
2510                          * t-o if the t-o does not cause a send.
2511                          */
2512                         if (rack->rc_inp->inp_in_hpts == 0)
2513                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2514                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2515                 }
2516         } else if (hpts_timeout) {
2517                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)  {
2518                         /* For a rack timer, don't wake us */
2519                         rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2520                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2521                 } else {
2522                         /* All other timers wake us up */
2523                         rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
2524                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2525                 }
2526                 if (rack->rc_inp->inp_in_hpts == 0)
2527                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2528                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2529         } else {
2530                 /* No timer starting */
2531 #ifdef INVARIANTS
2532                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2533                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2534                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2535                 }
2536 #endif
2537         }
2538         rack->rc_tmr_stopped = 0;
2539         if (slot)
2540                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2541 }
2542
2543 /*
2544  * RACK Timer, here we simply do logging and house keeping.
2545  * the normal rack_output() function will call the
2546  * appropriate thing to check if we need to do a RACK retransmit.
2547  * We return 1, saying don't proceed with rack_output only
2548  * when all timers have been stopped (destroyed PCB?).
2549  */
2550 static int
2551 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2552 {
2553         /*
2554          * This timer simply provides an internal trigger to send out data.
2555          * The check_recovery_mode call will see if there are needed
2556          * retransmissions, if so we will enter fast-recovery. The output
2557          * call may or may not do the same thing depending on sysctl
2558          * settings.
2559          */
2560         struct rack_sendmap *rsm;
2561         int32_t recovery, ll;
2562
2563         if (tp->t_timers->tt_flags & TT_STOPPED) {
2564                 return (1);
2565         }
2566         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2567                 /* Its not time yet */
2568                 return (0);
2569         }
2570         recovery = IN_RECOVERY(tp->t_flags);
2571         counter_u64_add(rack_to_tot, 1);
2572         if (rack->r_state && (rack->r_state != tp->t_state))
2573                 rack_set_state(tp, rack);
2574         rsm = rack_check_recovery_mode(tp, cts);
2575         if (rsm)
2576                 ll = rsm->r_end - rsm->r_start;
2577         else
2578                 ll = 0;
2579         rack_log_to_event(rack, RACK_TO_FRM_RACK, ll);
2580         if (rsm) {
2581                 uint32_t rtt;
2582
2583                 rtt = rack->rc_rack_rtt;
2584                 if (rtt == 0)
2585                         rtt = 1;
2586                 if ((recovery == 0) &&
2587                     (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
2588                         /*
2589                          * The rack-timeout that enter's us into recovery
2590                          * will force out one MSS and set us up so that we
2591                          * can do one more send in 2*rtt (transitioning the
2592                          * rack timeout into a rack-tlp).
2593                          */
2594                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2595                         rack_log_to_prr(rack, 3);
2596                 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
2597                            rack->use_rack_cheat) {
2598                         /*
2599                          * When a rack timer goes, if the rack cheat is
2600                          * on, arrange it so we can send a full segment.
2601                          */
2602                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2603                         rack_log_to_prr(rack, 4);
2604                 }
2605         } else {
2606                 /* This is a case that should happen rarely if ever */
2607                 counter_u64_add(rack_tlp_does_nada, 1);
2608 #ifdef TCP_BLACKBOX
2609                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2610 #endif
2611                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2612         }
2613         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2614         return (0);
2615 }
2616
2617 static __inline void
2618 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
2619                struct rack_sendmap *rsm, uint32_t start)
2620 {
2621         int idx;
2622
2623         nrsm->r_start = start;
2624         nrsm->r_end = rsm->r_end;
2625         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2626         nrsm->r_flags = rsm->r_flags;
2627         nrsm->r_dupack = rsm->r_dupack;
2628         nrsm->r_rtr_bytes = 0;
2629         rsm->r_end = nrsm->r_start;
2630         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2631                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2632         }
2633 }
2634
2635 static struct rack_sendmap *
2636 rack_merge_rsm(struct tcp_rack *rack,
2637                struct rack_sendmap *l_rsm,
2638                struct rack_sendmap *r_rsm)
2639 {
2640         /*
2641          * We are merging two ack'd RSM's,
2642          * the l_rsm is on the left (lower seq
2643          * values) and the r_rsm is on the right
2644          * (higher seq value). The simplest way
2645          * to merge these is to move the right
2646          * one into the left. I don't think there
2647          * is any reason we need to try to find
2648          * the oldest (or last oldest retransmitted).
2649          */
2650         struct rack_sendmap *rm;
2651
2652         l_rsm->r_end = r_rsm->r_end;
2653         if (l_rsm->r_dupack < r_rsm->r_dupack)
2654                 l_rsm->r_dupack = r_rsm->r_dupack;
2655         if (r_rsm->r_rtr_bytes)
2656                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
2657         if (r_rsm->r_in_tmap) {
2658                 /* This really should not happen */
2659                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
2660                 r_rsm->r_in_tmap = 0;
2661         }
2662         /* Now the flags */
2663         if (r_rsm->r_flags & RACK_HAS_FIN)
2664                 l_rsm->r_flags |= RACK_HAS_FIN;
2665         if (r_rsm->r_flags & RACK_TLP)
2666                 l_rsm->r_flags |= RACK_TLP;
2667         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
2668                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
2669         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
2670 #ifdef INVARIANTS
2671         if (rm != r_rsm) {
2672                 panic("removing head in rack:%p rsm:%p rm:%p",
2673                       rack, r_rsm, rm);
2674         }
2675 #endif
2676         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
2677                 /* Transfer the split limit to the map we free */
2678                 r_rsm->r_limit_type = l_rsm->r_limit_type;
2679                 l_rsm->r_limit_type = 0;
2680         }
2681         rack_free(rack, r_rsm);
2682         return(l_rsm);
2683 }
2684
2685 /*
2686  * TLP Timer, here we simply setup what segment we want to
2687  * have the TLP expire on, the normal rack_output() will then
2688  * send it out.
2689  *
2690  * We return 1, saying don't proceed with rack_output only
2691  * when all timers have been stopped (destroyed PCB?).
2692  */
2693 static int
2694 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2695 {
2696         /*
2697          * Tail Loss Probe.
2698          */
2699         struct rack_sendmap *rsm = NULL;
2700         struct rack_sendmap *insret;
2701         struct socket *so;
2702         uint32_t amm, old_prr_snd = 0;
2703         uint32_t out, avail;
2704         int collapsed_win = 0;
2705
2706         if (tp->t_timers->tt_flags & TT_STOPPED) {
2707                 return (1);
2708         }
2709         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2710                 /* Its not time yet */
2711                 return (0);
2712         }
2713         if (rack_progress_timeout_check(tp)) {
2714                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2715                 return (1);
2716         }
2717         /*
2718          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2719          * need to figure out how to force a full MSS segment out.
2720          */
2721         rack_log_to_event(rack, RACK_TO_FRM_TLP, 0);
2722         counter_u64_add(rack_tlp_tot, 1);
2723         if (rack->r_state && (rack->r_state != tp->t_state))
2724                 rack_set_state(tp, rack);
2725         so = tp->t_inpcb->inp_socket;
2726 #ifdef KERN_TLS
2727         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
2728                 /*
2729                  * For hardware TLS we do *not* want to send
2730                  * new data, lets instead just do a retransmission.
2731                  */
2732                 goto need_retran;
2733         }
2734 #endif
2735         avail = sbavail(&so->so_snd);
2736         out = tp->snd_max - tp->snd_una;
2737         rack->tlp_timer_up = 1;
2738         if (out > tp->snd_wnd) {
2739                 /* special case, we need a retransmission */
2740                 collapsed_win = 1;
2741                 goto need_retran;
2742         }
2743         /*
2744          * If we are in recovery we can jazz out a segment if new data is
2745          * present simply by setting rc_prr_sndcnt to a segment.
2746          */
2747         if ((avail > out) &&
2748             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2749                 /* New data is available */
2750                 amm = avail - out;
2751                 if (amm > ctf_fixed_maxseg(tp)) {
2752                         amm = ctf_fixed_maxseg(tp);
2753                 } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) {
2754                         /* not enough to fill a MTU and no-delay is off */
2755                         goto need_retran;
2756                 }
2757                 if (IN_RECOVERY(tp->t_flags)) {
2758                         /* Unlikely */
2759                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2760                         if (out + amm <= tp->snd_wnd) {
2761                                 rack->r_ctl.rc_prr_sndcnt = amm;
2762                                 rack_log_to_prr(rack, 4);
2763                         } else
2764                                 goto need_retran;
2765                 } else {
2766                         /* Set the send-new override */
2767                         if (out + amm <= tp->snd_wnd)
2768                                 rack->r_ctl.rc_tlp_new_data = amm;
2769                         else
2770                                 goto need_retran;
2771                 }
2772                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2773                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2774                 rack->r_ctl.rc_tlpsend = NULL;
2775                 counter_u64_add(rack_tlp_newdata, 1);
2776                 goto send;
2777         }
2778 need_retran:
2779         /*
2780          * Ok we need to arrange the last un-acked segment to be re-sent, or
2781          * optionally the first un-acked segment.
2782          */
2783         if (collapsed_win == 0) {
2784                 if (rack_always_send_oldest)
2785                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2786                 else {
2787                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2788                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2789                                 rsm = rack_find_high_nonack(rack, rsm);
2790                         }
2791                 }
2792                 if (rsm == NULL) {
2793                         counter_u64_add(rack_tlp_does_nada, 1);
2794 #ifdef TCP_BLACKBOX
2795                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2796 #endif
2797                         goto out;
2798                 }
2799         } else {
2800                 /*
2801                  * We must find the last segment
2802                  * that was acceptable by the client.
2803                  */
2804                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
2805                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
2806                                 /* Found one */
2807                                 break;
2808                         }
2809                 }
2810                 if (rsm == NULL) {
2811                         /* None? if so send the first */
2812                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2813                         if (rsm == NULL) {
2814                                 counter_u64_add(rack_tlp_does_nada, 1);
2815 #ifdef TCP_BLACKBOX
2816                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2817 #endif
2818                                 goto out;
2819                         }
2820                 }
2821         }
2822         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
2823                 /*
2824                  * We need to split this the last segment in two.
2825                  */
2826                 struct rack_sendmap *nrsm;
2827
2828
2829                 nrsm = rack_alloc_full_limit(rack);
2830                 if (nrsm == NULL) {
2831                         /*
2832                          * No memory to split, we will just exit and punt
2833                          * off to the RXT timer.
2834                          */
2835                         counter_u64_add(rack_tlp_does_nada, 1);
2836                         goto out;
2837                 }
2838                 rack_clone_rsm(rack, nrsm, rsm,
2839                                (rsm->r_end - ctf_fixed_maxseg(tp)));
2840                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
2841 #ifdef INVARIANTS
2842                 if (insret != NULL) {
2843                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
2844                               nrsm, insret, rack, rsm);
2845                 }
2846 #endif
2847                 if (rsm->r_in_tmap) {
2848                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2849                         nrsm->r_in_tmap = 1;
2850                 }
2851                 rsm->r_flags &= (~RACK_HAS_FIN);
2852                 rsm = nrsm;
2853         }
2854         rack->r_ctl.rc_tlpsend = rsm;
2855         rack->r_ctl.rc_tlp_rtx_out = 1;
2856         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2857                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2858                 tp->t_rxtshift++;
2859         } else {
2860                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2861                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2862         }
2863 send:
2864         rack->r_ctl.rc_tlp_send_cnt++;
2865         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2866                 /*
2867                  * Can't [re]/transmit a segment we have not heard from the
2868                  * peer in max times. We need the retransmit timer to take
2869                  * over.
2870                  */
2871         restore:
2872                 rack->r_ctl.rc_tlpsend = NULL;
2873                 if (rsm)
2874                         rsm->r_flags &= ~RACK_TLP;
2875                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2876                 rack_log_to_prr(rack, 5);
2877                 counter_u64_add(rack_tlp_retran_fail, 1);
2878                 goto out;
2879         } else if (rsm) {
2880                 rsm->r_flags |= RACK_TLP;
2881         }
2882         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2883             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2884                 /*
2885                  * We don't want to send a single segment more than the max
2886                  * either.
2887                  */
2888                 goto restore;
2889         }
2890         rack->r_timer_override = 1;
2891         rack->r_tlp_running = 1;
2892         rack->rc_tlp_in_progress = 1;
2893         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2894         return (0);
2895 out:
2896         rack->tlp_timer_up = 0;
2897         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2898         return (0);
2899 }
2900
2901 /*
2902  * Delayed ack Timer, here we simply need to setup the
2903  * ACK_NOW flag and remove the DELACK flag. From there
2904  * the output routine will send the ack out.
2905  *
2906  * We only return 1, saying don't proceed, if all timers
2907  * are stopped (destroyed PCB?).
2908  */
2909 static int
2910 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2911 {
2912         if (tp->t_timers->tt_flags & TT_STOPPED) {
2913                 return (1);
2914         }
2915         rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0);
2916         tp->t_flags &= ~TF_DELACK;
2917         tp->t_flags |= TF_ACKNOW;
2918         TCPSTAT_INC(tcps_delack);
2919         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2920         return (0);
2921 }
2922
2923 /*
2924  * Persists timer, here we simply need to setup the
2925  * FORCE-DATA flag the output routine will send
2926  * the one byte send.
2927  *
2928  * We only return 1, saying don't proceed, if all timers
2929  * are stopped (destroyed PCB?).
2930  */
2931 static int
2932 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2933 {
2934         struct tcptemp *t_template;
2935         struct inpcb *inp;
2936         int32_t retval = 1;
2937
2938         inp = tp->t_inpcb;
2939
2940         if (tp->t_timers->tt_flags & TT_STOPPED) {
2941                 return (1);
2942         }
2943         if (rack->rc_in_persist == 0)
2944                 return (0);
2945         if (rack_progress_timeout_check(tp)) {
2946                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2947                 return (1);
2948         }
2949         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2950         /*
2951          * Persistence timer into zero window. Force a byte to be output, if
2952          * possible.
2953          */
2954         TCPSTAT_INC(tcps_persisttimeo);
2955         /*
2956          * Hack: if the peer is dead/unreachable, we do not time out if the
2957          * window is closed.  After a full backoff, drop the connection if
2958          * the idle time (no responses to probes) reaches the maximum
2959          * backoff that we would use if retransmitting.
2960          */
2961         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2962             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2963             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2964                 TCPSTAT_INC(tcps_persistdrop);
2965                 retval = 1;
2966                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2967                 goto out;
2968         }
2969         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2970             tp->snd_una == tp->snd_max)
2971                 rack_exit_persist(tp, rack);
2972         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2973         /*
2974          * If the user has closed the socket then drop a persisting
2975          * connection after a much reduced timeout.
2976          */
2977         if (tp->t_state > TCPS_CLOSE_WAIT &&
2978             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2979                 retval = 1;
2980                 TCPSTAT_INC(tcps_persistdrop);
2981                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2982                 goto out;
2983         }
2984         t_template = tcpip_maketemplate(rack->rc_inp);
2985         if (t_template) {
2986                 tcp_respond(tp, t_template->tt_ipgen,
2987                             &t_template->tt_t, (struct mbuf *)NULL,
2988                             tp->rcv_nxt, tp->snd_una - 1, 0);
2989                 /* This sends an ack */
2990                 if (tp->t_flags & TF_DELACK)
2991                         tp->t_flags &= ~TF_DELACK;
2992                 free(t_template, M_TEMP);
2993         }
2994         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2995                 tp->t_rxtshift++;
2996 out:
2997         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0);
2998         rack_start_hpts_timer(rack, tp, cts,
2999                               0, 0, 0);
3000         return (retval);
3001 }
3002
3003 /*
3004  * If a keepalive goes off, we had no other timers
3005  * happening. We always return 1 here since this
3006  * routine either drops the connection or sends
3007  * out a segment with respond.
3008  */
3009 static int
3010 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3011 {
3012         struct tcptemp *t_template;
3013         struct inpcb *inp;
3014
3015         if (tp->t_timers->tt_flags & TT_STOPPED) {
3016                 return (1);
3017         }
3018         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
3019         inp = tp->t_inpcb;
3020         rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0);
3021         /*
3022          * Keep-alive timer went off; send something or drop connection if
3023          * idle for too long.
3024          */
3025         TCPSTAT_INC(tcps_keeptimeo);
3026         if (tp->t_state < TCPS_ESTABLISHED)
3027                 goto dropit;
3028         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
3029             tp->t_state <= TCPS_CLOSING) {
3030                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
3031                         goto dropit;
3032                 /*
3033                  * Send a packet designed to force a response if the peer is
3034                  * up and reachable: either an ACK if the connection is
3035                  * still alive, or an RST if the peer has closed the
3036                  * connection due to timeout or reboot. Using sequence
3037                  * number tp->snd_una-1 causes the transmitted zero-length
3038                  * segment to lie outside the receive window; by the
3039                  * protocol spec, this requires the correspondent TCP to
3040                  * respond.
3041                  */
3042                 TCPSTAT_INC(tcps_keepprobe);
3043                 t_template = tcpip_maketemplate(inp);
3044                 if (t_template) {
3045                         tcp_respond(tp, t_template->tt_ipgen,
3046                             &t_template->tt_t, (struct mbuf *)NULL,
3047                             tp->rcv_nxt, tp->snd_una - 1, 0);
3048                         free(t_template, M_TEMP);
3049                 }
3050         }
3051         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
3052         return (1);
3053 dropit:
3054         TCPSTAT_INC(tcps_keepdrops);
3055         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
3056         return (1);
3057 }
3058
3059 /*
3060  * Retransmit helper function, clear up all the ack
3061  * flags and take care of important book keeping.
3062  */
3063 static void
3064 rack_remxt_tmr(struct tcpcb *tp)
3065 {
3066         /*
3067          * The retransmit timer went off, all sack'd blocks must be
3068          * un-acked.
3069          */
3070         struct rack_sendmap *rsm, *trsm = NULL;
3071         struct tcp_rack *rack;
3072         int32_t cnt = 0;
3073
3074         rack = (struct tcp_rack *)tp->t_fb_ptr;
3075         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
3076         rack_log_to_event(rack, RACK_TO_FRM_TMR, 0);
3077         if (rack->r_state && (rack->r_state != tp->t_state))
3078                 rack_set_state(tp, rack);
3079         /*
3080          * Ideally we would like to be able to
3081          * mark SACK-PASS on anything not acked here.
3082          * However, if we do that we would burst out
3083          * all that data 1ms apart. This would be unwise,
3084          * so for now we will just let the normal rxt timer
3085          * and tlp timer take care of it.
3086          */
3087         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3088                 if (rsm->r_flags & RACK_ACKED) {
3089                         cnt++;
3090                         rsm->r_dupack = 0;
3091                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3092                         if (rsm->r_in_tmap == 0) {
3093                                 /* We must re-add it back to the tlist */
3094                                 if (trsm == NULL) {
3095                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3096                                 } else {
3097                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
3098                                 }
3099                                 rsm->r_in_tmap = 1;
3100                         }
3101                 }
3102                 trsm = rsm;
3103                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
3104         }
3105         /* Clear the count (we just un-acked them) */
3106         rack->r_ctl.rc_sacked = 0;
3107         /* Clear the tlp rtx mark */
3108         rack->r_ctl.rc_tlp_rtx_out = 0;
3109         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
3110         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3111         rack->r_ctl.rc_prr_sndcnt = 0;
3112         rack_log_to_prr(rack, 6);
3113         rack->r_timer_override = 1;
3114 }
3115
3116 /*
3117  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
3118  * we will setup to retransmit the lowest seq number outstanding.
3119  */
3120 static int
3121 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3122 {
3123         int32_t rexmt;
3124         struct inpcb *inp;
3125         int32_t retval = 0;
3126
3127         inp = tp->t_inpcb;
3128         if (tp->t_timers->tt_flags & TT_STOPPED) {
3129                 return (1);
3130         }
3131         if (rack_progress_timeout_check(tp)) {
3132                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
3133                 return (1);
3134         }
3135         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
3136         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
3137             (tp->snd_una == tp->snd_max)) {
3138                 /* Nothing outstanding .. nothing to do */
3139                 return (0);
3140         }
3141         /*
3142          * Retransmission timer went off.  Message has not been acked within
3143          * retransmit interval.  Back off to a longer retransmit interval
3144          * and retransmit one segment.
3145          */
3146         rack_remxt_tmr(tp);
3147         if ((rack->r_ctl.rc_resend == NULL) ||
3148             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
3149                 /*
3150                  * If the rwnd collapsed on
3151                  * the one we are retransmitting
3152                  * it does not count against the
3153                  * rxt count.
3154                  */
3155                 tp->t_rxtshift++;
3156         }
3157         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
3158                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
3159                 TCPSTAT_INC(tcps_timeoutdrop);
3160                 retval = 1;
3161                 tcp_set_inp_to_drop(rack->rc_inp,
3162                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
3163                 goto out;
3164         }
3165         if (tp->t_state == TCPS_SYN_SENT) {
3166                 /*
3167                  * If the SYN was retransmitted, indicate CWND to be limited
3168                  * to 1 segment in cc_conn_init().
3169                  */
3170                 tp->snd_cwnd = 1;
3171         } else if (tp->t_rxtshift == 1) {
3172                 /*
3173                  * first retransmit; record ssthresh and cwnd so they can be
3174                  * recovered if this turns out to be a "bad" retransmit. A
3175                  * retransmit is considered "bad" if an ACK for this segment
3176                  * is received within RTT/2 interval; the assumption here is
3177                  * that the ACK was already in flight.  See "On Estimating
3178                  * End-to-End Network Path Properties" by Allman and Paxson
3179                  * for more details.
3180                  */
3181                 tp->snd_cwnd_prev = tp->snd_cwnd;
3182                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
3183                 tp->snd_recover_prev = tp->snd_recover;
3184                 if (IN_FASTRECOVERY(tp->t_flags))
3185                         tp->t_flags |= TF_WASFRECOVERY;
3186                 else
3187                         tp->t_flags &= ~TF_WASFRECOVERY;
3188                 if (IN_CONGRECOVERY(tp->t_flags))
3189                         tp->t_flags |= TF_WASCRECOVERY;
3190                 else
3191                         tp->t_flags &= ~TF_WASCRECOVERY;
3192                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
3193                 tp->t_flags |= TF_PREVVALID;
3194         } else
3195                 tp->t_flags &= ~TF_PREVVALID;
3196         TCPSTAT_INC(tcps_rexmttimeo);
3197         if ((tp->t_state == TCPS_SYN_SENT) ||
3198             (tp->t_state == TCPS_SYN_RECEIVED))
3199                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
3200         else
3201                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
3202         TCPT_RANGESET(tp->t_rxtcur, rexmt,
3203            max(MSEC_2_TICKS(rack_rto_min), rexmt),
3204            MSEC_2_TICKS(rack_rto_max));
3205         /*
3206          * We enter the path for PLMTUD if connection is established or, if
3207          * connection is FIN_WAIT_1 status, reason for the last is that if
3208          * amount of data we send is very small, we could send it in couple
3209          * of packets and process straight to FIN. In that case we won't
3210          * catch ESTABLISHED state.
3211          */
3212         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
3213             || (tp->t_state == TCPS_FIN_WAIT_1))) {
3214 #ifdef INET6
3215                 int32_t isipv6;
3216 #endif
3217
3218                 /*
3219                  * Idea here is that at each stage of mtu probe (usually,
3220                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
3221                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
3222                  * should take care of that.
3223                  */
3224                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
3225                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
3226                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
3227                     tp->t_rxtshift % 2 == 0)) {
3228                         /*
3229                          * Enter Path MTU Black-hole Detection mechanism: -
3230                          * Disable Path MTU Discovery (IP "DF" bit). -
3231                          * Reduce MTU to lower value than what we negotiated
3232                          * with peer.
3233                          */
3234                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
3235                                 /* Record that we may have found a black hole. */
3236                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
3237                                 /* Keep track of previous MSS. */
3238                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
3239                         }
3240
3241                         /*
3242                          * Reduce the MSS to blackhole value or to the
3243                          * default in an attempt to retransmit.
3244                          */
3245 #ifdef INET6
3246                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
3247                         if (isipv6 &&
3248                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3249                                 /* Use the sysctl tuneable blackhole MSS. */
3250                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3251                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3252                         } else if (isipv6) {
3253                                 /* Use the default MSS. */
3254                                 tp->t_maxseg = V_tcp_v6mssdflt;
3255                                 /*
3256                                  * Disable Path MTU Discovery when we switch
3257                                  * to minmss.
3258                                  */
3259                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3260                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3261                         }
3262 #endif
3263 #if defined(INET6) && defined(INET)
3264                         else
3265 #endif
3266 #ifdef INET
3267                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3268                                 /* Use the sysctl tuneable blackhole MSS. */
3269                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3270                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3271                         } else {
3272                                 /* Use the default MSS. */
3273                                 tp->t_maxseg = V_tcp_mssdflt;
3274                                 /*
3275                                  * Disable Path MTU Discovery when we switch
3276                                  * to minmss.
3277                                  */
3278                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3279                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3280                         }
3281 #endif
3282                 } else {
3283                         /*
3284                          * If further retransmissions are still unsuccessful
3285                          * with a lowered MTU, maybe this isn't a blackhole
3286                          * and we restore the previous MSS and blackhole
3287                          * detection flags. The limit '6' is determined by
3288                          * giving each probe stage (1448, 1188, 524) 2
3289                          * chances to recover.
3290                          */
3291                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3292                             (tp->t_rxtshift >= 6)) {
3293                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3294                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3295                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3296                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3297                         }
3298                 }
3299         }
3300         /*
3301          * If we backed off this far, our srtt estimate is probably bogus.
3302          * Clobber it so we'll take the next rtt measurement as our srtt;
3303          * move the current srtt into rttvar to keep the current retransmit
3304          * times until then.
3305          */
3306         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3307 #ifdef INET6
3308                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3309                         in6_losing(tp->t_inpcb);
3310                 else
3311 #endif
3312                         in_losing(tp->t_inpcb);
3313                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3314                 tp->t_srtt = 0;
3315         }
3316         if (rack_use_sack_filter)
3317                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3318         tp->snd_recover = tp->snd_max;
3319         tp->t_flags |= TF_ACKNOW;
3320         tp->t_rtttime = 0;
3321         rack_cong_signal(tp, NULL, CC_RTO);
3322 out:
3323         return (retval);
3324 }
3325
3326 static int
3327 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3328 {
3329         int32_t ret = 0;
3330         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3331
3332         if (timers == 0) {
3333                 return (0);
3334         }
3335         if (tp->t_state == TCPS_LISTEN) {
3336                 /* no timers on listen sockets */
3337                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3338                         return (0);
3339                 return (1);
3340         }
3341         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3342                 uint32_t left;
3343
3344                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3345                         ret = -1;
3346                         rack_log_to_processing(rack, cts, ret, 0);
3347                         return (0);
3348                 }
3349                 if (hpts_calling == 0) {
3350                         ret = -2;
3351                         rack_log_to_processing(rack, cts, ret, 0);
3352                         return (0);
3353                 }
3354                 /*
3355                  * Ok our timer went off early and we are not paced false
3356                  * alarm, go back to sleep.
3357                  */
3358                 ret = -3;
3359                 left = rack->r_ctl.rc_timer_exp - cts;
3360                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3361                 rack_log_to_processing(rack, cts, ret, left);
3362                 rack->rc_last_pto_set = 0;
3363                 return (1);
3364         }
3365         rack->rc_tmr_stopped = 0;
3366         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3367         if (timers & PACE_TMR_DELACK) {
3368                 ret = rack_timeout_delack(tp, rack, cts);
3369         } else if (timers & PACE_TMR_RACK) {
3370                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3371                 ret = rack_timeout_rack(tp, rack, cts);
3372         } else if (timers & PACE_TMR_TLP) {
3373                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3374                 ret = rack_timeout_tlp(tp, rack, cts);
3375         } else if (timers & PACE_TMR_RXT) {
3376                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3377                 ret = rack_timeout_rxt(tp, rack, cts);
3378         } else if (timers & PACE_TMR_PERSIT) {
3379                 ret = rack_timeout_persist(tp, rack, cts);
3380         } else if (timers & PACE_TMR_KEEP) {
3381                 ret = rack_timeout_keepalive(tp, rack, cts);
3382         }
3383         rack_log_to_processing(rack, cts, ret, timers);
3384         return (ret);
3385 }
3386
3387 static void
3388 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3389 {
3390         uint8_t hpts_removed = 0;
3391
3392         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3393             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3394                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3395                 hpts_removed = 1;
3396         }
3397         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3398                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3399                 if (rack->rc_inp->inp_in_hpts &&
3400                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3401                         /*
3402                          * Canceling timer's when we have no output being
3403                          * paced. We also must remove ourselves from the
3404                          * hpts.
3405                          */
3406                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3407                         hpts_removed = 1;
3408                 }
3409                 rack_log_to_cancel(rack, hpts_removed, line);
3410                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3411         }
3412 }
3413
3414 static void
3415 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3416 {
3417         return;
3418 }
3419
3420 static int
3421 rack_stopall(struct tcpcb *tp)
3422 {
3423         struct tcp_rack *rack;
3424         rack = (struct tcp_rack *)tp->t_fb_ptr;
3425         rack->t_timers_stopped = 1;
3426         return (0);
3427 }
3428
3429 static void
3430 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3431 {
3432         return;
3433 }
3434
3435 static int
3436 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3437 {
3438         return (0);
3439 }
3440
3441 static void
3442 rack_stop_all_timers(struct tcpcb *tp)
3443 {
3444         struct tcp_rack *rack;
3445
3446         /*
3447          * Assure no timers are running.
3448          */
3449         if (tcp_timer_active(tp, TT_PERSIST)) {
3450                 /* We enter in persists, set the flag appropriately */
3451                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3452                 rack->rc_in_persist = 1;
3453         }
3454         tcp_timer_suspend(tp, TT_PERSIST);
3455         tcp_timer_suspend(tp, TT_REXMT);
3456         tcp_timer_suspend(tp, TT_KEEP);
3457         tcp_timer_suspend(tp, TT_DELACK);
3458 }
3459
3460 static void
3461 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3462     struct rack_sendmap *rsm, uint32_t ts)
3463 {
3464         int32_t idx;
3465
3466         rsm->r_rtr_cnt++;
3467         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3468         rsm->r_dupack = 0;
3469         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3470                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3471                 rsm->r_flags |= RACK_OVERMAX;
3472         }
3473         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3474                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3475                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3476         }
3477         idx = rsm->r_rtr_cnt - 1;
3478         rsm->r_tim_lastsent[idx] = ts;
3479         if (rsm->r_flags & RACK_ACKED) {
3480                 /* Problably MTU discovery messing with us */
3481                 rsm->r_flags &= ~RACK_ACKED;
3482                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3483         }
3484         if (rsm->r_in_tmap) {
3485                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3486                 rsm->r_in_tmap = 0;
3487         }
3488         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3489         rsm->r_in_tmap = 1;
3490         if (rsm->r_flags & RACK_SACK_PASSED) {
3491                 /* We have retransmitted due to the SACK pass */
3492                 rsm->r_flags &= ~RACK_SACK_PASSED;
3493                 rsm->r_flags |= RACK_WAS_SACKPASS;
3494         }
3495 }
3496
3497
3498 static uint32_t
3499 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3500     struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
3501 {
3502         /*
3503          * We (re-)transmitted starting at rsm->r_start for some length
3504          * (possibly less than r_end.
3505          */
3506         struct rack_sendmap *nrsm, *insret;
3507         uint32_t c_end;
3508         int32_t len;
3509
3510         len = *lenp;
3511         c_end = rsm->r_start + len;
3512         if (SEQ_GEQ(c_end, rsm->r_end)) {
3513                 /*
3514                  * We retransmitted the whole piece or more than the whole
3515                  * slopping into the next rsm.
3516                  */
3517                 rack_update_rsm(tp, rack, rsm, ts);
3518                 if (c_end == rsm->r_end) {
3519                         *lenp = 0;
3520                         return (0);
3521                 } else {
3522                         int32_t act_len;
3523
3524                         /* Hangs over the end return whats left */
3525                         act_len = rsm->r_end - rsm->r_start;
3526                         *lenp = (len - act_len);
3527                         return (rsm->r_end);
3528                 }
3529                 /* We don't get out of this block. */
3530         }
3531         /*
3532          * Here we retransmitted less than the whole thing which means we
3533          * have to split this into what was transmitted and what was not.
3534          */
3535         nrsm = rack_alloc_full_limit(rack);
3536         if (nrsm == NULL) {
3537                 /*
3538                  * We can't get memory, so lets not proceed.
3539                  */
3540                 *lenp = 0;
3541                 return (0);
3542         }
3543         /*
3544          * So here we are going to take the original rsm and make it what we
3545          * retransmitted. nrsm will be the tail portion we did not
3546          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3547          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3548          * 1, 6 and the new piece will be 6, 11.
3549          */
3550         rack_clone_rsm(rack, nrsm, rsm, c_end);
3551         nrsm->r_dupack = 0;
3552         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
3553         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3554 #ifdef INVARIANTS
3555         if (insret != NULL) {
3556                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3557                       nrsm, insret, rack, rsm);
3558         }
3559 #endif
3560         if (rsm->r_in_tmap) {
3561                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3562                 nrsm->r_in_tmap = 1;
3563         }
3564         rsm->r_flags &= (~RACK_HAS_FIN);
3565         rack_update_rsm(tp, rack, rsm, ts);
3566         *lenp = 0;
3567         return (0);
3568 }
3569
3570
3571 static void
3572 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3573     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3574     uint8_t pass, struct rack_sendmap *hintrsm)
3575 {
3576         struct tcp_rack *rack;
3577         struct rack_sendmap *rsm, *nrsm, *insret, fe;
3578         register uint32_t snd_max, snd_una;
3579
3580         /*
3581          * Add to the RACK log of packets in flight or retransmitted. If
3582          * there is a TS option we will use the TS echoed, if not we will
3583          * grab a TS.
3584          *
3585          * Retransmissions will increment the count and move the ts to its
3586          * proper place. Note that if options do not include TS's then we
3587          * won't be able to effectively use the ACK for an RTT on a retran.
3588          *
3589          * Notes about r_start and r_end. Lets consider a send starting at
3590          * sequence 1 for 10 bytes. In such an example the r_start would be
3591          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3592          * This means that r_end is actually the first sequence for the next
3593          * slot (11).
3594          *
3595          */
3596         /*
3597          * If err is set what do we do XXXrrs? should we not add the thing?
3598          * -- i.e. return if err != 0 or should we pretend we sent it? --
3599          * i.e. proceed with add ** do this for now.
3600          */
3601         INP_WLOCK_ASSERT(tp->t_inpcb);
3602         if (err)
3603                 /*
3604                  * We don't log errors -- we could but snd_max does not
3605                  * advance in this case either.
3606                  */
3607                 return;
3608
3609         if (th_flags & TH_RST) {
3610                 /*
3611                  * We don't log resets and we return immediately from
3612                  * sending
3613                  */
3614                 return;
3615         }
3616         rack = (struct tcp_rack *)tp->t_fb_ptr;
3617         snd_una = tp->snd_una;
3618         if (SEQ_LEQ((seq_out + len), snd_una)) {
3619                 /* Are sending an old segment to induce an ack (keep-alive)? */
3620                 return;
3621         }
3622         if (SEQ_LT(seq_out, snd_una)) {
3623                 /* huh? should we panic? */
3624                 uint32_t end;
3625
3626                 end = seq_out + len;
3627                 seq_out = snd_una;
3628                 if (SEQ_GEQ(end, seq_out))
3629                         len = end - seq_out;
3630                 else
3631                         len = 0;
3632         }
3633         snd_max = tp->snd_max;
3634         if (th_flags & (TH_SYN | TH_FIN)) {
3635                 /*
3636                  * The call to rack_log_output is made before bumping
3637                  * snd_max. This means we can record one extra byte on a SYN
3638                  * or FIN if seq_out is adding more on and a FIN is present
3639                  * (and we are not resending).
3640                  */
3641                 if (th_flags & TH_SYN)
3642                         len++;
3643                 if (th_flags & TH_FIN)
3644                         len++;
3645                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3646                         /*
3647                          * The add/update as not been done for the FIN/SYN
3648                          * yet.
3649                          */
3650                         snd_max = tp->snd_nxt;
3651                 }
3652         }
3653         if (len == 0) {
3654                 /* We don't log zero window probes */
3655                 return;
3656         }
3657         rack->r_ctl.rc_time_last_sent = ts;
3658         if (IN_RECOVERY(tp->t_flags)) {
3659                 rack->r_ctl.rc_prr_out += len;
3660         }
3661         /* First question is it a retransmission or new? */
3662         if (seq_out == snd_max) {
3663                 /* Its new */
3664 again:
3665                 rsm = rack_alloc(rack);
3666                 if (rsm == NULL) {
3667                         /*
3668                          * Hmm out of memory and the tcb got destroyed while
3669                          * we tried to wait.
3670                          */
3671                         return;
3672                 }
3673                 if (th_flags & TH_FIN) {
3674                         rsm->r_flags = RACK_HAS_FIN;
3675                 } else {
3676                         rsm->r_flags = 0;
3677                 }
3678                 rsm->r_tim_lastsent[0] = ts;
3679                 rsm->r_rtr_cnt = 1;
3680                 rsm->r_rtr_bytes = 0;
3681                 if (th_flags & TH_SYN) {
3682                         /* The data space is one beyond snd_una */
3683                         rsm->r_start = seq_out + 1;
3684                         rsm->r_end = rsm->r_start + (len - 1);
3685                 } else {
3686                         /* Normal case */
3687                         rsm->r_start = seq_out;
3688                         rsm->r_end = rsm->r_start + len;
3689                 }
3690                 rsm->r_dupack = 0;
3691                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3692                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
3693 #ifdef INVARIANTS
3694                 if (insret != NULL) {
3695                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3696                               nrsm, insret, rack, rsm);
3697                 }
3698 #endif
3699                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3700                 rsm->r_in_tmap = 1;
3701                 return;
3702         }
3703         /*
3704          * If we reach here its a retransmission and we need to find it.
3705          */
3706         memset(&fe, 0, sizeof(fe));
3707 more:
3708         if (hintrsm && (hintrsm->r_start == seq_out)) {
3709                 rsm = hintrsm;
3710                 hintrsm = NULL;
3711         } else {
3712                 /* No hints sorry */
3713                 rsm = NULL;
3714         }
3715         if ((rsm) && (rsm->r_start == seq_out)) {
3716                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3717                 if (len == 0) {
3718                         return;
3719                 } else {
3720                         goto more;
3721                 }
3722         }
3723         /* Ok it was not the last pointer go through it the hard way. */
3724 refind:
3725         fe.r_start = seq_out;
3726         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3727         if (rsm) {
3728                 if (rsm->r_start == seq_out) {
3729                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3730                         if (len == 0) {
3731                                 return;
3732                         } else {
3733                                 goto refind;
3734                         }
3735                 }
3736                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3737                         /* Transmitted within this piece */
3738                         /*
3739                          * Ok we must split off the front and then let the
3740                          * update do the rest
3741                          */
3742                         nrsm = rack_alloc_full_limit(rack);
3743                         if (nrsm == NULL) {
3744                                 rack_update_rsm(tp, rack, rsm, ts);
3745                                 return;
3746                         }
3747                         /*
3748                          * copy rsm to nrsm and then trim the front of rsm
3749                          * to not include this part.
3750                          */
3751                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
3752                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3753 #ifdef INVARIANTS
3754                         if (insret != NULL) {
3755                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3756                                       nrsm, insret, rack, rsm);
3757                         }
3758 #endif
3759                         if (rsm->r_in_tmap) {
3760                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3761                                 nrsm->r_in_tmap = 1;
3762                         }
3763                         rsm->r_flags &= (~RACK_HAS_FIN);
3764                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3765                         if (len == 0) {
3766                                 return;
3767                         } else if (len > 0)
3768                                 goto refind;
3769                 }
3770         }
3771         /*
3772          * Hmm not found in map did they retransmit both old and on into the
3773          * new?
3774          */
3775         if (seq_out == tp->snd_max) {
3776                 goto again;
3777         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3778 #ifdef INVARIANTS
3779                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3780                     seq_out, len, tp->snd_una, tp->snd_max);
3781                 printf("Starting Dump of all rack entries\n");
3782                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3783                         printf("rsm:%p start:%u end:%u\n",
3784                             rsm, rsm->r_start, rsm->r_end);
3785                 }
3786                 printf("Dump complete\n");
3787                 panic("seq_out not found rack:%p tp:%p",
3788                     rack, tp);
3789 #endif
3790         } else {
3791 #ifdef INVARIANTS
3792                 /*
3793                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3794                  * flag)
3795                  */
3796                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3797                     seq_out, len, tp->snd_max, tp);
3798 #endif
3799         }
3800 }
3801
3802 /*
3803  * Record one of the RTT updates from an ack into
3804  * our sample structure.
3805  */
3806 static void
3807 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3808 {
3809         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3810             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3811                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3812         }
3813         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3814             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3815                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3816         }
3817         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3818         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3819         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3820 }
3821
3822 /*
3823  * Collect new round-trip time estimate
3824  * and update averages and current timeout.
3825  */
3826 static void
3827 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3828 {
3829         int32_t delta;
3830         uint32_t o_srtt, o_var;
3831         int32_t rtt;
3832
3833         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3834                 /* No valid sample */
3835                 return;
3836         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3837                 /* We are to use the lowest RTT seen in a single ack */
3838                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3839         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3840                 /* We are to use the highest RTT seen in a single ack */
3841                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3842         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3843                 /* We are to use the average RTT seen in a single ack */
3844                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3845                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3846         } else {
3847 #ifdef INVARIANTS
3848                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3849 #endif
3850                 return;
3851         }
3852         if (rtt == 0)
3853                 rtt = 1;
3854         rack_log_rtt_sample(rack, rtt);
3855         o_srtt = tp->t_srtt;
3856         o_var = tp->t_rttvar;
3857         rack = (struct tcp_rack *)tp->t_fb_ptr;
3858         if (tp->t_srtt != 0) {
3859                 /*
3860                  * srtt is stored as fixed point with 5 bits after the
3861                  * binary point (i.e., scaled by 8).  The following magic is
3862                  * equivalent to the smoothing algorithm in rfc793 with an
3863                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3864                  * Adjust rtt to origin 0.
3865                  */
3866                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3867                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3868
3869                 tp->t_srtt += delta;
3870                 if (tp->t_srtt <= 0)
3871                         tp->t_srtt = 1;
3872
3873                 /*
3874                  * We accumulate a smoothed rtt variance (actually, a
3875                  * smoothed mean difference), then set the retransmit timer
3876                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3877                  * is stored as fixed point with 4 bits after the binary
3878                  * point (scaled by 16).  The following is equivalent to
3879                  * rfc793 smoothing with an alpha of .75 (rttvar =
3880                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3881                  * wired-in beta.
3882                  */
3883                 if (delta < 0)
3884                         delta = -delta;
3885                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3886                 tp->t_rttvar += delta;
3887                 if (tp->t_rttvar <= 0)
3888                         tp->t_rttvar = 1;
3889                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3890                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3891         } else {
3892                 /*
3893                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3894                  * variance to half the rtt (so our first retransmit happens
3895                  * at 3*rtt).
3896                  */
3897                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3898                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3899                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3900         }
3901         TCPSTAT_INC(tcps_rttupdated);
3902         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3903         tp->t_rttupdated++;
3904 #ifdef STATS
3905         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3906 #endif
3907         tp->t_rxtshift = 0;
3908
3909         /*
3910          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3911          * way we do the smoothing, srtt and rttvar will each average +1/2
3912          * tick of bias.  When we compute the retransmit timer, we want 1/2
3913          * tick of rounding and 1 extra tick because of +-1/2 tick
3914          * uncertainty in the firing of the timer.  The bias will give us
3915          * exactly the 1.5 tick we need.  But, because the bias is
3916          * statistical, we have to test that we don't drop below the minimum
3917          * feasible timer (which is 2 ticks).
3918          */
3919         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3920            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3921         tp->t_softerror = 0;
3922 }
3923
3924 static void
3925 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3926     uint32_t t, uint32_t cts)
3927 {
3928         /*
3929          * For this RSM, we acknowledged the data from a previous
3930          * transmission, not the last one we made. This means we did a false
3931          * retransmit.
3932          */
3933         struct tcp_rack *rack;
3934
3935         if (rsm->r_flags & RACK_HAS_FIN) {
3936                 /*
3937                  * The sending of the FIN often is multiple sent when we
3938                  * have everything outstanding ack'd. We ignore this case
3939                  * since its over now.
3940                  */
3941                 return;
3942         }
3943         if (rsm->r_flags & RACK_TLP) {
3944                 /*
3945                  * We expect TLP's to have this occur.
3946                  */
3947                 return;
3948         }
3949         rack = (struct tcp_rack *)tp->t_fb_ptr;
3950         /* should we undo cc changes and exit recovery? */
3951         if (IN_RECOVERY(tp->t_flags)) {
3952                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3953                         /*
3954                          * Undo what we ratched down and exit recovery if
3955                          * possible
3956                          */
3957                         EXIT_RECOVERY(tp->t_flags);
3958                         tp->snd_recover = tp->snd_una;
3959                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3960                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3961                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3962                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3963                 }
3964         }
3965         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3966                 /*
3967                  * We retransmitted based on a sack and the earlier
3968                  * retransmission ack'd it - re-ordering is occuring.
3969                  */
3970                 counter_u64_add(rack_reorder_seen, 1);
3971                 rack->r_ctl.rc_reorder_ts = cts;
3972         }
3973         counter_u64_add(rack_badfr, 1);
3974         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3975 }
3976
3977
3978 static int
3979 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3980     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3981 {
3982         int32_t i;
3983         uint32_t t;
3984
3985         if (rsm->r_flags & RACK_ACKED)
3986                 /* Already done */
3987                 return (0);
3988
3989
3990         if ((rsm->r_rtr_cnt == 1) ||
3991             ((ack_type == CUM_ACKED) &&
3992             (to->to_flags & TOF_TS) &&
3993             (to->to_tsecr) &&
3994             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3995             ) {
3996                 /*
3997                  * We will only find a matching timestamp if its cum-acked.
3998                  * But if its only one retransmission its for-sure matching
3999                  * :-)
4000                  */
4001                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4002                 if ((int)t <= 0)
4003                         t = 1;
4004                 if (!tp->t_rttlow || tp->t_rttlow > t)
4005                         tp->t_rttlow = t;
4006                 if (!rack->r_ctl.rc_rack_min_rtt ||
4007                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4008                         rack->r_ctl.rc_rack_min_rtt = t;
4009                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4010                                 rack->r_ctl.rc_rack_min_rtt = 1;
4011                         }
4012                 }
4013                 tcp_rack_xmit_timer(rack, t + 1);
4014                 if ((rsm->r_flags & RACK_TLP) &&
4015                     (!IN_RECOVERY(tp->t_flags))) {
4016                         /* Segment was a TLP and our retrans matched */
4017                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
4018                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
4019                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4020                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4021                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
4022                                 /*
4023                                  * When we enter recovery we need to assure
4024                                  * we send one packet.
4025                                  */
4026                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4027                                 rack_log_to_prr(rack, 7);
4028                         }
4029                 }
4030                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4031                         /* New more recent rack_tmit_time */
4032                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4033                         rack->rc_rack_rtt = t;
4034                 }
4035                 return (1);
4036         }
4037         /*
4038          * We clear the soft/rxtshift since we got an ack.
4039          * There is no assurance we will call the commit() function
4040          * so we need to clear these to avoid incorrect handling.
4041          */
4042         tp->t_rxtshift = 0;
4043         tp->t_softerror = 0;
4044         if ((to->to_flags & TOF_TS) &&
4045             (ack_type == CUM_ACKED) &&
4046             (to->to_tsecr) &&
4047             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
4048                 /*
4049                  * Now which timestamp does it match? In this block the ACK
4050                  * must be coming from a previous transmission.
4051                  */
4052                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
4053                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
4054                                 t = cts - rsm->r_tim_lastsent[i];
4055                                 if ((int)t <= 0)
4056                                         t = 1;
4057                                 if ((i + 1) < rsm->r_rtr_cnt) {
4058                                         /* Likely */
4059                                         rack_earlier_retran(tp, rsm, t, cts);
4060                                 }
4061                                 if (!tp->t_rttlow || tp->t_rttlow > t)
4062                                         tp->t_rttlow = t;
4063                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4064                                         rack->r_ctl.rc_rack_min_rtt = t;
4065                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4066                                                 rack->r_ctl.rc_rack_min_rtt = 1;
4067                                         }
4068                                 }
4069                                 /*
4070                                  * Note the following calls to
4071                                  * tcp_rack_xmit_timer() are being commented
4072                                  * out for now. They give us no more accuracy
4073                                  * and often lead to a wrong choice. We have
4074                                  * enough samples that have not been
4075                                  * retransmitted. I leave the commented out
4076                                  * code in here in case in the future we
4077                                  * decide to add it back (though I can't forsee
4078                                  * doing that). That way we will easily see
4079                                  * where they need to be placed.
4080                                  */
4081                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
4082                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4083                                         /* New more recent rack_tmit_time */
4084                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4085                                         rack->rc_rack_rtt = t;
4086                                 }
4087                                 return (1);
4088                         }
4089                 }
4090                 goto ts_not_found;
4091         } else {
4092                 /*
4093                  * Ok its a SACK block that we retransmitted. or a windows
4094                  * machine without timestamps. We can tell nothing from the
4095                  * time-stamp since its not there or the time the peer last
4096                  * recieved a segment that moved forward its cum-ack point.
4097                  */
4098 ts_not_found:
4099                 i = rsm->r_rtr_cnt - 1;
4100                 t = cts - rsm->r_tim_lastsent[i];
4101                 if ((int)t <= 0)
4102                         t = 1;
4103                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4104                         /*
4105                          * We retransmitted and the ack came back in less
4106                          * than the smallest rtt we have observed. We most
4107                          * likey did an improper retransmit as outlined in
4108                          * 4.2 Step 3 point 2 in the rack-draft.
4109                          */
4110                         i = rsm->r_rtr_cnt - 2;
4111                         t = cts - rsm->r_tim_lastsent[i];
4112                         rack_earlier_retran(tp, rsm, t, cts);
4113                 } else if (rack->r_ctl.rc_rack_min_rtt) {
4114                         /*
4115                          * We retransmitted it and the retransmit did the
4116                          * job.
4117                          */
4118                         if (!rack->r_ctl.rc_rack_min_rtt ||
4119                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4120                                 rack->r_ctl.rc_rack_min_rtt = t;
4121                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
4122                                         rack->r_ctl.rc_rack_min_rtt = 1;
4123                                 }
4124                         }
4125                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
4126                                 /* New more recent rack_tmit_time */
4127                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
4128                                 rack->rc_rack_rtt = t;
4129                         }
4130                         return (1);
4131                 }
4132         }
4133         return (0);
4134 }
4135
4136 /*
4137  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
4138  */
4139 static void
4140 rack_log_sack_passed(struct tcpcb *tp,
4141     struct tcp_rack *rack, struct rack_sendmap *rsm)
4142 {
4143         struct rack_sendmap *nrsm;
4144
4145         nrsm = rsm;
4146         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
4147             rack_head, r_tnext) {
4148                 if (nrsm == rsm) {
4149                         /* Skip orginal segment he is acked */
4150                         continue;
4151                 }
4152                 if (nrsm->r_flags & RACK_ACKED) {
4153                         /*
4154                          * Skip ack'd segments, though we
4155                          * should not see these, since tmap
4156                          * should not have ack'd segments.
4157                          */
4158                         continue;
4159                 }
4160                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4161                         /*
4162                          * We found one that is already marked
4163                          * passed, we have been here before and
4164                          * so all others below this are marked.
4165                          */
4166                         break;
4167                 }
4168                 nrsm->r_flags |= RACK_SACK_PASSED;
4169                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
4170         }
4171 }
4172
4173 static uint32_t
4174 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
4175                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
4176 {
4177         uint32_t start, end, changed = 0;
4178         struct rack_sendmap stack_map;
4179         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
4180         int32_t used_ref = 1;
4181         int moved = 0;
4182
4183         start = sack->start;
4184         end = sack->end;
4185         rsm = *prsm;
4186         memset(&fe, 0, sizeof(fe));
4187 do_rest_ofb:
4188         if ((rsm == NULL) ||
4189             (SEQ_LT(end, rsm->r_start)) ||
4190             (SEQ_GEQ(start, rsm->r_end)) ||
4191             (SEQ_LT(start, rsm->r_start))) {
4192                 /*
4193                  * We are not in the right spot,
4194                  * find the correct spot in the tree.
4195                  */
4196                 used_ref = 0;
4197                 fe.r_start = start;
4198                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4199                 moved++;
4200         }
4201         if (rsm == NULL) {
4202                 /* TSNH */
4203                 goto out;
4204         }
4205         /* Ok we have an ACK for some piece of this rsm */
4206         if (rsm->r_start != start) {
4207                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4208                         /**
4209                          * Need to split this in two pieces the before and after,
4210                          * the before remains in the map, the after must be
4211                          * added. In other words we have:
4212                          * rsm        |--------------|
4213                          * sackblk        |------->
4214                          * rsm will become
4215                          *     rsm    |---|
4216                          * and nrsm will be  the sacked piece
4217                          *     nrsm       |----------|
4218                          *
4219                          * But before we start down that path lets
4220                          * see if the sack spans over on top of
4221                          * the next guy and it is already sacked.
4222                          */
4223                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4224                         if (next && (next->r_flags & RACK_ACKED) &&
4225                             SEQ_GEQ(end, next->r_start)) {
4226                                 /**
4227                                  * So the next one is already acked, and
4228                                  * we can thus by hookery use our stack_map
4229                                  * to reflect the piece being sacked and
4230                                  * then adjust the two tree entries moving
4231                                  * the start and ends around. So we start like:
4232                                  *  rsm     |------------|             (not-acked)
4233                                  *  next                 |-----------| (acked)
4234                                  *  sackblk        |-------->
4235                                  *  We want to end like so:
4236                                  *  rsm     |------|                   (not-acked)
4237                                  *  next           |-----------------| (acked)
4238                                  *  nrsm           |-----|
4239                                  * Where nrsm is a temporary stack piece we
4240                                  * use to update all the gizmos.
4241                                  */
4242                                 /* Copy up our fudge block */
4243                                 nrsm = &stack_map;
4244                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4245                                 /* Now adjust our tree blocks */
4246                                 rsm->r_end = start;
4247                                 next->r_start = start;
4248                                 /* Clear out the dup ack count of the remainder */
4249                                 rsm->r_dupack = 0;
4250                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4251                                 /* Now lets make sure our fudge block is right */
4252                                 nrsm->r_start = start;
4253                                 /* Now lets update all the stats and such */
4254                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4255                                 changed += (nrsm->r_end - nrsm->r_start);
4256                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4257                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4258                                         counter_u64_add(rack_reorder_seen, 1);
4259                                         rack->r_ctl.rc_reorder_ts = cts;
4260                                 }
4261                                 /*
4262                                  * Now we want to go up from rsm (the
4263                                  * one left un-acked) to the next one
4264                                  * in the tmap. We do this so when
4265                                  * we walk backwards we include marking
4266                                  * sack-passed on rsm (The one passed in
4267                                  * is skipped since it is generally called
4268                                  * on something sacked before removing it
4269                                  * from the tmap).
4270                                  */
4271                                 if (rsm->r_in_tmap) {
4272                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
4273                                         /*
4274                                          * Now that we have the next
4275                                          * one walk backwards from there.
4276                                          */
4277                                         if (nrsm && nrsm->r_in_tmap)
4278                                                 rack_log_sack_passed(tp, rack, nrsm);
4279                                 }
4280                                 /* Now are we done? */
4281                                 if (SEQ_LT(end, next->r_end) ||
4282                                     (end == next->r_end)) {
4283                                         /* Done with block */
4284                                         goto out;
4285                                 }
4286                                 counter_u64_add(rack_sack_used_next_merge, 1);
4287                                 /* Postion for the next block */
4288                                 start = next->r_end;
4289                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
4290                                 if (rsm == NULL)
4291                                         goto out;
4292                         } else {
4293                                 /**
4294                                  * We can't use any hookery here, so we
4295                                  * need to split the map. We enter like
4296                                  * so:
4297                                  *  rsm      |--------|
4298                                  *  sackblk       |----->
4299                                  * We will add the new block nrsm and
4300                                  * that will be the new portion, and then
4301                                  * fall through after reseting rsm. So we
4302                                  * split and look like this:
4303                                  *  rsm      |----|
4304                                  *  sackblk       |----->
4305                                  *  nrsm          |---|
4306                                  * We then fall through reseting
4307                                  * rsm to nrsm, so the next block
4308                                  * picks it up.
4309                                  */
4310                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4311                                 if (nrsm == NULL) {
4312                                         /*
4313                                          * failed XXXrrs what can we do but loose the sack
4314                                          * info?
4315                                          */
4316                                         goto out;
4317                                 }
4318                                 counter_u64_add(rack_sack_splits, 1);
4319                                 rack_clone_rsm(rack, nrsm, rsm, start);
4320                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4321 #ifdef INVARIANTS
4322                                 if (insret != NULL) {
4323                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4324                                               nrsm, insret, rack, rsm);
4325                                 }
4326 #endif
4327                                 if (rsm->r_in_tmap) {
4328                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4329                                         nrsm->r_in_tmap = 1;
4330                                 }
4331                                 rsm->r_flags &= (~RACK_HAS_FIN);
4332                                 /* Position us to point to the new nrsm that starts the sack blk */
4333                                 rsm = nrsm;
4334                         }
4335                 } else {
4336                         /* Already sacked this piece */
4337                         counter_u64_add(rack_sack_skipped_acked, 1);
4338                         moved++;
4339                         if (end == rsm->r_end) {
4340                                 /* Done with block */
4341                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4342                                 goto out;
4343                         } else if (SEQ_LT(end, rsm->r_end)) {
4344                                 /* A partial sack to a already sacked block */
4345                                 moved++;
4346                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4347                                 goto out;
4348                         } else {
4349                                 /*
4350                                  * The end goes beyond this guy
4351                                  * repostion the start to the
4352                                  * next block.
4353                                  */
4354                                 start = rsm->r_end;
4355                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4356                                 if (rsm == NULL)
4357                                         goto out;
4358                         }
4359                 }
4360         }
4361         if (SEQ_GEQ(end, rsm->r_end)) {
4362                 /**
4363                  * The end of this block is either beyond this guy or right
4364                  * at this guy. I.e.:
4365                  *  rsm ---                 |-----|
4366                  *  end                     |-----|
4367                  *  <or>
4368                  *  end                     |---------|
4369                  */
4370                 if (rsm->r_flags & RACK_TLP)
4371                         rack->r_ctl.rc_tlp_rtx_out = 0;
4372                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4373                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4374                         changed += (rsm->r_end - rsm->r_start);
4375                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4376                         if (rsm->r_in_tmap) /* should be true */
4377                                 rack_log_sack_passed(tp, rack, rsm);
4378                         /* Is Reordering occuring? */
4379                         if (rsm->r_flags & RACK_SACK_PASSED) {
4380                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4381                                 counter_u64_add(rack_reorder_seen, 1);
4382                                 rack->r_ctl.rc_reorder_ts = cts;
4383                         }
4384                         rsm->r_flags |= RACK_ACKED;
4385                         rsm->r_flags &= ~RACK_TLP;
4386                         if (rsm->r_in_tmap) {
4387                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4388                                 rsm->r_in_tmap = 0;
4389                         }
4390                 } else {
4391                         counter_u64_add(rack_sack_skipped_acked, 1);
4392                         moved++;
4393                 }
4394                 if (end == rsm->r_end) {
4395                         /* This block only - done, setup for next  */
4396                         goto out;
4397                 }
4398                 /*
4399                  * There is more not coverend by this rsm move on
4400                  * to the next block in the RB tree.
4401                  */
4402                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4403                 start = rsm->r_end;
4404                 rsm = nrsm;
4405                 if (rsm == NULL)
4406                         goto out;
4407                 goto do_rest_ofb;
4408         }
4409         /**
4410          * The end of this sack block is smaller than
4411          * our rsm i.e.:
4412          *  rsm ---                 |-----|
4413          *  end                     |--|
4414          */
4415         if ((rsm->r_flags & RACK_ACKED) == 0) {
4416                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4417                 if (prev && (prev->r_flags & RACK_ACKED)) {
4418                         /**
4419                          * Goal, we want the right remainder of rsm to shrink
4420                          * in place and span from (rsm->r_start = end) to rsm->r_end.
4421                          * We want to expand prev to go all the way
4422                          * to prev->r_end <- end.
4423                          * so in the tree we have before:
4424                          *   prev     |--------|         (acked)
4425                          *   rsm               |-------| (non-acked)
4426                          *   sackblk           |-|
4427                          * We churn it so we end up with
4428                          *   prev     |----------|       (acked)
4429                          *   rsm                 |-----| (non-acked)
4430                          *   nrsm              |-| (temporary)
4431                          */
4432                         nrsm = &stack_map;
4433                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4434                         prev->r_end = end;
4435                         rsm->r_start = end;
4436                         /* Now adjust nrsm (stack copy) to be
4437                          * the one that is the small
4438                          * piece that was "sacked".
4439                          */
4440                         nrsm->r_end = end;
4441                         rsm->r_dupack = 0;
4442                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4443                         /*
4444                          * Now nrsm is our new little piece
4445                          * that is acked (which was merged
4446                          * to prev). Update the rtt and changed
4447                          * based on that. Also check for reordering.
4448                          */
4449                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4450                         changed += (nrsm->r_end - nrsm->r_start);
4451                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4452                         if (nrsm->r_flags & RACK_SACK_PASSED) {
4453                                 counter_u64_add(rack_reorder_seen, 1);
4454                                 rack->r_ctl.rc_reorder_ts = cts;
4455                         }
4456                         rsm = prev;
4457                         counter_u64_add(rack_sack_used_prev_merge, 1);
4458                 } else {
4459                         /**
4460                          * This is the case where our previous
4461                          * block is not acked either, so we must
4462                          * split the block in two.
4463                          */
4464                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4465                         if (nrsm == NULL) {
4466                                 /* failed rrs what can we do but loose the sack info? */
4467                                 goto out;
4468                         }
4469                         /**
4470                          * In this case nrsm becomes
4471                          * nrsm->r_start = end;
4472                          * nrsm->r_end = rsm->r_end;
4473                          * which is un-acked.
4474                          * <and>
4475                          * rsm->r_end = nrsm->r_start;
4476                          * i.e. the remaining un-acked
4477                          * piece is left on the left
4478                          * hand side.
4479                          *
4480                          * So we start like this
4481                          * rsm      |----------| (not acked)
4482                          * sackblk  |---|
4483                          * build it so we have
4484                          * rsm      |---|         (acked)
4485                          * nrsm         |------|  (not acked)
4486                          */
4487                         counter_u64_add(rack_sack_splits, 1);
4488                         rack_clone_rsm(rack, nrsm, rsm, end);
4489                         rsm->r_flags &= (~RACK_HAS_FIN);
4490                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4491 #ifdef INVARIANTS
4492                         if (insret != NULL) {
4493                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4494                                       nrsm, insret, rack, rsm);
4495                         }
4496 #endif
4497                         if (rsm->r_in_tmap) {
4498                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4499                                 nrsm->r_in_tmap = 1;
4500                         }
4501                         nrsm->r_dupack = 0;
4502                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
4503                         if (rsm->r_flags & RACK_TLP)
4504                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4505                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4506                         changed += (rsm->r_end - rsm->r_start);
4507                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4508                         if (rsm->r_in_tmap) /* should be true */
4509                                 rack_log_sack_passed(tp, rack, rsm);
4510                         /* Is Reordering occuring? */
4511                         if (rsm->r_flags & RACK_SACK_PASSED) {
4512                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4513                                 counter_u64_add(rack_reorder_seen, 1);
4514                                 rack->r_ctl.rc_reorder_ts = cts;
4515                         }
4516                         rsm->r_flags |= RACK_ACKED;
4517                         rsm->r_flags &= ~RACK_TLP;
4518                         if (rsm->r_in_tmap) {
4519                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4520                                 rsm->r_in_tmap = 0;
4521                         }
4522                 }
4523         } else if (start != end){
4524                 /*
4525                  * The block was already acked.
4526                  */
4527                 counter_u64_add(rack_sack_skipped_acked, 1);
4528                 moved++;
4529         }
4530 out:
4531         if (rsm && (rsm->r_flags & RACK_ACKED)) {
4532                 /*
4533                  * Now can we merge where we worked
4534                  * with either the previous or
4535                  * next block?
4536                  */
4537                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4538                 while (next) {
4539                     if (next->r_flags & RACK_ACKED) {
4540                         /* yep this and next can be merged */
4541                         rsm = rack_merge_rsm(rack, rsm, next);
4542                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4543                     } else
4544                             break;
4545                 }
4546                 /* Now what about the previous? */
4547                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4548                 while (prev) {
4549                     if (prev->r_flags & RACK_ACKED) {
4550                         /* yep the previous and this can be merged */
4551                         rsm = rack_merge_rsm(rack, prev, rsm);
4552                         prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4553                     } else
4554                             break;
4555                 }
4556         }
4557         if (used_ref == 0) {
4558                 counter_u64_add(rack_sack_proc_all, 1);
4559         } else {
4560                 counter_u64_add(rack_sack_proc_short, 1);
4561         }
4562         /* Save off the next one for quick reference. */
4563         if (rsm)
4564                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4565         else
4566                 nrsm = NULL;
4567         *prsm = rack->r_ctl.rc_sacklast = nrsm;
4568         /* Pass back the moved. */
4569         *moved_two = moved;
4570         return (changed);
4571 }
4572
4573 static void inline
4574 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4575 {
4576         struct rack_sendmap *tmap;
4577
4578         tmap = NULL;
4579         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4580                 /* Its no longer sacked, mark it so */
4581                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4582 #ifdef INVARIANTS
4583                 if (rsm->r_in_tmap) {
4584                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4585                               rack, rsm, rsm->r_flags);
4586                 }
4587 #endif
4588                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4589                 /* Rebuild it into our tmap */
4590                 if (tmap == NULL) {
4591                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4592                         tmap = rsm;
4593                 } else {
4594                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4595                         tmap = rsm;
4596                 }
4597                 tmap->r_in_tmap = 1;
4598                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4599         }
4600         /*
4601          * Now lets possibly clear the sack filter so we start
4602          * recognizing sacks that cover this area.
4603          */
4604         if (rack_use_sack_filter)
4605                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4606
4607 }
4608
4609 static void
4610 rack_do_decay(struct tcp_rack *rack)
4611 {
4612 #ifdef NETFLIX_EXP_DETECTION
4613         struct timeval res;
4614
4615 #define timersub(tvp, uvp, vvp)                                         \
4616         do {                                                            \
4617                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
4618                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
4619                 if ((vvp)->tv_usec < 0) {                               \
4620                         (vvp)->tv_sec--;                                \
4621                         (vvp)->tv_usec += 1000000;                      \
4622                 }                                                       \
4623         } while (0)
4624
4625         timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res);
4626 #undef timersub
4627
4628         rack->r_ctl.input_pkt++;
4629         if ((rack->rc_in_persist) ||
4630             (res.tv_sec >= 1) ||
4631             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
4632                 /*
4633                  * Check for decay of non-SAD,
4634                  * we want all SAD detection metrics to
4635                  * decay 1/4 per second (or more) passed.
4636                  */
4637                 uint32_t pkt_delta;
4638
4639                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
4640                 /* Update our saved tracking values */
4641                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
4642                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
4643                 /* Now do we escape without decay? */
4644                 if (rack->rc_in_persist ||
4645                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
4646                     (pkt_delta < tcp_sad_low_pps)){
4647                         /*
4648                          * We don't decay idle connections
4649                          * or ones that have a low input pps.
4650                          */
4651                         return;
4652                 }
4653                 /* Decay the counters */
4654                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
4655                                                         tcp_sad_decay_val);
4656                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
4657                                                          tcp_sad_decay_val);
4658                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
4659                                                                tcp_sad_decay_val);
4660                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
4661                                                                 tcp_sad_decay_val);
4662         }
4663 #endif
4664 }
4665
4666 static void
4667 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4668 {
4669         uint32_t changed, entered_recovery = 0;
4670         struct tcp_rack *rack;
4671         struct rack_sendmap *rsm, *rm;
4672         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4673         register uint32_t th_ack;
4674         int32_t i, j, k, num_sack_blks = 0;
4675         uint32_t cts, acked, ack_point, sack_changed = 0;
4676         int loop_start = 0, moved_two = 0;
4677
4678         INP_WLOCK_ASSERT(tp->t_inpcb);
4679         if (th->th_flags & TH_RST) {
4680                 /* We don't log resets */
4681                 return;
4682         }
4683         rack = (struct tcp_rack *)tp->t_fb_ptr;
4684         cts = tcp_ts_getticks();
4685         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4686         changed = 0;
4687         th_ack = th->th_ack;
4688         if (rack->sack_attack_disable == 0)
4689                 rack_do_decay(rack);
4690         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
4691                 /*
4692                  * You only get credit for
4693                  * MSS and greater (and you get extra
4694                  * credit for larger cum-ack moves).
4695                  */
4696                 int ac;
4697
4698                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
4699                 rack->r_ctl.ack_count += ac;
4700                 counter_u64_add(rack_ack_total, ac);
4701         }
4702         if (rack->r_ctl.ack_count > 0xfff00000) {
4703                 /*
4704                  * reduce the number to keep us under
4705                  * a uint32_t.
4706                  */
4707                 rack->r_ctl.ack_count /= 2;
4708                 rack->r_ctl.sack_count /= 2;
4709         }
4710         if (SEQ_GT(th_ack, tp->snd_una)) {
4711                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4712                 tp->t_acktime = ticks;
4713         }
4714         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4715                 changed = th_ack - rsm->r_start;
4716         if (changed) {
4717                 /*
4718                  * The ACK point is advancing to th_ack, we must drop off
4719                  * the packets in the rack log and calculate any eligble
4720                  * RTT's.
4721                  */
4722                 rack->r_wanted_output++;
4723         more:
4724                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4725                 if (rsm == NULL) {
4726                         if ((th_ack - 1) == tp->iss) {
4727                                 /*
4728                                  * For the SYN incoming case we will not
4729                                  * have called tcp_output for the sending of
4730                                  * the SYN, so there will be no map. All
4731                                  * other cases should probably be a panic.
4732                                  */
4733                                 goto proc_sack;
4734                         }
4735                         if (tp->t_flags & TF_SENTFIN) {
4736                                 /* if we send a FIN we will not hav a map */
4737                                 goto proc_sack;
4738                         }
4739 #ifdef INVARIANTS
4740                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4741                               tp,
4742                               th, tp->t_state, rack,
4743                               tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4744 #endif
4745                         goto proc_sack;
4746                 }
4747                 if (SEQ_LT(th_ack, rsm->r_start)) {
4748                         /* Huh map is missing this */
4749 #ifdef INVARIANTS
4750                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4751                                rsm->r_start,
4752                                th_ack, tp->t_state, rack->r_state);
4753 #endif
4754                         goto proc_sack;
4755                 }
4756                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4757                 /* Now do we consume the whole thing? */
4758                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4759                         /* Its all consumed. */
4760                         uint32_t left;
4761
4762                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4763                         rsm->r_rtr_bytes = 0;
4764                         if (rsm->r_flags & RACK_TLP)
4765                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4766                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4767 #ifdef INVARIANTS
4768                         if (rm != rsm) {
4769                                 panic("removing head in rack:%p rsm:%p rm:%p",
4770                                       rack, rsm, rm);
4771                         }
4772 #endif
4773                         if (rsm->r_in_tmap) {
4774                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4775                                 rsm->r_in_tmap = 0;
4776                         }
4777                         if (rsm->r_flags & RACK_ACKED) {
4778                                 /*
4779                                  * It was acked on the scoreboard -- remove
4780                                  * it from total
4781                                  */
4782                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4783                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4784                                 /*
4785                                  * There are segments ACKED on the
4786                                  * scoreboard further up. We are seeing
4787                                  * reordering.
4788                                  */
4789                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4790                                 counter_u64_add(rack_reorder_seen, 1);
4791                                 rsm->r_flags |= RACK_ACKED;
4792                                 rack->r_ctl.rc_reorder_ts = cts;
4793                         }
4794                         left = th_ack - rsm->r_end;
4795                         if (rsm->r_rtr_cnt > 1) {
4796                                 /*
4797                                  * Technically we should make r_rtr_cnt be
4798                                  * monotonicly increasing and just mod it to
4799                                  * the timestamp it is replacing.. that way
4800                                  * we would have the last 3 retransmits. Now
4801                                  * rc_loss_count will be wrong if we
4802                                  * retransmit something more than 2 times in
4803                                  * recovery :(
4804                                  */
4805                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4806                         }
4807                         /* Free back to zone */
4808                         rack_free(rack, rsm);
4809                         if (left) {
4810                                 goto more;
4811                         }
4812                         goto proc_sack;
4813                 }
4814                 if (rsm->r_flags & RACK_ACKED) {
4815                         /*
4816                          * It was acked on the scoreboard -- remove it from
4817                          * total for the part being cum-acked.
4818                          */
4819                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4820                 }
4821                 /*
4822                  * Clear the dup ack count for
4823                  * the piece that remains.
4824                  */
4825                 rsm->r_dupack = 0;
4826                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4827                 if (rsm->r_rtr_bytes) {
4828                         /*
4829                          * It was retransmitted adjust the
4830                          * sack holes for what was acked.
4831                          */
4832                         int ack_am;
4833
4834                         ack_am = (th_ack - rsm->r_start);
4835                         if (ack_am >= rsm->r_rtr_bytes) {
4836                                 rack->r_ctl.rc_holes_rxt -= ack_am;
4837                                 rsm->r_rtr_bytes -= ack_am;
4838                         }
4839                 }
4840                 /* Update where the piece starts */
4841                 rsm->r_start = th_ack;
4842         }
4843 proc_sack:
4844         /* Check for reneging */
4845         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4846         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4847                 /*
4848                  * The peer has moved snd_una up to
4849                  * the edge of this send, i.e. one
4850                  * that it had previously acked. The only
4851                  * way that can be true if the peer threw
4852                  * away data (space issues) that it had
4853                  * previously sacked (else it would have
4854                  * given us snd_una up to (rsm->r_end).
4855                  * We need to undo the acked markings here.
4856                  *
4857                  * Note we have to look to make sure th_ack is
4858                  * our rsm->r_start in case we get an old ack
4859                  * where th_ack is behind snd_una.
4860                  */
4861                 rack_peer_reneges(rack, rsm, th->th_ack);
4862         }
4863         if ((to->to_flags & TOF_SACK) == 0) {
4864                 /* We are done nothing left */
4865                 goto out;
4866         }
4867         /* Sack block processing */
4868         if (SEQ_GT(th_ack, tp->snd_una))
4869                 ack_point = th_ack;
4870         else
4871                 ack_point = tp->snd_una;
4872         for (i = 0; i < to->to_nsacks; i++) {
4873                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4874                       &sack, sizeof(sack));
4875                 sack.start = ntohl(sack.start);
4876                 sack.end = ntohl(sack.end);
4877                 if (SEQ_GT(sack.end, sack.start) &&
4878                     SEQ_GT(sack.start, ack_point) &&
4879                     SEQ_LT(sack.start, tp->snd_max) &&
4880                     SEQ_GT(sack.end, ack_point) &&
4881                     SEQ_LEQ(sack.end, tp->snd_max)) {
4882                         sack_blocks[num_sack_blks] = sack;
4883                         num_sack_blks++;
4884 #ifdef NETFLIX_STATS
4885                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4886                            SEQ_LEQ(sack.end, th_ack)) {
4887                         /*
4888                          * Its a D-SACK block.
4889                          */
4890                         tcp_record_dsack(sack.start, sack.end);
4891 #endif
4892                 }
4893
4894         }
4895         /*
4896          * Sort the SACK blocks so we can update the rack scoreboard with
4897          * just one pass.
4898          */
4899         if (rack_use_sack_filter) {
4900                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
4901                                                  num_sack_blks, th->th_ack);
4902                 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
4903         }
4904         if (num_sack_blks == 0)  {
4905                 /* Nothing to sack (DSACKs?) */
4906                 goto out_with_totals;
4907         }
4908         if (num_sack_blks < 2) {
4909                 /* Only one, we don't need to sort */
4910                 goto do_sack_work;
4911         }
4912         /* Sort the sacks */
4913         for (i = 0; i < num_sack_blks; i++) {
4914                 for (j = i + 1; j < num_sack_blks; j++) {
4915                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4916                                 sack = sack_blocks[i];
4917                                 sack_blocks[i] = sack_blocks[j];
4918                                 sack_blocks[j] = sack;
4919                         }
4920                 }
4921         }
4922         /*
4923          * Now are any of the sack block ends the same (yes some
4924          * implementations send these)?
4925          */
4926 again:
4927         if (num_sack_blks == 0)
4928                 goto out_with_totals;
4929         if (num_sack_blks > 1) {
4930                 for (i = 0; i < num_sack_blks; i++) {
4931                         for (j = i + 1; j < num_sack_blks; j++) {
4932                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4933                                         /*
4934                                          * Ok these two have the same end we
4935                                          * want the smallest end and then
4936                                          * throw away the larger and start
4937                                          * again.
4938                                          */
4939                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4940                                                 /*
4941                                                  * The second block covers
4942                                                  * more area use that
4943                                                  */
4944                                                 sack_blocks[i].start = sack_blocks[j].start;
4945                                         }
4946                                         /*
4947                                          * Now collapse out the dup-sack and
4948                                          * lower the count
4949                                          */
4950                                         for (k = (j + 1); k < num_sack_blks; k++) {
4951                                                 sack_blocks[j].start = sack_blocks[k].start;
4952                                                 sack_blocks[j].end = sack_blocks[k].end;
4953                                                 j++;
4954                                         }
4955                                         num_sack_blks--;
4956                                         goto again;
4957                                 }
4958                         }
4959                 }
4960         }
4961 do_sack_work:
4962         /*
4963          * First lets look to see if
4964          * we have retransmitted and
4965          * can use the transmit next?
4966          */
4967         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4968         if (rsm &&
4969             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
4970             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
4971                 /*
4972                  * We probably did the FR and the next
4973                  * SACK in continues as we would expect.
4974                  */
4975                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
4976                 if (acked) {
4977                         rack->r_wanted_output++;
4978                         changed += acked;
4979                         sack_changed += acked;
4980                 }
4981                 if (num_sack_blks == 1) {
4982                         /*
4983                          * This is what we would expect from
4984                          * a normal implementation to happen
4985                          * after we have retransmitted the FR,
4986                          * i.e the sack-filter pushes down
4987                          * to 1 block and the next to be retransmitted
4988                          * is the sequence in the sack block (has more
4989                          * are acked). Count this as ACK'd data to boost
4990                          * up the chances of recovering any false positives.
4991                          */
4992                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
4993                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
4994                         counter_u64_add(rack_express_sack, 1);
4995                         if (rack->r_ctl.ack_count > 0xfff00000) {
4996                                 /*
4997                                  * reduce the number to keep us under
4998                                  * a uint32_t.
4999                                  */
5000                                 rack->r_ctl.ack_count /= 2;
5001                                 rack->r_ctl.sack_count /= 2;
5002                         }
5003                         goto out_with_totals;
5004                 } else {
5005                         /*
5006                          * Start the loop through the
5007                          * rest of blocks, past the first block.
5008                          */
5009                         moved_two = 0;
5010                         loop_start = 1;
5011                 }
5012         }
5013         /* Its a sack of some sort */
5014         rack->r_ctl.sack_count++;
5015         if (rack->r_ctl.sack_count > 0xfff00000) {
5016                 /*
5017                  * reduce the number to keep us under
5018                  * a uint32_t.
5019                  */
5020                 rack->r_ctl.ack_count /= 2;
5021                 rack->r_ctl.sack_count /= 2;
5022         }
5023         counter_u64_add(rack_sack_total, 1);
5024         if (rack->sack_attack_disable) {
5025                 /* An attacker disablement is in place */
5026                 if (num_sack_blks > 1) {
5027                         rack->r_ctl.sack_count += (num_sack_blks - 1);
5028                         rack->r_ctl.sack_moved_extra++;
5029                         counter_u64_add(rack_move_some, 1);
5030                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
5031                                 rack->r_ctl.sack_moved_extra /= 2;
5032                                 rack->r_ctl.sack_noextra_move /= 2;
5033                         }
5034                 }
5035                 goto out;
5036         }
5037         rsm = rack->r_ctl.rc_sacklast;
5038         for (i = loop_start; i < num_sack_blks; i++) {
5039                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
5040                 if (acked) {
5041                         rack->r_wanted_output++;
5042                         changed += acked;
5043                         sack_changed += acked;
5044                 }
5045                 if (moved_two) {
5046                         /*
5047                          * If we did not get a SACK for at least a MSS and
5048                          * had to move at all, or if we moved more than our
5049                          * threshold, it counts against the "extra" move.
5050                          */
5051                         rack->r_ctl.sack_moved_extra += moved_two;
5052                         counter_u64_add(rack_move_some, 1);
5053                 } else {
5054                         /*
5055                          * else we did not have to move
5056                          * any more than we would expect.
5057                          */
5058                         rack->r_ctl.sack_noextra_move++;
5059                         counter_u64_add(rack_move_none, 1);
5060                 }
5061                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
5062                         /*
5063                          * If the SACK was not a full MSS then
5064                          * we add to sack_count the number of
5065                          * MSS's (or possibly more than
5066                          * a MSS if its a TSO send) we had to skip by.
5067                          */
5068                         rack->r_ctl.sack_count += moved_two;
5069                         counter_u64_add(rack_sack_total, moved_two);
5070                 }
5071                 /*
5072                  * Now we need to setup for the next
5073                  * round. First we make sure we won't
5074                  * exceed the size of our uint32_t on
5075                  * the various counts, and then clear out
5076                  * moved_two.
5077                  */
5078                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
5079                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
5080                         rack->r_ctl.sack_moved_extra /= 2;
5081                         rack->r_ctl.sack_noextra_move /= 2;
5082                 }
5083                 if (rack->r_ctl.sack_count > 0xfff00000) {
5084                         rack->r_ctl.ack_count /= 2;
5085                         rack->r_ctl.sack_count /= 2;
5086                 }
5087                 moved_two = 0;
5088         }
5089 out_with_totals:
5090         if (num_sack_blks > 1) {
5091                 /*
5092                  * You get an extra stroke if
5093                  * you have more than one sack-blk, this
5094                  * could be where we are skipping forward
5095                  * and the sack-filter is still working, or
5096                  * it could be an attacker constantly
5097                  * moving us.
5098                  */
5099                 rack->r_ctl.sack_moved_extra++;
5100                 counter_u64_add(rack_move_some, 1);
5101         }
5102 out:
5103 #ifdef NETFLIX_EXP_DETECTION
5104         if ((rack->do_detection || tcp_force_detection) &&
5105             tcp_sack_to_ack_thresh &&
5106             tcp_sack_to_move_thresh &&
5107             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
5108                 /*
5109                  * We have thresholds set to find
5110                  * possible attackers and disable sack.
5111                  * Check them.
5112                  */
5113                 uint64_t ackratio, moveratio, movetotal;
5114
5115                 /* Log detecting */
5116                 rack_log_sad(rack, 1);
5117                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
5118                 ackratio *= (uint64_t)(1000);
5119                 if (rack->r_ctl.ack_count)
5120                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
5121                 else {
5122                         /* We really should not hit here */
5123                         ackratio = 1000;
5124                 }
5125                 if ((rack->sack_attack_disable  == 0) &&
5126                     (ackratio > rack_highest_sack_thresh_seen))
5127                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
5128                 movetotal = rack->r_ctl.sack_moved_extra;
5129                 movetotal += rack->r_ctl.sack_noextra_move;
5130                 moveratio = rack->r_ctl.sack_moved_extra;
5131                 moveratio *= (uint64_t)1000;
5132                 if (movetotal)
5133                         moveratio /= movetotal;
5134                 else {
5135                         /* No moves, thats pretty good */
5136                         moveratio = 0;
5137                 }
5138                 if ((rack->sack_attack_disable == 0) &&
5139                     (moveratio > rack_highest_move_thresh_seen))
5140                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
5141                 if (rack->sack_attack_disable == 0) {
5142                         if ((ackratio > tcp_sack_to_ack_thresh) &&
5143                             (moveratio > tcp_sack_to_move_thresh)) {
5144                                 /* Disable sack processing */
5145                                 rack->sack_attack_disable = 1;
5146                                 if (rack->r_rep_attack == 0) {
5147                                         rack->r_rep_attack = 1;
5148                                         counter_u64_add(rack_sack_attacks_detected, 1);
5149                                 }
5150                                 if (tcp_attack_on_turns_on_logging) {
5151                                         /*
5152                                          * Turn on logging, used for debugging
5153                                          * false positives.
5154                                          */
5155                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
5156                                 }
5157                                 /* Clamp the cwnd at flight size */
5158                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
5159                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
5160                                 rack_log_sad(rack, 2);
5161                         }
5162                 } else {
5163                         /* We are sack-disabled check for false positives */
5164                         if ((ackratio <= tcp_restoral_thresh) ||
5165                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
5166                                 rack->sack_attack_disable  = 0;
5167                                 rack_log_sad(rack, 3);
5168                                 /* Restart counting */
5169                                 rack->r_ctl.sack_count = 0;
5170                                 rack->r_ctl.sack_moved_extra = 0;
5171                                 rack->r_ctl.sack_noextra_move = 1;
5172                                 rack->r_ctl.ack_count = max(1,
5173                                       (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
5174
5175                                 if (rack->r_rep_reverse == 0) {
5176                                         rack->r_rep_reverse = 1;
5177                                         counter_u64_add(rack_sack_attacks_reversed, 1);
5178                                 }
5179                                 /* Restore the cwnd */
5180                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
5181                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
5182                         }
5183                 }
5184         }
5185 #endif
5186         if (changed) {
5187                 /* Something changed cancel the rack timer */
5188                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5189         }
5190         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
5191                 /*
5192                  * Ok we have a high probability that we need to go in to
5193                  * recovery since we have data sack'd
5194                  */
5195                 struct rack_sendmap *rsm;
5196                 uint32_t tsused;
5197
5198                 tsused = tcp_ts_getticks();
5199                 rsm = tcp_rack_output(tp, rack, tsused);
5200                 if (rsm) {
5201                         /* Enter recovery */
5202                         rack->r_ctl.rc_rsm_start = rsm->r_start;
5203                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
5204                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
5205                         entered_recovery = 1;
5206                         rack_cong_signal(tp, NULL, CC_NDUPACK);
5207                         /*
5208                          * When we enter recovery we need to assure we send
5209                          * one packet.
5210                          */
5211                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5212                         rack_log_to_prr(rack, 8);
5213                         rack->r_timer_override = 1;
5214                 }
5215         }
5216         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
5217                 /* Deal with changed and PRR here (in recovery only) */
5218                 uint32_t pipe, snd_una;
5219
5220                 rack->r_ctl.rc_prr_delivered += changed;
5221                 /* Compute prr_sndcnt */
5222                 if (SEQ_GT(tp->snd_una, th_ack)) {
5223                         snd_una = tp->snd_una;
5224                 } else {
5225                         snd_una = th_ack;
5226                 }
5227                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
5228                 if (pipe > tp->snd_ssthresh) {
5229                         long sndcnt;
5230
5231                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
5232                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
5233                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
5234                         else {
5235                                 rack->r_ctl.rc_prr_sndcnt = 0;
5236                                 rack_log_to_prr(rack, 9);
5237                                 sndcnt = 0;
5238                         }
5239                         sndcnt++;
5240                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
5241                                 sndcnt -= rack->r_ctl.rc_prr_out;
5242                         else
5243                                 sndcnt = 0;
5244                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
5245                         rack_log_to_prr(rack, 10);
5246                 } else {
5247                         uint32_t limit;
5248
5249                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
5250                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
5251                         else
5252                                 limit = 0;
5253                         if (changed > limit)
5254                                 limit = changed;
5255                         limit += ctf_fixed_maxseg(tp);
5256                         if (tp->snd_ssthresh > pipe) {
5257                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
5258                                 rack_log_to_prr(rack, 11);
5259                         } else {
5260                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
5261                                 rack_log_to_prr(rack, 12);
5262                         }
5263                 }
5264                 if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) {
5265                         rack->r_timer_override = 1;
5266                 }
5267         }
5268 }
5269
5270 static void
5271 rack_strike_dupack(struct tcp_rack *rack)
5272 {
5273         struct rack_sendmap *rsm;
5274
5275         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5276         if (rsm && (rsm->r_dupack < 0xff)) {
5277                 rsm->r_dupack++;
5278                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
5279                         rack->r_wanted_output = 1;
5280                         rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
5281                 } else {
5282                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
5283                 }
5284         }
5285 }
5286
5287 /*
5288  * Return value of 1, we do not need to call rack_process_data().
5289  * return value of 0, rack_process_data can be called.
5290  * For ret_val if its 0 the TCP is locked, if its non-zero
5291  * its unlocked and probably unsafe to touch the TCB.
5292  */
5293 static int
5294 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5295     struct tcpcb *tp, struct tcpopt *to,
5296     uint32_t tiwin, int32_t tlen,
5297     int32_t * ofia, int32_t thflags, int32_t * ret_val)
5298 {
5299         int32_t ourfinisacked = 0;
5300         int32_t nsegs, acked_amount;
5301         int32_t acked;
5302         struct mbuf *mfree;
5303         struct tcp_rack *rack;
5304         int32_t recovery = 0;
5305
5306         rack = (struct tcp_rack *)tp->t_fb_ptr;
5307         if (SEQ_GT(th->th_ack, tp->snd_max)) {
5308                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
5309                 rack->r_wanted_output++;
5310                 return (1);
5311         }
5312         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
5313                 if (rack->rc_in_persist)
5314                         tp->t_rxtshift = 0;
5315                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
5316                         rack_strike_dupack(rack);
5317                 rack_log_ack(tp, to, th);
5318         }
5319         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5320                 /*
5321                  * Old ack, behind (or duplicate to) the last one rcv'd
5322                  * Note: Should mark reordering is occuring! We should also
5323                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
5324                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
5325                  * retran and> ack 3
5326                  */
5327                 return (0);
5328         }
5329         /*
5330          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
5331          * something we sent.
5332          */
5333         if (tp->t_flags & TF_NEEDSYN) {
5334                 /*
5335                  * T/TCP: Connection was half-synchronized, and our SYN has
5336                  * been ACK'd (so connection is now fully synchronized).  Go
5337                  * to non-starred state, increment snd_una for ACK of SYN,
5338                  * and check if we can do window scaling.
5339                  */
5340                 tp->t_flags &= ~TF_NEEDSYN;
5341                 tp->snd_una++;
5342                 /* Do window scaling? */
5343                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5344                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5345                         tp->rcv_scale = tp->request_r_scale;
5346                         /* Send window already scaled. */
5347                 }
5348         }
5349         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5350         INP_WLOCK_ASSERT(tp->t_inpcb);
5351
5352         acked = BYTES_THIS_ACK(tp, th);
5353         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5354         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5355
5356         /*
5357          * If we just performed our first retransmit, and the ACK arrives
5358          * within our recovery window, then it was a mistake to do the
5359          * retransmit in the first place.  Recover our original cwnd and
5360          * ssthresh, and proceed to transmit where we left off.
5361          */
5362         if (tp->t_flags & TF_PREVVALID) {
5363                 tp->t_flags &= ~TF_PREVVALID;
5364                 if (tp->t_rxtshift == 1 &&
5365                     (int)(ticks - tp->t_badrxtwin) < 0)
5366                         rack_cong_signal(tp, th, CC_RTO_ERR);
5367         }
5368         /*
5369          * If we have a timestamp reply, update smoothed round trip time. If
5370          * no timestamp is present but transmit timer is running and timed
5371          * sequence number was acked, update smoothed round trip time. Since
5372          * we now have an rtt measurement, cancel the timer backoff (cf.,
5373          * Phil Karn's retransmit alg.). Recompute the initial retransmit
5374          * timer.
5375          *
5376          * Some boxes send broken timestamp replies during the SYN+ACK
5377          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5378          * and blow up the retransmit timer.
5379          */
5380         /*
5381          * If all outstanding data is acked, stop retransmit timer and
5382          * remember to restart (more output or persist). If there is more
5383          * data to be acked, restart retransmit timer, using current
5384          * (possibly backed-off) value.
5385          */
5386         if (th->th_ack == tp->snd_max) {
5387                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5388                 rack->r_wanted_output++;
5389         }
5390         if (acked == 0) {
5391                 if (ofia)
5392                         *ofia = ourfinisacked;
5393                 return (0);
5394         }
5395         if (rack->r_ctl.rc_early_recovery) {
5396                 if (IN_RECOVERY(tp->t_flags)) {
5397                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5398                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5399                                 tcp_rack_partialack(tp, th);
5400                         } else {
5401                                 rack_post_recovery(tp, th);
5402                                 recovery = 1;
5403                         }
5404                 }
5405         }
5406         /*
5407          * Let the congestion control algorithm update congestion control
5408          * related information. This typically means increasing the
5409          * congestion window.
5410          */
5411         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
5412         SOCKBUF_LOCK(&so->so_snd);
5413         acked_amount = min(acked, (int)sbavail(&so->so_snd));
5414         tp->snd_wnd -= acked_amount;
5415         mfree = sbcut_locked(&so->so_snd, acked_amount);
5416         if ((sbused(&so->so_snd) == 0) &&
5417             (acked > acked_amount) &&
5418             (tp->t_state >= TCPS_FIN_WAIT_1)) {
5419                 ourfinisacked = 1;
5420         }
5421         /* NB: sowwakeup_locked() does an implicit unlock. */
5422         sowwakeup_locked(so);
5423         m_freem(mfree);
5424         if (rack->r_ctl.rc_early_recovery == 0) {
5425                 if (IN_RECOVERY(tp->t_flags)) {
5426                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5427                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5428                                 tcp_rack_partialack(tp, th);
5429                         } else {
5430                                 rack_post_recovery(tp, th);
5431                         }
5432                 }
5433         }
5434         tp->snd_una = th->th_ack;
5435         if (SEQ_GT(tp->snd_una, tp->snd_recover))
5436                 tp->snd_recover = tp->snd_una;
5437
5438         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
5439                 tp->snd_nxt = tp->snd_una;
5440         }
5441         if (tp->snd_una == tp->snd_max) {
5442                 /* Nothing left outstanding */
5443                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5444                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
5445                         tp->t_acktime = 0;
5446                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5447                 /* Set need output so persist might get set */
5448                 rack->r_wanted_output++;
5449                 if (rack_use_sack_filter)
5450                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5451                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
5452                     (sbavail(&so->so_snd) == 0) &&
5453                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
5454                         /*
5455                          * The socket was gone and the
5456                          * peer sent data, time to
5457                          * reset him.
5458                          */
5459                         *ret_val = 1;
5460                         tp = tcp_close(tp);
5461                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
5462                         return (1);
5463                 }
5464         }
5465         if (ofia)
5466                 *ofia = ourfinisacked;
5467         return (0);
5468 }
5469
5470 static void
5471 rack_collapsed_window(struct tcp_rack *rack)
5472 {
5473         /*
5474          * Now we must walk the
5475          * send map and divide the
5476          * ones left stranded. These
5477          * guys can't cause us to abort
5478          * the connection and are really
5479          * "unsent". However if a buggy
5480          * client actually did keep some
5481          * of the data i.e. collapsed the win
5482          * and refused to ack and then opened
5483          * the win and acked that data. We would
5484          * get into an ack war, the simplier
5485          * method then of just pretending we
5486          * did not send those segments something
5487          * won't work.
5488          */
5489         struct rack_sendmap *rsm, *nrsm, fe, *insret;
5490         tcp_seq max_seq;
5491         uint32_t maxseg;
5492
5493         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
5494         maxseg = ctf_fixed_maxseg(rack->rc_tp);
5495         memset(&fe, 0, sizeof(fe));
5496         fe.r_start = max_seq;
5497         /* Find the first seq past or at maxseq */
5498         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
5499         if (rsm == NULL) {
5500                 /* Nothing to do strange */
5501                 rack->rc_has_collapsed = 0;
5502                 return;
5503         }
5504         /*
5505          * Now do we need to split at
5506          * the collapse point?
5507          */
5508         if (SEQ_GT(max_seq, rsm->r_start)) {
5509                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
5510                 if (nrsm == NULL) {
5511                         /* We can't get a rsm, mark all? */
5512                         nrsm = rsm;
5513                         goto no_split;
5514                 }
5515                 /* Clone it */
5516                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
5517                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
5518 #ifdef INVARIANTS
5519                 if (insret != NULL) {
5520                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
5521                               nrsm, insret, rack, rsm);
5522                 }
5523 #endif
5524                 if (rsm->r_in_tmap) {
5525                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5526                         nrsm->r_in_tmap = 1;
5527                 }
5528                 /*
5529                  * Set in the new RSM as the
5530                  * collapsed starting point
5531                  */
5532                 rsm = nrsm;
5533         }
5534 no_split:
5535         counter_u64_add(rack_collapsed_win, 1);
5536         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
5537                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
5538                 rack->rc_has_collapsed = 1;
5539         }
5540 }
5541
5542 static void
5543 rack_un_collapse_window(struct tcp_rack *rack)
5544 {
5545         struct rack_sendmap *rsm;
5546
5547         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
5548                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
5549                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
5550                 else
5551                         break;
5552         }
5553         rack->rc_has_collapsed = 0;
5554 }
5555
5556 /*
5557  * Return value of 1, the TCB is unlocked and most
5558  * likely gone, return value of 0, the TCP is still
5559  * locked.
5560  */
5561 static int
5562 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
5563     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
5564     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5565 {
5566         /*
5567          * Update window information. Don't look at window if no ACK: TAC's
5568          * send garbage on first SYN.
5569          */
5570         int32_t nsegs;
5571         int32_t tfo_syn;
5572         struct tcp_rack *rack;
5573
5574         rack = (struct tcp_rack *)tp->t_fb_ptr;
5575         INP_WLOCK_ASSERT(tp->t_inpcb);
5576         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5577         if ((thflags & TH_ACK) &&
5578             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
5579             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
5580             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
5581                 /* keep track of pure window updates */
5582                 if (tlen == 0 &&
5583                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
5584                         TCPSTAT_INC(tcps_rcvwinupd);
5585                 tp->snd_wnd = tiwin;
5586                 tp->snd_wl1 = th->th_seq;
5587                 tp->snd_wl2 = th->th_ack;
5588                 if (tp->snd_wnd > tp->max_sndwnd)
5589                         tp->max_sndwnd = tp->snd_wnd;
5590                 rack->r_wanted_output++;
5591         } else if (thflags & TH_ACK) {
5592                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
5593                         tp->snd_wnd = tiwin;
5594                         tp->snd_wl1 = th->th_seq;
5595                         tp->snd_wl2 = th->th_ack;
5596                 }
5597         }
5598         if (tp->snd_wnd < ctf_outstanding(tp))
5599                 /* The peer collapsed the window */
5600                 rack_collapsed_window(rack);
5601         else if (rack->rc_has_collapsed)
5602                 rack_un_collapse_window(rack);
5603         /* Was persist timer active and now we have window space? */
5604         if ((rack->rc_in_persist != 0) &&
5605             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
5606                                 rack->r_ctl.rc_pace_min_segs))) {
5607                 rack_exit_persist(tp, rack);
5608                 tp->snd_nxt = tp->snd_max;
5609                 /* Make sure we output to start the timer */
5610                 rack->r_wanted_output++;
5611         }
5612         /* Do we enter persists? */
5613         if ((rack->rc_in_persist == 0) &&
5614             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
5615             TCPS_HAVEESTABLISHED(tp->t_state) &&
5616             (tp->snd_max == tp->snd_una) &&
5617             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
5618             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
5619                 /*
5620                  * Here the rwnd is less than
5621                  * the pacing size, we are established,
5622                  * nothing is outstanding, and there is
5623                  * data to send. Enter persists.
5624                  */
5625                 tp->snd_nxt = tp->snd_una;
5626                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
5627         }
5628         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
5629                 m_freem(m);
5630                 return (0);
5631         }
5632         /*
5633          * Process segments with URG.
5634          */
5635         if ((thflags & TH_URG) && th->th_urp &&
5636             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5637                 /*
5638                  * This is a kludge, but if we receive and accept random
5639                  * urgent pointers, we'll crash in soreceive.  It's hard to
5640                  * imagine someone actually wanting to send this much urgent
5641                  * data.
5642                  */
5643                 SOCKBUF_LOCK(&so->so_rcv);
5644                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
5645                         th->th_urp = 0; /* XXX */
5646                         thflags &= ~TH_URG;     /* XXX */
5647                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
5648                         goto dodata;    /* XXX */
5649                 }
5650                 /*
5651                  * If this segment advances the known urgent pointer, then
5652                  * mark the data stream.  This should not happen in
5653                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
5654                  * FIN has been received from the remote side. In these
5655                  * states we ignore the URG.
5656                  *
5657                  * According to RFC961 (Assigned Protocols), the urgent
5658                  * pointer points to the last octet of urgent data.  We
5659                  * continue, however, to consider it to indicate the first
5660                  * octet of data past the urgent section as the original
5661                  * spec states (in one of two places).
5662                  */
5663                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
5664                         tp->rcv_up = th->th_seq + th->th_urp;
5665                         so->so_oobmark = sbavail(&so->so_rcv) +
5666                             (tp->rcv_up - tp->rcv_nxt) - 1;
5667                         if (so->so_oobmark == 0)
5668                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
5669                         sohasoutofband(so);
5670                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
5671                 }
5672                 SOCKBUF_UNLOCK(&so->so_rcv);
5673                 /*
5674                  * Remove out of band data so doesn't get presented to user.
5675                  * This can happen independent of advancing the URG pointer,
5676                  * but if two URG's are pending at once, some out-of-band
5677                  * data may creep in... ick.
5678                  */
5679                 if (th->th_urp <= (uint32_t) tlen &&
5680                     !(so->so_options & SO_OOBINLINE)) {
5681                         /* hdr drop is delayed */
5682                         tcp_pulloutofband(so, th, m, drop_hdrlen);
5683                 }
5684         } else {
5685                 /*
5686                  * If no out of band data is expected, pull receive urgent
5687                  * pointer along with the receive window.
5688                  */
5689                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
5690                         tp->rcv_up = tp->rcv_nxt;
5691         }
5692 dodata:                         /* XXX */
5693         INP_WLOCK_ASSERT(tp->t_inpcb);
5694
5695         /*
5696          * Process the segment text, merging it into the TCP sequencing
5697          * queue, and arranging for acknowledgment of receipt if necessary.
5698          * This process logically involves adjusting tp->rcv_wnd as data is
5699          * presented to the user (this happens in tcp_usrreq.c, case
5700          * PRU_RCVD).  If a FIN has already been received on this connection
5701          * then we just ignore the text.
5702          */
5703         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
5704                    IS_FASTOPEN(tp->t_flags));
5705         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
5706             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5707                 tcp_seq save_start = th->th_seq;
5708                 tcp_seq save_rnxt  = tp->rcv_nxt;
5709                 int     save_tlen  = tlen;
5710
5711                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5712                 /*
5713                  * Insert segment which includes th into TCP reassembly
5714                  * queue with control block tp.  Set thflags to whether
5715                  * reassembly now includes a segment with FIN.  This handles
5716                  * the common case inline (segment is the next to be
5717                  * received on an established connection, and the queue is
5718                  * empty), avoiding linkage into and removal from the queue
5719                  * and repetition of various conversions. Set DELACK for
5720                  * segments received in order, but ack immediately when
5721                  * segments are out of order (so fast retransmit can work).
5722                  */
5723                 if (th->th_seq == tp->rcv_nxt &&
5724                     SEGQ_EMPTY(tp) &&
5725                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
5726                     tfo_syn)) {
5727 #ifdef NETFLIX_SB_LIMITS
5728                         u_int mcnt, appended;
5729
5730                         if (so->so_rcv.sb_shlim) {
5731                                 mcnt = m_memcnt(m);
5732                                 appended = 0;
5733                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5734                                     CFO_NOSLEEP, NULL) == false) {
5735                                         counter_u64_add(tcp_sb_shlim_fails, 1);
5736                                         m_freem(m);
5737                                         return (0);
5738                                 }
5739                         }
5740 #endif
5741                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
5742                                 rack_timer_cancel(tp, rack,
5743                                     rack->r_ctl.rc_rcvtime, __LINE__);
5744                                 tp->t_flags |= TF_DELACK;
5745                         } else {
5746                                 rack->r_wanted_output++;
5747                                 tp->t_flags |= TF_ACKNOW;
5748                         }
5749                         tp->rcv_nxt += tlen;
5750                         thflags = th->th_flags & TH_FIN;
5751                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5752                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5753                         SOCKBUF_LOCK(&so->so_rcv);
5754                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5755                                 m_freem(m);
5756                         } else
5757 #ifdef NETFLIX_SB_LIMITS
5758                                 appended =
5759 #endif
5760                                         sbappendstream_locked(&so->so_rcv, m, 0);
5761                         /* NB: sorwakeup_locked() does an implicit unlock. */
5762                         sorwakeup_locked(so);
5763 #ifdef NETFLIX_SB_LIMITS
5764                         if (so->so_rcv.sb_shlim && appended != mcnt)
5765                                 counter_fo_release(so->so_rcv.sb_shlim,
5766                                     mcnt - appended);
5767 #endif
5768                 } else {
5769                         /*
5770                          * XXX: Due to the header drop above "th" is
5771                          * theoretically invalid by now.  Fortunately
5772                          * m_adj() doesn't actually frees any mbufs when
5773                          * trimming from the head.
5774                          */
5775                         tcp_seq temp = save_start;
5776                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
5777                         tp->t_flags |= TF_ACKNOW;
5778                 }
5779                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
5780                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
5781                                 /*
5782                                  * DSACK actually handled in the fastpath
5783                                  * above.
5784                                  */
5785                                 tcp_update_sack_list(tp, save_start,
5786                                     save_start + save_tlen);
5787                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
5788                                 if ((tp->rcv_numsacks >= 1) &&
5789                                     (tp->sackblks[0].end == save_start)) {
5790                                         /*
5791                                          * Partial overlap, recorded at todrop
5792                                          * above.
5793                                          */
5794                                         tcp_update_sack_list(tp,
5795                                             tp->sackblks[0].start,
5796                                             tp->sackblks[0].end);
5797                                 } else {
5798                                         tcp_update_dsack_list(tp, save_start,
5799                                             save_start + save_tlen);
5800                                 }
5801                         } else if (tlen >= save_tlen) {
5802                                 /* Update of sackblks. */
5803                                 tcp_update_dsack_list(tp, save_start,
5804                                     save_start + save_tlen);
5805                         } else if (tlen > 0) {
5806                                 tcp_update_dsack_list(tp, save_start,
5807                                     save_start + tlen);
5808                         }
5809                 }
5810         } else {
5811                 m_freem(m);
5812                 thflags &= ~TH_FIN;
5813         }
5814
5815         /*
5816          * If FIN is received ACK the FIN and let the user know that the
5817          * connection is closing.
5818          */
5819         if (thflags & TH_FIN) {
5820                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5821                         socantrcvmore(so);
5822                         /*
5823                          * If connection is half-synchronized (ie NEEDSYN
5824                          * flag on) then delay ACK, so it may be piggybacked
5825                          * when SYN is sent. Otherwise, since we received a
5826                          * FIN then no more input can be expected, send ACK
5827                          * now.
5828                          */
5829                         if (tp->t_flags & TF_NEEDSYN) {
5830                                 rack_timer_cancel(tp, rack,
5831                                     rack->r_ctl.rc_rcvtime, __LINE__);
5832                                 tp->t_flags |= TF_DELACK;
5833                         } else {
5834                                 tp->t_flags |= TF_ACKNOW;
5835                         }
5836                         tp->rcv_nxt++;
5837                 }
5838                 switch (tp->t_state) {
5839
5840                         /*
5841                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
5842                          * CLOSE_WAIT state.
5843                          */
5844                 case TCPS_SYN_RECEIVED:
5845                         tp->t_starttime = ticks;
5846                         /* FALLTHROUGH */
5847                 case TCPS_ESTABLISHED:
5848                         rack_timer_cancel(tp, rack,
5849                             rack->r_ctl.rc_rcvtime, __LINE__);
5850                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
5851                         break;
5852
5853                         /*
5854                          * If still in FIN_WAIT_1 STATE FIN has not been
5855                          * acked so enter the CLOSING state.
5856                          */
5857                 case TCPS_FIN_WAIT_1:
5858                         rack_timer_cancel(tp, rack,
5859                             rack->r_ctl.rc_rcvtime, __LINE__);
5860                         tcp_state_change(tp, TCPS_CLOSING);
5861                         break;
5862
5863                         /*
5864                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
5865                          * starting the time-wait timer, turning off the
5866                          * other standard timers.
5867                          */
5868                 case TCPS_FIN_WAIT_2:
5869                         rack_timer_cancel(tp, rack,
5870                             rack->r_ctl.rc_rcvtime, __LINE__);
5871                         tcp_twstart(tp);
5872                         return (1);
5873                 }
5874         }
5875         /*
5876          * Return any desired output.
5877          */
5878         if ((tp->t_flags & TF_ACKNOW) ||
5879             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
5880                 rack->r_wanted_output++;
5881         }
5882         INP_WLOCK_ASSERT(tp->t_inpcb);
5883         return (0);
5884 }
5885
5886 /*
5887  * Here nothing is really faster, its just that we
5888  * have broken out the fast-data path also just like
5889  * the fast-ack.
5890  */
5891 static int
5892 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
5893     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5894     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
5895 {
5896         int32_t nsegs;
5897         int32_t newsize = 0;    /* automatic sockbuf scaling */
5898         struct tcp_rack *rack;
5899 #ifdef NETFLIX_SB_LIMITS
5900         u_int mcnt, appended;
5901 #endif
5902 #ifdef TCPDEBUG
5903         /*
5904          * The size of tcp_saveipgen must be the size of the max ip header,
5905          * now IPv6.
5906          */
5907         u_char tcp_saveipgen[IP6_HDR_LEN];
5908         struct tcphdr tcp_savetcp;
5909         short ostate = 0;
5910
5911 #endif
5912         /*
5913          * If last ACK falls within this segment's sequence numbers, record
5914          * the timestamp. NOTE that the test is modified according to the
5915          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5916          */
5917         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5918                 return (0);
5919         }
5920         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5921                 return (0);
5922         }
5923         if (tiwin && tiwin != tp->snd_wnd) {
5924                 return (0);
5925         }
5926         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5927                 return (0);
5928         }
5929         if (__predict_false((to->to_flags & TOF_TS) &&
5930             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5931                 return (0);
5932         }
5933         if (__predict_false((th->th_ack != tp->snd_una))) {
5934                 return (0);
5935         }
5936         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5937                 return (0);
5938         }
5939         if ((to->to_flags & TOF_TS) != 0 &&
5940             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5941                 tp->ts_recent_age = tcp_ts_getticks();
5942                 tp->ts_recent = to->to_tsval;
5943         }
5944         rack = (struct tcp_rack *)tp->t_fb_ptr;
5945         /*
5946          * This is a pure, in-sequence data packet with nothing on the
5947          * reassembly queue and we have enough buffer space to take it.
5948          */
5949         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5950
5951 #ifdef NETFLIX_SB_LIMITS
5952         if (so->so_rcv.sb_shlim) {
5953                 mcnt = m_memcnt(m);
5954                 appended = 0;
5955                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5956                     CFO_NOSLEEP, NULL) == false) {
5957                         counter_u64_add(tcp_sb_shlim_fails, 1);
5958                         m_freem(m);
5959                         return (1);
5960                 }
5961         }
5962 #endif
5963         /* Clean receiver SACK report if present */
5964         if (tp->rcv_numsacks)
5965                 tcp_clean_sackreport(tp);
5966         TCPSTAT_INC(tcps_preddat);
5967         tp->rcv_nxt += tlen;
5968         /*
5969          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5970          */
5971         tp->snd_wl1 = th->th_seq;
5972         /*
5973          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5974          */
5975         tp->rcv_up = tp->rcv_nxt;
5976         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5977         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5978 #ifdef TCPDEBUG
5979         if (so->so_options & SO_DEBUG)
5980                 tcp_trace(TA_INPUT, ostate, tp,
5981                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5982 #endif
5983         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5984
5985         /* Add data to socket buffer. */
5986         SOCKBUF_LOCK(&so->so_rcv);
5987         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5988                 m_freem(m);
5989         } else {
5990                 /*
5991                  * Set new socket buffer size. Give up when limit is
5992                  * reached.
5993                  */
5994                 if (newsize)
5995                         if (!sbreserve_locked(&so->so_rcv,
5996                             newsize, so, NULL))
5997                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5998                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5999 #ifdef NETFLIX_SB_LIMITS
6000                 appended =
6001 #endif
6002                         sbappendstream_locked(&so->so_rcv, m, 0);
6003                 ctf_calc_rwin(so, tp);
6004         }
6005         /* NB: sorwakeup_locked() does an implicit unlock. */
6006         sorwakeup_locked(so);
6007 #ifdef NETFLIX_SB_LIMITS
6008         if (so->so_rcv.sb_shlim && mcnt != appended)
6009                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
6010 #endif
6011         if (DELAY_ACK(tp, tlen)) {
6012                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6013                 tp->t_flags |= TF_DELACK;
6014         } else {
6015                 tp->t_flags |= TF_ACKNOW;
6016                 rack->r_wanted_output++;
6017         }
6018         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
6019                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
6020         return (1);
6021 }
6022
6023 /*
6024  * This subfunction is used to try to highly optimize the
6025  * fast path. We again allow window updates that are
6026  * in sequence to remain in the fast-path. We also add
6027  * in the __predict's to attempt to help the compiler.
6028  * Note that if we return a 0, then we can *not* process
6029  * it and the caller should push the packet into the
6030  * slow-path.
6031  */
6032 static int
6033 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6034     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6035     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts, uint8_t iptos)
6036 {
6037         int32_t acked;
6038         int32_t nsegs;
6039
6040 #ifdef TCPDEBUG
6041         /*
6042          * The size of tcp_saveipgen must be the size of the max ip header,
6043          * now IPv6.
6044          */
6045         u_char tcp_saveipgen[IP6_HDR_LEN];
6046         struct tcphdr tcp_savetcp;
6047         short ostate = 0;
6048
6049 #endif
6050         struct tcp_rack *rack;
6051
6052         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
6053                 /* Old ack, behind (or duplicate to) the last one rcv'd */
6054                 return (0);
6055         }
6056         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
6057                 /* Above what we have sent? */
6058                 return (0);
6059         }
6060         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
6061                 /* We are retransmitting */
6062                 return (0);
6063         }
6064         if (__predict_false(tiwin == 0)) {
6065                 /* zero window */
6066                 return (0);
6067         }
6068         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
6069                 /* We need a SYN or a FIN, unlikely.. */
6070                 return (0);
6071         }
6072         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
6073                 /* Timestamp is behind .. old ack with seq wrap? */
6074                 return (0);
6075         }
6076         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
6077                 /* Still recovering */
6078                 return (0);
6079         }
6080         rack = (struct tcp_rack *)tp->t_fb_ptr;
6081         if (rack->r_ctl.rc_sacked) {
6082                 /* We have sack holes on our scoreboard */
6083                 return (0);
6084         }
6085         /* Ok if we reach here, we can process a fast-ack */
6086         nsegs = max(1, m->m_pkthdr.lro_nsegs);
6087         rack_log_ack(tp, to, th);
6088         /*
6089          * We made progress, clear the tlp
6090          * out flag so we could start a TLP
6091          * again.
6092          */
6093         rack->r_ctl.rc_tlp_rtx_out = 0;
6094         /* Did the window get updated? */
6095         if (tiwin != tp->snd_wnd) {
6096                 tp->snd_wnd = tiwin;
6097                 tp->snd_wl1 = th->th_seq;
6098                 if (tp->snd_wnd > tp->max_sndwnd)
6099                         tp->max_sndwnd = tp->snd_wnd;
6100         }
6101         /* Do we exit persists? */
6102         if ((rack->rc_in_persist != 0) &&
6103             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
6104                                rack->r_ctl.rc_pace_min_segs))) {
6105                 rack_exit_persist(tp, rack);
6106         }
6107         /* Do we enter persists? */
6108         if ((rack->rc_in_persist == 0) &&
6109             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
6110             TCPS_HAVEESTABLISHED(tp->t_state) &&
6111             (tp->snd_max == tp->snd_una) &&
6112             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
6113             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
6114                 /*
6115                  * Here the rwnd is less than
6116                  * the pacing size, we are established,
6117                  * nothing is outstanding, and there is
6118                  * data to send. Enter persists.
6119                  */
6120                 tp->snd_nxt = tp->snd_una;
6121                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
6122         }
6123         /*
6124          * If last ACK falls within this segment's sequence numbers, record
6125          * the timestamp. NOTE that the test is modified according to the
6126          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
6127          */
6128         if ((to->to_flags & TOF_TS) != 0 &&
6129             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
6130                 tp->ts_recent_age = tcp_ts_getticks();
6131                 tp->ts_recent = to->to_tsval;
6132         }
6133         /*
6134          * This is a pure ack for outstanding data.
6135          */
6136         TCPSTAT_INC(tcps_predack);
6137
6138         /*
6139          * "bad retransmit" recovery.
6140          */
6141         if (tp->t_flags & TF_PREVVALID) {
6142                 tp->t_flags &= ~TF_PREVVALID;
6143                 if (tp->t_rxtshift == 1 &&
6144                     (int)(ticks - tp->t_badrxtwin) < 0)
6145                         rack_cong_signal(tp, th, CC_RTO_ERR);
6146         }
6147         /*
6148          * Recalculate the transmit timer / rtt.
6149          *
6150          * Some boxes send broken timestamp replies during the SYN+ACK
6151          * phase, ignore timestamps of 0 or we could calculate a huge RTT
6152          * and blow up the retransmit timer.
6153          */
6154         acked = BYTES_THIS_ACK(tp, th);
6155
6156 #ifdef TCP_HHOOK
6157         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
6158         hhook_run_tcp_est_in(tp, th, to);
6159 #endif
6160
6161         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
6162         TCPSTAT_ADD(tcps_rcvackbyte, acked);
6163         sbdrop(&so->so_snd, acked);
6164         /*
6165          * Let the congestion control algorithm update congestion control
6166          * related information. This typically means increasing the
6167          * congestion window.
6168          */
6169         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
6170
6171         tp->snd_una = th->th_ack;
6172         if (tp->snd_wnd < ctf_outstanding(tp)) {
6173                 /* The peer collapsed the window */
6174                 rack_collapsed_window(rack);
6175         } else if (rack->rc_has_collapsed)
6176                 rack_un_collapse_window(rack);
6177
6178         /*
6179          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
6180          */
6181         tp->snd_wl2 = th->th_ack;
6182         tp->t_dupacks = 0;
6183         m_freem(m);
6184         /* ND6_HINT(tp);         *//* Some progress has been made. */
6185
6186         /*
6187          * If all outstanding data are acked, stop retransmit timer,
6188          * otherwise restart timer using current (possibly backed-off)
6189          * value. If process is waiting for space, wakeup/selwakeup/signal.
6190          * If data are ready to send, let tcp_output decide between more
6191          * output or persist.
6192          */
6193 #ifdef TCPDEBUG
6194         if (so->so_options & SO_DEBUG)
6195                 tcp_trace(TA_INPUT, ostate, tp,
6196                     (void *)tcp_saveipgen,
6197                     &tcp_savetcp, 0);
6198 #endif
6199         if (tp->snd_una == tp->snd_max) {
6200                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
6201                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
6202                         tp->t_acktime = 0;
6203                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6204         }
6205         /* Wake up the socket if we have room to write more */
6206         sowwakeup(so);
6207         if (sbavail(&so->so_snd)) {
6208                 rack->r_wanted_output++;
6209         }
6210         return (1);
6211 }
6212
6213 /*
6214  * Return value of 1, the TCB is unlocked and most
6215  * likely gone, return value of 0, the TCP is still
6216  * locked.
6217  */
6218 static int
6219 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
6220     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6221     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t tos)
6222 {
6223         int32_t ret_val = 0;
6224         int32_t todrop;
6225         int32_t ourfinisacked = 0;
6226         struct tcp_rack *rack;
6227
6228         ctf_calc_rwin(so, tp);
6229         /*
6230          * If the state is SYN_SENT: if seg contains an ACK, but not for our
6231          * SYN, drop the input. if seg contains a RST, then drop the
6232          * connection. if seg does not contain SYN, then drop it. Otherwise
6233          * this is an acceptable SYN segment initialize tp->rcv_nxt and
6234          * tp->irs if seg contains ack then advance tp->snd_una if seg
6235          * contains an ECE and ECN support is enabled, the stream is ECN
6236          * capable. if SYN has been acked change to ESTABLISHED else
6237          * SYN_RCVD state arrange for segment to be acked (eventually)
6238          * continue processing rest of data/controls, beginning with URG
6239          */
6240         if ((thflags & TH_ACK) &&
6241             (SEQ_LEQ(th->th_ack, tp->iss) ||
6242             SEQ_GT(th->th_ack, tp->snd_max))) {
6243                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6244                 return (1);
6245         }
6246         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
6247                 TCP_PROBE5(connect__refused, NULL, tp,
6248                     mtod(m, const char *), tp, th);
6249                 tp = tcp_drop(tp, ECONNREFUSED);
6250                 ctf_do_drop(m, tp);
6251                 return (1);
6252         }
6253         if (thflags & TH_RST) {
6254                 ctf_do_drop(m, tp);
6255                 return (1);
6256         }
6257         if (!(thflags & TH_SYN)) {
6258                 ctf_do_drop(m, tp);
6259                 return (1);
6260         }
6261         tp->irs = th->th_seq;
6262         tcp_rcvseqinit(tp);
6263         rack = (struct tcp_rack *)tp->t_fb_ptr;
6264         if (thflags & TH_ACK) {
6265                 int tfo_partial = 0;
6266
6267                 TCPSTAT_INC(tcps_connects);
6268                 soisconnected(so);
6269 #ifdef MAC
6270                 mac_socketpeer_set_from_mbuf(m, so);
6271 #endif
6272                 /* Do window scaling on this connection? */
6273                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6274                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6275                         tp->rcv_scale = tp->request_r_scale;
6276                 }
6277                 tp->rcv_adv += min(tp->rcv_wnd,
6278                     TCP_MAXWIN << tp->rcv_scale);
6279                 /*
6280                  * If not all the data that was sent in the TFO SYN
6281                  * has been acked, resend the remainder right away.
6282                  */
6283                 if (IS_FASTOPEN(tp->t_flags) &&
6284                     (tp->snd_una != tp->snd_max)) {
6285                         tp->snd_nxt = th->th_ack;
6286                         tfo_partial = 1;
6287                 }
6288                 /*
6289                  * If there's data, delay ACK; if there's also a FIN ACKNOW
6290                  * will be turned on later.
6291                  */
6292                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
6293                         rack_timer_cancel(tp, rack,
6294                                           rack->r_ctl.rc_rcvtime, __LINE__);
6295                         tp->t_flags |= TF_DELACK;
6296                 } else {
6297                         rack->r_wanted_output++;
6298                         tp->t_flags |= TF_ACKNOW;
6299                 }
6300
6301                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
6302                     (V_tcp_do_ecn == 1)) {
6303                         tp->t_flags2 |= TF2_ECN_PERMIT;
6304                         TCPSTAT_INC(tcps_ecn_shs);
6305                 }
6306                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
6307                         /*
6308                          * We advance snd_una for the
6309                          * fast open case. If th_ack is
6310                          * acknowledging data beyond
6311                          * snd_una we can't just call
6312                          * ack-processing since the
6313                          * data stream in our send-map
6314                          * will start at snd_una + 1 (one
6315                          * beyond the SYN). If its just
6316                          * equal we don't need to do that
6317                          * and there is no send_map.
6318                          */
6319                         tp->snd_una++;
6320                 }
6321                 /*
6322                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
6323                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
6324                  */
6325                 tp->t_starttime = ticks;
6326                 if (tp->t_flags & TF_NEEDFIN) {
6327                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
6328                         tp->t_flags &= ~TF_NEEDFIN;
6329                         thflags &= ~TH_SYN;
6330                 } else {
6331                         tcp_state_change(tp, TCPS_ESTABLISHED);
6332                         TCP_PROBE5(connect__established, NULL, tp,
6333                             mtod(m, const char *), tp, th);
6334                         cc_conn_init(tp);
6335                 }
6336         } else {
6337                 /*
6338                  * Received initial SYN in SYN-SENT[*] state => simultaneous
6339                  * open.  If segment contains CC option and there is a
6340                  * cached CC, apply TAO test. If it succeeds, connection is *
6341                  * half-synchronized. Otherwise, do 3-way handshake:
6342                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
6343                  * there was no CC option, clear cached CC value.
6344                  */
6345                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
6346                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
6347         }
6348         INP_WLOCK_ASSERT(tp->t_inpcb);
6349         /*
6350          * Advance th->th_seq to correspond to first data byte. If data,
6351          * trim to stay within window, dropping FIN if necessary.
6352          */
6353         th->th_seq++;
6354         if (tlen > tp->rcv_wnd) {
6355                 todrop = tlen - tp->rcv_wnd;
6356                 m_adj(m, -todrop);
6357                 tlen = tp->rcv_wnd;
6358                 thflags &= ~TH_FIN;
6359                 TCPSTAT_INC(tcps_rcvpackafterwin);
6360                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
6361         }
6362         tp->snd_wl1 = th->th_seq - 1;
6363         tp->rcv_up = th->th_seq;
6364         /*
6365          * Client side of transaction: already sent SYN and data. If the
6366          * remote host used T/TCP to validate the SYN, our data will be
6367          * ACK'd; if so, enter normal data segment processing in the middle
6368          * of step 5, ack processing. Otherwise, goto step 6.
6369          */
6370         if (thflags & TH_ACK) {
6371                 /* For syn-sent we need to possibly update the rtt */
6372                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6373                         uint32_t t;
6374
6375                         t = tcp_ts_getticks() - to->to_tsecr;
6376                         if (!tp->t_rttlow || tp->t_rttlow > t)
6377                                 tp->t_rttlow = t;
6378                         tcp_rack_xmit_timer(rack, t + 1);
6379                         tcp_rack_xmit_timer_commit(rack, tp);
6380                 }
6381                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
6382                         return (ret_val);
6383                 /* We may have changed to FIN_WAIT_1 above */
6384                 if (tp->t_state == TCPS_FIN_WAIT_1) {
6385                         /*
6386                          * In FIN_WAIT_1 STATE in addition to the processing
6387                          * for the ESTABLISHED state if our FIN is now
6388                          * acknowledged then enter FIN_WAIT_2.
6389                          */
6390                         if (ourfinisacked) {
6391                                 /*
6392                                  * If we can't receive any more data, then
6393                                  * closing user can proceed. Starting the
6394                                  * timer is contrary to the specification,
6395                                  * but if we don't get a FIN we'll hang
6396                                  * forever.
6397                                  *
6398                                  * XXXjl: we should release the tp also, and
6399                                  * use a compressed state.
6400                                  */
6401                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6402                                         soisdisconnected(so);
6403                                         tcp_timer_activate(tp, TT_2MSL,
6404                                             (tcp_fast_finwait2_recycle ?
6405                                             tcp_finwait2_timeout :
6406                                             TP_MAXIDLE(tp)));
6407                                 }
6408                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6409                         }
6410                 }
6411         }
6412         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6413            tiwin, thflags, nxt_pkt));
6414 }
6415
6416 /*
6417  * Return value of 1, the TCB is unlocked and most
6418  * likely gone, return value of 0, the TCP is still
6419  * locked.
6420  */
6421 static int
6422 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
6423     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6424     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6425 {
6426         struct tcp_rack *rack;
6427         int32_t ret_val = 0;
6428         int32_t ourfinisacked = 0;
6429
6430         ctf_calc_rwin(so, tp);
6431         if ((thflags & TH_ACK) &&
6432             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
6433             SEQ_GT(th->th_ack, tp->snd_max))) {
6434                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6435                 return (1);
6436         }
6437         rack = (struct tcp_rack *)tp->t_fb_ptr;
6438         if (IS_FASTOPEN(tp->t_flags)) {
6439                 /*
6440                  * When a TFO connection is in SYN_RECEIVED, the
6441                  * only valid packets are the initial SYN, a
6442                  * retransmit/copy of the initial SYN (possibly with
6443                  * a subset of the original data), a valid ACK, a
6444                  * FIN, or a RST.
6445                  */
6446                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
6447                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6448                         return (1);
6449                 } else if (thflags & TH_SYN) {
6450                         /* non-initial SYN is ignored */
6451                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
6452                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
6453                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
6454                                 ctf_do_drop(m, NULL);
6455                                 return (0);
6456                         }
6457                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
6458                         ctf_do_drop(m, NULL);
6459                         return (0);
6460                 }
6461         }
6462         if ((thflags & TH_RST) ||
6463             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6464                 return (ctf_process_rst(m, th, so, tp));
6465         /*
6466          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6467          * it's less than ts_recent, drop it.
6468          */
6469         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6470             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6471                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6472                         return (ret_val);
6473         }
6474         /*
6475          * In the SYN-RECEIVED state, validate that the packet belongs to
6476          * this connection before trimming the data to fit the receive
6477          * window.  Check the sequence number versus IRS since we know the
6478          * sequence numbers haven't wrapped.  This is a partial fix for the
6479          * "LAND" DoS attack.
6480          */
6481         if (SEQ_LT(th->th_seq, tp->irs)) {
6482                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6483                 return (1);
6484         }
6485         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6486                 return (ret_val);
6487         }
6488         /*
6489          * If last ACK falls within this segment's sequence numbers, record
6490          * its timestamp. NOTE: 1) That the test incorporates suggestions
6491          * from the latest proposal of the tcplw@cray.com list (Braden
6492          * 1993/04/26). 2) That updating only on newer timestamps interferes
6493          * with our earlier PAWS tests, so this check should be solely
6494          * predicated on the sequence space of this segment. 3) That we
6495          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6496          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6497          * SEG.Len, This modified check allows us to overcome RFC1323's
6498          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6499          * p.869. In such cases, we can still calculate the RTT correctly
6500          * when RCV.NXT == Last.ACK.Sent.
6501          */
6502         if ((to->to_flags & TOF_TS) != 0 &&
6503             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6504             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6505             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6506                 tp->ts_recent_age = tcp_ts_getticks();
6507                 tp->ts_recent = to->to_tsval;
6508         }
6509         tp->snd_wnd = tiwin;
6510         /*
6511          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6512          * is on (half-synchronized state), then queue data for later
6513          * processing; else drop segment and return.
6514          */
6515         if ((thflags & TH_ACK) == 0) {
6516                 if (IS_FASTOPEN(tp->t_flags)) {
6517                         cc_conn_init(tp);
6518                 }
6519                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6520                     tiwin, thflags, nxt_pkt));
6521         }
6522         TCPSTAT_INC(tcps_connects);
6523         soisconnected(so);
6524         /* Do window scaling? */
6525         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6526             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6527                 tp->rcv_scale = tp->request_r_scale;
6528         }
6529         /*
6530          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
6531          * FIN-WAIT-1
6532          */
6533         tp->t_starttime = ticks;
6534         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
6535                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
6536                 tp->t_tfo_pending = NULL;
6537
6538                 /*
6539                  * Account for the ACK of our SYN prior to
6540                  * regular ACK processing below.
6541                  */
6542                 tp->snd_una++;
6543         }
6544         if (tp->t_flags & TF_NEEDFIN) {
6545                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
6546                 tp->t_flags &= ~TF_NEEDFIN;
6547         } else {
6548                 tcp_state_change(tp, TCPS_ESTABLISHED);
6549                 TCP_PROBE5(accept__established, NULL, tp,
6550                     mtod(m, const char *), tp, th);
6551                 /*
6552                  * TFO connections call cc_conn_init() during SYN
6553                  * processing.  Calling it again here for such connections
6554                  * is not harmless as it would undo the snd_cwnd reduction
6555                  * that occurs when a TFO SYN|ACK is retransmitted.
6556                  */
6557                 if (!IS_FASTOPEN(tp->t_flags))
6558                         cc_conn_init(tp);
6559         }
6560         /*
6561          * If segment contains data or ACK, will call tcp_reass() later; if
6562          * not, do so now to pass queued data to user.
6563          */
6564         if (tlen == 0 && (thflags & TH_FIN) == 0)
6565                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
6566                     (struct mbuf *)0);
6567         tp->snd_wl1 = th->th_seq - 1;
6568         /* For syn-recv we need to possibly update the rtt */
6569         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6570                 uint32_t t;
6571
6572                 t = tcp_ts_getticks() - to->to_tsecr;
6573                 if (!tp->t_rttlow || tp->t_rttlow > t)
6574                         tp->t_rttlow = t;
6575                 tcp_rack_xmit_timer(rack, t + 1);
6576                 tcp_rack_xmit_timer_commit(rack, tp);
6577         }
6578         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6579                 return (ret_val);
6580         }
6581         if (tp->t_state == TCPS_FIN_WAIT_1) {
6582                 /* We could have went to FIN_WAIT_1 (or EST) above */
6583                 /*
6584                  * In FIN_WAIT_1 STATE in addition to the processing for the
6585                  * ESTABLISHED state if our FIN is now acknowledged then
6586                  * enter FIN_WAIT_2.
6587                  */
6588                 if (ourfinisacked) {
6589                         /*
6590                          * If we can't receive any more data, then closing
6591                          * user can proceed. Starting the timer is contrary
6592                          * to the specification, but if we don't get a FIN
6593                          * we'll hang forever.
6594                          *
6595                          * XXXjl: we should release the tp also, and use a
6596                          * compressed state.
6597                          */
6598                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6599                                 soisdisconnected(so);
6600                                 tcp_timer_activate(tp, TT_2MSL,
6601                                     (tcp_fast_finwait2_recycle ?
6602                                     tcp_finwait2_timeout :
6603                                     TP_MAXIDLE(tp)));
6604                         }
6605                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
6606                 }
6607         }
6608         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6609             tiwin, thflags, nxt_pkt));
6610 }
6611
6612 /*
6613  * Return value of 1, the TCB is unlocked and most
6614  * likely gone, return value of 0, the TCP is still
6615  * locked.
6616  */
6617 static int
6618 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
6619     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6620     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6621 {
6622         int32_t ret_val = 0;
6623
6624         /*
6625          * Header prediction: check for the two common cases of a
6626          * uni-directional data xfer.  If the packet has no control flags,
6627          * is in-sequence, the window didn't change and we're not
6628          * retransmitting, it's a candidate.  If the length is zero and the
6629          * ack moved forward, we're the sender side of the xfer.  Just free
6630          * the data acked & wake any higher level process that was blocked
6631          * waiting for space.  If the length is non-zero and the ack didn't
6632          * move, we're the receiver side.  If we're getting packets in-order
6633          * (the reassembly queue is empty), add the data toc The socket
6634          * buffer and note that we need a delayed ack. Make sure that the
6635          * hidden state-flags are also off. Since we check for
6636          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
6637          */
6638         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
6639             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
6640             __predict_true(SEGQ_EMPTY(tp)) &&
6641             __predict_true(th->th_seq == tp->rcv_nxt)) {
6642                 struct tcp_rack *rack;
6643
6644                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6645                 if (tlen == 0) {
6646                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
6647                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime, iptos)) {
6648                                 return (0);
6649                         }
6650                 } else {
6651                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
6652                             tiwin, nxt_pkt, iptos)) {
6653                                 return (0);
6654                         }
6655                 }
6656         }
6657         ctf_calc_rwin(so, tp);
6658
6659         if ((thflags & TH_RST) ||
6660             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6661                 return (ctf_process_rst(m, th, so, tp));
6662
6663         /*
6664          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6665          * synchronized state.
6666          */
6667         if (thflags & TH_SYN) {
6668                 ctf_challenge_ack(m, th, tp, &ret_val);
6669                 return (ret_val);
6670         }
6671         /*
6672          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6673          * it's less than ts_recent, drop it.
6674          */
6675         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6676             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6677                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6678                         return (ret_val);
6679         }
6680         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6681                 return (ret_val);
6682         }
6683         /*
6684          * If last ACK falls within this segment's sequence numbers, record
6685          * its timestamp. NOTE: 1) That the test incorporates suggestions
6686          * from the latest proposal of the tcplw@cray.com list (Braden
6687          * 1993/04/26). 2) That updating only on newer timestamps interferes
6688          * with our earlier PAWS tests, so this check should be solely
6689          * predicated on the sequence space of this segment. 3) That we
6690          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6691          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6692          * SEG.Len, This modified check allows us to overcome RFC1323's
6693          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6694          * p.869. In such cases, we can still calculate the RTT correctly
6695          * when RCV.NXT == Last.ACK.Sent.
6696          */
6697         if ((to->to_flags & TOF_TS) != 0 &&
6698             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6699             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6700             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6701                 tp->ts_recent_age = tcp_ts_getticks();
6702                 tp->ts_recent = to->to_tsval;
6703         }
6704         /*
6705          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6706          * is on (half-synchronized state), then queue data for later
6707          * processing; else drop segment and return.
6708          */
6709         if ((thflags & TH_ACK) == 0) {
6710                 if (tp->t_flags & TF_NEEDSYN) {
6711
6712                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6713                             tiwin, thflags, nxt_pkt));
6714
6715                 } else if (tp->t_flags & TF_ACKNOW) {
6716                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6717                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6718                         return (ret_val);
6719                 } else {
6720                         ctf_do_drop(m, NULL);
6721                         return (0);
6722                 }
6723         }
6724         /*
6725          * Ack processing.
6726          */
6727         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6728                 return (ret_val);
6729         }
6730         if (sbavail(&so->so_snd)) {
6731                 if (rack_progress_timeout_check(tp)) {
6732                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6733                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6734                         return (1);
6735                 }
6736         }
6737         /* State changes only happen in rack_process_data() */
6738         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6739             tiwin, thflags, nxt_pkt));
6740 }
6741
6742 /*
6743  * Return value of 1, the TCB is unlocked and most
6744  * likely gone, return value of 0, the TCP is still
6745  * locked.
6746  */
6747 static int
6748 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
6749     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6750     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6751 {
6752         int32_t ret_val = 0;
6753
6754         ctf_calc_rwin(so, tp);
6755         if ((thflags & TH_RST) ||
6756             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6757                 return (ctf_process_rst(m, th, so, tp));
6758         /*
6759          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6760          * synchronized state.
6761          */
6762         if (thflags & TH_SYN) {
6763                 ctf_challenge_ack(m, th, tp, &ret_val);
6764                 return (ret_val);
6765         }
6766         /*
6767          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6768          * it's less than ts_recent, drop it.
6769          */
6770         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6771             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6772                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6773                         return (ret_val);
6774         }
6775         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6776                 return (ret_val);
6777         }
6778         /*
6779          * If last ACK falls within this segment's sequence numbers, record
6780          * its timestamp. NOTE: 1) That the test incorporates suggestions
6781          * from the latest proposal of the tcplw@cray.com list (Braden
6782          * 1993/04/26). 2) That updating only on newer timestamps interferes
6783          * with our earlier PAWS tests, so this check should be solely
6784          * predicated on the sequence space of this segment. 3) That we
6785          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6786          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6787          * SEG.Len, This modified check allows us to overcome RFC1323's
6788          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6789          * p.869. In such cases, we can still calculate the RTT correctly
6790          * when RCV.NXT == Last.ACK.Sent.
6791          */
6792         if ((to->to_flags & TOF_TS) != 0 &&
6793             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6794             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6795             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6796                 tp->ts_recent_age = tcp_ts_getticks();
6797                 tp->ts_recent = to->to_tsval;
6798         }
6799         /*
6800          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6801          * is on (half-synchronized state), then queue data for later
6802          * processing; else drop segment and return.
6803          */
6804         if ((thflags & TH_ACK) == 0) {
6805                 if (tp->t_flags & TF_NEEDSYN) {
6806                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6807                             tiwin, thflags, nxt_pkt));
6808
6809                 } else if (tp->t_flags & TF_ACKNOW) {
6810                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6811                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6812                         return (ret_val);
6813                 } else {
6814                         ctf_do_drop(m, NULL);
6815                         return (0);
6816                 }
6817         }
6818         /*
6819          * Ack processing.
6820          */
6821         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6822                 return (ret_val);
6823         }
6824         if (sbavail(&so->so_snd)) {
6825                 if (rack_progress_timeout_check(tp)) {
6826                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6827                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6828                         return (1);
6829                 }
6830         }
6831         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6832             tiwin, thflags, nxt_pkt));
6833 }
6834
6835 static int
6836 rack_check_data_after_close(struct mbuf *m,
6837     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
6838 {
6839         struct tcp_rack *rack;
6840
6841         rack = (struct tcp_rack *)tp->t_fb_ptr;
6842         if (rack->rc_allow_data_af_clo == 0) {
6843         close_now:
6844                 tp = tcp_close(tp);
6845                 TCPSTAT_INC(tcps_rcvafterclose);
6846                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
6847                 return (1);
6848         }
6849         if (sbavail(&so->so_snd) == 0)
6850                 goto close_now;
6851         /* Ok we allow data that is ignored and a followup reset */
6852         tp->rcv_nxt = th->th_seq + *tlen;
6853         tp->t_flags2 |= TF2_DROP_AF_DATA;
6854         rack->r_wanted_output = 1;
6855         *tlen = 0;
6856         return (0);
6857 }
6858
6859 /*
6860  * Return value of 1, the TCB is unlocked and most
6861  * likely gone, return value of 0, the TCP is still
6862  * locked.
6863  */
6864 static int
6865 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
6866     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6867     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6868 {
6869         int32_t ret_val = 0;
6870         int32_t ourfinisacked = 0;
6871
6872         ctf_calc_rwin(so, tp);
6873
6874         if ((thflags & TH_RST) ||
6875             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6876                 return (ctf_process_rst(m, th, so, tp));
6877         /*
6878          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6879          * synchronized state.
6880          */
6881         if (thflags & TH_SYN) {
6882                 ctf_challenge_ack(m, th, tp, &ret_val);
6883                 return (ret_val);
6884         }
6885         /*
6886          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6887          * it's less than ts_recent, drop it.
6888          */
6889         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6890             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6891                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6892                         return (ret_val);
6893         }
6894         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6895                 return (ret_val);
6896         }
6897         /*
6898          * If new data are received on a connection after the user processes
6899          * are gone, then RST the other end.
6900          */
6901         if ((so->so_state & SS_NOFDREF) && tlen) {
6902                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6903                         return (1);
6904         }
6905         /*
6906          * If last ACK falls within this segment's sequence numbers, record
6907          * its timestamp. NOTE: 1) That the test incorporates suggestions
6908          * from the latest proposal of the tcplw@cray.com list (Braden
6909          * 1993/04/26). 2) That updating only on newer timestamps interferes
6910          * with our earlier PAWS tests, so this check should be solely
6911          * predicated on the sequence space of this segment. 3) That we
6912          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6913          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6914          * SEG.Len, This modified check allows us to overcome RFC1323's
6915          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6916          * p.869. In such cases, we can still calculate the RTT correctly
6917          * when RCV.NXT == Last.ACK.Sent.
6918          */
6919         if ((to->to_flags & TOF_TS) != 0 &&
6920             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6921             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6922             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6923                 tp->ts_recent_age = tcp_ts_getticks();
6924                 tp->ts_recent = to->to_tsval;
6925         }
6926         /*
6927          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6928          * is on (half-synchronized state), then queue data for later
6929          * processing; else drop segment and return.
6930          */
6931         if ((thflags & TH_ACK) == 0) {
6932                 if (tp->t_flags & TF_NEEDSYN) {
6933                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6934                             tiwin, thflags, nxt_pkt));
6935                 } else if (tp->t_flags & TF_ACKNOW) {
6936                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6937                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6938                         return (ret_val);
6939                 } else {
6940                         ctf_do_drop(m, NULL);
6941                         return (0);
6942                 }
6943         }
6944         /*
6945          * Ack processing.
6946          */
6947         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6948                 return (ret_val);
6949         }
6950         if (ourfinisacked) {
6951                 /*
6952                  * If we can't receive any more data, then closing user can
6953                  * proceed. Starting the timer is contrary to the
6954                  * specification, but if we don't get a FIN we'll hang
6955                  * forever.
6956                  *
6957                  * XXXjl: we should release the tp also, and use a
6958                  * compressed state.
6959                  */
6960                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6961                         soisdisconnected(so);
6962                         tcp_timer_activate(tp, TT_2MSL,
6963                             (tcp_fast_finwait2_recycle ?
6964                             tcp_finwait2_timeout :
6965                             TP_MAXIDLE(tp)));
6966                 }
6967                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6968         }
6969         if (sbavail(&so->so_snd)) {
6970                 if (rack_progress_timeout_check(tp)) {
6971                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6972                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6973                         return (1);
6974                 }
6975         }
6976         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6977             tiwin, thflags, nxt_pkt));
6978 }
6979
6980 /*
6981  * Return value of 1, the TCB is unlocked and most
6982  * likely gone, return value of 0, the TCP is still
6983  * locked.
6984  */
6985 static int
6986 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
6987     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6988     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6989 {
6990         int32_t ret_val = 0;
6991         int32_t ourfinisacked = 0;
6992
6993         ctf_calc_rwin(so, tp);
6994
6995         if ((thflags & TH_RST) ||
6996             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6997                 return (ctf_process_rst(m, th, so, tp));
6998         /*
6999          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7000          * synchronized state.
7001          */
7002         if (thflags & TH_SYN) {
7003                 ctf_challenge_ack(m, th, tp, &ret_val);
7004                 return (ret_val);
7005         }
7006         /*
7007          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7008          * it's less than ts_recent, drop it.
7009          */
7010         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7011             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7012                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7013                         return (ret_val);
7014         }
7015         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7016                 return (ret_val);
7017         }
7018         /*
7019          * If new data are received on a connection after the user processes
7020          * are gone, then RST the other end.
7021          */
7022         if ((so->so_state & SS_NOFDREF) && tlen) {
7023                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7024                         return (1);
7025         }
7026         /*
7027          * If last ACK falls within this segment's sequence numbers, record
7028          * its timestamp. NOTE: 1) That the test incorporates suggestions
7029          * from the latest proposal of the tcplw@cray.com list (Braden
7030          * 1993/04/26). 2) That updating only on newer timestamps interferes
7031          * with our earlier PAWS tests, so this check should be solely
7032          * predicated on the sequence space of this segment. 3) That we
7033          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7034          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7035          * SEG.Len, This modified check allows us to overcome RFC1323's
7036          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7037          * p.869. In such cases, we can still calculate the RTT correctly
7038          * when RCV.NXT == Last.ACK.Sent.
7039          */
7040         if ((to->to_flags & TOF_TS) != 0 &&
7041             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7042             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7043             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7044                 tp->ts_recent_age = tcp_ts_getticks();
7045                 tp->ts_recent = to->to_tsval;
7046         }
7047         /*
7048          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7049          * is on (half-synchronized state), then queue data for later
7050          * processing; else drop segment and return.
7051          */
7052         if ((thflags & TH_ACK) == 0) {
7053                 if (tp->t_flags & TF_NEEDSYN) {
7054                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7055                             tiwin, thflags, nxt_pkt));
7056                 } else if (tp->t_flags & TF_ACKNOW) {
7057                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7058                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7059                         return (ret_val);
7060                 } else {
7061                         ctf_do_drop(m, NULL);
7062                         return (0);
7063                 }
7064         }
7065         /*
7066          * Ack processing.
7067          */
7068         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7069                 return (ret_val);
7070         }
7071         if (ourfinisacked) {
7072                 tcp_twstart(tp);
7073                 m_freem(m);
7074                 return (1);
7075         }
7076         if (sbavail(&so->so_snd)) {
7077                 if (rack_progress_timeout_check(tp)) {
7078                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7079                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7080                         return (1);
7081                 }
7082         }
7083         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7084             tiwin, thflags, nxt_pkt));
7085 }
7086
7087 /*
7088  * Return value of 1, the TCB is unlocked and most
7089  * likely gone, return value of 0, the TCP is still
7090  * locked.
7091  */
7092 static int
7093 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
7094     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7095     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7096 {
7097         int32_t ret_val = 0;
7098         int32_t ourfinisacked = 0;
7099
7100         ctf_calc_rwin(so, tp);
7101
7102         if ((thflags & TH_RST) ||
7103             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7104                 return (ctf_process_rst(m, th, so, tp));
7105         /*
7106          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7107          * synchronized state.
7108          */
7109         if (thflags & TH_SYN) {
7110                 ctf_challenge_ack(m, th, tp, &ret_val);
7111                 return (ret_val);
7112         }
7113         /*
7114          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7115          * it's less than ts_recent, drop it.
7116          */
7117         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7118             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7119                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7120                         return (ret_val);
7121         }
7122         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7123                 return (ret_val);
7124         }
7125         /*
7126          * If new data are received on a connection after the user processes
7127          * are gone, then RST the other end.
7128          */
7129         if ((so->so_state & SS_NOFDREF) && tlen) {
7130                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7131                         return (1);
7132         }
7133         /*
7134          * If last ACK falls within this segment's sequence numbers, record
7135          * its timestamp. NOTE: 1) That the test incorporates suggestions
7136          * from the latest proposal of the tcplw@cray.com list (Braden
7137          * 1993/04/26). 2) That updating only on newer timestamps interferes
7138          * with our earlier PAWS tests, so this check should be solely
7139          * predicated on the sequence space of this segment. 3) That we
7140          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7141          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7142          * SEG.Len, This modified check allows us to overcome RFC1323's
7143          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7144          * p.869. In such cases, we can still calculate the RTT correctly
7145          * when RCV.NXT == Last.ACK.Sent.
7146          */
7147         if ((to->to_flags & TOF_TS) != 0 &&
7148             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7149             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7150             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7151                 tp->ts_recent_age = tcp_ts_getticks();
7152                 tp->ts_recent = to->to_tsval;
7153         }
7154         /*
7155          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7156          * is on (half-synchronized state), then queue data for later
7157          * processing; else drop segment and return.
7158          */
7159         if ((thflags & TH_ACK) == 0) {
7160                 if (tp->t_flags & TF_NEEDSYN) {
7161                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7162                             tiwin, thflags, nxt_pkt));
7163                 } else if (tp->t_flags & TF_ACKNOW) {
7164                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7165                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7166                         return (ret_val);
7167                 } else {
7168                         ctf_do_drop(m, NULL);
7169                         return (0);
7170                 }
7171         }
7172         /*
7173          * case TCPS_LAST_ACK: Ack processing.
7174          */
7175         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7176                 return (ret_val);
7177         }
7178         if (ourfinisacked) {
7179                 tp = tcp_close(tp);
7180                 ctf_do_drop(m, tp);
7181                 return (1);
7182         }
7183         if (sbavail(&so->so_snd)) {
7184                 if (rack_progress_timeout_check(tp)) {
7185                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7186                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7187                         return (1);
7188                 }
7189         }
7190         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7191             tiwin, thflags, nxt_pkt));
7192 }
7193
7194
7195 /*
7196  * Return value of 1, the TCB is unlocked and most
7197  * likely gone, return value of 0, the TCP is still
7198  * locked.
7199  */
7200 static int
7201 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
7202     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7203     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7204 {
7205         int32_t ret_val = 0;
7206         int32_t ourfinisacked = 0;
7207
7208         ctf_calc_rwin(so, tp);
7209
7210         /* Reset receive buffer auto scaling when not in bulk receive mode. */
7211         if ((thflags & TH_RST) ||
7212             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7213                 return (ctf_process_rst(m, th, so, tp));
7214         /*
7215          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7216          * synchronized state.
7217          */
7218         if (thflags & TH_SYN) {
7219                 ctf_challenge_ack(m, th, tp, &ret_val);
7220                 return (ret_val);
7221         }
7222         /*
7223          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7224          * it's less than ts_recent, drop it.
7225          */
7226         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7227             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7228                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7229                         return (ret_val);
7230         }
7231         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7232                 return (ret_val);
7233         }
7234         /*
7235          * If new data are received on a connection after the user processes
7236          * are gone, then RST the other end.
7237          */
7238         if ((so->so_state & SS_NOFDREF) &&
7239             tlen) {
7240                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7241                         return (1);
7242         }
7243         /*
7244          * If last ACK falls within this segment's sequence numbers, record
7245          * its timestamp. NOTE: 1) That the test incorporates suggestions
7246          * from the latest proposal of the tcplw@cray.com list (Braden
7247          * 1993/04/26). 2) That updating only on newer timestamps interferes
7248          * with our earlier PAWS tests, so this check should be solely
7249          * predicated on the sequence space of this segment. 3) That we
7250          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7251          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7252          * SEG.Len, This modified check allows us to overcome RFC1323's
7253          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7254          * p.869. In such cases, we can still calculate the RTT correctly
7255          * when RCV.NXT == Last.ACK.Sent.
7256          */
7257         if ((to->to_flags & TOF_TS) != 0 &&
7258             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7259             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7260             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7261                 tp->ts_recent_age = tcp_ts_getticks();
7262                 tp->ts_recent = to->to_tsval;
7263         }
7264         /*
7265          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7266          * is on (half-synchronized state), then queue data for later
7267          * processing; else drop segment and return.
7268          */
7269         if ((thflags & TH_ACK) == 0) {
7270                 if (tp->t_flags & TF_NEEDSYN) {
7271                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7272                             tiwin, thflags, nxt_pkt));
7273                 } else if (tp->t_flags & TF_ACKNOW) {
7274                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7275                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7276                         return (ret_val);
7277                 } else {
7278                         ctf_do_drop(m, NULL);
7279                         return (0);
7280                 }
7281         }
7282         /*
7283          * Ack processing.
7284          */
7285         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7286                 return (ret_val);
7287         }
7288         if (sbavail(&so->so_snd)) {
7289                 if (rack_progress_timeout_check(tp)) {
7290                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7291                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7292                         return (1);
7293                 }
7294         }
7295         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7296             tiwin, thflags, nxt_pkt));
7297 }
7298
7299
7300 static void inline
7301 rack_clear_rate_sample(struct tcp_rack *rack)
7302 {
7303         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
7304         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
7305         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
7306 }
7307
7308 static void
7309 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack)
7310 {
7311         uint32_t tls_seg = 0;
7312
7313 #ifdef KERN_TLS
7314         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
7315                 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd);
7316                 rack->r_ctl.rc_pace_min_segs = tls_seg;
7317         } else
7318 #endif
7319                 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
7320         rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs;
7321         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES)
7322                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
7323 #ifdef KERN_TLS
7324         if (tls_seg != 0) {
7325                 if (rack_hw_tls_max_seg > 1) {
7326                         rack->r_ctl.rc_pace_max_segs /= tls_seg;
7327                         if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs)
7328                                 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg;
7329                 } else {
7330                         rack->r_ctl.rc_pace_max_segs = 1;
7331                 }
7332                 if (rack->r_ctl.rc_pace_max_segs == 0)
7333                         rack->r_ctl.rc_pace_max_segs = 1;
7334                 rack->r_ctl.rc_pace_max_segs *= tls_seg;
7335         }
7336 #endif
7337         rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2);
7338 }
7339
7340 static int
7341 rack_init(struct tcpcb *tp)
7342 {
7343         struct tcp_rack *rack = NULL;
7344         struct rack_sendmap *insret;
7345
7346         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
7347         if (tp->t_fb_ptr == NULL) {
7348                 /*
7349                  * We need to allocate memory but cant. The INP and INP_INFO
7350                  * locks and they are recusive (happens during setup. So a
7351                  * scheme to drop the locks fails :(
7352                  *
7353                  */
7354                 return (ENOMEM);
7355         }
7356         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
7357
7358         rack = (struct tcp_rack *)tp->t_fb_ptr;
7359         RB_INIT(&rack->r_ctl.rc_mtree);
7360         TAILQ_INIT(&rack->r_ctl.rc_free);
7361         TAILQ_INIT(&rack->r_ctl.rc_tmap);
7362         rack->rc_tp = tp;
7363         if (tp->t_inpcb) {
7364                 rack->rc_inp = tp->t_inpcb;
7365         }
7366         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
7367         /* Probably not needed but lets be sure */
7368         rack_clear_rate_sample(rack);
7369         rack->r_cpu = 0;
7370         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
7371         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
7372         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
7373         rack->rc_pace_reduce = rack_slot_reduction;
7374         if (use_rack_cheat)
7375                 rack->use_rack_cheat = 1;
7376         if (V_tcp_delack_enabled)
7377                 tp->t_delayed_ack = 1;
7378         else
7379                 tp->t_delayed_ack = 0;
7380         rack->rc_pace_max_segs = rack_hptsi_segments;
7381         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
7382         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
7383         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
7384         rack->r_enforce_min_pace = rack_min_pace_time;
7385         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
7386         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
7387         rack->r_ctl.rc_early_recovery = rack_early_recovery;
7388         rack->rc_always_pace = rack_pace_every_seg;
7389         rack_set_pace_segments(tp, rack);
7390         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
7391         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
7392         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
7393         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
7394         rack->r_ctl.rc_min_to = rack_min_to;
7395         rack->rack_per_of_gp = rack_per_of_gp;
7396         microuptime(&rack->r_ctl.rc_last_ack);
7397         rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
7398         rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks();
7399         /* Do we force on detection? */
7400 #ifdef NETFLIX_EXP_DETECTION
7401         if (tcp_force_detection)
7402                 rack->do_detection = 1;
7403         else
7404 #endif
7405                 rack->do_detection = 0;
7406         if (tp->snd_una != tp->snd_max) {
7407                 /* Create a send map for the current outstanding data */
7408                 struct rack_sendmap *rsm;
7409
7410                 rsm = rack_alloc(rack);
7411                 if (rsm == NULL) {
7412                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7413                         tp->t_fb_ptr = NULL;
7414                         return (ENOMEM);
7415                 }
7416                 rsm->r_flags = RACK_OVERMAX;
7417                 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
7418                 rsm->r_rtr_cnt = 1;
7419                 rsm->r_rtr_bytes = 0;
7420                 rsm->r_start = tp->snd_una;
7421                 rsm->r_end = tp->snd_max;
7422                 rsm->r_dupack = 0;
7423                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7424 #ifdef INVARIANTS
7425                 if (insret != NULL) {
7426                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
7427                               insret, rack, rsm);
7428                 }
7429 #endif
7430                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7431                 rsm->r_in_tmap = 1;
7432         }
7433         rack_stop_all_timers(tp);
7434         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7435         return (0);
7436 }
7437
7438 static int
7439 rack_handoff_ok(struct tcpcb *tp)
7440 {
7441         if ((tp->t_state == TCPS_CLOSED) ||
7442             (tp->t_state == TCPS_LISTEN)) {
7443                 /* Sure no problem though it may not stick */
7444                 return (0);
7445         }
7446         if ((tp->t_state == TCPS_SYN_SENT) ||
7447             (tp->t_state == TCPS_SYN_RECEIVED)) {
7448                 /*
7449                  * We really don't know you have to get to ESTAB or beyond
7450                  * to tell.
7451                  */
7452                 return (EAGAIN);
7453         }
7454         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
7455                 return (0);
7456         }
7457         /*
7458          * If we reach here we don't do SACK on this connection so we can
7459          * never do rack.
7460          */
7461         return (EINVAL);
7462 }
7463
7464 static void
7465 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
7466 {
7467         if (tp->t_fb_ptr) {
7468                 struct tcp_rack *rack;
7469                 struct rack_sendmap *rsm, *nrsm, *rm;
7470                 if (tp->t_inpcb) {
7471                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
7472                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
7473                 }
7474                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7475 #ifdef TCP_BLACKBOX
7476                 tcp_log_flowend(tp);
7477 #endif
7478                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
7479                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7480 #ifdef INVARIANTS
7481                         if (rm != rsm) {
7482                                 panic("At fini, rack:%p rsm:%p rm:%p",
7483                                       rack, rsm, rm);
7484                         }
7485 #endif
7486                         uma_zfree(rack_zone, rsm);
7487                 }
7488                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7489                 while (rsm) {
7490                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
7491                         uma_zfree(rack_zone, rsm);
7492                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7493                 }
7494                 rack->rc_free_cnt = 0;
7495                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7496                 tp->t_fb_ptr = NULL;
7497         }
7498         /* Make sure snd_nxt is correctly set */
7499         tp->snd_nxt = tp->snd_max;
7500 }
7501
7502
7503 static void
7504 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
7505 {
7506         switch (tp->t_state) {
7507         case TCPS_SYN_SENT:
7508                 rack->r_state = TCPS_SYN_SENT;
7509                 rack->r_substate = rack_do_syn_sent;
7510                 break;
7511         case TCPS_SYN_RECEIVED:
7512                 rack->r_state = TCPS_SYN_RECEIVED;
7513                 rack->r_substate = rack_do_syn_recv;
7514                 break;
7515         case TCPS_ESTABLISHED:
7516                 rack_set_pace_segments(tp, rack);
7517                 rack->r_state = TCPS_ESTABLISHED;
7518                 rack->r_substate = rack_do_established;
7519                 break;
7520         case TCPS_CLOSE_WAIT:
7521                 rack->r_state = TCPS_CLOSE_WAIT;
7522                 rack->r_substate = rack_do_close_wait;
7523                 break;
7524         case TCPS_FIN_WAIT_1:
7525                 rack->r_state = TCPS_FIN_WAIT_1;
7526                 rack->r_substate = rack_do_fin_wait_1;
7527                 break;
7528         case TCPS_CLOSING:
7529                 rack->r_state = TCPS_CLOSING;
7530                 rack->r_substate = rack_do_closing;
7531                 break;
7532         case TCPS_LAST_ACK:
7533                 rack->r_state = TCPS_LAST_ACK;
7534                 rack->r_substate = rack_do_lastack;
7535                 break;
7536         case TCPS_FIN_WAIT_2:
7537                 rack->r_state = TCPS_FIN_WAIT_2;
7538                 rack->r_substate = rack_do_fin_wait_2;
7539                 break;
7540         case TCPS_LISTEN:
7541         case TCPS_CLOSED:
7542         case TCPS_TIME_WAIT:
7543         default:
7544                 break;
7545         };
7546 }
7547
7548
7549 static void
7550 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
7551 {
7552         /*
7553          * We received an ack, and then did not
7554          * call send or were bounced out due to the
7555          * hpts was running. Now a timer is up as well, is
7556          * it the right timer?
7557          */
7558         struct rack_sendmap *rsm;
7559         int tmr_up;
7560
7561         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7562         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
7563                 return;
7564         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7565         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
7566             (tmr_up == PACE_TMR_RXT)) {
7567                 /* Should be an RXT */
7568                 return;
7569         }
7570         if (rsm == NULL) {
7571                 /* Nothing outstanding? */
7572                 if (tp->t_flags & TF_DELACK) {
7573                         if (tmr_up == PACE_TMR_DELACK)
7574                                 /* We are supposed to have delayed ack up and we do */
7575                                 return;
7576                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
7577                         /*
7578                          * if we hit enobufs then we would expect the possiblity
7579                          * of nothing outstanding and the RXT up (and the hptsi timer).
7580                          */
7581                         return;
7582                 } else if (((V_tcp_always_keepalive ||
7583                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7584                             (tp->t_state <= TCPS_CLOSING)) &&
7585                            (tmr_up == PACE_TMR_KEEP) &&
7586                            (tp->snd_max == tp->snd_una)) {
7587                         /* We should have keep alive up and we do */
7588                         return;
7589                 }
7590         }
7591         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
7592                    ((tmr_up == PACE_TMR_TLP) ||
7593                     (tmr_up == PACE_TMR_RACK) ||
7594                     (tmr_up == PACE_TMR_RXT))) {
7595                 /*
7596                  * Either a Rack, TLP or RXT is fine if  we
7597                  * have outstanding data.
7598                  */
7599                 return;
7600         } else if (tmr_up == PACE_TMR_DELACK) {
7601                 /*
7602                  * If the delayed ack was going to go off
7603                  * before the rtx/tlp/rack timer were going to
7604                  * expire, then that would be the timer in control.
7605                  * Note we don't check the time here trusting the
7606                  * code is correct.
7607                  */
7608                 return;
7609         }
7610         /*
7611          * Ok the timer originally started is not what we want now.
7612          * We will force the hpts to be stopped if any, and restart
7613          * with the slot set to what was in the saved slot.
7614          */
7615         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
7616         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7617 }
7618
7619 static int
7620 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
7621     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
7622     int32_t nxt_pkt, struct timeval *tv)
7623 {
7624         int32_t thflags, retval, did_out = 0;
7625         int32_t way_out = 0;
7626         uint32_t cts;
7627         uint32_t tiwin;
7628         struct tcpopt to;
7629         struct tcp_rack *rack;
7630         struct rack_sendmap *rsm;
7631         int32_t prev_state = 0;
7632
7633         if (m->m_flags & M_TSTMP_LRO) {
7634                 tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7635                 tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7636         }
7637         cts = tcp_tv_to_mssectick(tv);
7638         rack = (struct tcp_rack *)tp->t_fb_ptr;
7639
7640         kern_prefetch(rack, &prev_state);
7641         prev_state = 0;
7642         thflags = th->th_flags;
7643
7644         NET_EPOCH_ASSERT();
7645         INP_WLOCK_ASSERT(tp->t_inpcb);
7646         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
7647             __func__));
7648         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
7649             __func__));
7650         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
7651                 union tcp_log_stackspecific log;
7652                 struct timeval tv;
7653
7654                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
7655                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
7656                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
7657                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
7658                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
7659                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
7660                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
7661                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
7662                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
7663                     tlen, &log, true, &tv);
7664         }
7665         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
7666                 way_out = 4;
7667                 retval = 0;
7668                 goto done_with_input;
7669         }
7670         /*
7671          * If a segment with the ACK-bit set arrives in the SYN-SENT state
7672          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
7673          */
7674         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
7675             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
7676                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7677                 return(1);
7678         }
7679         /*
7680          * Segment received on connection. Reset idle time and keep-alive
7681          * timer. XXX: This should be done after segment validation to
7682          * ignore broken/spoofed segs.
7683          */
7684         if  (tp->t_idle_reduce &&
7685              (tp->snd_max == tp->snd_una) &&
7686              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
7687                 counter_u64_add(rack_input_idle_reduces, 1);
7688                 rack_cc_after_idle(tp);
7689         }
7690         tp->t_rcvtime = ticks;
7691
7692         /*
7693          * Unscale the window into a 32-bit value. For the SYN_SENT state
7694          * the scale is zero.
7695          */
7696         tiwin = th->th_win << tp->snd_scale;
7697 #ifdef STATS
7698         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
7699 #endif
7700         if (tiwin > rack->r_ctl.rc_high_rwnd)
7701                 rack->r_ctl.rc_high_rwnd = tiwin;
7702         /*
7703          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
7704          * this to occur after we've validated the segment.
7705          */
7706         if (tp->t_flags2 & TF2_ECN_PERMIT) {
7707                 if (thflags & TH_CWR) {
7708                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
7709                         tp->t_flags |= TF_ACKNOW;
7710                 }
7711                 switch (iptos & IPTOS_ECN_MASK) {
7712                 case IPTOS_ECN_CE:
7713                         tp->t_flags2 |= TF2_ECN_SND_ECE;
7714                         TCPSTAT_INC(tcps_ecn_ce);
7715                         break;
7716                 case IPTOS_ECN_ECT0:
7717                         TCPSTAT_INC(tcps_ecn_ect0);
7718                         break;
7719                 case IPTOS_ECN_ECT1:
7720                         TCPSTAT_INC(tcps_ecn_ect1);
7721                         break;
7722                 }
7723
7724                 /* Process a packet differently from RFC3168. */
7725                 cc_ecnpkt_handler(tp, th, iptos);
7726
7727                 /* Congestion experienced. */
7728                 if (thflags & TH_ECE) {
7729                         rack_cong_signal(tp, th, CC_ECN);
7730                 }
7731         }
7732         /*
7733          * Parse options on any incoming segment.
7734          */
7735         tcp_dooptions(&to, (u_char *)(th + 1),
7736             (th->th_off << 2) - sizeof(struct tcphdr),
7737             (thflags & TH_SYN) ? TO_SYN : 0);
7738
7739         /*
7740          * If echoed timestamp is later than the current time, fall back to
7741          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
7742          * were used when this connection was established.
7743          */
7744         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
7745                 to.to_tsecr -= tp->ts_offset;
7746                 if (TSTMP_GT(to.to_tsecr, cts))
7747                         to.to_tsecr = 0;
7748         }
7749         /*
7750          * If its the first time in we need to take care of options and
7751          * verify we can do SACK for rack!
7752          */
7753         if (rack->r_state == 0) {
7754                 /* Should be init'd by rack_init() */
7755                 KASSERT(rack->rc_inp != NULL,
7756                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
7757                 if (rack->rc_inp == NULL) {
7758                         rack->rc_inp = tp->t_inpcb;
7759                 }
7760
7761                 /*
7762                  * Process options only when we get SYN/ACK back. The SYN
7763                  * case for incoming connections is handled in tcp_syncache.
7764                  * According to RFC1323 the window field in a SYN (i.e., a
7765                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
7766                  * this is traditional behavior, may need to be cleaned up.
7767                  */
7768                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
7769                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
7770                         if ((to.to_flags & TOF_SCALE) &&
7771                             (tp->t_flags & TF_REQ_SCALE)) {
7772                                 tp->t_flags |= TF_RCVD_SCALE;
7773                                 tp->snd_scale = to.to_wscale;
7774                         }
7775                         /*
7776                          * Initial send window.  It will be updated with the
7777                          * next incoming segment to the scaled value.
7778                          */
7779                         tp->snd_wnd = th->th_win;
7780                         if (to.to_flags & TOF_TS) {
7781                                 tp->t_flags |= TF_RCVD_TSTMP;
7782                                 tp->ts_recent = to.to_tsval;
7783                                 tp->ts_recent_age = cts;
7784                         }
7785                         if (to.to_flags & TOF_MSS)
7786                                 tcp_mss(tp, to.to_mss);
7787                         if ((tp->t_flags & TF_SACK_PERMIT) &&
7788                             (to.to_flags & TOF_SACKPERM) == 0)
7789                                 tp->t_flags &= ~TF_SACK_PERMIT;
7790                         if (IS_FASTOPEN(tp->t_flags)) {
7791                                 if (to.to_flags & TOF_FASTOPEN) {
7792                                         uint16_t mss;
7793
7794                                         if (to.to_flags & TOF_MSS)
7795                                                 mss = to.to_mss;
7796                                         else
7797                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
7798                                                         mss = TCP6_MSS;
7799                                                 else
7800                                                         mss = TCP_MSS;
7801                                         tcp_fastopen_update_cache(tp, mss,
7802                                             to.to_tfo_len, to.to_tfo_cookie);
7803                                 } else
7804                                         tcp_fastopen_disable_path(tp);
7805                         }
7806                 }
7807                 /*
7808                  * At this point we are at the initial call. Here we decide
7809                  * if we are doing RACK or not. We do this by seeing if
7810                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
7811                  * we switch to the default code.
7812                  */
7813                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
7814                         tcp_switch_back_to_default(tp);
7815                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
7816                             tlen, iptos);
7817                         return (1);
7818                 }
7819                 /* Set the flag */
7820                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
7821                 tcp_set_hpts(tp->t_inpcb);
7822                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
7823         }
7824         /*
7825          * This is the one exception case where we set the rack state
7826          * always. All other times (timers etc) we must have a rack-state
7827          * set (so we assure we have done the checks above for SACK).
7828          */
7829         memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval));
7830         rack->r_ctl.rc_rcvtime = cts;
7831         if (rack->r_state != tp->t_state)
7832                 rack_set_state(tp, rack);
7833         if (SEQ_GT(th->th_ack, tp->snd_una) &&
7834             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
7835                 kern_prefetch(rsm, &prev_state);
7836         prev_state = rack->r_state;
7837         rack->r_ctl.rc_tlp_send_cnt = 0;
7838         rack_clear_rate_sample(rack);
7839         retval = (*rack->r_substate) (m, th, so,
7840             tp, &to, drop_hdrlen,
7841             tlen, tiwin, thflags, nxt_pkt, iptos);
7842 #ifdef INVARIANTS
7843         if ((retval == 0) &&
7844             (tp->t_inpcb == NULL)) {
7845                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
7846                     retval, tp, prev_state);
7847         }
7848 #endif
7849         if (retval == 0) {
7850                 /*
7851                  * If retval is 1 the tcb is unlocked and most likely the tp
7852                  * is gone.
7853                  */
7854                 INP_WLOCK_ASSERT(tp->t_inpcb);
7855                 if (rack->set_pacing_done_a_iw == 0) {
7856                         /* How much has been acked? */
7857                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
7858                                 /* We have enough to set in the pacing segment size */
7859                                 rack->set_pacing_done_a_iw = 1;
7860                                 rack_set_pace_segments(tp, rack);
7861                         }
7862                 }
7863                 tcp_rack_xmit_timer_commit(rack, tp);
7864                 if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) {
7865                         if (rack->r_wanted_output != 0) {
7866                                 did_out = 1;
7867                                 (void)tp->t_fb->tfb_tcp_output(tp);
7868                         }
7869                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7870                 }
7871                 if ((nxt_pkt == 0) &&
7872                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
7873                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
7874                      (tp->t_flags & TF_DELACK) ||
7875                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7876                       (tp->t_state <= TCPS_CLOSING)))) {
7877                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
7878                         if ((tp->snd_max == tp->snd_una) &&
7879                             ((tp->t_flags & TF_DELACK) == 0) &&
7880                             (rack->rc_inp->inp_in_hpts) &&
7881                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
7882                                 /* keep alive not needed if we are hptsi output yet */
7883                                 ;
7884                         } else {
7885                                 if (rack->rc_inp->inp_in_hpts) {
7886                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7887                                         counter_u64_add(rack_per_timer_hole, 1);
7888                                 }
7889                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7890                         }
7891                         way_out = 1;
7892                 } else if (nxt_pkt == 0) {
7893                         /* Do we have the correct timer running? */
7894                         rack_timer_audit(tp, rack, &so->so_snd);
7895                         way_out = 2;
7896                 }
7897         done_with_input:
7898                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
7899                 if (did_out)
7900                         rack->r_wanted_output = 0;
7901 #ifdef INVARIANTS
7902                 if (tp->t_inpcb == NULL) {
7903                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
7904                               did_out,
7905                               retval, tp, prev_state);
7906                 }
7907 #endif
7908         }
7909         return (retval);
7910 }
7911
7912 void
7913 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
7914     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
7915 {
7916         struct timeval tv;
7917
7918         /* First lets see if we have old packets */
7919         if (tp->t_in_pkt) {
7920                 if (ctf_do_queued_segments(so, tp, 1)) {
7921                         m_freem(m);
7922                         return;
7923                 }
7924         }
7925         if (m->m_flags & M_TSTMP_LRO) {
7926                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7927                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7928         } else {
7929                 /* Should not be should we kassert instead? */
7930                 tcp_get_usecs(&tv);
7931         }
7932         if(rack_do_segment_nounlock(m, th, so, tp,
7933                                     drop_hdrlen, tlen, iptos, 0, &tv) == 0)
7934                 INP_WUNLOCK(tp->t_inpcb);
7935 }
7936
7937 struct rack_sendmap *
7938 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
7939 {
7940         struct rack_sendmap *rsm = NULL;
7941         int32_t idx;
7942         uint32_t srtt = 0, thresh = 0, ts_low = 0;
7943
7944         /* Return the next guy to be re-transmitted */
7945         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
7946                 return (NULL);
7947         }
7948         if (tp->t_flags & TF_SENTFIN) {
7949                 /* retran the end FIN? */
7950                 return (NULL);
7951         }
7952         /* ok lets look at this one */
7953         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7954         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
7955                 goto check_it;
7956         }
7957         rsm = rack_find_lowest_rsm(rack);
7958         if (rsm == NULL) {
7959                 return (NULL);
7960         }
7961 check_it:
7962         if (rsm->r_flags & RACK_ACKED) {
7963                 return (NULL);
7964         }
7965         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
7966                 /* Its not yet ready */
7967                 return (NULL);
7968         }
7969         srtt = rack_grab_rtt(tp, rack);
7970         idx = rsm->r_rtr_cnt - 1;
7971         ts_low = rsm->r_tim_lastsent[idx];
7972         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
7973         if ((tsused == ts_low) ||
7974             (TSTMP_LT(tsused, ts_low))) {
7975                 /* No time since sending */
7976                 return (NULL);
7977         }
7978         if ((tsused - ts_low) < thresh) {
7979                 /* It has not been long enough yet */
7980                 return (NULL);
7981         }
7982         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
7983             ((rsm->r_flags & RACK_SACK_PASSED) &&
7984              (rack->sack_attack_disable == 0))) {
7985                 /*
7986                  * We have passed the dup-ack threshold <or>
7987                  * a SACK has indicated this is missing.
7988                  * Note that if you are a declared attacker
7989                  * it is only the dup-ack threshold that
7990                  * will cause retransmits.
7991                  */
7992                 /* log retransmit reason */
7993                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
7994                 return (rsm);
7995         }
7996         return (NULL);
7997 }
7998
7999 static int32_t
8000 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len)
8001 {
8002         int32_t slot = 0;
8003
8004         if ((rack->rack_per_of_gp == 0) ||
8005             (rack->rc_always_pace == 0)) {
8006                 /*
8007                  * We use the most optimistic possible cwnd/srtt for
8008                  * sending calculations. This will make our
8009                  * calculation anticipate getting more through
8010                  * quicker then possible. But thats ok we don't want
8011                  * the peer to have a gap in data sending.
8012                  */
8013                 uint32_t srtt, cwnd, tr_perms = 0;
8014
8015 old_method:
8016                 if (rack->r_ctl.rc_rack_min_rtt)
8017                         srtt = rack->r_ctl.rc_rack_min_rtt;
8018                 else
8019                         srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8020                 if (rack->r_ctl.rc_rack_largest_cwnd)
8021                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8022                 else
8023                         cwnd = tp->snd_cwnd;
8024                 tr_perms = cwnd / srtt;
8025                 if (tr_perms == 0) {
8026                         tr_perms = ctf_fixed_maxseg(tp);
8027                 }
8028                 /*
8029                  * Calculate how long this will take to drain, if
8030                  * the calculation comes out to zero, thats ok we
8031                  * will use send_a_lot to possibly spin around for
8032                  * more increasing tot_len_this_send to the point
8033                  * that its going to require a pace, or we hit the
8034                  * cwnd. Which in that case we are just waiting for
8035                  * a ACK.
8036                  */
8037                 slot = len / tr_perms;
8038                 /* Now do we reduce the time so we don't run dry? */
8039                 if (slot && rack->rc_pace_reduce) {
8040                         int32_t reduce;
8041
8042                         reduce = (slot / rack->rc_pace_reduce);
8043                         if (reduce < slot) {
8044                                 slot -= reduce;
8045                         } else
8046                                 slot = 0;
8047                 }
8048         } else {
8049                 int cnt;
8050                 uint64_t bw_est, bw_raise, res, lentim;
8051
8052                 bw_est = 0;
8053                 for (cnt=0; cnt<RACK_GP_HIST; cnt++) {
8054                         if ((rack->r_ctl.rc_gp_hist_filled == 0) &&
8055                             (rack->r_ctl.rc_gp_history[cnt] == 0))
8056                                 break;
8057                         bw_est += rack->r_ctl.rc_gp_history[cnt];
8058                 }
8059                 if (bw_est == 0) {
8060                         /*
8061                          * No way yet to make a b/w estimate
8062                          * (no goodput est yet).
8063                          */
8064                         goto old_method;
8065                 }
8066                 /* Covert to bytes per second */
8067                 bw_est *= MSEC_IN_SECOND;
8068                 /*
8069                  * Now ratchet it up by our percentage. Note
8070                  * that the minimum you can do is 1 which would
8071                  * get you 101% of the average last N goodput estimates.
8072                  * The max you can do is 256 which would yeild you
8073                  * 356% of the last N goodput estimates.
8074                  */
8075                 bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp;
8076                 bw_est += bw_raise;
8077                 /* average by the number we added */
8078                 bw_est /= cnt;
8079                 /* Now calculate a rate based on this b/w */
8080                 lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND;
8081                 res = lentim / bw_est;
8082                 slot = (uint32_t)res;
8083         }
8084         if (rack->r_enforce_min_pace &&
8085             (slot == 0)) {
8086                 /* We are enforcing a minimum pace time of 1ms */
8087                 slot = rack->r_enforce_min_pace;
8088         }
8089         if (slot)
8090                 counter_u64_add(rack_calc_nonzero, 1);
8091         else
8092                 counter_u64_add(rack_calc_zero, 1);
8093         return (slot);
8094 }
8095
8096 static int
8097 rack_output(struct tcpcb *tp)
8098 {
8099         struct socket *so;
8100         uint32_t recwin, sendwin;
8101         uint32_t sb_offset;
8102         int32_t len, flags, error = 0;
8103         struct mbuf *m;
8104         struct mbuf *mb;
8105         uint32_t if_hw_tsomaxsegcount = 0;
8106         uint32_t if_hw_tsomaxsegsize = 0;
8107         int32_t maxseg;
8108         long tot_len_this_send = 0;
8109         struct ip *ip = NULL;
8110 #ifdef TCPDEBUG
8111         struct ipovly *ipov = NULL;
8112 #endif
8113         struct udphdr *udp = NULL;
8114         struct tcp_rack *rack;
8115         struct tcphdr *th;
8116         uint8_t pass = 0;
8117         uint8_t wanted_cookie = 0;
8118         u_char opt[TCP_MAXOLEN];
8119         unsigned ipoptlen, optlen, hdrlen, ulen=0;
8120         uint32_t rack_seq;
8121
8122 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8123         unsigned ipsec_optlen = 0;
8124
8125 #endif
8126         int32_t idle, sendalot;
8127         int32_t sub_from_prr = 0;
8128         volatile int32_t sack_rxmit;
8129         struct rack_sendmap *rsm = NULL;
8130         int32_t tso, mtu;
8131         struct tcpopt to;
8132         int32_t slot = 0;
8133         int32_t sup_rack = 0;
8134         uint32_t cts;
8135         uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
8136         int32_t do_a_prefetch;
8137         int32_t prefetch_rsm = 0;
8138         int force_tso = 0;
8139         int32_t orig_len;
8140         int32_t prefetch_so_done = 0;
8141         struct tcp_log_buffer *lgb = NULL;
8142         struct inpcb *inp;
8143         struct sockbuf *sb;
8144 #ifdef INET6
8145         struct ip6_hdr *ip6 = NULL;
8146         int32_t isipv6;
8147 #endif
8148         uint8_t filled_all = 0;
8149         bool hw_tls = false;
8150
8151         /* setup and take the cache hits here */
8152         rack = (struct tcp_rack *)tp->t_fb_ptr;
8153         inp = rack->rc_inp;
8154         so = inp->inp_socket;
8155         sb = &so->so_snd;
8156         kern_prefetch(sb, &do_a_prefetch);
8157         do_a_prefetch = 1;
8158
8159 #ifdef KERN_TLS
8160         hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
8161 #endif
8162
8163         NET_EPOCH_ASSERT();
8164         INP_WLOCK_ASSERT(inp);
8165
8166 #ifdef TCP_OFFLOAD
8167         if (tp->t_flags & TF_TOE)
8168                 return (tcp_offload_output(tp));
8169 #endif
8170         maxseg = ctf_fixed_maxseg(tp);
8171         /*
8172          * For TFO connections in SYN_RECEIVED, only allow the initial
8173          * SYN|ACK and those sent by the retransmit timer.
8174          */
8175         if (IS_FASTOPEN(tp->t_flags) &&
8176             (tp->t_state == TCPS_SYN_RECEIVED) &&
8177             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
8178             (rack->r_ctl.rc_resend == NULL))         /* not a retransmit */
8179                 return (0);
8180 #ifdef INET6
8181         if (rack->r_state) {
8182                 /* Use the cache line loaded if possible */
8183                 isipv6 = rack->r_is_v6;
8184         } else {
8185                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
8186         }
8187 #endif
8188         cts = tcp_ts_getticks();
8189         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
8190             inp->inp_in_hpts) {
8191                 /*
8192                  * We are on the hpts for some timer but not hptsi output.
8193                  * Remove from the hpts unconditionally.
8194                  */
8195                 rack_timer_cancel(tp, rack, cts, __LINE__);
8196         }
8197         /* Mark that we have called rack_output(). */
8198         if ((rack->r_timer_override) ||
8199             (tp->t_flags & TF_FORCEDATA) ||
8200             (tp->t_state < TCPS_ESTABLISHED)) {
8201                 if (tp->t_inpcb->inp_in_hpts)
8202                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
8203         } else if (tp->t_inpcb->inp_in_hpts) {
8204                 /*
8205                  * On the hpts you can't pass even if ACKNOW is on, we will
8206                  * when the hpts fires.
8207                  */
8208                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
8209                 return (0);
8210         }
8211         hpts_calling = inp->inp_hpts_calls;
8212         inp->inp_hpts_calls = 0;
8213         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
8214                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
8215                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
8216                         return (0);
8217                 }
8218         }
8219         rack->r_wanted_output = 0;
8220         rack->r_timer_override = 0;
8221         /*
8222          * For TFO connections in SYN_SENT or SYN_RECEIVED,
8223          * only allow the initial SYN or SYN|ACK and those sent
8224          * by the retransmit timer.
8225          */
8226         if (IS_FASTOPEN(tp->t_flags) &&
8227             ((tp->t_state == TCPS_SYN_RECEIVED) ||
8228              (tp->t_state == TCPS_SYN_SENT)) &&
8229             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
8230             (tp->t_rxtshift == 0))              /* not a retransmit */
8231                 return (0);
8232         /*
8233          * Determine length of data that should be transmitted, and flags
8234          * that will be used. If there is some data or critical controls
8235          * (SYN, RST) to send, then transmit; otherwise, investigate
8236          * further.
8237          */
8238         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
8239         if (tp->t_idle_reduce) {
8240                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
8241                         rack_cc_after_idle(tp);
8242         }
8243         tp->t_flags &= ~TF_LASTIDLE;
8244         if (idle) {
8245                 if (tp->t_flags & TF_MORETOCOME) {
8246                         tp->t_flags |= TF_LASTIDLE;
8247                         idle = 0;
8248                 }
8249         }
8250 again:
8251         /*
8252          * If we've recently taken a timeout, snd_max will be greater than
8253          * snd_nxt.  There may be SACK information that allows us to avoid
8254          * resending already delivered data.  Adjust snd_nxt accordingly.
8255          */
8256         sendalot = 0;
8257         cts = tcp_ts_getticks();
8258         tso = 0;
8259         mtu = 0;
8260         sb_offset = tp->snd_max - tp->snd_una;
8261         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
8262
8263         flags = tcp_outflags[tp->t_state];
8264         while (rack->rc_free_cnt < rack_free_cache) {
8265                 rsm = rack_alloc(rack);
8266                 if (rsm == NULL) {
8267                         if (inp->inp_hpts_calls)
8268                                 /* Retry in a ms */
8269                                 slot = 1;
8270                         goto just_return_nolock;
8271                 }
8272                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
8273                 rack->rc_free_cnt++;
8274                 rsm = NULL;
8275         }
8276         if (inp->inp_hpts_calls)
8277                 inp->inp_hpts_calls = 0;
8278         sack_rxmit = 0;
8279         len = 0;
8280         rsm = NULL;
8281         if (flags & TH_RST) {
8282                 SOCKBUF_LOCK(sb);
8283                 goto send;
8284         }
8285         if (rack->r_ctl.rc_tlpsend) {
8286                 /* Tail loss probe */
8287                 long cwin;
8288                 long tlen;
8289
8290                 doing_tlp = 1;
8291                 /*
8292                  * Check if we can do a TLP with a RACK'd packet
8293                  * this can happen if we are not doing the rack
8294                  * cheat and we skipped to a TLP and it
8295                  * went off.
8296                  */
8297                 rsm = tcp_rack_output(tp, rack, cts);
8298                 if (rsm == NULL)
8299                         rsm = rack->r_ctl.rc_tlpsend;
8300                 rack->r_ctl.rc_tlpsend = NULL;
8301                 sack_rxmit = 1;
8302                 tlen = rsm->r_end - rsm->r_start;
8303                 if (tlen > ctf_fixed_maxseg(tp))
8304                         tlen = ctf_fixed_maxseg(tp);
8305                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8306                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8307                     __func__, __LINE__,
8308                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8309                 sb_offset = rsm->r_start - tp->snd_una;
8310                 cwin = min(tp->snd_wnd, tlen);
8311                 len = cwin;
8312         } else if (rack->r_ctl.rc_resend) {
8313                 /* Retransmit timer */
8314                 rsm = rack->r_ctl.rc_resend;
8315                 rack->r_ctl.rc_resend = NULL;
8316                 len = rsm->r_end - rsm->r_start;
8317                 sack_rxmit = 1;
8318                 sendalot = 0;
8319                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8320                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8321                     __func__, __LINE__,
8322                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8323                 sb_offset = rsm->r_start - tp->snd_una;
8324                 if (len >= ctf_fixed_maxseg(tp)) {
8325                         len = ctf_fixed_maxseg(tp);
8326                 }
8327         } else if ((rack->rc_in_persist == 0) &&
8328             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
8329                 int maxseg;
8330
8331                 maxseg = ctf_fixed_maxseg(tp);
8332                 if ((!IN_RECOVERY(tp->t_flags)) &&
8333                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
8334                         /* Enter recovery if not induced by a time-out */
8335                         rack->r_ctl.rc_rsm_start = rsm->r_start;
8336                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8337                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8338                         rack_cong_signal(tp, NULL, CC_NDUPACK);
8339                         /*
8340                          * When we enter recovery we need to assure we send
8341                          * one packet.
8342                          */
8343                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
8344                         rack_log_to_prr(rack, 13);
8345                 }
8346 #ifdef INVARIANTS
8347                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
8348                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
8349                             tp, rack, rsm, rsm->r_start, tp->snd_una);
8350                 }
8351 #endif
8352                 len = rsm->r_end - rsm->r_start;
8353                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8354                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8355                     __func__, __LINE__,
8356                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8357                 sb_offset = rsm->r_start - tp->snd_una;
8358                 /* Can we send it within the PRR boundary? */
8359                 if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
8360                         /* It does not fit */
8361                         if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
8362                             (rack->r_ctl.rc_prr_sndcnt < maxseg)) {
8363                                 /*
8364                                  * prr is less than a segment, we
8365                                  * have more acks due in besides
8366                                  * what we need to resend. Lets not send
8367                                  * to avoid sending small pieces of
8368                                  * what we need to retransmit.
8369                                  */
8370                                 len = 0;
8371                                 goto just_return_nolock;
8372                         }
8373                         len = rack->r_ctl.rc_prr_sndcnt;
8374                 }
8375                 sendalot = 0;
8376                 if (len >= maxseg) {
8377                         len = maxseg;
8378                 }
8379                 if (len > 0) {
8380                         sub_from_prr = 1;
8381                         sack_rxmit = 1;
8382                         TCPSTAT_INC(tcps_sack_rexmits);
8383                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
8384                             min(len, ctf_fixed_maxseg(tp)));
8385                         counter_u64_add(rack_rtm_prr_retran, 1);
8386                 }
8387         }
8388         /*
8389          * Enforce a connection sendmap count limit if set
8390          * as long as we are not retransmiting.
8391          */
8392         if ((rsm == NULL) &&
8393             (rack->do_detection == 0) &&
8394             (V_tcp_map_entries_limit > 0) &&
8395             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
8396                 counter_u64_add(rack_to_alloc_limited, 1);
8397                 if (!rack->alloc_limit_reported) {
8398                         rack->alloc_limit_reported = 1;
8399                         counter_u64_add(rack_alloc_limited_conns, 1);
8400                 }
8401                 goto just_return_nolock;
8402         }
8403         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
8404                 /* we are retransmitting the fin */
8405                 len--;
8406                 if (len) {
8407                         /*
8408                          * When retransmitting data do *not* include the
8409                          * FIN. This could happen from a TLP probe.
8410                          */
8411                         flags &= ~TH_FIN;
8412                 }
8413         }
8414 #ifdef INVARIANTS
8415         /* For debugging */
8416         rack->r_ctl.rc_rsm_at_retran = rsm;
8417 #endif
8418         /*
8419          * Get standard flags, and add SYN or FIN if requested by 'hidden'
8420          * state flags.
8421          */
8422         if (tp->t_flags & TF_NEEDFIN)
8423                 flags |= TH_FIN;
8424         if (tp->t_flags & TF_NEEDSYN)
8425                 flags |= TH_SYN;
8426         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
8427                 void *end_rsm;
8428                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
8429                 if (end_rsm)
8430                         kern_prefetch(end_rsm, &prefetch_rsm);
8431                 prefetch_rsm = 1;
8432         }
8433         SOCKBUF_LOCK(sb);
8434         /*
8435          * If in persist timeout with window of 0, send 1 byte. Otherwise,
8436          * if window is small but nonzero and time TF_SENTFIN expired, we
8437          * will send what we can and go to transmit state.
8438          */
8439         if (tp->t_flags & TF_FORCEDATA) {
8440                 if (sendwin == 0) {
8441                         /*
8442                          * If we still have some data to send, then clear
8443                          * the FIN bit.  Usually this would happen below
8444                          * when it realizes that we aren't sending all the
8445                          * data.  However, if we have exactly 1 byte of
8446                          * unsent data, then it won't clear the FIN bit
8447                          * below, and if we are in persist state, we wind up
8448                          * sending the packet without recording that we sent
8449                          * the FIN bit.
8450                          *
8451                          * We can't just blindly clear the FIN bit, because
8452                          * if we don't have any more data to send then the
8453                          * probe will be the FIN itself.
8454                          */
8455                         if (sb_offset < sbused(sb))
8456                                 flags &= ~TH_FIN;
8457                         sendwin = 1;
8458                 } else {
8459                         if ((rack->rc_in_persist != 0) &&
8460                             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
8461                                                rack->r_ctl.rc_pace_min_segs)))
8462                                 rack_exit_persist(tp, rack);
8463                         /*
8464                          * If we are dropping persist mode then we need to
8465                          * correct snd_nxt/snd_max and off.
8466                          */
8467                         tp->snd_nxt = tp->snd_max;
8468                         sb_offset = tp->snd_nxt - tp->snd_una;
8469                 }
8470         }
8471         /*
8472          * If snd_nxt == snd_max and we have transmitted a FIN, the
8473          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
8474          * negative length.  This can also occur when TCP opens up its
8475          * congestion window while receiving additional duplicate acks after
8476          * fast-retransmit because TCP will reset snd_nxt to snd_max after
8477          * the fast-retransmit.
8478          *
8479          * In the normal retransmit-FIN-only case, however, snd_nxt will be
8480          * set to snd_una, the sb_offset will be 0, and the length may wind
8481          * up 0.
8482          *
8483          * If sack_rxmit is true we are retransmitting from the scoreboard
8484          * in which case len is already set.
8485          */
8486         if (sack_rxmit == 0) {
8487                 uint32_t avail;
8488
8489                 avail = sbavail(sb);
8490                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
8491                         sb_offset = tp->snd_nxt - tp->snd_una;
8492                 else
8493                         sb_offset = 0;
8494                 if (IN_RECOVERY(tp->t_flags) == 0) {
8495                         if (rack->r_ctl.rc_tlp_new_data) {
8496                                 /* TLP is forcing out new data */
8497                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
8498                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
8499                                 }
8500                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
8501                                         len = tp->snd_wnd;
8502                                 else
8503                                         len = rack->r_ctl.rc_tlp_new_data;
8504                                 rack->r_ctl.rc_tlp_new_data = 0;
8505                                 new_data_tlp = doing_tlp = 1;
8506                         } else {
8507                                 if (sendwin > avail) {
8508                                         /* use the available */
8509                                         if (avail > sb_offset) {
8510                                                 len = (int32_t)(avail - sb_offset);
8511                                         } else {
8512                                                 len = 0;
8513                                         }
8514                                 } else {
8515                                         if (sendwin > sb_offset) {
8516                                                 len = (int32_t)(sendwin - sb_offset);
8517                                         } else {
8518                                                 len = 0;
8519                                         }
8520                                 }
8521                         }
8522                 } else {
8523                         uint32_t outstanding;
8524
8525                         /*
8526                          * We are inside of a SACK recovery episode and are
8527                          * sending new data, having retransmitted all the
8528                          * data possible so far in the scoreboard.
8529                          */
8530                         outstanding = tp->snd_max - tp->snd_una;
8531                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
8532                                 if (tp->snd_wnd > outstanding) {
8533                                         len = tp->snd_wnd - outstanding;
8534                                         /* Check to see if we have the data */
8535                                         if (((sb_offset + len) > avail) &&
8536                                             (avail > sb_offset))
8537                                                 len = avail - sb_offset;
8538                                         else
8539                                                 len = 0;
8540                                 } else
8541                                         len = 0;
8542                         } else if (avail > sb_offset)
8543                                 len = avail - sb_offset;
8544                         else
8545                                 len = 0;
8546                         if (len > 0) {
8547                                 if (len > rack->r_ctl.rc_prr_sndcnt)
8548                                         len = rack->r_ctl.rc_prr_sndcnt;
8549                                 if (len > 0) {
8550                                         sub_from_prr = 1;
8551                                         counter_u64_add(rack_rtm_prr_newdata, 1);
8552                                 }
8553                         }
8554                         if (len > ctf_fixed_maxseg(tp)) {
8555                                 /*
8556                                  * We should never send more than a MSS when
8557                                  * retransmitting or sending new data in prr
8558                                  * mode unless the override flag is on. Most
8559                                  * likely the PRR algorithm is not going to
8560                                  * let us send a lot as well :-)
8561                                  */
8562                                 if (rack->r_ctl.rc_prr_sendalot == 0)
8563                                         len = ctf_fixed_maxseg(tp);
8564                         } else if (len < ctf_fixed_maxseg(tp)) {
8565                                 /*
8566                                  * Do we send any? The idea here is if the
8567                                  * send empty's the socket buffer we want to
8568                                  * do it. However if not then lets just wait
8569                                  * for our prr_sndcnt to get bigger.
8570                                  */
8571                                 long leftinsb;
8572
8573                                 leftinsb = sbavail(sb) - sb_offset;
8574                                 if (leftinsb > len) {
8575                                         /* This send does not empty the sb */
8576                                         len = 0;
8577                                 }
8578                         }
8579                 }
8580         }
8581         if (prefetch_so_done == 0) {
8582                 kern_prefetch(so, &prefetch_so_done);
8583                 prefetch_so_done = 1;
8584         }
8585         /*
8586          * Lop off SYN bit if it has already been sent.  However, if this is
8587          * SYN-SENT state and if segment contains data and if we don't know
8588          * that foreign host supports TAO, suppress sending segment.
8589          */
8590         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
8591             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
8592                 if (tp->t_state != TCPS_SYN_RECEIVED)
8593                         flags &= ~TH_SYN;
8594                 /*
8595                  * When sending additional segments following a TFO SYN|ACK,
8596                  * do not include the SYN bit.
8597                  */
8598                 if (IS_FASTOPEN(tp->t_flags) &&
8599                     (tp->t_state == TCPS_SYN_RECEIVED))
8600                         flags &= ~TH_SYN;
8601                 sb_offset--, len++;
8602         }
8603         /*
8604          * Be careful not to send data and/or FIN on SYN segments. This
8605          * measure is needed to prevent interoperability problems with not
8606          * fully conformant TCP implementations.
8607          */
8608         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
8609                 len = 0;
8610                 flags &= ~TH_FIN;
8611         }
8612         /*
8613          * On TFO sockets, ensure no data is sent in the following cases:
8614          *
8615          *  - When retransmitting SYN|ACK on a passively-created socket
8616          *
8617          *  - When retransmitting SYN on an actively created socket
8618          *
8619          *  - When sending a zero-length cookie (cookie request) on an
8620          *    actively created socket
8621          *
8622          *  - When the socket is in the CLOSED state (RST is being sent)
8623          */
8624         if (IS_FASTOPEN(tp->t_flags) &&
8625             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
8626              ((tp->t_state == TCPS_SYN_SENT) &&
8627               (tp->t_tfo_client_cookie_len == 0)) ||
8628              (flags & TH_RST))) {
8629                 sack_rxmit = 0;
8630                 len = 0;
8631         }
8632         /* Without fast-open there should never be data sent on a SYN */
8633         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
8634                 len = 0;
8635         orig_len = len;
8636         if (len <= 0) {
8637                 /*
8638                  * If FIN has been sent but not acked, but we haven't been
8639                  * called to retransmit, len will be < 0.  Otherwise, window
8640                  * shrank after we sent into it.  If window shrank to 0,
8641                  * cancel pending retransmit, pull snd_nxt back to (closed)
8642                  * window, and set the persist timer if it isn't already
8643                  * going.  If the window didn't close completely, just wait
8644                  * for an ACK.
8645                  *
8646                  * We also do a general check here to ensure that we will
8647                  * set the persist timer when we have data to send, but a
8648                  * 0-byte window. This makes sure the persist timer is set
8649                  * even if the packet hits one of the "goto send" lines
8650                  * below.
8651                  */
8652                 len = 0;
8653                 if ((tp->snd_wnd == 0) &&
8654                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8655                     (tp->snd_una == tp->snd_max) &&
8656                     (sb_offset < (int)sbavail(sb))) {
8657                         tp->snd_nxt = tp->snd_una;
8658                         rack_enter_persist(tp, rack, cts);
8659                 }
8660         } else if ((rsm == NULL) &&
8661                    ((doing_tlp == 0) || (new_data_tlp == 1)) &&
8662                    (len < rack->r_ctl.rc_pace_max_segs)) {
8663                 /*
8664                  * We are not sending a full segment for
8665                  * some reason. Should we not send anything (think
8666                  * sws or persists)?
8667                  */
8668                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8669                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8670                     (len < (int)(sbavail(sb) - sb_offset))) {
8671                         /*
8672                          * Here the rwnd is less than
8673                          * the pacing size, this is not a retransmit,
8674                          * we are established and
8675                          * the send is not the last in the socket buffer
8676                          * we send nothing, and may enter persists.
8677                          */
8678                         len = 0;
8679                         if (tp->snd_max == tp->snd_una) {
8680                                 /*
8681                                  * Nothing out we can
8682                                  * go into persists.
8683                                  */
8684                                 rack_enter_persist(tp, rack, cts);
8685                                 tp->snd_nxt = tp->snd_una;
8686                         }
8687                 } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) &&
8688                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8689                            (len < (int)(sbavail(sb) - sb_offset)) &&
8690                            (len < rack->r_ctl.rc_pace_min_segs)) {
8691                         /*
8692                          * Here we are not retransmitting, and
8693                          * the cwnd is not so small that we could
8694                          * not send at least a min size (rxt timer
8695                          * not having gone off), We have 2 segments or
8696                          * more already in flight, its not the tail end
8697                          * of the socket buffer  and the cwnd is blocking
8698                          * us from sending out a minimum pacing segment size.
8699                          * Lets not send anything.
8700                          */
8701                         len = 0;
8702                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
8703                             min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8704                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8705                            (len < (int)(sbavail(sb) - sb_offset)) &&
8706                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
8707                         /*
8708                          * Here we have a send window but we have
8709                          * filled it up and we can't send another pacing segment.
8710                          * We also have in flight more than 2 segments
8711                          * and we are not completing the sb i.e. we allow
8712                          * the last bytes of the sb to go out even if
8713                          * its not a full pacing segment.
8714                          */
8715                         len = 0;
8716                 }
8717         }
8718         /* len will be >= 0 after this point. */
8719         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
8720         tcp_sndbuf_autoscale(tp, so, sendwin);
8721         /*
8722          * Decide if we can use TCP Segmentation Offloading (if supported by
8723          * hardware).
8724          *
8725          * TSO may only be used if we are in a pure bulk sending state.  The
8726          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
8727          * options prevent using TSO.  With TSO the TCP header is the same
8728          * (except for the sequence number) for all generated packets.  This
8729          * makes it impossible to transmit any options which vary per
8730          * generated segment or packet.
8731          *
8732          * IPv4 handling has a clear separation of ip options and ip header
8733          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
8734          * the right thing below to provide length of just ip options and thus
8735          * checking for ipoptlen is enough to decide if ip options are present.
8736          */
8737
8738 #ifdef INET6
8739         if (isipv6)
8740                 ipoptlen = ip6_optlen(tp->t_inpcb);
8741         else
8742 #endif
8743                 if (tp->t_inpcb->inp_options)
8744                         ipoptlen = tp->t_inpcb->inp_options->m_len -
8745                             offsetof(struct ipoption, ipopt_list);
8746                 else
8747                         ipoptlen = 0;
8748 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8749         /*
8750          * Pre-calculate here as we save another lookup into the darknesses
8751          * of IPsec that way and can actually decide if TSO is ok.
8752          */
8753 #ifdef INET6
8754         if (isipv6 && IPSEC_ENABLED(ipv6))
8755                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
8756 #ifdef INET
8757         else
8758 #endif
8759 #endif                          /* INET6 */
8760 #ifdef INET
8761         if (IPSEC_ENABLED(ipv4))
8762                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
8763 #endif                          /* INET */
8764 #endif
8765
8766 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8767         ipoptlen += ipsec_optlen;
8768 #endif
8769         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) &&
8770             (tp->t_port == 0) &&
8771             ((tp->t_flags & TF_SIGNATURE) == 0) &&
8772             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
8773             ipoptlen == 0)
8774                 tso = 1;
8775         {
8776                 uint32_t outstanding;
8777
8778                 outstanding = tp->snd_max - tp->snd_una;
8779                 if (tp->t_flags & TF_SENTFIN) {
8780                         /*
8781                          * If we sent a fin, snd_max is 1 higher than
8782                          * snd_una
8783                          */
8784                         outstanding--;
8785                 }
8786                 if (sack_rxmit) {
8787                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
8788                                 flags &= ~TH_FIN;
8789                 } else {
8790                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
8791                             sbused(sb)))
8792                                 flags &= ~TH_FIN;
8793                 }
8794         }
8795         recwin = sbspace(&so->so_rcv);
8796
8797         /*
8798          * Sender silly window avoidance.   We transmit under the following
8799          * conditions when len is non-zero:
8800          *
8801          * - We have a full segment (or more with TSO) - This is the last
8802          * buffer in a write()/send() and we are either idle or running
8803          * NODELAY - we've timed out (e.g. persist timer) - we have more
8804          * then 1/2 the maximum send window's worth of data (receiver may be
8805          * limited the window size) - we need to retransmit
8806          */
8807         if (len) {
8808                 if (len >= ctf_fixed_maxseg(tp)) {
8809                         pass = 1;
8810                         goto send;
8811                 }
8812                 /*
8813                  * NOTE! on localhost connections an 'ack' from the remote
8814                  * end may occur synchronously with the output and cause us
8815                  * to flush a buffer queued with moretocome.  XXX
8816                  *
8817                  */
8818                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
8819                     (idle || (tp->t_flags & TF_NODELAY)) &&
8820                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
8821                     (tp->t_flags & TF_NOPUSH) == 0) {
8822                         pass = 2;
8823                         goto send;
8824                 }
8825                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
8826                         pass = 3;
8827                         goto send;
8828                 }
8829                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
8830                         goto send;
8831                 }
8832                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
8833                         pass = 4;
8834                         goto send;
8835                 }
8836                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
8837                         pass = 5;
8838                         goto send;
8839                 }
8840                 if (sack_rxmit) {
8841                         pass = 6;
8842                         goto send;
8843                 }
8844         }
8845         /*
8846          * Sending of standalone window updates.
8847          *
8848          * Window updates are important when we close our window due to a
8849          * full socket buffer and are opening it again after the application
8850          * reads data from it.  Once the window has opened again and the
8851          * remote end starts to send again the ACK clock takes over and
8852          * provides the most current window information.
8853          *
8854          * We must avoid the silly window syndrome whereas every read from
8855          * the receive buffer, no matter how small, causes a window update
8856          * to be sent.  We also should avoid sending a flurry of window
8857          * updates when the socket buffer had queued a lot of data and the
8858          * application is doing small reads.
8859          *
8860          * Prevent a flurry of pointless window updates by only sending an
8861          * update when we can increase the advertized window by more than
8862          * 1/4th of the socket buffer capacity.  When the buffer is getting
8863          * full or is very small be more aggressive and send an update
8864          * whenever we can increase by two mss sized segments. In all other
8865          * situations the ACK's to new incoming data will carry further
8866          * window increases.
8867          *
8868          * Don't send an independent window update if a delayed ACK is
8869          * pending (it will get piggy-backed on it) or the remote side
8870          * already has done a half-close and won't send more data.  Skip
8871          * this if the connection is in T/TCP half-open state.
8872          */
8873         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
8874             !(tp->t_flags & TF_DELACK) &&
8875             !TCPS_HAVERCVDFIN(tp->t_state)) {
8876                 /*
8877                  * "adv" is the amount we could increase the window, taking
8878                  * into account that we are limited by TCP_MAXWIN <<
8879                  * tp->rcv_scale.
8880                  */
8881                 int32_t adv;
8882                 int oldwin;
8883
8884                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
8885                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
8886                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
8887                         adv -= oldwin;
8888                 } else
8889                         oldwin = 0;
8890
8891                 /*
8892                  * If the new window size ends up being the same as the old
8893                  * size when it is scaled, then don't force a window update.
8894                  */
8895                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
8896                         goto dontupdate;
8897
8898                 if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) &&
8899                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
8900                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
8901                      so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) {
8902                         pass = 7;
8903                         goto send;
8904                 }
8905                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
8906                         goto send;
8907         }
8908 dontupdate:
8909
8910         /*
8911          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
8912          * is also a catch-all for the retransmit timer timeout case.
8913          */
8914         if (tp->t_flags & TF_ACKNOW) {
8915                 pass = 8;
8916                 goto send;
8917         }
8918         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
8919                 pass = 9;
8920                 goto send;
8921         }
8922         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
8923                 pass = 10;
8924                 goto send;
8925         }
8926         /*
8927          * If our state indicates that FIN should be sent and we have not
8928          * yet done so, then we need to send.
8929          */
8930         if ((flags & TH_FIN) &&
8931             (tp->snd_nxt == tp->snd_una)) {
8932                 pass = 11;
8933                 goto send;
8934         }
8935         /*
8936          * No reason to send a segment, just return.
8937          */
8938 just_return:
8939         SOCKBUF_UNLOCK(sb);
8940 just_return_nolock:
8941         if (tot_len_this_send == 0)
8942                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
8943         if (slot) {
8944                 /* set the rack tcb into the slot N */
8945                 counter_u64_add(rack_paced_segments, 1);
8946         } else if (tot_len_this_send) {
8947                 counter_u64_add(rack_unpaced_segments, 1);
8948         }
8949         /* Check if we need to go into persists or not */
8950         if ((rack->rc_in_persist == 0) &&
8951             (tp->snd_max == tp->snd_una) &&
8952             TCPS_HAVEESTABLISHED(tp->t_state) &&
8953             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8954             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) &&
8955             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) {
8956                 /* Yes lets make sure to move to persist before timer-start */
8957                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
8958         }
8959         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
8960         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
8961         tp->t_flags &= ~TF_FORCEDATA;
8962         return (0);
8963
8964 send:
8965         if ((flags & TH_FIN) &&
8966             sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8967                 /*
8968                  * We do not transmit a FIN
8969                  * with data outstanding. We
8970                  * need to make it so all data
8971                  * is acked first.
8972                  */
8973                 flags &= ~TH_FIN;
8974         }
8975         if (doing_tlp == 0) {
8976                 /*
8977                  * Data not a TLP, and its not the rxt firing. If it is the
8978                  * rxt firing, we want to leave the tlp_in_progress flag on
8979                  * so we don't send another TLP. It has to be a rack timer
8980                  * or normal send (response to acked data) to clear the tlp
8981                  * in progress flag.
8982                  */
8983                 rack->rc_tlp_in_progress = 0;
8984         }
8985         SOCKBUF_LOCK_ASSERT(sb);
8986         if (len > 0) {
8987                 if (len >= ctf_fixed_maxseg(tp))
8988                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
8989                 else
8990                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
8991         }
8992         /*
8993          * Before ESTABLISHED, force sending of initial options unless TCP
8994          * set not to do any options. NOTE: we assume that the IP/TCP header
8995          * plus TCP options always fit in a single mbuf, leaving room for a
8996          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
8997          * + optlen <= MCLBYTES
8998          */
8999         optlen = 0;
9000 #ifdef INET6
9001         if (isipv6)
9002                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
9003         else
9004 #endif
9005                 hdrlen = sizeof(struct tcpiphdr);
9006
9007         /*
9008          * Compute options for segment. We only have to care about SYN and
9009          * established connection segments.  Options for SYN-ACK segments
9010          * are handled in TCP syncache.
9011          */
9012         to.to_flags = 0;
9013         if ((tp->t_flags & TF_NOOPT) == 0) {
9014                 /* Maximum segment size. */
9015                 if (flags & TH_SYN) {
9016                         tp->snd_nxt = tp->iss;
9017                         to.to_mss = tcp_mssopt(&inp->inp_inc);
9018 #ifdef NETFLIX_TCPOUDP
9019                         if (tp->t_port)
9020                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
9021 #endif
9022                         to.to_flags |= TOF_MSS;
9023
9024                         /*
9025                          * On SYN or SYN|ACK transmits on TFO connections,
9026                          * only include the TFO option if it is not a
9027                          * retransmit, as the presence of the TFO option may
9028                          * have caused the original SYN or SYN|ACK to have
9029                          * been dropped by a middlebox.
9030                          */
9031                         if (IS_FASTOPEN(tp->t_flags) &&
9032                             (tp->t_rxtshift == 0)) {
9033                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
9034                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
9035                                         to.to_tfo_cookie =
9036                                             (u_int8_t *)&tp->t_tfo_cookie.server;
9037                                         to.to_flags |= TOF_FASTOPEN;
9038                                         wanted_cookie = 1;
9039                                 } else if (tp->t_state == TCPS_SYN_SENT) {
9040                                         to.to_tfo_len =
9041                                             tp->t_tfo_client_cookie_len;
9042                                         to.to_tfo_cookie =
9043                                             tp->t_tfo_cookie.client;
9044                                         to.to_flags |= TOF_FASTOPEN;
9045                                         wanted_cookie = 1;
9046                                         /*
9047                                          * If we wind up having more data to
9048                                          * send with the SYN than can fit in
9049                                          * one segment, don't send any more
9050                                          * until the SYN|ACK comes back from
9051                                          * the other end.
9052                                          */
9053                                         sendalot = 0;
9054                                 }
9055                         }
9056                 }
9057                 /* Window scaling. */
9058                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
9059                         to.to_wscale = tp->request_r_scale;
9060                         to.to_flags |= TOF_SCALE;
9061                 }
9062                 /* Timestamps. */
9063                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
9064                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
9065                         to.to_tsval = cts + tp->ts_offset;
9066                         to.to_tsecr = tp->ts_recent;
9067                         to.to_flags |= TOF_TS;
9068                 }
9069                 /* Set receive buffer autosizing timestamp. */
9070                 if (tp->rfbuf_ts == 0 &&
9071                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
9072                         tp->rfbuf_ts = tcp_ts_getticks();
9073                 /* Selective ACK's. */
9074                 if (flags & TH_SYN)
9075                         to.to_flags |= TOF_SACKPERM;
9076                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9077                     tp->rcv_numsacks > 0) {
9078                         to.to_flags |= TOF_SACK;
9079                         to.to_nsacks = tp->rcv_numsacks;
9080                         to.to_sacks = (u_char *)tp->sackblks;
9081                 }
9082 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9083                 /* TCP-MD5 (RFC2385). */
9084                 if (tp->t_flags & TF_SIGNATURE)
9085                         to.to_flags |= TOF_SIGNATURE;
9086 #endif                          /* TCP_SIGNATURE */
9087
9088                 /* Processing the options. */
9089                 hdrlen += optlen = tcp_addoptions(&to, opt);
9090                 /*
9091                  * If we wanted a TFO option to be added, but it was unable
9092                  * to fit, ensure no data is sent.
9093                  */
9094                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
9095                     !(to.to_flags & TOF_FASTOPEN))
9096                         len = 0;
9097         }
9098 #ifdef NETFLIX_TCPOUDP
9099         if (tp->t_port) {
9100                 if (V_tcp_udp_tunneling_port == 0) {
9101                         /* The port was removed?? */
9102                         SOCKBUF_UNLOCK(&so->so_snd);
9103                         return (EHOSTUNREACH);
9104                 }
9105                 hdrlen += sizeof(struct udphdr);
9106         }
9107 #endif
9108 #ifdef INET6
9109         if (isipv6)
9110                 ipoptlen = ip6_optlen(tp->t_inpcb);
9111         else
9112 #endif
9113         if (tp->t_inpcb->inp_options)
9114                 ipoptlen = tp->t_inpcb->inp_options->m_len -
9115                     offsetof(struct ipoption, ipopt_list);
9116         else
9117                 ipoptlen = 0;
9118 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
9119         ipoptlen += ipsec_optlen;
9120 #endif
9121
9122 #ifdef KERN_TLS
9123         /* force TSO for so TLS offload can get mss */
9124         if (sb->sb_flags & SB_TLS_IFNET) {
9125                 force_tso = 1;
9126         }
9127 #endif
9128         /*
9129          * Adjust data length if insertion of options will bump the packet
9130          * length beyond the t_maxseg length. Clear the FIN bit because we
9131          * cut off the tail of the segment.
9132          */
9133         if (len + optlen + ipoptlen > tp->t_maxseg) {
9134                 if (tso) {
9135                         uint32_t if_hw_tsomax;
9136                         uint32_t moff;
9137                         int32_t max_len;
9138
9139                         /* extract TSO information */
9140                         if_hw_tsomax = tp->t_tsomax;
9141                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
9142                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
9143                         KASSERT(ipoptlen == 0,
9144                             ("%s: TSO can't do IP options", __func__));
9145
9146                         /*
9147                          * Check if we should limit by maximum payload
9148                          * length:
9149                          */
9150                         if (if_hw_tsomax != 0) {
9151                                 /* compute maximum TSO length */
9152                                 max_len = (if_hw_tsomax - hdrlen -
9153                                     max_linkhdr);
9154                                 if (max_len <= 0) {
9155                                         len = 0;
9156                                 } else if (len > max_len) {
9157                                         sendalot = 1;
9158                                         len = max_len;
9159                                 }
9160                         }
9161                         /*
9162                          * Prevent the last segment from being fractional
9163                          * unless the send sockbuf can be emptied:
9164                          */
9165                         max_len = (tp->t_maxseg - optlen);
9166                         if (((sb_offset + len) < sbavail(sb)) &&
9167                             (hw_tls == 0)) {
9168                                 moff = len % (u_int)max_len;
9169                                 if (moff != 0) {
9170                                         len -= moff;
9171                                         sendalot = 1;
9172                                 }
9173                         }
9174                         /*
9175                          * In case there are too many small fragments don't
9176                          * use TSO:
9177                          */
9178                         if (len <= maxseg) {
9179                                 len = max_len;
9180                                 sendalot = 1;
9181                                 tso = 0;
9182                         }
9183                         /*
9184                          * Send the FIN in a separate segment after the bulk
9185                          * sending is done. We don't trust the TSO
9186                          * implementations to clear the FIN flag on all but
9187                          * the last segment.
9188                          */
9189                         if (tp->t_flags & TF_NEEDFIN)
9190                                 sendalot = 1;
9191
9192                 } else {
9193                         if (optlen + ipoptlen >= tp->t_maxseg) {
9194                                 /*
9195                                  * Since we don't have enough space to put
9196                                  * the IP header chain and the TCP header in
9197                                  * one packet as required by RFC 7112, don't
9198                                  * send it. Also ensure that at least one
9199                                  * byte of the payload can be put into the
9200                                  * TCP segment.
9201                                  */
9202                                 SOCKBUF_UNLOCK(&so->so_snd);
9203                                 error = EMSGSIZE;
9204                                 sack_rxmit = 0;
9205                                 goto out;
9206                         }
9207                         len = tp->t_maxseg - optlen - ipoptlen;
9208                         sendalot = 1;
9209                 }
9210         } else
9211                 tso = 0;
9212         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
9213             ("%s: len > IP_MAXPACKET", __func__));
9214 #ifdef DIAGNOSTIC
9215 #ifdef INET6
9216         if (max_linkhdr + hdrlen > MCLBYTES)
9217 #else
9218         if (max_linkhdr + hdrlen > MHLEN)
9219 #endif
9220                 panic("tcphdr too big");
9221 #endif
9222
9223         /*
9224          * This KASSERT is here to catch edge cases at a well defined place.
9225          * Before, those had triggered (random) panic conditions further
9226          * down.
9227          */
9228         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
9229         if ((len == 0) &&
9230             (flags & TH_FIN) &&
9231             (sbused(sb))) {
9232                 /*
9233                  * We have outstanding data, don't send a fin by itself!.
9234                  */
9235                 goto just_return;
9236         }
9237         /*
9238          * Grab a header mbuf, attaching a copy of data to be transmitted,
9239          * and initialize the header from the template for sends on this
9240          * connection.
9241          */
9242         if (len) {
9243                 uint32_t max_val;
9244                 uint32_t moff;
9245
9246                 if (rack->rc_pace_max_segs)
9247                         max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp);
9248                 else
9249                         max_val = len;
9250                 if (rack->r_ctl.rc_pace_max_segs < max_val)
9251                         max_val = rack->r_ctl.rc_pace_max_segs;
9252                 /*
9253                  * We allow a limit on sending with hptsi.
9254                  */
9255                 if (len > max_val) {
9256                         len = max_val;
9257                 }
9258 #ifdef INET6
9259                 if (MHLEN < hdrlen + max_linkhdr)
9260                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
9261                 else
9262 #endif
9263                         m = m_gethdr(M_NOWAIT, MT_DATA);
9264
9265                 if (m == NULL) {
9266                         SOCKBUF_UNLOCK(sb);
9267                         error = ENOBUFS;
9268                         sack_rxmit = 0;
9269                         goto out;
9270                 }
9271                 m->m_data += max_linkhdr;
9272                 m->m_len = hdrlen;
9273
9274                 /*
9275                  * Start the m_copy functions from the closest mbuf to the
9276                  * sb_offset in the socket buffer chain.
9277                  */
9278                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
9279                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
9280                         m_copydata(mb, moff, (int)len,
9281                             mtod(m, caddr_t)+hdrlen);
9282                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9283                                 sbsndptr_adv(sb, mb, len);
9284                         m->m_len += len;
9285                 } else {
9286                         struct sockbuf *msb;
9287
9288                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9289                                 msb = NULL;
9290                         else
9291                                 msb = sb;
9292                         m->m_next = tcp_m_copym(
9293 #ifdef NETFLIX_COPY_ARGS
9294                                 tp,
9295 #endif
9296                                 mb, moff, &len,
9297                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
9298                             ((rsm == NULL) ? hw_tls : 0)
9299 #ifdef NETFLIX_COPY_ARGS
9300                                 , &filled_all
9301 #endif
9302                                 );
9303                         if (len <= (tp->t_maxseg - optlen)) {
9304                                 /*
9305                                  * Must have ran out of mbufs for the copy
9306                                  * shorten it to no longer need tso. Lets
9307                                  * not put on sendalot since we are low on
9308                                  * mbufs.
9309                                  */
9310                                 tso = 0;
9311                         }
9312                         if (m->m_next == NULL) {
9313                                 SOCKBUF_UNLOCK(sb);
9314                                 (void)m_free(m);
9315                                 error = ENOBUFS;
9316                                 sack_rxmit = 0;
9317                                 goto out;
9318                         }
9319                 }
9320                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
9321                         TCPSTAT_INC(tcps_sndprobe);
9322 #ifdef STATS
9323                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9324                                 stats_voi_update_abs_u32(tp->t_stats,
9325                                     VOI_TCP_RETXPB, len);
9326                         else
9327                                 stats_voi_update_abs_u64(tp->t_stats,
9328                                     VOI_TCP_TXPB, len);
9329 #endif
9330                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
9331                         if (rsm && (rsm->r_flags & RACK_TLP)) {
9332                                 /*
9333                                  * TLP should not count in retran count, but
9334                                  * in its own bin
9335                                  */
9336                                 counter_u64_add(rack_tlp_retran, 1);
9337                                 counter_u64_add(rack_tlp_retran_bytes, len);
9338                         } else {
9339                                 tp->t_sndrexmitpack++;
9340                                 TCPSTAT_INC(tcps_sndrexmitpack);
9341                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
9342                         }
9343 #ifdef STATS
9344                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
9345                             len);
9346 #endif
9347                 } else {
9348                         TCPSTAT_INC(tcps_sndpack);
9349                         TCPSTAT_ADD(tcps_sndbyte, len);
9350 #ifdef STATS
9351                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
9352                             len);
9353 #endif
9354                 }
9355                 /*
9356                  * If we're sending everything we've got, set PUSH. (This
9357                  * will keep happy those implementations which only give
9358                  * data to the user when a buffer fills or a PUSH comes in.)
9359                  */
9360                 if (sb_offset + len == sbused(sb) &&
9361                     sbused(sb) &&
9362                     !(flags & TH_SYN))
9363                         flags |= TH_PUSH;
9364
9365                 /*
9366                  * Are we doing pacing, if so we must calculate the slot. We
9367                  * only do hptsi in ESTABLISHED and with no RESET being
9368                  * sent where we have data to send.
9369                  */
9370                 if (((tp->t_state == TCPS_ESTABLISHED) ||
9371                     (tp->t_state == TCPS_CLOSE_WAIT) ||
9372                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
9373                     ((tp->t_flags & TF_SENTFIN) == 0) &&
9374                     ((flags & TH_FIN) == 0))) &&
9375                     ((flags & TH_RST) == 0)) {
9376                         /* Get our pacing rate */
9377                         tot_len_this_send += len;
9378                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send);
9379                 }
9380                 SOCKBUF_UNLOCK(sb);
9381         } else {
9382                 SOCKBUF_UNLOCK(sb);
9383                 if (tp->t_flags & TF_ACKNOW)
9384                         TCPSTAT_INC(tcps_sndacks);
9385                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
9386                         TCPSTAT_INC(tcps_sndctrl);
9387                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
9388                         TCPSTAT_INC(tcps_sndurg);
9389                 else
9390                         TCPSTAT_INC(tcps_sndwinup);
9391
9392                 m = m_gethdr(M_NOWAIT, MT_DATA);
9393                 if (m == NULL) {
9394                         error = ENOBUFS;
9395                         sack_rxmit = 0;
9396                         goto out;
9397                 }
9398 #ifdef INET6
9399                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
9400                     MHLEN >= hdrlen) {
9401                         M_ALIGN(m, hdrlen);
9402                 } else
9403 #endif
9404                         m->m_data += max_linkhdr;
9405                 m->m_len = hdrlen;
9406         }
9407         SOCKBUF_UNLOCK_ASSERT(sb);
9408         m->m_pkthdr.rcvif = (struct ifnet *)0;
9409 #ifdef MAC
9410         mac_inpcb_create_mbuf(inp, m);
9411 #endif
9412 #ifdef INET6
9413         if (isipv6) {
9414                 ip6 = mtod(m, struct ip6_hdr *);
9415 #ifdef NETFLIX_TCPOUDP
9416                 if (tp->t_port) {
9417                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
9418                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9419                         udp->uh_dport = tp->t_port;
9420                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
9421                         udp->uh_ulen = htons(ulen);
9422                         th = (struct tcphdr *)(udp + 1);
9423                 } else
9424 #endif
9425                         th = (struct tcphdr *)(ip6 + 1);
9426                 tcpip_fillheaders(inp,
9427 #ifdef NETFLIX_TCPOUDP
9428                                   tp->t_port,
9429 #endif
9430                                   ip6, th);
9431         } else
9432 #endif                          /* INET6 */
9433         {
9434                 ip = mtod(m, struct ip *);
9435 #ifdef TCPDEBUG
9436                 ipov = (struct ipovly *)ip;
9437 #endif
9438 #ifdef NETFLIX_TCPOUDP
9439                 if (tp->t_port) {
9440                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
9441                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9442                         udp->uh_dport = tp->t_port;
9443                         ulen = hdrlen + len - sizeof(struct ip);
9444                         udp->uh_ulen = htons(ulen);
9445                         th = (struct tcphdr *)(udp + 1);
9446                 } else
9447 #endif
9448                         th = (struct tcphdr *)(ip + 1);
9449                 tcpip_fillheaders(inp,
9450 #ifdef NETFLIX_TCPOUDP
9451                                   tp->t_port,
9452 #endif
9453                                   ip, th);
9454         }
9455         /*
9456          * Fill in fields, remembering maximum advertised window for use in
9457          * delaying messages about window sizes. If resending a FIN, be sure
9458          * not to use a new sequence number.
9459          */
9460         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
9461             tp->snd_nxt == tp->snd_max)
9462                 tp->snd_nxt--;
9463         /*
9464          * If we are starting a connection, send ECN setup SYN packet. If we
9465          * are on a retransmit, we may resend those bits a number of times
9466          * as per RFC 3168.
9467          */
9468         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
9469                 if (tp->t_rxtshift >= 1) {
9470                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
9471                                 flags |= TH_ECE | TH_CWR;
9472                 } else
9473                         flags |= TH_ECE | TH_CWR;
9474         }
9475         if (tp->t_state == TCPS_ESTABLISHED &&
9476             (tp->t_flags2 & TF2_ECN_PERMIT)) {
9477                 /*
9478                  * If the peer has ECN, mark data packets with ECN capable
9479                  * transmission (ECT). Ignore pure ack packets,
9480                  * retransmissions and window probes.
9481                  */
9482                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
9483                     (sack_rxmit == 0) &&
9484                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
9485 #ifdef INET6
9486                         if (isipv6)
9487                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
9488                         else
9489 #endif
9490                                 ip->ip_tos |= IPTOS_ECN_ECT0;
9491                         TCPSTAT_INC(tcps_ecn_ect0);
9492                 }
9493                 /*
9494                  * Reply with proper ECN notifications.
9495                  */
9496                 if (tp->t_flags2 & TF2_ECN_SND_CWR) {
9497                         flags |= TH_CWR;
9498                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
9499                 }
9500                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
9501                         flags |= TH_ECE;
9502         }
9503         /*
9504          * If we are doing retransmissions, then snd_nxt will not reflect
9505          * the first unsent octet.  For ACK only packets, we do not want the
9506          * sequence number of the retransmitted packet, we want the sequence
9507          * number of the next unsent octet.  So, if there is no data (and no
9508          * SYN or FIN), use snd_max instead of snd_nxt when filling in
9509          * ti_seq.  But if we are in persist state, snd_max might reflect
9510          * one byte beyond the right edge of the window, so use snd_nxt in
9511          * that case, since we know we aren't doing a retransmission.
9512          * (retransmit and persist are mutually exclusive...)
9513          */
9514         if (sack_rxmit == 0) {
9515                 if (len || (flags & (TH_SYN | TH_FIN)) ||
9516                     rack->rc_in_persist) {
9517                         th->th_seq = htonl(tp->snd_nxt);
9518                         rack_seq = tp->snd_nxt;
9519                 } else if (flags & TH_RST) {
9520                         /*
9521                          * For a Reset send the last cum ack in sequence
9522                          * (this like any other choice may still generate a
9523                          * challenge ack, if a ack-update packet is in
9524                          * flight).
9525                          */
9526                         th->th_seq = htonl(tp->snd_una);
9527                         rack_seq = tp->snd_una;
9528                 } else {
9529                         th->th_seq = htonl(tp->snd_max);
9530                         rack_seq = tp->snd_max;
9531                 }
9532         } else {
9533                 th->th_seq = htonl(rsm->r_start);
9534                 rack_seq = rsm->r_start;
9535         }
9536         th->th_ack = htonl(tp->rcv_nxt);
9537         if (optlen) {
9538                 bcopy(opt, th + 1, optlen);
9539                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
9540         }
9541         th->th_flags = flags;
9542         /*
9543          * Calculate receive window.  Don't shrink window, but avoid silly
9544          * window syndrome.
9545          * If a RST segment is sent, advertise a window of zero.
9546          */
9547         if (flags & TH_RST) {
9548                 recwin = 0;
9549         } else {
9550                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
9551                     recwin < (long)ctf_fixed_maxseg(tp))
9552                         recwin = 0;
9553                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
9554                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
9555                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
9556                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
9557                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
9558         }
9559
9560         /*
9561          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
9562          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
9563          * handled in syncache.
9564          */
9565         if (flags & TH_SYN)
9566                 th->th_win = htons((u_short)
9567                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
9568         else
9569                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
9570         /*
9571          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
9572          * window.  This may cause the remote transmitter to stall.  This
9573          * flag tells soreceive() to disable delayed acknowledgements when
9574          * draining the buffer.  This can occur if the receiver is
9575          * attempting to read more data than can be buffered prior to
9576          * transmitting on the connection.
9577          */
9578         if (th->th_win == 0) {
9579                 tp->t_sndzerowin++;
9580                 tp->t_flags |= TF_RXWIN0SENT;
9581         } else
9582                 tp->t_flags &= ~TF_RXWIN0SENT;
9583         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
9584                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
9585                 th->th_flags |= TH_URG;
9586         } else
9587                 /*
9588                  * If no urgent pointer to send, then we pull the urgent
9589                  * pointer to the left edge of the send window so that it
9590                  * doesn't drift into the send window on sequence number
9591                  * wraparound.
9592                  */
9593                 tp->snd_up = tp->snd_una;       /* drag it along */
9594
9595 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9596         if (to.to_flags & TOF_SIGNATURE) {
9597                 /*
9598                  * Calculate MD5 signature and put it into the place
9599                  * determined before.
9600                  * NOTE: since TCP options buffer doesn't point into
9601                  * mbuf's data, calculate offset and use it.
9602                  */
9603                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
9604                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
9605                         /*
9606                          * Do not send segment if the calculation of MD5
9607                          * digest has failed.
9608                          */
9609                         goto out;
9610                 }
9611         }
9612 #endif
9613
9614         /*
9615          * Put TCP length in extended header, and then checksum extended
9616          * header and data.
9617          */
9618         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
9619 #ifdef INET6
9620         if (isipv6) {
9621                 /*
9622                  * ip6_plen is not need to be filled now, and will be filled
9623                  * in ip6_output.
9624                  */
9625                 if (tp->t_port) {
9626                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
9627                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9628                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
9629                         th->th_sum = htons(0);
9630                         UDPSTAT_INC(udps_opackets);
9631                 } else {
9632                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
9633                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9634                         th->th_sum = in6_cksum_pseudo(ip6,
9635                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
9636                             0);
9637                 }
9638         }
9639 #endif
9640 #if defined(INET6) && defined(INET)
9641         else
9642 #endif
9643 #ifdef INET
9644         {
9645                 if (tp->t_port) {
9646                         m->m_pkthdr.csum_flags = CSUM_UDP;
9647                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9648                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
9649                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
9650                         th->th_sum = htons(0);
9651                         UDPSTAT_INC(udps_opackets);
9652                 } else {
9653                         m->m_pkthdr.csum_flags = CSUM_TCP;
9654                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9655                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
9656                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
9657                             IPPROTO_TCP + len + optlen));
9658                 }
9659                 /* IP version must be set here for ipv4/ipv6 checking later */
9660                 KASSERT(ip->ip_v == IPVERSION,
9661                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
9662         }
9663 #endif
9664         /*
9665          * Enable TSO and specify the size of the segments. The TCP pseudo
9666          * header checksum is always provided. XXX: Fixme: This is currently
9667          * not the case for IPv6.
9668          */
9669         if (tso || force_tso) {
9670                 KASSERT(force_tso || len > tp->t_maxseg - optlen,
9671                     ("%s: len <= tso_segsz", __func__));
9672                 m->m_pkthdr.csum_flags |= CSUM_TSO;
9673                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
9674         }
9675         KASSERT(len + hdrlen == m_length(m, NULL),
9676             ("%s: mbuf chain different than expected: %d + %u != %u",
9677             __func__, len, hdrlen, m_length(m, NULL)));
9678
9679 #ifdef TCP_HHOOK
9680         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
9681         hhook_run_tcp_est_out(tp, th, &to, len, tso);
9682 #endif
9683 #ifdef TCPDEBUG
9684         /*
9685          * Trace.
9686          */
9687         if (so->so_options & SO_DEBUG) {
9688                 u_short save = 0;
9689
9690 #ifdef INET6
9691                 if (!isipv6)
9692 #endif
9693                 {
9694                         save = ipov->ih_len;
9695                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
9696                               * (th->th_off << 2) */ );
9697                 }
9698                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
9699 #ifdef INET6
9700                 if (!isipv6)
9701 #endif
9702                         ipov->ih_len = save;
9703         }
9704 #endif                          /* TCPDEBUG */
9705
9706         /* We're getting ready to send; log now. */
9707         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
9708                 union tcp_log_stackspecific log;
9709                 struct timeval tv;
9710
9711                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
9712                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
9713                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
9714                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
9715                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
9716                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
9717                 log.u_bbr.flex4 = orig_len;
9718                 if (filled_all)
9719                         log.u_bbr.flex5 = 0x80000000;
9720                 else
9721                         log.u_bbr.flex5 = 0;
9722                 if (rsm || sack_rxmit) {
9723                         log.u_bbr.flex8 = 1;
9724                 } else {
9725                         log.u_bbr.flex8 = 0;
9726                 }
9727                 log.u_bbr.pkts_out = tp->t_maxseg;
9728                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
9729                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9730                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
9731                     len, &log, false, NULL, NULL, 0, &tv);
9732         } else
9733                 lgb = NULL;
9734
9735         /*
9736          * Fill in IP length and desired time to live and send to IP level.
9737          * There should be a better way to handle ttl and tos; we could keep
9738          * them in the template, but need a way to checksum without them.
9739          */
9740         /*
9741          * m->m_pkthdr.len should have been set before cksum calcuration,
9742          * because in6_cksum() need it.
9743          */
9744 #ifdef INET6
9745         if (isipv6) {
9746                 /*
9747                  * we separately set hoplimit for every segment, since the
9748                  * user might want to change the value via setsockopt. Also,
9749                  * desired default hop limit might be changed via Neighbor
9750                  * Discovery.
9751                  */
9752                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
9753
9754                 /*
9755                  * Set the packet size here for the benefit of DTrace
9756                  * probes. ip6_output() will set it properly; it's supposed
9757                  * to include the option header lengths as well.
9758                  */
9759                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
9760
9761                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
9762                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9763                 else
9764                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9765
9766                 if (tp->t_state == TCPS_SYN_SENT)
9767                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
9768
9769                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
9770                 /* TODO: IPv6 IP6TOS_ECT bit on */
9771                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
9772                     &inp->inp_route6,
9773                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
9774                     NULL, NULL, inp);
9775
9776                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
9777                         mtu = inp->inp_route6.ro_rt->rt_mtu;
9778         }
9779 #endif                          /* INET6 */
9780 #if defined(INET) && defined(INET6)
9781         else
9782 #endif
9783 #ifdef INET
9784         {
9785                 ip->ip_len = htons(m->m_pkthdr.len);
9786 #ifdef INET6
9787                 if (inp->inp_vflag & INP_IPV6PROTO)
9788                         ip->ip_ttl = in6_selecthlim(inp, NULL);
9789 #endif                          /* INET6 */
9790                 /*
9791                  * If we do path MTU discovery, then we set DF on every
9792                  * packet. This might not be the best thing to do according
9793                  * to RFC3390 Section 2. However the tcp hostcache migitates
9794                  * the problem so it affects only the first tcp connection
9795                  * with a host.
9796                  *
9797                  * NB: Don't set DF on small MTU/MSS to have a safe
9798                  * fallback.
9799                  */
9800                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
9801                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9802                         if (tp->t_port == 0 || len < V_tcp_minmss) {
9803                                 ip->ip_off |= htons(IP_DF);
9804                         }
9805                 } else {
9806                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9807                 }
9808
9809                 if (tp->t_state == TCPS_SYN_SENT)
9810                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
9811
9812                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
9813
9814                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
9815                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
9816                     inp);
9817                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
9818                         mtu = inp->inp_route.ro_rt->rt_mtu;
9819         }
9820 #endif                          /* INET */
9821
9822 out:
9823         if (lgb) {
9824                 lgb->tlb_errno = error;
9825                 lgb = NULL;
9826         }
9827         /*
9828          * In transmit state, time the transmission and arrange for the
9829          * retransmit.  In persist state, just set snd_max.
9830          */
9831         if (error == 0) {
9832                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9833                     (tp->t_flags & TF_SACK_PERMIT) &&
9834                     tp->rcv_numsacks > 0)
9835                         tcp_clean_dsack_blocks(tp);
9836                 if (len == 0)
9837                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
9838                 else if (len == 1) {
9839                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
9840                 } else if (len > 1) {
9841                         int idx;
9842
9843                         idx = (len / ctf_fixed_maxseg(tp)) + 3;
9844                         if (idx >= TCP_MSS_ACCT_ATIMER)
9845                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
9846                         else
9847                                 counter_u64_add(rack_out_size[idx], 1);
9848                 }
9849                 if (hw_tls && len > 0) {
9850                         if (filled_all) {
9851                                 counter_u64_add(rack_tls_filled, 1);
9852                                 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1);
9853                         } else {
9854                                 if (rsm) {
9855                                         counter_u64_add(rack_tls_rxt, 1);
9856                                         rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1);
9857                                 } else if (doing_tlp) {
9858                                         counter_u64_add(rack_tls_tlp, 1);
9859                                         rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1);
9860                                 } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) {
9861                                         counter_u64_add(rack_tls_app, 1);
9862                                         rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1);
9863                                 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) {
9864                                         counter_u64_add(rack_tls_cwnd, 1);
9865                                         rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1);
9866                                 } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) {
9867                                         counter_u64_add(rack_tls_rwnd, 1);
9868                                         rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1);
9869                                 } else {
9870                                         rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1);
9871                                         counter_u64_add(rack_tls_other, 1);
9872                                 }
9873                         }
9874                 }
9875         }
9876         if (sub_from_prr && (error == 0)) {
9877                 if (rack->r_ctl.rc_prr_sndcnt >= len)
9878                         rack->r_ctl.rc_prr_sndcnt -= len;
9879                 else
9880                         rack->r_ctl.rc_prr_sndcnt = 0;
9881         }
9882         sub_from_prr = 0;
9883         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
9884             pass, rsm);
9885         if ((error == 0) &&
9886             (len > 0) &&
9887             (tp->snd_una == tp->snd_max))
9888                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
9889         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
9890             (rack->rc_in_persist == 0)) {
9891                 tcp_seq startseq = tp->snd_nxt;
9892
9893                 /*
9894                  * Advance snd_nxt over sequence space of this segment.
9895                  */
9896                 if (error)
9897                         /* We don't log or do anything with errors */
9898                         goto nomore;
9899
9900                 if (flags & (TH_SYN | TH_FIN)) {
9901                         if (flags & TH_SYN)
9902                                 tp->snd_nxt++;
9903                         if (flags & TH_FIN) {
9904                                 tp->snd_nxt++;
9905                                 tp->t_flags |= TF_SENTFIN;
9906                         }
9907                 }
9908                 /* In the ENOBUFS case we do *not* update snd_max */
9909                 if (sack_rxmit)
9910                         goto nomore;
9911
9912                 tp->snd_nxt += len;
9913                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
9914                         if (tp->snd_una == tp->snd_max) {
9915                                 /*
9916                                  * Update the time we just added data since
9917                                  * none was outstanding.
9918                                  */
9919                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9920                                 tp->t_acktime = ticks;
9921                         }
9922                         tp->snd_max = tp->snd_nxt;
9923                         /*
9924                          * Time this transmission if not a retransmission and
9925                          * not currently timing anything.
9926                          * This is only relevant in case of switching back to
9927                          * the base stack.
9928                          */
9929                         if (tp->t_rtttime == 0) {
9930                                 tp->t_rtttime = ticks;
9931                                 tp->t_rtseq = startseq;
9932                                 TCPSTAT_INC(tcps_segstimed);
9933                         }
9934 #ifdef STATS
9935                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
9936                                 tp->t_flags |= TF_GPUTINPROG;
9937                                 tp->gput_seq = startseq;
9938                                 tp->gput_ack = startseq +
9939                                     ulmin(sbavail(sb) - sb_offset, sendwin);
9940                                 tp->gput_ts = tcp_ts_getticks();
9941                         }
9942 #endif
9943                 }
9944         } else {
9945                 /*
9946                  * Persist case, update snd_max but since we are in persist
9947                  * mode (no window) we do not update snd_nxt.
9948                  */
9949                 int32_t xlen = len;
9950
9951                 if (error)
9952                         goto nomore;
9953
9954                 if (flags & TH_SYN)
9955                         ++xlen;
9956                 if (flags & TH_FIN) {
9957                         ++xlen;
9958                         tp->t_flags |= TF_SENTFIN;
9959                 }
9960                 /* In the ENOBUFS case we do *not* update snd_max */
9961                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
9962                         if (tp->snd_una == tp->snd_max) {
9963                                 /*
9964                                  * Update the time we just added data since
9965                                  * none was outstanding.
9966                                  */
9967                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9968                                 tp->t_acktime = ticks;
9969                         }
9970                         tp->snd_max = tp->snd_nxt + len;
9971                 }
9972         }
9973 nomore:
9974         if (error) {
9975                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
9976                 /*
9977                  * Failures do not advance the seq counter above. For the
9978                  * case of ENOBUFS we will fall out and retry in 1ms with
9979                  * the hpts. Everything else will just have to retransmit
9980                  * with the timer.
9981                  *
9982                  * In any case, we do not want to loop around for another
9983                  * send without a good reason.
9984                  */
9985                 sendalot = 0;
9986                 switch (error) {
9987                 case EPERM:
9988                         tp->t_flags &= ~TF_FORCEDATA;
9989                         tp->t_softerror = error;
9990                         return (error);
9991                 case ENOBUFS:
9992                         if (slot == 0) {
9993                                 /*
9994                                  * Pace us right away to retry in a some
9995                                  * time
9996                                  */
9997                                 slot = 1 + rack->rc_enobuf;
9998                                 if (rack->rc_enobuf < 255)
9999                                         rack->rc_enobuf++;
10000                                 if (slot > (rack->rc_rack_rtt / 2)) {
10001                                         slot = rack->rc_rack_rtt / 2;
10002                                 }
10003                                 if (slot < 10)
10004                                         slot = 10;
10005                         }
10006                         counter_u64_add(rack_saw_enobuf, 1);
10007                         error = 0;
10008                         goto enobufs;
10009                 case EMSGSIZE:
10010                         /*
10011                          * For some reason the interface we used initially
10012                          * to send segments changed to another or lowered
10013                          * its MTU. If TSO was active we either got an
10014                          * interface without TSO capabilits or TSO was
10015                          * turned off. If we obtained mtu from ip_output()
10016                          * then update it and try again.
10017                          */
10018                         if (tso)
10019                                 tp->t_flags &= ~TF_TSO;
10020                         if (mtu != 0) {
10021                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
10022                                 goto again;
10023                         }
10024                         slot = 10;
10025                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10026                         tp->t_flags &= ~TF_FORCEDATA;
10027                         return (error);
10028                 case ENETUNREACH:
10029                         counter_u64_add(rack_saw_enetunreach, 1);
10030                 case EHOSTDOWN:
10031                 case EHOSTUNREACH:
10032                 case ENETDOWN:
10033                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
10034                                 tp->t_softerror = error;
10035                         }
10036                         /* FALLTHROUGH */
10037                 default:
10038                         slot = 10;
10039                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10040                         tp->t_flags &= ~TF_FORCEDATA;
10041                         return (error);
10042                 }
10043         } else {
10044                 rack->rc_enobuf = 0;
10045         }
10046         TCPSTAT_INC(tcps_sndtotal);
10047
10048         /*
10049          * Data sent (as far as we can tell). If this advertises a larger
10050          * window than any other segment, then remember the size of the
10051          * advertised window. Any pending ACK has now been sent.
10052          */
10053         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
10054                 tp->rcv_adv = tp->rcv_nxt + recwin;
10055         tp->last_ack_sent = tp->rcv_nxt;
10056         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
10057 enobufs:
10058         rack->r_tlp_running = 0;
10059         if (flags & TH_RST) {
10060                 /*
10061                  * We don't send again after sending a RST.
10062                  */
10063                 slot = 0;
10064                 sendalot = 0;
10065         }
10066         if (rsm && (slot == 0)) {
10067                 /*
10068                  * Dup ack retransmission possibly, so
10069                  * lets assure we have at least min rack
10070                  * time, if its a rack resend then the rack
10071                  * to will also be set to this.
10072                  */
10073                 slot = rack->r_ctl.rc_min_to;
10074         }
10075         if (slot) {
10076                 /* set the rack tcb into the slot N */
10077                 counter_u64_add(rack_paced_segments, 1);
10078         } else if (sendalot) {
10079                 if (len)
10080                         counter_u64_add(rack_unpaced_segments, 1);
10081                 sack_rxmit = 0;
10082                 tp->t_flags &= ~TF_FORCEDATA;
10083                 goto again;
10084         } else if (len) {
10085                 counter_u64_add(rack_unpaced_segments, 1);
10086         }
10087         tp->t_flags &= ~TF_FORCEDATA;
10088         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
10089         return (error);
10090 }
10091
10092 /*
10093  * rack_ctloutput() must drop the inpcb lock before performing copyin on
10094  * socket option arguments.  When it re-acquires the lock after the copy, it
10095  * has to revalidate that the connection is still valid for the socket
10096  * option.
10097  */
10098 static int
10099 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
10100     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10101 {
10102         struct epoch_tracker et;
10103         int32_t error = 0, optval;
10104
10105         switch (sopt->sopt_name) {
10106         case TCP_RACK_PROP_RATE:
10107         case TCP_RACK_PROP:
10108         case TCP_RACK_TLP_REDUCE:
10109         case TCP_RACK_EARLY_RECOV:
10110         case TCP_RACK_PACE_ALWAYS:
10111         case TCP_DELACK:
10112         case TCP_RACK_PACE_REDUCE:
10113         case TCP_RACK_PACE_MAX_SEG:
10114         case TCP_RACK_PRR_SENDALOT:
10115         case TCP_RACK_MIN_TO:
10116         case TCP_RACK_EARLY_SEG:
10117         case TCP_RACK_REORD_THRESH:
10118         case TCP_RACK_REORD_FADE:
10119         case TCP_RACK_TLP_THRESH:
10120         case TCP_RACK_PKT_DELAY:
10121         case TCP_RACK_TLP_USE:
10122         case TCP_RACK_TLP_INC_VAR:
10123         case TCP_RACK_IDLE_REDUCE_HIGH:
10124         case TCP_RACK_MIN_PACE:
10125         case TCP_RACK_GP_INCREASE:
10126         case TCP_BBR_RACK_RTT_USE:
10127         case TCP_BBR_USE_RACK_CHEAT:
10128         case TCP_RACK_DO_DETECTION:
10129         case TCP_DATA_AFTER_CLOSE:
10130                 break;
10131         default:
10132                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10133                 break;
10134         }
10135         INP_WUNLOCK(inp);
10136         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
10137         if (error)
10138                 return (error);
10139         INP_WLOCK(inp);
10140         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
10141                 INP_WUNLOCK(inp);
10142                 return (ECONNRESET);
10143         }
10144         tp = intotcpcb(inp);
10145         rack = (struct tcp_rack *)tp->t_fb_ptr;
10146         switch (sopt->sopt_name) {
10147         case TCP_RACK_DO_DETECTION:
10148                 RACK_OPTS_INC(tcp_rack_do_detection);
10149                 if (optval == 0)
10150                         rack->do_detection = 0;
10151                 else
10152                         rack->do_detection = 1;
10153                 break;
10154         case TCP_RACK_PROP_RATE:
10155                 if ((optval <= 0) || (optval >= 100)) {
10156                         error = EINVAL;
10157                         break;
10158                 }
10159                 RACK_OPTS_INC(tcp_rack_prop_rate);
10160                 rack->r_ctl.rc_prop_rate = optval;
10161                 break;
10162         case TCP_RACK_TLP_USE:
10163                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
10164                         error = EINVAL;
10165                         break;
10166                 }
10167                 RACK_OPTS_INC(tcp_tlp_use);
10168                 rack->rack_tlp_threshold_use = optval;
10169                 break;
10170         case TCP_RACK_PROP:
10171                 /* RACK proportional rate reduction (bool) */
10172                 RACK_OPTS_INC(tcp_rack_prop);
10173                 rack->r_ctl.rc_prop_reduce = optval;
10174                 break;
10175         case TCP_RACK_TLP_REDUCE:
10176                 /* RACK TLP cwnd reduction (bool) */
10177                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
10178                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
10179                 break;
10180         case TCP_RACK_EARLY_RECOV:
10181                 /* Should recovery happen early (bool) */
10182                 RACK_OPTS_INC(tcp_rack_early_recov);
10183                 rack->r_ctl.rc_early_recovery = optval;
10184                 break;
10185         case TCP_RACK_PACE_ALWAYS:
10186                 /* Use the always pace method (bool)  */
10187                 RACK_OPTS_INC(tcp_rack_pace_always);
10188                 if (optval > 0)
10189                         rack->rc_always_pace = 1;
10190                 else
10191                         rack->rc_always_pace = 0;
10192                 break;
10193         case TCP_RACK_PACE_REDUCE:
10194                 /* RACK Hptsi reduction factor (divisor) */
10195                 RACK_OPTS_INC(tcp_rack_pace_reduce);
10196                 if (optval)
10197                         /* Must be non-zero */
10198                         rack->rc_pace_reduce = optval;
10199                 else
10200                         error = EINVAL;
10201                 break;
10202         case TCP_RACK_PACE_MAX_SEG:
10203                 /* Max segments in a pace */
10204                 RACK_OPTS_INC(tcp_rack_max_seg);
10205                 rack->rc_pace_max_segs = optval;
10206                 rack_set_pace_segments(tp, rack);
10207                 break;
10208         case TCP_RACK_PRR_SENDALOT:
10209                 /* Allow PRR to send more than one seg */
10210                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
10211                 rack->r_ctl.rc_prr_sendalot = optval;
10212                 break;
10213         case TCP_RACK_MIN_TO:
10214                 /* Minimum time between rack t-o's in ms */
10215                 RACK_OPTS_INC(tcp_rack_min_to);
10216                 rack->r_ctl.rc_min_to = optval;
10217                 break;
10218         case TCP_RACK_EARLY_SEG:
10219                 /* If early recovery max segments */
10220                 RACK_OPTS_INC(tcp_rack_early_seg);
10221                 rack->r_ctl.rc_early_recovery_segs = optval;
10222                 break;
10223         case TCP_RACK_REORD_THRESH:
10224                 /* RACK reorder threshold (shift amount) */
10225                 RACK_OPTS_INC(tcp_rack_reord_thresh);
10226                 if ((optval > 0) && (optval < 31))
10227                         rack->r_ctl.rc_reorder_shift = optval;
10228                 else
10229                         error = EINVAL;
10230                 break;
10231         case TCP_RACK_REORD_FADE:
10232                 /* Does reordering fade after ms time */
10233                 RACK_OPTS_INC(tcp_rack_reord_fade);
10234                 rack->r_ctl.rc_reorder_fade = optval;
10235                 break;
10236         case TCP_RACK_TLP_THRESH:
10237                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10238                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
10239                 if (optval)
10240                         rack->r_ctl.rc_tlp_threshold = optval;
10241                 else
10242                         error = EINVAL;
10243                 break;
10244         case TCP_BBR_USE_RACK_CHEAT:
10245                 RACK_OPTS_INC(tcp_rack_cheat);
10246                 if (optval)
10247                         rack->use_rack_cheat = 1;
10248                 else
10249                         rack->use_rack_cheat = 0;
10250                 break;
10251         case TCP_RACK_PKT_DELAY:
10252                 /* RACK added ms i.e. rack-rtt + reord + N */
10253                 RACK_OPTS_INC(tcp_rack_pkt_delay);
10254                 rack->r_ctl.rc_pkt_delay = optval;
10255                 break;
10256         case TCP_RACK_TLP_INC_VAR:
10257                 /* Does TLP include rtt variance in t-o */
10258                 error = EINVAL;
10259                 break;
10260         case TCP_RACK_IDLE_REDUCE_HIGH:
10261                 error = EINVAL;
10262                 break;
10263         case TCP_DELACK:
10264                 if (optval == 0)
10265                         tp->t_delayed_ack = 0;
10266                 else
10267                         tp->t_delayed_ack = 1;
10268                 if (tp->t_flags & TF_DELACK) {
10269                         tp->t_flags &= ~TF_DELACK;
10270                         tp->t_flags |= TF_ACKNOW;
10271                         NET_EPOCH_ENTER(et);
10272                         rack_output(tp);
10273                         NET_EPOCH_EXIT(et);
10274                 }
10275                 break;
10276         case TCP_RACK_MIN_PACE:
10277                 RACK_OPTS_INC(tcp_rack_min_pace);
10278                 if (optval > 3)
10279                         rack->r_enforce_min_pace = 3;
10280                 else
10281                         rack->r_enforce_min_pace = optval;
10282                 break;
10283         case TCP_RACK_GP_INCREASE:
10284                 if ((optval >= 0) &&
10285                     (optval <= 256))
10286                         rack->rack_per_of_gp = optval;
10287                 else
10288                         error = EINVAL;
10289
10290                 break;
10291         case TCP_BBR_RACK_RTT_USE:
10292                 if ((optval != USE_RTT_HIGH) &&
10293                     (optval != USE_RTT_LOW) &&
10294                     (optval != USE_RTT_AVG))
10295                         error = EINVAL;
10296                 else
10297                         rack->r_ctl.rc_rate_sample_method = optval;
10298                 break;
10299         case TCP_DATA_AFTER_CLOSE:
10300                 if (optval)
10301                         rack->rc_allow_data_af_clo = 1;
10302                 else
10303                         rack->rc_allow_data_af_clo = 0;
10304                 break;
10305         default:
10306                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10307                 break;
10308         }
10309 #ifdef NETFLIX_STATS
10310         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
10311 #endif
10312         INP_WUNLOCK(inp);
10313         return (error);
10314 }
10315
10316 static int
10317 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
10318     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10319 {
10320         int32_t error, optval;
10321
10322         /*
10323          * Because all our options are either boolean or an int, we can just
10324          * pull everything into optval and then unlock and copy. If we ever
10325          * add a option that is not a int, then this will have quite an
10326          * impact to this routine.
10327          */
10328         error = 0;
10329         switch (sopt->sopt_name) {
10330         case TCP_RACK_DO_DETECTION:
10331                 optval = rack->do_detection;
10332                 break;
10333
10334         case TCP_RACK_PROP_RATE:
10335                 optval = rack->r_ctl.rc_prop_rate;
10336                 break;
10337         case TCP_RACK_PROP:
10338                 /* RACK proportional rate reduction (bool) */
10339                 optval = rack->r_ctl.rc_prop_reduce;
10340                 break;
10341         case TCP_RACK_TLP_REDUCE:
10342                 /* RACK TLP cwnd reduction (bool) */
10343                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
10344                 break;
10345         case TCP_RACK_EARLY_RECOV:
10346                 /* Should recovery happen early (bool) */
10347                 optval = rack->r_ctl.rc_early_recovery;
10348                 break;
10349         case TCP_RACK_PACE_REDUCE:
10350                 /* RACK Hptsi reduction factor (divisor) */
10351                 optval = rack->rc_pace_reduce;
10352                 break;
10353         case TCP_RACK_PACE_MAX_SEG:
10354                 /* Max segments in a pace */
10355                 optval = rack->rc_pace_max_segs;
10356                 break;
10357         case TCP_RACK_PACE_ALWAYS:
10358                 /* Use the always pace method */
10359                 optval = rack->rc_always_pace;
10360                 break;
10361         case TCP_RACK_PRR_SENDALOT:
10362                 /* Allow PRR to send more than one seg */
10363                 optval = rack->r_ctl.rc_prr_sendalot;
10364                 break;
10365         case TCP_RACK_MIN_TO:
10366                 /* Minimum time between rack t-o's in ms */
10367                 optval = rack->r_ctl.rc_min_to;
10368                 break;
10369         case TCP_RACK_EARLY_SEG:
10370                 /* If early recovery max segments */
10371                 optval = rack->r_ctl.rc_early_recovery_segs;
10372                 break;
10373         case TCP_RACK_REORD_THRESH:
10374                 /* RACK reorder threshold (shift amount) */
10375                 optval = rack->r_ctl.rc_reorder_shift;
10376                 break;
10377         case TCP_RACK_REORD_FADE:
10378                 /* Does reordering fade after ms time */
10379                 optval = rack->r_ctl.rc_reorder_fade;
10380                 break;
10381         case TCP_BBR_USE_RACK_CHEAT:
10382                 /* Do we use the rack cheat for rxt */
10383                 optval = rack->use_rack_cheat;
10384                 break;
10385         case TCP_RACK_TLP_THRESH:
10386                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10387                 optval = rack->r_ctl.rc_tlp_threshold;
10388                 break;
10389         case TCP_RACK_PKT_DELAY:
10390                 /* RACK added ms i.e. rack-rtt + reord + N */
10391                 optval = rack->r_ctl.rc_pkt_delay;
10392                 break;
10393         case TCP_RACK_TLP_USE:
10394                 optval = rack->rack_tlp_threshold_use;
10395                 break;
10396         case TCP_RACK_TLP_INC_VAR:
10397                 /* Does TLP include rtt variance in t-o */
10398                 error = EINVAL;
10399                 break;
10400         case TCP_RACK_IDLE_REDUCE_HIGH:
10401                 error = EINVAL;
10402                 break;
10403         case TCP_RACK_MIN_PACE:
10404                 optval = rack->r_enforce_min_pace;
10405                 break;
10406         case TCP_RACK_GP_INCREASE:
10407                 optval = rack->rack_per_of_gp;
10408                 break;
10409         case TCP_BBR_RACK_RTT_USE:
10410                 optval = rack->r_ctl.rc_rate_sample_method;
10411                 break;
10412         case TCP_DELACK:
10413                 optval = tp->t_delayed_ack;
10414                 break;
10415         case TCP_DATA_AFTER_CLOSE:
10416                 optval = rack->rc_allow_data_af_clo;
10417                 break;
10418         default:
10419                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10420                 break;
10421         }
10422         INP_WUNLOCK(inp);
10423         if (error == 0) {
10424                 error = sooptcopyout(sopt, &optval, sizeof optval);
10425         }
10426         return (error);
10427 }
10428
10429 static int
10430 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
10431 {
10432         int32_t error = EINVAL;
10433         struct tcp_rack *rack;
10434
10435         rack = (struct tcp_rack *)tp->t_fb_ptr;
10436         if (rack == NULL) {
10437                 /* Huh? */
10438                 goto out;
10439         }
10440         if (sopt->sopt_dir == SOPT_SET) {
10441                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
10442         } else if (sopt->sopt_dir == SOPT_GET) {
10443                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
10444         }
10445 out:
10446         INP_WUNLOCK(inp);
10447         return (error);
10448 }
10449
10450
10451 static struct tcp_function_block __tcp_rack = {
10452         .tfb_tcp_block_name = __XSTRING(STACKNAME),
10453         .tfb_tcp_output = rack_output,
10454         .tfb_do_queued_segments = ctf_do_queued_segments,
10455         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
10456         .tfb_tcp_do_segment = rack_do_segment,
10457         .tfb_tcp_ctloutput = rack_ctloutput,
10458         .tfb_tcp_fb_init = rack_init,
10459         .tfb_tcp_fb_fini = rack_fini,
10460         .tfb_tcp_timer_stop_all = rack_stopall,
10461         .tfb_tcp_timer_activate = rack_timer_activate,
10462         .tfb_tcp_timer_active = rack_timer_active,
10463         .tfb_tcp_timer_stop = rack_timer_stop,
10464         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
10465         .tfb_tcp_handoff_ok = rack_handoff_ok
10466 };
10467
10468 static const char *rack_stack_names[] = {
10469         __XSTRING(STACKNAME),
10470 #ifdef STACKALIAS
10471         __XSTRING(STACKALIAS),
10472 #endif
10473 };
10474
10475 static int
10476 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
10477 {
10478         memset(mem, 0, size);
10479         return (0);
10480 }
10481
10482 static void
10483 rack_dtor(void *mem, int32_t size, void *arg)
10484 {
10485
10486 }
10487
10488 static bool rack_mod_inited = false;
10489
10490 static int
10491 tcp_addrack(module_t mod, int32_t type, void *data)
10492 {
10493         int32_t err = 0;
10494         int num_stacks;
10495
10496         switch (type) {
10497         case MOD_LOAD:
10498                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
10499                     sizeof(struct rack_sendmap),
10500                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
10501
10502                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
10503                     sizeof(struct tcp_rack),
10504                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
10505
10506                 sysctl_ctx_init(&rack_sysctl_ctx);
10507                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
10508                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
10509                     OID_AUTO,
10510 #ifdef STACKALIAS
10511                     __XSTRING(STACKALIAS),
10512 #else
10513                     __XSTRING(STACKNAME),
10514 #endif
10515                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
10516                     "");
10517                 if (rack_sysctl_root == NULL) {
10518                         printf("Failed to add sysctl node\n");
10519                         err = EFAULT;
10520                         goto free_uma;
10521                 }
10522                 rack_init_sysctls();
10523                 num_stacks = nitems(rack_stack_names);
10524                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
10525                     rack_stack_names, &num_stacks);
10526                 if (err) {
10527                         printf("Failed to register %s stack name for "
10528                             "%s module\n", rack_stack_names[num_stacks],
10529                             __XSTRING(MODNAME));
10530                         sysctl_ctx_free(&rack_sysctl_ctx);
10531 free_uma:
10532                         uma_zdestroy(rack_zone);
10533                         uma_zdestroy(rack_pcb_zone);
10534                         rack_counter_destroy();
10535                         printf("Failed to register rack module -- err:%d\n", err);
10536                         return (err);
10537                 }
10538                 tcp_lro_reg_mbufq();
10539                 rack_mod_inited = true;
10540                 break;
10541         case MOD_QUIESCE:
10542                 err = deregister_tcp_functions(&__tcp_rack, true, false);
10543                 break;
10544         case MOD_UNLOAD:
10545                 err = deregister_tcp_functions(&__tcp_rack, false, true);
10546                 if (err == EBUSY)
10547                         break;
10548                 if (rack_mod_inited) {
10549                         uma_zdestroy(rack_zone);
10550                         uma_zdestroy(rack_pcb_zone);
10551                         sysctl_ctx_free(&rack_sysctl_ctx);
10552                         rack_counter_destroy();
10553                         rack_mod_inited = false;
10554                 }
10555                 tcp_lro_dereg_mbufq();
10556                 err = 0;
10557                 break;
10558         default:
10559                 return (EOPNOTSUPP);
10560         }
10561         return (err);
10562 }
10563
10564 static moduledata_t tcp_rack = {
10565         .name = __XSTRING(MODNAME),
10566         .evhand = tcp_addrack,
10567         .priv = 0
10568 };
10569
10570 MODULE_VERSION(MODNAME, 1);
10571 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
10572 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);