sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-9 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34 #include "opt_ratelimit.h"
  35 #include "opt_kern_tls.h"
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #ifdef KERN_TLS
  52 #include <sys/ktls.h>
  53 #endif
  54 #include <sys/sysctl.h>
  55 #include <sys/systm.h>
  56 #ifdef STATS
  57 #include <sys/qmath.h>
  58 #include <sys/tree.h>
  59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  60 #endif
  61 #include <sys/refcount.h>
  62 #include <sys/tree.h>
  63 #include <sys/queue.h>
  64 #include <sys/smp.h>
  65 #include <sys/kthread.h>
  66 #include <sys/kern_prefetch.h>
  67
  68 #include <vm/uma.h>
  69
  70 #include <net/route.h>
  71 #include <net/vnet.h>
  72
  73 #define TCPSTATES               /* for logging */
  74
  75 #include <netinet/in.h>
  76 #include <netinet/in_kdtrace.h>
  77 #include <netinet/in_pcb.h>
  78 #include <netinet/ip.h>
  79 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  80 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  81 #include <netinet/ip_var.h>
  82 #include <netinet/ip6.h>
  83 #include <netinet6/in6_pcb.h>
  84 #include <netinet6/ip6_var.h>
  85 #include <netinet/tcp.h>
  86 #define TCPOUTFLAGS
  87 #include <netinet/tcp_fsm.h>
  88 #include <netinet/tcp_log_buf.h>
  89 #include <netinet/tcp_seq.h>
  90 #include <netinet/tcp_timer.h>
  91 #include <netinet/tcp_var.h>
  92 #include <netinet/tcp_hpts.h>
  93 #include <netinet/tcpip.h>
  94 #include <netinet/cc/cc.h>
  95 #include <netinet/tcp_fastopen.h>
  96 #include <netinet/tcp_lro.h>
  97 #ifdef TCPDEBUG
  98 #include <netinet/tcp_debug.h>
  99 #endif                          /* TCPDEBUG */
 100 #ifdef TCP_OFFLOAD
 101 #include <netinet/tcp_offload.h>
 102 #endif
 103 #ifdef INET6
 104 #include <netinet6/tcp6_var.h>
 105 #endif
 106
 107 #include <netipsec/ipsec_support.h>
 108
 109 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 110 #include <netipsec/ipsec.h>
 111 #include <netipsec/ipsec6.h>
 112 #endif                          /* IPSEC */
 113
 114 #include <netinet/udp.h>
 115 #include <netinet/udp_var.h>
 116 #include <machine/in_cksum.h>
 117
 118 #ifdef MAC
 119 #include <security/mac/mac_framework.h>
 120 #endif
 121 #include "sack_filter.h"
 122 #include "tcp_rack.h"
 123 #include "rack_bbr_common.h"
 124
 125 uma_zone_t rack_zone;
 126 uma_zone_t rack_pcb_zone;
 127
 128 #ifndef TICKS2SBT
 129 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 130 #endif
 131
 132 struct sysctl_ctx_list rack_sysctl_ctx;
 133 struct sysctl_oid *rack_sysctl_root;
 134
 135 #define CUM_ACKED 1
 136 #define SACKED 2
 137
 138 /*
 139  * The RACK module incorporates a number of
 140  * TCP ideas that have been put out into the IETF
 141  * over the last few years:
 142  * - Matt Mathis's Rate Halving which slowly drops
 143  *    the congestion window so that the ack clock can
 144  *    be maintained during a recovery.
 145  * - Yuchung Cheng's RACK TCP (for which its named) that
 146  *    will stop us using the number of dup acks and instead
 147  *    use time as the gage of when we retransmit.
 148  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 149  *    of Dukkipati et.al.
 150  * RACK depends on SACK, so if an endpoint arrives that
 151  * cannot do SACK the state machine below will shuttle the
 152  * connection back to using the "default" TCP stack that is
 153  * in FreeBSD.
 154  *
 155  * To implement RACK the original TCP stack was first decomposed
 156  * into a functional state machine with individual states
 157  * for each of the possible TCP connection states. The do_segement
 158  * functions role in life is to mandate the connection supports SACK
 159  * initially and then assure that the RACK state matches the conenction
 160  * state before calling the states do_segment function. Each
 161  * state is simplified due to the fact that the original do_segment
 162  * has been decomposed and we *know* what state we are in (no
 163  * switches on the state) and all tests for SACK are gone. This
 164  * greatly simplifies what each state does.
 165  *
 166  * TCP output is also over-written with a new version since it
 167  * must maintain the new rack scoreboard.
 168  *
 169  */
 170 static int32_t rack_tlp_thresh = 1;
 171 static int32_t rack_reorder_thresh = 2;
 172 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 173                                                  * - 60 seconds */
 174 /* Attack threshold detections */
 175 static uint32_t rack_highest_sack_thresh_seen = 0;
 176 static uint32_t rack_highest_move_thresh_seen = 0;
 177
 178 static int32_t rack_pkt_delay = 1;
 179 static int32_t rack_min_pace_time = 0;
 180 static int32_t rack_early_recovery = 1;
 181 static int32_t rack_send_a_lot_in_prr = 1;
 182 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 183 static int32_t rack_verbose_logging = 0;
 184 static int32_t rack_ignore_data_after_close = 1;
 185 static int32_t use_rack_cheat = 1;
 186 static int32_t rack_persist_min = 250;  /* 250ms */
 187 static int32_t rack_persist_max = 1000; /* 1 Second */
 188 static int32_t rack_sack_not_required = 0;      /* set to one to allow non-sack to use rack */
 189 static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */
 190
 191 /*
 192  * Currently regular tcp has a rto_min of 30ms
 193  * the backoff goes 12 times so that ends up
 194  * being a total of 122.850 seconds before a
 195  * connection is killed.
 196  */
 197 static int32_t rack_tlp_min = 10;
 198 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 199 static int32_t rack_rto_max = 4000;     /* 4 seconds */
 200 static const int32_t rack_free_cache = 2;
 201 static int32_t rack_hptsi_segments = 40;
 202 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 203 static int32_t rack_pace_every_seg = 0;
 204 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 205 static int32_t rack_slot_reduction = 4;
 206 static int32_t rack_lower_cwnd_at_tlp = 0;
 207 static int32_t rack_use_proportional_reduce = 0;
 208 static int32_t rack_proportional_rate = 10;
 209 static int32_t rack_tlp_max_resend = 2;
 210 static int32_t rack_limited_retran = 0;
 211 static int32_t rack_always_send_oldest = 0;
 212 static int32_t rack_use_sack_filter = 1;
 213 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 214 static int32_t rack_per_of_gp = 50;
 215
 216 /* Rack specific counters */
 217 counter_u64_t rack_badfr;
 218 counter_u64_t rack_badfr_bytes;
 219 counter_u64_t rack_rtm_prr_retran;
 220 counter_u64_t rack_rtm_prr_newdata;
 221 counter_u64_t rack_timestamp_mismatch;
 222 counter_u64_t rack_reorder_seen;
 223 counter_u64_t rack_paced_segments;
 224 counter_u64_t rack_unpaced_segments;
 225 counter_u64_t rack_calc_zero;
 226 counter_u64_t rack_calc_nonzero;
 227 counter_u64_t rack_saw_enobuf;
 228 counter_u64_t rack_saw_enetunreach;
 229 counter_u64_t rack_per_timer_hole;
 230
 231 /* Tail loss probe counters */
 232 counter_u64_t rack_tlp_tot;
 233 counter_u64_t rack_tlp_newdata;
 234 counter_u64_t rack_tlp_retran;
 235 counter_u64_t rack_tlp_retran_bytes;
 236 counter_u64_t rack_tlp_retran_fail;
 237 counter_u64_t rack_to_tot;
 238 counter_u64_t rack_to_arm_rack;
 239 counter_u64_t rack_to_arm_tlp;
 240 counter_u64_t rack_to_alloc;
 241 counter_u64_t rack_to_alloc_hard;
 242 counter_u64_t rack_to_alloc_emerg;
 243 counter_u64_t rack_to_alloc_limited;
 244 counter_u64_t rack_alloc_limited_conns;
 245 counter_u64_t rack_split_limited;
 246
 247 counter_u64_t rack_sack_proc_all;
 248 counter_u64_t rack_sack_proc_short;
 249 counter_u64_t rack_sack_proc_restart;
 250 counter_u64_t rack_sack_attacks_detected;
 251 counter_u64_t rack_sack_attacks_reversed;
 252 counter_u64_t rack_sack_used_next_merge;
 253 counter_u64_t rack_sack_splits;
 254 counter_u64_t rack_sack_used_prev_merge;
 255 counter_u64_t rack_sack_skipped_acked;
 256 counter_u64_t rack_ack_total;
 257 counter_u64_t rack_express_sack;
 258 counter_u64_t rack_sack_total;
 259 counter_u64_t rack_move_none;
 260 counter_u64_t rack_move_some;
 261
 262 counter_u64_t rack_used_tlpmethod;
 263 counter_u64_t rack_used_tlpmethod2;
 264 counter_u64_t rack_enter_tlp_calc;
 265 counter_u64_t rack_input_idle_reduces;
 266 counter_u64_t rack_collapsed_win;
 267 counter_u64_t rack_tlp_does_nada;
 268
 269 /* Counters for HW TLS */
 270 counter_u64_t rack_tls_rwnd;
 271 counter_u64_t rack_tls_cwnd;
 272 counter_u64_t rack_tls_app;
 273 counter_u64_t rack_tls_other;
 274 counter_u64_t rack_tls_filled;
 275 counter_u64_t rack_tls_rxt;
 276 counter_u64_t rack_tls_tlp;
 277
 278 /* Temp CPU counters */
 279 counter_u64_t rack_find_high;
 280
 281 counter_u64_t rack_progress_drops;
 282 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 283 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 284
 285 static void
 286 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 287
 288 static int
 289 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 290     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 291     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 292 static int
 293 rack_process_data(struct mbuf *m, struct tcphdr *th,
 294     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 295     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 296 static void
 297 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 298     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 299 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 300 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 301     uint8_t limit_type);
 302 static struct rack_sendmap *
 303 rack_check_recovery_mode(struct tcpcb *tp,
 304     uint32_t tsused);
 305 static void
 306 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 307     uint32_t type);
 308 static void rack_counter_destroy(void);
 309 static int
 310 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 311     struct inpcb *inp, struct tcpcb *tp);
 312 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 313 static void
 314 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 315     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 316     uint8_t iptos);
 317 static void rack_dtor(void *mem, int32_t size, void *arg);
 318 static void
 319 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 320     uint32_t t, uint32_t cts);
 321 static struct rack_sendmap *
 322 rack_find_high_nonack(struct tcp_rack *rack,
 323     struct rack_sendmap *rsm);
 324 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 325 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 326 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 327 static int
 328 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 329     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 330 static int32_t rack_handoff_ok(struct tcpcb *tp);
 331 static int32_t rack_init(struct tcpcb *tp);
 332 static void rack_init_sysctls(void);
 333 static void
 334 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 335     struct tcphdr *th);
 336 static void
 337 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 338     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 339     uint8_t pass, struct rack_sendmap *hintrsm);
 340 static void
 341 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 342     struct rack_sendmap *rsm);
 343 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num);
 344 static int32_t rack_output(struct tcpcb *tp);
 345
 346 static uint32_t
 347 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 348     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 349     uint32_t cts, int *moved_two);
 350 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 351 static void rack_remxt_tmr(struct tcpcb *tp);
 352 static int
 353 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 354     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 355 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 356 static int32_t rack_stopall(struct tcpcb *tp);
 357 static void
 358 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 359     uint32_t delta);
 360 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 361 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 362 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 363 static uint32_t
 364 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 365     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 366 static void
 367 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 368     struct rack_sendmap *rsm, uint32_t ts);
 369 static int
 370 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 371     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 372 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 373 static int
 374 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 375     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 376     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 377 static int
 378 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 379     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 380     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 381 static int
 382 rack_do_established(struct mbuf *m, struct tcphdr *th,
 383     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 384     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 385 static int
 386 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 387     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 388     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 389 static int
 390 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 391     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 392     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 393 static int
 394 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 395     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 396     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 397 static int
 398 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 399     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 400     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 401 static int
 402 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 403     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 404     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 405 static int
 406 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 407     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 408     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 409 struct rack_sendmap *
 410 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 411     uint32_t tsused);
 412 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 413 static void
 414      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 415
 416 int32_t rack_clear_counter=0;
 417
 418
 419 static int
 420 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 421 {
 422         uint32_t stat;
 423         int32_t error;
 424
 425         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 426         if (error || req->newptr == NULL)
 427                 return error;
 428
 429         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 430         if (error)
 431                 return (error);
 432         if (stat == 1) {
 433 #ifdef INVARIANTS
 434                 printf("Clearing RACK counters\n");
 435 #endif
 436                 counter_u64_zero(rack_badfr);
 437                 counter_u64_zero(rack_badfr_bytes);
 438                 counter_u64_zero(rack_rtm_prr_retran);
 439                 counter_u64_zero(rack_rtm_prr_newdata);
 440                 counter_u64_zero(rack_timestamp_mismatch);
 441                 counter_u64_zero(rack_reorder_seen);
 442                 counter_u64_zero(rack_tlp_tot);
 443                 counter_u64_zero(rack_tlp_newdata);
 444                 counter_u64_zero(rack_tlp_retran);
 445                 counter_u64_zero(rack_tlp_retran_bytes);
 446                 counter_u64_zero(rack_tlp_retran_fail);
 447                 counter_u64_zero(rack_to_tot);
 448                 counter_u64_zero(rack_to_arm_rack);
 449                 counter_u64_zero(rack_to_arm_tlp);
 450                 counter_u64_zero(rack_paced_segments);
 451                 counter_u64_zero(rack_calc_zero);
 452                 counter_u64_zero(rack_calc_nonzero);
 453                 counter_u64_zero(rack_unpaced_segments);
 454                 counter_u64_zero(rack_saw_enobuf);
 455                 counter_u64_zero(rack_saw_enetunreach);
 456                 counter_u64_zero(rack_per_timer_hole);
 457                 counter_u64_zero(rack_to_alloc_hard);
 458                 counter_u64_zero(rack_to_alloc_emerg);
 459                 counter_u64_zero(rack_sack_proc_all);
 460                 counter_u64_zero(rack_sack_proc_short);
 461                 counter_u64_zero(rack_sack_proc_restart);
 462                 counter_u64_zero(rack_to_alloc);
 463                 counter_u64_zero(rack_to_alloc_limited);
 464                 counter_u64_zero(rack_alloc_limited_conns);
 465                 counter_u64_zero(rack_split_limited);
 466                 counter_u64_zero(rack_find_high);
 467                 counter_u64_zero(rack_tls_rwnd);
 468                 counter_u64_zero(rack_tls_cwnd);
 469                 counter_u64_zero(rack_tls_app);
 470                 counter_u64_zero(rack_tls_other);
 471                 counter_u64_zero(rack_tls_filled);
 472                 counter_u64_zero(rack_tls_rxt);
 473                 counter_u64_zero(rack_tls_tlp);
 474                 counter_u64_zero(rack_sack_attacks_detected);
 475                 counter_u64_zero(rack_sack_attacks_reversed);
 476                 counter_u64_zero(rack_sack_used_next_merge);
 477                 counter_u64_zero(rack_sack_used_prev_merge);
 478                 counter_u64_zero(rack_sack_splits);
 479                 counter_u64_zero(rack_sack_skipped_acked);
 480                 counter_u64_zero(rack_ack_total);
 481                 counter_u64_zero(rack_express_sack);
 482                 counter_u64_zero(rack_sack_total);
 483                 counter_u64_zero(rack_move_none);
 484                 counter_u64_zero(rack_move_some);
 485                 counter_u64_zero(rack_used_tlpmethod);
 486                 counter_u64_zero(rack_used_tlpmethod2);
 487                 counter_u64_zero(rack_enter_tlp_calc);
 488                 counter_u64_zero(rack_progress_drops);
 489                 counter_u64_zero(rack_tlp_does_nada);
 490                 counter_u64_zero(rack_collapsed_win);
 491
 492         }
 493         rack_clear_counter = 0;
 494         return (0);
 495 }
 496
 497
 498
 499 static void
 500 rack_init_sysctls(void)
 501 {
 502         struct sysctl_oid *rack_counters;
 503         struct sysctl_oid *rack_attack;
 504
 505         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 506             SYSCTL_CHILDREN(rack_sysctl_root),
 507             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 508             &rack_rate_sample_method , USE_RTT_LOW,
 509             "What method should we use for rate sampling 0=high, 1=low ");
 510         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 511             SYSCTL_CHILDREN(rack_sysctl_root),
 512             OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
 513             &rack_hw_tls_max_seg , 0,
 514             "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? ");
 515         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 516             SYSCTL_CHILDREN(rack_sysctl_root),
 517             OID_AUTO, "data_after_close", CTLFLAG_RW,
 518             &rack_ignore_data_after_close, 0,
 519             "Do we hold off sending a RST until all pending data is ack'd");
 520         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 521             SYSCTL_CHILDREN(rack_sysctl_root),
 522             OID_AUTO, "cheat_rxt", CTLFLAG_RW,
 523             &use_rack_cheat, 1,
 524             "Do we use the rxt cheat for rack?");
 525
 526         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 527             SYSCTL_CHILDREN(rack_sysctl_root),
 528             OID_AUTO, "persmin", CTLFLAG_RW,
 529             &rack_persist_min, 250,
 530             "What is the minimum time in milliseconds between persists");
 531         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 532             SYSCTL_CHILDREN(rack_sysctl_root),
 533             OID_AUTO, "persmax", CTLFLAG_RW,
 534             &rack_persist_max, 1000,
 535             "What is the largest delay in milliseconds between persists");
 536         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 537             SYSCTL_CHILDREN(rack_sysctl_root),
 538             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
 539             &rack_sack_not_required, 0,
 540             "Do we allow rack to run on connections not supporting SACK?");
 541         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 542             SYSCTL_CHILDREN(rack_sysctl_root),
 543             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 544             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 545             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 546         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 547             SYSCTL_CHILDREN(rack_sysctl_root),
 548             OID_AUTO, "gp_percentage", CTLFLAG_RW,
 549             &rack_per_of_gp, 50,
 550             "Do we pace to percentage of goodput (0=old method)?");
 551         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 552             SYSCTL_CHILDREN(rack_sysctl_root),
 553             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 554             &rack_min_pace_time, 0,
 555             "Should we enforce a minimum pace time of 1ms");
 556         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 557             SYSCTL_CHILDREN(rack_sysctl_root),
 558             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 559             &rack_verbose_logging, 0,
 560             "Should RACK black box logging be verbose");
 561         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 562             SYSCTL_CHILDREN(rack_sysctl_root),
 563             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 564             &rack_use_sack_filter, 1,
 565             "Do we use sack filtering?");
 566         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 567             SYSCTL_CHILDREN(rack_sysctl_root),
 568             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 569             &rack_delayed_ack_time, 200,
 570             "Delayed ack time (200ms)");
 571         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 572             SYSCTL_CHILDREN(rack_sysctl_root),
 573             OID_AUTO, "tlpminto", CTLFLAG_RW,
 574             &rack_tlp_min, 10,
 575             "TLP minimum timeout per the specification (10ms)");
 576         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 577             SYSCTL_CHILDREN(rack_sysctl_root),
 578             OID_AUTO, "send_oldest", CTLFLAG_RW,
 579             &rack_always_send_oldest, 1,
 580             "Should we always send the oldest TLP and RACK-TLP");
 581         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 582             SYSCTL_CHILDREN(rack_sysctl_root),
 583             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 584             &rack_limited_retran, 0,
 585             "How many times can a rack timeout drive out sends");
 586         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 587             SYSCTL_CHILDREN(rack_sysctl_root),
 588             OID_AUTO, "minrto", CTLFLAG_RW,
 589             &rack_rto_min, 0,
 590             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 591         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 592             SYSCTL_CHILDREN(rack_sysctl_root),
 593             OID_AUTO, "maxrto", CTLFLAG_RW,
 594             &rack_rto_max, 0,
 595             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 596         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 597             SYSCTL_CHILDREN(rack_sysctl_root),
 598             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 599             &rack_tlp_max_resend, 2,
 600             "How many times does TLP retry a single segment or multiple with no ACK");
 601         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 602             SYSCTL_CHILDREN(rack_sysctl_root),
 603             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 604             &rack_use_proportional_reduce, 0,
 605             "Should we proportionaly reduce cwnd based on the number of losses ");
 606         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 607             SYSCTL_CHILDREN(rack_sysctl_root),
 608             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 609             &rack_proportional_rate, 10,
 610             "What percent reduction per loss");
 611         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 612             SYSCTL_CHILDREN(rack_sysctl_root),
 613             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 614             &rack_lower_cwnd_at_tlp, 0,
 615             "When a TLP completes a retran should we enter recovery?");
 616         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 617             SYSCTL_CHILDREN(rack_sysctl_root),
 618             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 619             &rack_slot_reduction, 4,
 620             "When setting a slot should we reduce by divisor");
 621         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 622             SYSCTL_CHILDREN(rack_sysctl_root),
 623             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 624             &rack_pace_every_seg, 0,
 625             "Should we use the original pacing mechanism that did not pace much?");
 626         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 627             SYSCTL_CHILDREN(rack_sysctl_root),
 628             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 629             &rack_hptsi_segments, 40,
 630             "Should we pace out only a limited size of segments");
 631         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 634             &rack_send_a_lot_in_prr, 1,
 635             "Send a lot in prr");
 636         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "minto", CTLFLAG_RW,
 639             &rack_min_to, 1,
 640             "Minimum rack timeout in milliseconds");
 641         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 642             SYSCTL_CHILDREN(rack_sysctl_root),
 643             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 644             &rack_early_recovery, 1,
 645             "Do we do early recovery with rack");
 646         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 647             SYSCTL_CHILDREN(rack_sysctl_root),
 648             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 649             &rack_reorder_thresh, 2,
 650             "What factor for rack will be added when seeing reordering (shift right)");
 651         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 652             SYSCTL_CHILDREN(rack_sysctl_root),
 653             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 654             &rack_tlp_thresh, 1,
 655             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 656         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 657             SYSCTL_CHILDREN(rack_sysctl_root),
 658             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 659             &rack_reorder_fade, 0,
 660             "Does reorder detection fade, if so how many ms (0 means never)");
 661         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 662             SYSCTL_CHILDREN(rack_sysctl_root),
 663             OID_AUTO, "pktdelay", CTLFLAG_RW,
 664             &rack_pkt_delay, 1,
 665             "Extra RACK time (in ms) besides reordering thresh");
 666
 667         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 668             SYSCTL_CHILDREN(rack_sysctl_root),
 669             OID_AUTO,
 670             "stats",
 671             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 672             "Rack Counters");
 673         rack_badfr = counter_u64_alloc(M_WAITOK);
 674         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 675             SYSCTL_CHILDREN(rack_counters),
 676             OID_AUTO, "badfr", CTLFLAG_RD,
 677             &rack_badfr, "Total number of bad FRs");
 678         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 680             SYSCTL_CHILDREN(rack_counters),
 681             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 682             &rack_badfr_bytes, "Total number of bad FRs");
 683         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 684         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 685             SYSCTL_CHILDREN(rack_counters),
 686             OID_AUTO, "prrsndret", CTLFLAG_RD,
 687             &rack_rtm_prr_retran,
 688             "Total number of prr based retransmits");
 689         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 690         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 691             SYSCTL_CHILDREN(rack_counters),
 692             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 693             &rack_rtm_prr_newdata,
 694             "Total number of prr based new transmits");
 695         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 696         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 697             SYSCTL_CHILDREN(rack_counters),
 698             OID_AUTO, "tsnf", CTLFLAG_RD,
 699             &rack_timestamp_mismatch,
 700             "Total number of timestamps that we could not find the reported ts");
 701         rack_find_high = counter_u64_alloc(M_WAITOK);
 702         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 703             SYSCTL_CHILDREN(rack_counters),
 704             OID_AUTO, "findhigh", CTLFLAG_RD,
 705             &rack_find_high,
 706             "Total number of FIN causing find-high");
 707         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 708         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 709             SYSCTL_CHILDREN(rack_counters),
 710             OID_AUTO, "reordering", CTLFLAG_RD,
 711             &rack_reorder_seen,
 712             "Total number of times we added delay due to reordering");
 713         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 714         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 715             SYSCTL_CHILDREN(rack_counters),
 716             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 717             &rack_tlp_tot,
 718             "Total number of tail loss probe expirations");
 719         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 720         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 721             SYSCTL_CHILDREN(rack_counters),
 722             OID_AUTO, "tlp_new", CTLFLAG_RD,
 723             &rack_tlp_newdata,
 724             "Total number of tail loss probe sending new data");
 725
 726         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 727         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 728             SYSCTL_CHILDREN(rack_counters),
 729             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 730             &rack_tlp_retran,
 731             "Total number of tail loss probe sending retransmitted data");
 732         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 733         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 734             SYSCTL_CHILDREN(rack_counters),
 735             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 736             &rack_tlp_retran_bytes,
 737             "Total bytes of tail loss probe sending retransmitted data");
 738         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 739         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 740             SYSCTL_CHILDREN(rack_counters),
 741             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 742             &rack_tlp_retran_fail,
 743             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 744         rack_to_tot = counter_u64_alloc(M_WAITOK);
 745         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 746             SYSCTL_CHILDREN(rack_counters),
 747             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 748             &rack_to_tot,
 749             "Total number of times the rack to expired?");
 750         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 751         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 752             SYSCTL_CHILDREN(rack_counters),
 753             OID_AUTO, "arm_rack", CTLFLAG_RD,
 754             &rack_to_arm_rack,
 755             "Total number of times the rack timer armed?");
 756         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 757         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 758             SYSCTL_CHILDREN(rack_counters),
 759             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 760             &rack_to_arm_tlp,
 761             "Total number of times the tlp timer armed?");
 762
 763         rack_calc_zero = counter_u64_alloc(M_WAITOK);
 764         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
 765         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 766             SYSCTL_CHILDREN(rack_counters),
 767             OID_AUTO, "calc_zero", CTLFLAG_RD,
 768             &rack_calc_zero,
 769             "Total number of times pacing time worked out to zero?");
 770         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 771             SYSCTL_CHILDREN(rack_counters),
 772             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
 773             &rack_calc_nonzero,
 774             "Total number of times pacing time worked out to non-zero?");
 775         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 776         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 777             SYSCTL_CHILDREN(rack_counters),
 778             OID_AUTO, "paced", CTLFLAG_RD,
 779             &rack_paced_segments,
 780             "Total number of times a segment send caused hptsi");
 781         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 782         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 783             SYSCTL_CHILDREN(rack_counters),
 784             OID_AUTO, "unpaced", CTLFLAG_RD,
 785             &rack_unpaced_segments,
 786             "Total number of times a segment did not cause hptsi");
 787         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 788         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 789             SYSCTL_CHILDREN(rack_counters),
 790             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 791             &rack_saw_enobuf,
 792             "Total number of times a segment did not cause hptsi");
 793         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 794         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 795             SYSCTL_CHILDREN(rack_counters),
 796             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 797             &rack_saw_enetunreach,
 798             "Total number of times a segment did not cause hptsi");
 799         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 800         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 801             SYSCTL_CHILDREN(rack_counters),
 802             OID_AUTO, "allocs", CTLFLAG_RD,
 803             &rack_to_alloc,
 804             "Total allocations of tracking structures");
 805         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 806         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 807             SYSCTL_CHILDREN(rack_counters),
 808             OID_AUTO, "allochard", CTLFLAG_RD,
 809             &rack_to_alloc_hard,
 810             "Total allocations done with sleeping the hard way");
 811         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 812         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 813             SYSCTL_CHILDREN(rack_counters),
 814             OID_AUTO, "allocemerg", CTLFLAG_RD,
 815             &rack_to_alloc_emerg,
 816             "Total allocations done from emergency cache");
 817         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
 818         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 819             SYSCTL_CHILDREN(rack_counters),
 820             OID_AUTO, "alloc_limited", CTLFLAG_RD,
 821             &rack_to_alloc_limited,
 822             "Total allocations dropped due to limit");
 823         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 824         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 825             SYSCTL_CHILDREN(rack_counters),
 826             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 827             &rack_alloc_limited_conns,
 828             "Connections with allocations dropped due to limit");
 829         rack_split_limited = counter_u64_alloc(M_WAITOK);
 830         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 831             SYSCTL_CHILDREN(rack_counters),
 832             OID_AUTO, "split_limited", CTLFLAG_RD,
 833             &rack_split_limited,
 834             "Split allocations dropped due to limit");
 835         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 836         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 837             SYSCTL_CHILDREN(rack_counters),
 838             OID_AUTO, "sack_long", CTLFLAG_RD,
 839             &rack_sack_proc_all,
 840             "Total times we had to walk whole list for sack processing");
 841
 842         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 843         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 844             SYSCTL_CHILDREN(rack_counters),
 845             OID_AUTO, "sack_restart", CTLFLAG_RD,
 846             &rack_sack_proc_restart,
 847             "Total times we had to walk whole list due to a restart");
 848         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 849         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 850             SYSCTL_CHILDREN(rack_counters),
 851             OID_AUTO, "sack_short", CTLFLAG_RD,
 852             &rack_sack_proc_short,
 853             "Total times we took shortcut for sack processing");
 854         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 855         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 856             SYSCTL_CHILDREN(rack_counters),
 857             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 858             &rack_enter_tlp_calc,
 859             "Total times we called calc-tlp");
 860         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 861         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 862             SYSCTL_CHILDREN(rack_counters),
 863             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 864             &rack_used_tlpmethod,
 865             "Total number of runt sacks");
 866         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 867         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 868             SYSCTL_CHILDREN(rack_counters),
 869             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 870             &rack_used_tlpmethod2,
 871             "Total number of times we hit TLP method 2");
 872         /* Sack Attacker detection stuff */
 873         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 874             SYSCTL_CHILDREN(rack_sysctl_root),
 875             OID_AUTO,
 876             "sack_attack",
 877             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 878             "Rack Sack Attack Counters and Controls");
 879         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 880             SYSCTL_CHILDREN(rack_attack),
 881             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
 882             &rack_highest_sack_thresh_seen, 0,
 883             "Highest sack to ack ratio seen");
 884         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 885             SYSCTL_CHILDREN(rack_attack),
 886             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
 887             &rack_highest_move_thresh_seen, 0,
 888             "Highest move to non-move ratio seen");
 889         rack_ack_total = counter_u64_alloc(M_WAITOK);
 890         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 891             SYSCTL_CHILDREN(rack_attack),
 892             OID_AUTO, "acktotal", CTLFLAG_RD,
 893             &rack_ack_total,
 894             "Total number of Ack's");
 895
 896         rack_express_sack = counter_u64_alloc(M_WAITOK);
 897         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 898             SYSCTL_CHILDREN(rack_attack),
 899             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
 900             &rack_express_sack,
 901             "Total expresss number of Sack's");
 902         rack_sack_total = counter_u64_alloc(M_WAITOK);
 903         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 904             SYSCTL_CHILDREN(rack_attack),
 905             OID_AUTO, "sacktotal", CTLFLAG_RD,
 906             &rack_sack_total,
 907             "Total number of SACK's");
 908         rack_move_none = counter_u64_alloc(M_WAITOK);
 909         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 910             SYSCTL_CHILDREN(rack_attack),
 911             OID_AUTO, "move_none", CTLFLAG_RD,
 912             &rack_move_none,
 913             "Total number of SACK index reuse of postions under threshold");
 914         rack_move_some = counter_u64_alloc(M_WAITOK);
 915         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 916             SYSCTL_CHILDREN(rack_attack),
 917             OID_AUTO, "move_some", CTLFLAG_RD,
 918             &rack_move_some,
 919             "Total number of SACK index reuse of postions over threshold");
 920         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
 921         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 922             SYSCTL_CHILDREN(rack_attack),
 923             OID_AUTO, "attacks", CTLFLAG_RD,
 924             &rack_sack_attacks_detected,
 925             "Total number of SACK attackers that had sack disabled");
 926         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
 927         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 928             SYSCTL_CHILDREN(rack_attack),
 929             OID_AUTO, "reversed", CTLFLAG_RD,
 930             &rack_sack_attacks_reversed,
 931             "Total number of SACK attackers that were later determined false positive");
 932         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
 933         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 934             SYSCTL_CHILDREN(rack_attack),
 935             OID_AUTO, "nextmerge", CTLFLAG_RD,
 936             &rack_sack_used_next_merge,
 937             "Total number of times we used the next merge");
 938         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
 939         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 940             SYSCTL_CHILDREN(rack_attack),
 941             OID_AUTO, "prevmerge", CTLFLAG_RD,
 942             &rack_sack_used_prev_merge,
 943             "Total number of times we used the prev merge");
 944         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
 945         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 946             SYSCTL_CHILDREN(rack_attack),
 947             OID_AUTO, "skipacked", CTLFLAG_RD,
 948             &rack_sack_skipped_acked,
 949             "Total number of times we skipped previously sacked");
 950         rack_sack_splits = counter_u64_alloc(M_WAITOK);
 951         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 952             SYSCTL_CHILDREN(rack_attack),
 953             OID_AUTO, "ofsplit", CTLFLAG_RD,
 954             &rack_sack_splits,
 955             "Total number of times we did the old fashion tree split");
 956         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 957         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 958             SYSCTL_CHILDREN(rack_counters),
 959             OID_AUTO, "prog_drops", CTLFLAG_RD,
 960             &rack_progress_drops,
 961             "Total number of progress drops");
 962         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 963         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 964             SYSCTL_CHILDREN(rack_counters),
 965             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 966             &rack_input_idle_reduces,
 967             "Total number of idle reductions on input");
 968         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
 969         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 970             SYSCTL_CHILDREN(rack_counters),
 971             OID_AUTO, "collapsed_win", CTLFLAG_RD,
 972             &rack_collapsed_win,
 973             "Total number of collapsed windows");
 974         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 975         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 976             SYSCTL_CHILDREN(rack_counters),
 977             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 978             &rack_tlp_does_nada,
 979             "Total number of nada tlp calls");
 980
 981         rack_tls_rwnd = counter_u64_alloc(M_WAITOK);
 982         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 983             SYSCTL_CHILDREN(rack_counters),
 984             OID_AUTO, "tls_rwnd", CTLFLAG_RD,
 985             &rack_tls_rwnd,
 986             "Total hdwr tls rwnd limited");
 987
 988         rack_tls_cwnd = counter_u64_alloc(M_WAITOK);
 989         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 990             SYSCTL_CHILDREN(rack_counters),
 991             OID_AUTO, "tls_cwnd", CTLFLAG_RD,
 992             &rack_tls_cwnd,
 993             "Total hdwr tls cwnd limited");
 994
 995         rack_tls_app = counter_u64_alloc(M_WAITOK);
 996         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 997             SYSCTL_CHILDREN(rack_counters),
 998             OID_AUTO, "tls_app", CTLFLAG_RD,
 999             &rack_tls_app,
1000             "Total hdwr tls app limited");
1001
1002         rack_tls_other = counter_u64_alloc(M_WAITOK);
1003         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1004             SYSCTL_CHILDREN(rack_counters),
1005             OID_AUTO, "tls_other", CTLFLAG_RD,
1006             &rack_tls_other,
1007             "Total hdwr tls other limited");
1008
1009         rack_tls_filled = counter_u64_alloc(M_WAITOK);
1010         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1011             SYSCTL_CHILDREN(rack_counters),
1012             OID_AUTO, "tls_filled", CTLFLAG_RD,
1013             &rack_tls_filled,
1014             "Total hdwr tls filled");
1015
1016         rack_tls_rxt = counter_u64_alloc(M_WAITOK);
1017         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1018             SYSCTL_CHILDREN(rack_counters),
1019             OID_AUTO, "tls_rxt", CTLFLAG_RD,
1020             &rack_tls_rxt,
1021             "Total hdwr rxt");
1022
1023         rack_tls_tlp = counter_u64_alloc(M_WAITOK);
1024         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1025             SYSCTL_CHILDREN(rack_counters),
1026             OID_AUTO, "tls_tlp", CTLFLAG_RD,
1027             &rack_tls_tlp,
1028             "Total hdwr tls tlp");
1029         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1030         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1031             SYSCTL_CHILDREN(rack_counters),
1032             OID_AUTO, "timer_hole", CTLFLAG_RD,
1033             &rack_per_timer_hole,
1034             "Total persists start in timer hole");
1035
1036         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1037         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1038             OID_AUTO, "outsize", CTLFLAG_RD,
1039             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1040         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1041         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1042             OID_AUTO, "opts", CTLFLAG_RD,
1043             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1044         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1045             SYSCTL_CHILDREN(rack_sysctl_root),
1046             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1047             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1048 }
1049
1050 static __inline int
1051 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1052 {
1053         if (SEQ_GEQ(b->r_start, a->r_start) &&
1054             SEQ_LT(b->r_start, a->r_end)) {
1055                 /*
1056                  * The entry b is within the
1057                  * block a. i.e.:
1058                  * a --   |-------------|
1059                  * b --   |----|
1060                  * <or>
1061                  * b --       |------|
1062                  * <or>
1063                  * b --       |-----------|
1064                  */
1065                 return (0);
1066         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1067                 /*
1068                  * b falls as either the next
1069                  * sequence block after a so a
1070                  * is said to be smaller than b.
1071                  * i.e:
1072                  * a --   |------|
1073                  * b --          |--------|
1074                  * or
1075                  * b --              |-----|
1076                  */
1077                 return (1);
1078         }
1079         /*
1080          * Whats left is where a is
1081          * larger than b. i.e:
1082          * a --         |-------|
1083          * b --  |---|
1084          * or even possibly
1085          * b --   |--------------|
1086          */
1087         return (-1);
1088 }
1089
1090 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1091 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1092
1093 static inline int32_t
1094 rack_progress_timeout_check(struct tcpcb *tp)
1095 {
1096         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
1097                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
1098                         /*
1099                          * There is an assumption that the caller
1100                          * will drop the connection so we will
1101                          * increment the counters here.
1102                          */
1103                         struct tcp_rack *rack;
1104                         rack = (struct tcp_rack *)tp->t_fb_ptr;
1105                         counter_u64_add(rack_progress_drops, 1);
1106 #ifdef NETFLIX_STATS
1107                         KMOD_TCPSTAT_INC(tcps_progdrops);
1108 #endif
1109                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
1110                         return (1);
1111                 }
1112         }
1113         return (0);
1114 }
1115
1116
1117
1118 static void
1119 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
1120 {
1121         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1122                 union tcp_log_stackspecific log;
1123                 struct timeval tv;
1124                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1125                 log.u_bbr.flex1 = tsused;
1126                 log.u_bbr.flex2 = thresh;
1127                 log.u_bbr.flex3 = rsm->r_flags;
1128                 log.u_bbr.flex4 = rsm->r_dupack;
1129                 log.u_bbr.flex5 = rsm->r_start;
1130                 log.u_bbr.flex6 = rsm->r_end;
1131                 log.u_bbr.flex8 = mod;
1132                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1133                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1134                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1135                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1136                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1137                     &rack->rc_inp->inp_socket->so_rcv,
1138                     &rack->rc_inp->inp_socket->so_snd,
1139                     BBR_LOG_SETTINGS_CHG, 0,
1140                     0, &log, false, &tv);
1141         }
1142 }
1143
1144
1145
1146 static void
1147 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
1148 {
1149         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1150                 union tcp_log_stackspecific log;
1151                 struct timeval tv;
1152
1153                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1154                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
1155                 log.u_bbr.flex2 = to;
1156                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1157                 log.u_bbr.flex4 = slot;
1158                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
1159                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1160                 log.u_bbr.flex7 = rack->rc_in_persist;
1161                 log.u_bbr.flex8 = which;
1162                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1163                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1164                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1165                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1166                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1167                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1168                     &rack->rc_inp->inp_socket->so_rcv,
1169                     &rack->rc_inp->inp_socket->so_snd,
1170                     BBR_LOG_TIMERSTAR, 0,
1171                     0, &log, false, &tv);
1172         }
1173 }
1174
1175 static void
1176 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no)
1177 {
1178         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1179                 union tcp_log_stackspecific log;
1180                 struct timeval tv;
1181
1182                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1183                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1184                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1185                 log.u_bbr.flex8 = to_num;
1186                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
1187                 log.u_bbr.flex2 = rack->rc_rack_rtt;
1188                 log.u_bbr.flex3 = no;
1189                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1190                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1191                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1192                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1193                     &rack->rc_inp->inp_socket->so_rcv,
1194                     &rack->rc_inp->inp_socket->so_snd,
1195                     BBR_LOG_RTO, 0,
1196                     0, &log, false, &tv);
1197         }
1198 }
1199
1200 static void
1201 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
1202     uint32_t o_srtt, uint32_t o_var)
1203 {
1204         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1205                 union tcp_log_stackspecific log;
1206                 struct timeval tv;
1207
1208                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1209                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1210                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1211                 log.u_bbr.flex1 = t;
1212                 log.u_bbr.flex2 = o_srtt;
1213                 log.u_bbr.flex3 = o_var;
1214                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
1215                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
1216                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
1217                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
1218                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
1219                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1220                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1221                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1222                 TCP_LOG_EVENTP(tp, NULL,
1223                     &rack->rc_inp->inp_socket->so_rcv,
1224                     &rack->rc_inp->inp_socket->so_snd,
1225                     BBR_LOG_BBRRTT, 0,
1226                     0, &log, false, &tv);
1227         }
1228 }
1229
1230 static void
1231 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
1232 {
1233         /*
1234          * Log the rtt sample we are
1235          * applying to the srtt algorithm in
1236          * useconds.
1237          */
1238         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1239                 union tcp_log_stackspecific log;
1240                 struct timeval tv;
1241
1242                 /* Convert our ms to a microsecond */
1243                 memset(&log, 0, sizeof(log));
1244                 log.u_bbr.flex1 = rtt * 1000;
1245                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1246                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
1247                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1248                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
1249                 log.u_bbr.flex8 = rack->sack_attack_disable;
1250                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1251                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1252                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1253                     &rack->rc_inp->inp_socket->so_rcv,
1254                     &rack->rc_inp->inp_socket->so_snd,
1255                     TCP_LOG_RTT, 0,
1256                     0, &log, false, &tv);
1257         }
1258 }
1259
1260
1261 static inline void
1262 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
1263 {
1264         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1265                 union tcp_log_stackspecific log;
1266                 struct timeval tv;
1267
1268                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1269                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1270                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1271                 log.u_bbr.flex1 = line;
1272                 log.u_bbr.flex2 = tick;
1273                 log.u_bbr.flex3 = tp->t_maxunacktime;
1274                 log.u_bbr.flex4 = tp->t_acktime;
1275                 log.u_bbr.flex8 = event;
1276                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1277                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1278                 TCP_LOG_EVENTP(tp, NULL,
1279                     &rack->rc_inp->inp_socket->so_rcv,
1280                     &rack->rc_inp->inp_socket->so_snd,
1281                     BBR_LOG_PROGRESS, 0,
1282                     0, &log, false, &tv);
1283         }
1284 }
1285
1286 static void
1287 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1288 {
1289         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1290                 union tcp_log_stackspecific log;
1291                 struct timeval tv;
1292
1293                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1294                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1295                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1296                 log.u_bbr.flex1 = slot;
1297                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
1298                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1299                 log.u_bbr.flex8 = rack->rc_in_persist;
1300                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1301                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1302                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1303                     &rack->rc_inp->inp_socket->so_rcv,
1304                     &rack->rc_inp->inp_socket->so_snd,
1305                     BBR_LOG_BBRSND, 0,
1306                     0, &log, false, &tv);
1307         }
1308 }
1309
1310 static void
1311 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1312 {
1313         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1314                 union tcp_log_stackspecific log;
1315                 struct timeval tv;
1316
1317                 memset(&log, 0, sizeof(log));
1318                 log.u_bbr.flex1 = did_out;
1319                 log.u_bbr.flex2 = nxt_pkt;
1320                 log.u_bbr.flex3 = way_out;
1321                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1322                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1323                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
1324                 log.u_bbr.flex7 = rack->r_wanted_output;
1325                 log.u_bbr.flex8 = rack->rc_in_persist;
1326                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1327                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1328                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1329                     &rack->rc_inp->inp_socket->so_rcv,
1330                     &rack->rc_inp->inp_socket->so_snd,
1331                     BBR_LOG_DOSEG_DONE, 0,
1332                     0, &log, false, &tv);
1333         }
1334 }
1335
1336 static void
1337 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
1338 {
1339         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1340                 union tcp_log_stackspecific log;
1341                 struct timeval tv;
1342                 uint32_t cts;
1343
1344                 memset(&log, 0, sizeof(log));
1345                 cts = tcp_get_usecs(&tv);
1346                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
1347                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
1348                 log.u_bbr.flex4 = len;
1349                 log.u_bbr.flex5 = orig_len;
1350                 log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
1351                 log.u_bbr.flex7 = mod;
1352                 log.u_bbr.flex8 = frm;
1353                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1354                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1355                 TCP_LOG_EVENTP(tp, NULL,
1356                     &tp->t_inpcb->inp_socket->so_rcv,
1357                     &tp->t_inpcb->inp_socket->so_snd,
1358                     TCP_HDWR_TLS, 0,
1359                     0, &log, false, &tv);
1360         }
1361 }
1362
1363 static void
1364 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1365 {
1366         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1367                 union tcp_log_stackspecific log;
1368                 struct timeval tv;
1369
1370                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1371                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1372                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1373                 log.u_bbr.flex1 = slot;
1374                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1375                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1376                 log.u_bbr.flex7 = hpts_calling;
1377                 log.u_bbr.flex8 = rack->rc_in_persist;
1378                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1379                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1380                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1381                     &rack->rc_inp->inp_socket->so_rcv,
1382                     &rack->rc_inp->inp_socket->so_snd,
1383                     BBR_LOG_JUSTRET, 0,
1384                     tlen, &log, false, &tv);
1385         }
1386 }
1387
1388 static void
1389 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1390 {
1391         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1392                 union tcp_log_stackspecific log;
1393                 struct timeval tv;
1394
1395                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1396                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1397                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1398                 log.u_bbr.flex1 = line;
1399                 log.u_bbr.flex2 = 0;
1400                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1401                 log.u_bbr.flex4 = 0;
1402                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1403                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1404                 log.u_bbr.flex8 = hpts_removed;
1405                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1406                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1407                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1408                     &rack->rc_inp->inp_socket->so_rcv,
1409                     &rack->rc_inp->inp_socket->so_snd,
1410                     BBR_LOG_TIMERCANC, 0,
1411                     0, &log, false, &tv);
1412         }
1413 }
1414
1415 static void
1416 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1417 {
1418         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1419                 union tcp_log_stackspecific log;
1420                 struct timeval tv;
1421
1422                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1423                 log.u_bbr.flex1 = timers;
1424                 log.u_bbr.flex2 = ret;
1425                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1426                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1427                 log.u_bbr.flex5 = cts;
1428                 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
1429                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1430                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1431                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1432                     &rack->rc_inp->inp_socket->so_rcv,
1433                     &rack->rc_inp->inp_socket->so_snd,
1434                     BBR_LOG_TO_PROCESS, 0,
1435                     0, &log, false, &tv);
1436         }
1437 }
1438
1439 static void
1440 rack_log_to_prr(struct tcp_rack *rack, int frm)
1441 {
1442         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1443                 union tcp_log_stackspecific log;
1444                 struct timeval tv;
1445
1446                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1447                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
1448                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
1449                 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
1450                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
1451                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
1452                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
1453                 log.u_bbr.flex8 = frm;
1454                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1455                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1456                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1457                     &rack->rc_inp->inp_socket->so_rcv,
1458                     &rack->rc_inp->inp_socket->so_snd,
1459                     BBR_LOG_BBRUPD, 0,
1460                     0, &log, false, &tv);
1461         }
1462 }
1463
1464 #ifdef NETFLIX_EXP_DETECTION
1465 static void
1466 rack_log_sad(struct tcp_rack *rack, int event)
1467 {
1468         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1469                 union tcp_log_stackspecific log;
1470                 struct timeval tv;
1471
1472                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1473                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
1474                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1475                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
1476                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1477                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
1478                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
1479                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
1480                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
1481                 log.u_bbr.lt_epoch |= rack->do_detection;
1482                 log.u_bbr.applimited = tcp_map_minimum;
1483                 log.u_bbr.flex7 = rack->sack_attack_disable;
1484                 log.u_bbr.flex8 = event;
1485                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1486                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1487                 log.u_bbr.delivered = tcp_sad_decay_val;
1488                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1489                     &rack->rc_inp->inp_socket->so_rcv,
1490                     &rack->rc_inp->inp_socket->so_snd,
1491                     TCP_SAD_DETECTION, 0,
1492                     0, &log, false, &tv);
1493         }
1494 }
1495 #endif
1496
1497 static void
1498 rack_counter_destroy(void)
1499 {
1500         counter_u64_free(rack_badfr);
1501         counter_u64_free(rack_badfr_bytes);
1502         counter_u64_free(rack_rtm_prr_retran);
1503         counter_u64_free(rack_rtm_prr_newdata);
1504         counter_u64_free(rack_timestamp_mismatch);
1505         counter_u64_free(rack_reorder_seen);
1506         counter_u64_free(rack_tlp_tot);
1507         counter_u64_free(rack_tlp_newdata);
1508         counter_u64_free(rack_tlp_retran);
1509         counter_u64_free(rack_tlp_retran_bytes);
1510         counter_u64_free(rack_tlp_retran_fail);
1511         counter_u64_free(rack_to_tot);
1512         counter_u64_free(rack_to_arm_rack);
1513         counter_u64_free(rack_to_arm_tlp);
1514         counter_u64_free(rack_paced_segments);
1515         counter_u64_free(rack_unpaced_segments);
1516         counter_u64_free(rack_saw_enobuf);
1517         counter_u64_free(rack_saw_enetunreach);
1518         counter_u64_free(rack_to_alloc_hard);
1519         counter_u64_free(rack_to_alloc_emerg);
1520         counter_u64_free(rack_sack_proc_all);
1521         counter_u64_free(rack_sack_proc_short);
1522         counter_u64_free(rack_sack_proc_restart);
1523         counter_u64_free(rack_to_alloc);
1524         counter_u64_free(rack_to_alloc_limited);
1525         counter_u64_free(rack_alloc_limited_conns);
1526         counter_u64_free(rack_split_limited);
1527         counter_u64_free(rack_find_high);
1528         counter_u64_free(rack_enter_tlp_calc);
1529         counter_u64_free(rack_used_tlpmethod);
1530         counter_u64_free(rack_used_tlpmethod2);
1531         counter_u64_free(rack_progress_drops);
1532         counter_u64_free(rack_input_idle_reduces);
1533         counter_u64_free(rack_collapsed_win);
1534         counter_u64_free(rack_tlp_does_nada);
1535         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1536         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1537 }
1538
1539 static struct rack_sendmap *
1540 rack_alloc(struct tcp_rack *rack)
1541 {
1542         struct rack_sendmap *rsm;
1543
1544         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1545         if (rsm) {
1546                 rack->r_ctl.rc_num_maps_alloced++;
1547                 counter_u64_add(rack_to_alloc, 1);
1548                 return (rsm);
1549         }
1550         if (rack->rc_free_cnt) {
1551                 counter_u64_add(rack_to_alloc_emerg, 1);
1552                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1553                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
1554                 rack->rc_free_cnt--;
1555                 return (rsm);
1556         }
1557         return (NULL);
1558 }
1559
1560 static struct rack_sendmap *
1561 rack_alloc_full_limit(struct tcp_rack *rack)
1562 {
1563         if ((V_tcp_map_entries_limit > 0) &&
1564             (rack->do_detection == 0) &&
1565             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
1566                 counter_u64_add(rack_to_alloc_limited, 1);
1567                 if (!rack->alloc_limit_reported) {
1568                         rack->alloc_limit_reported = 1;
1569                         counter_u64_add(rack_alloc_limited_conns, 1);
1570                 }
1571                 return (NULL);
1572         }
1573         return (rack_alloc(rack));
1574 }
1575
1576 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1577 static struct rack_sendmap *
1578 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1579 {
1580         struct rack_sendmap *rsm;
1581
1582         if (limit_type) {
1583                 /* currently there is only one limit type */
1584                 if (V_tcp_map_split_limit > 0 &&
1585                     (rack->do_detection == 0) &&
1586                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
1587                         counter_u64_add(rack_split_limited, 1);
1588                         if (!rack->alloc_limit_reported) {
1589                                 rack->alloc_limit_reported = 1;
1590                                 counter_u64_add(rack_alloc_limited_conns, 1);
1591                         }
1592                         return (NULL);
1593                 }
1594         }
1595
1596         /* allocate and mark in the limit type, if set */
1597         rsm = rack_alloc(rack);
1598         if (rsm != NULL && limit_type) {
1599                 rsm->r_limit_type = limit_type;
1600                 rack->r_ctl.rc_num_split_allocs++;
1601         }
1602         return (rsm);
1603 }
1604
1605 static void
1606 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1607 {
1608         if (rsm->r_limit_type) {
1609                 /* currently there is only one limit type */
1610                 rack->r_ctl.rc_num_split_allocs--;
1611         }
1612         if (rack->r_ctl.rc_tlpsend == rsm)
1613                 rack->r_ctl.rc_tlpsend = NULL;
1614         if (rack->r_ctl.rc_sacklast == rsm)
1615                 rack->r_ctl.rc_sacklast = NULL;
1616         if (rack->rc_free_cnt < rack_free_cache) {
1617                 memset(rsm, 0, sizeof(struct rack_sendmap));
1618                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
1619                 rsm->r_limit_type = 0;
1620                 rack->rc_free_cnt++;
1621                 return;
1622         }
1623         rack->r_ctl.rc_num_maps_alloced--;
1624         uma_zfree(rack_zone, rsm);
1625 }
1626
1627 /*
1628  * CC wrapper hook functions
1629  */
1630 static void
1631 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1632     uint16_t type, int32_t recovery)
1633 {
1634 #ifdef STATS
1635         int32_t gput;
1636 #endif
1637
1638         INP_WLOCK_ASSERT(tp->t_inpcb);
1639         tp->ccv->nsegs = nsegs;
1640         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1641         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1642                 uint32_t max;
1643
1644                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
1645                 if (tp->ccv->bytes_this_ack > max) {
1646                         tp->ccv->bytes_this_ack = max;
1647                 }
1648         }
1649         if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
1650             (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
1651              (tp->snd_cwnd < (ctf_flight_size(tp, rack->r_ctl.rc_sacked) * 2))))
1652                 tp->ccv->flags |= CCF_CWND_LIMITED;
1653         else
1654                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1655
1656         if (type == CC_ACK) {
1657 #ifdef STATS
1658                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1659                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1660                 if ((tp->t_flags & TF_GPUTINPROG) &&
1661                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1662                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1663                             max(1, tcp_ts_getticks() - tp->gput_ts);
1664                         /* We store it in bytes per ms (or kbytes per sec) */
1665                         rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8;
1666                         rack->r_ctl.rc_gp_hist_idx++;
1667                         if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST)
1668                                 rack->r_ctl.rc_gp_hist_filled = 1;
1669                         rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST;
1670                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1671                             gput);
1672                         /*
1673                          * XXXLAS: This is a temporary hack, and should be
1674                          * chained off VOI_TCP_GPUT when stats(9) grows an
1675                          * API to deal with chained VOIs.
1676                          */
1677                         if (tp->t_stats_gput_prev > 0)
1678                                 stats_voi_update_abs_s32(tp->t_stats,
1679                                     VOI_TCP_GPUT_ND,
1680                                     ((gput - tp->t_stats_gput_prev) * 100) /
1681                                     tp->t_stats_gput_prev);
1682                         tp->t_flags &= ~TF_GPUTINPROG;
1683                         tp->t_stats_gput_prev = gput;
1684 #ifdef NETFLIX_PEAKRATE
1685                         if (tp->t_maxpeakrate) {
1686                                 /*
1687                                  * We update t_peakrate_thr. This gives us roughly
1688                                  * one update per round trip time.
1689                                  */
1690                                 tcp_update_peakrate_thr(tp);
1691                         }
1692 #endif
1693                 }
1694 #endif
1695                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1696                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1697                             nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
1698                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1699                                 tp->t_bytes_acked -= tp->snd_cwnd;
1700                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1701                         }
1702                 } else {
1703                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1704                         tp->t_bytes_acked = 0;
1705                 }
1706         }
1707         if (CC_ALGO(tp)->ack_received != NULL) {
1708                 /* XXXLAS: Find a way to live without this */
1709                 tp->ccv->curack = th->th_ack;
1710                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1711         }
1712 #ifdef STATS
1713         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1714 #endif
1715         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1716                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1717         }
1718         /* we enforce max peak rate if it is set. */
1719         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1720                 tp->snd_cwnd = tp->t_peakrate_thr;
1721         }
1722 }
1723
1724 static void
1725 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1726 {
1727         struct tcp_rack *rack;
1728
1729         rack = (struct tcp_rack *)tp->t_fb_ptr;
1730         INP_WLOCK_ASSERT(tp->t_inpcb);
1731         if (rack->r_ctl.rc_prr_sndcnt > 0)
1732                 rack->r_wanted_output++;
1733 }
1734
1735 static void
1736 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1737 {
1738         struct tcp_rack *rack;
1739
1740         INP_WLOCK_ASSERT(tp->t_inpcb);
1741         rack = (struct tcp_rack *)tp->t_fb_ptr;
1742         if (CC_ALGO(tp)->post_recovery != NULL) {
1743                 tp->ccv->curack = th->th_ack;
1744                 CC_ALGO(tp)->post_recovery(tp->ccv);
1745         }
1746         /*
1747          * Here we can in theory adjust cwnd to be based on the number of
1748          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1749          * based on the rack_use_proportional flag.
1750          */
1751         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1752                 int32_t reduce;
1753
1754                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1755                 if (reduce > 50) {
1756                         reduce = 50;
1757                 }
1758                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1759         } else {
1760                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1761                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1762                         tp->snd_cwnd = tp->snd_ssthresh;
1763                 }
1764         }
1765         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1766                 /* Suck the next prr cnt back into cwnd */
1767                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1768                 rack->r_ctl.rc_prr_sndcnt = 0;
1769                 rack_log_to_prr(rack, 1);
1770         }
1771         tp->snd_recover = tp->snd_una;
1772         EXIT_RECOVERY(tp->t_flags);
1773
1774
1775 }
1776
1777 static void
1778 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1779 {
1780         struct tcp_rack *rack;
1781
1782         INP_WLOCK_ASSERT(tp->t_inpcb);
1783
1784         rack = (struct tcp_rack *)tp->t_fb_ptr;
1785         switch (type) {
1786         case CC_NDUPACK:
1787                 tp->t_flags &= ~TF_WASFRECOVERY;
1788                 tp->t_flags &= ~TF_WASCRECOVERY;
1789                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1790                         rack->r_ctl.rc_tlp_rtx_out = 0;
1791                         rack->r_ctl.rc_prr_delivered = 0;
1792                         rack->r_ctl.rc_prr_out = 0;
1793                         rack->r_ctl.rc_loss_count = 0;
1794                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
1795                         rack_log_to_prr(rack, 2);
1796                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1797                         tp->snd_recover = tp->snd_max;
1798                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1799                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1800                 }
1801                 break;
1802         case CC_ECN:
1803                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1804                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
1805                         tp->snd_recover = tp->snd_max;
1806                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1807                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1808                 }
1809                 break;
1810         case CC_RTO:
1811                 tp->t_dupacks = 0;
1812                 tp->t_bytes_acked = 0;
1813                 EXIT_RECOVERY(tp->t_flags);
1814                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1815                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
1816                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
1817                 if (tp->t_flags2 & TF2_ECN_PERMIT)
1818                         tp->t_flags2 |= TF2_ECN_SND_CWR;
1819                 break;
1820         case CC_RTO_ERR:
1821                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
1822                 /* RTO was unnecessary, so reset everything. */
1823                 tp->snd_cwnd = tp->snd_cwnd_prev;
1824                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1825                 tp->snd_recover = tp->snd_recover_prev;
1826                 if (tp->t_flags & TF_WASFRECOVERY) {
1827                         ENTER_FASTRECOVERY(tp->t_flags);
1828                         tp->t_flags &= ~TF_WASFRECOVERY;
1829                 }
1830                 if (tp->t_flags & TF_WASCRECOVERY) {
1831                         ENTER_CONGRECOVERY(tp->t_flags);
1832                         tp->t_flags &= ~TF_WASCRECOVERY;
1833                 }
1834                 tp->snd_nxt = tp->snd_max;
1835                 tp->t_badrxtwin = 0;
1836                 break;
1837         }
1838
1839         if (CC_ALGO(tp)->cong_signal != NULL) {
1840                 if (th != NULL)
1841                         tp->ccv->curack = th->th_ack;
1842                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1843         }
1844 }
1845
1846
1847
1848 static inline void
1849 rack_cc_after_idle(struct tcpcb *tp)
1850 {
1851         uint32_t i_cwnd;
1852
1853         INP_WLOCK_ASSERT(tp->t_inpcb);
1854
1855 #ifdef NETFLIX_STATS
1856         KMOD_TCPSTAT_INC(tcps_idle_restarts);
1857         if (tp->t_state == TCPS_ESTABLISHED)
1858                 KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
1859 #endif
1860         if (CC_ALGO(tp)->after_idle != NULL)
1861                 CC_ALGO(tp)->after_idle(tp->ccv);
1862
1863         if (tp->snd_cwnd == 1)
1864                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1865         else
1866                 i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
1867
1868         /*
1869          * Being idle is no differnt than the initial window. If the cc
1870          * clamps it down below the initial window raise it to the initial
1871          * window.
1872          */
1873         if (tp->snd_cwnd < i_cwnd) {
1874                 tp->snd_cwnd = i_cwnd;
1875         }
1876 }
1877
1878
1879 /*
1880  * Indicate whether this ack should be delayed.  We can delay the ack if
1881  * following conditions are met:
1882  *      - There is no delayed ack timer in progress.
1883  *      - Our last ack wasn't a 0-sized window. We never want to delay
1884  *        the ack that opens up a 0-sized window.
1885  *      - LRO wasn't used for this segment. We make sure by checking that the
1886  *        segment size is not larger than the MSS.
1887  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1888  *        connection.
1889  */
1890 #define DELAY_ACK(tp, tlen)                      \
1891         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1892         ((tp->t_flags & TF_DELACK) == 0) &&      \
1893         (tlen <= tp->t_maxseg) &&                \
1894         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1895
1896 static struct rack_sendmap *
1897 rack_find_lowest_rsm(struct tcp_rack *rack)
1898 {
1899         struct rack_sendmap *rsm;
1900
1901         /*
1902          * Walk the time-order transmitted list looking for an rsm that is
1903          * not acked. This will be the one that was sent the longest time
1904          * ago that is still outstanding.
1905          */
1906         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1907                 if (rsm->r_flags & RACK_ACKED) {
1908                         continue;
1909                 }
1910                 goto finish;
1911         }
1912 finish:
1913         return (rsm);
1914 }
1915
1916 static struct rack_sendmap *
1917 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1918 {
1919         struct rack_sendmap *prsm;
1920
1921         /*
1922          * Walk the sequence order list backward until we hit and arrive at
1923          * the highest seq not acked. In theory when this is called it
1924          * should be the last segment (which it was not).
1925          */
1926         counter_u64_add(rack_find_high, 1);
1927         prsm = rsm;
1928         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
1929                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1930                         continue;
1931                 }
1932                 return (prsm);
1933         }
1934         return (NULL);
1935 }
1936
1937
1938 static uint32_t
1939 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1940 {
1941         int32_t lro;
1942         uint32_t thresh;
1943
1944         /*
1945          * lro is the flag we use to determine if we have seen reordering.
1946          * If it gets set we have seen reordering. The reorder logic either
1947          * works in one of two ways:
1948          *
1949          * If reorder-fade is configured, then we track the last time we saw
1950          * re-ordering occur. If we reach the point where enough time as
1951          * passed we no longer consider reordering has occuring.
1952          *
1953          * Or if reorder-face is 0, then once we see reordering we consider
1954          * the connection to alway be subject to reordering and just set lro
1955          * to 1.
1956          *
1957          * In the end if lro is non-zero we add the extra time for
1958          * reordering in.
1959          */
1960         if (srtt == 0)
1961                 srtt = 1;
1962         if (rack->r_ctl.rc_reorder_ts) {
1963                 if (rack->r_ctl.rc_reorder_fade) {
1964                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1965                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1966                                 if (lro == 0) {
1967                                         /*
1968                                          * No time as passed since the last
1969                                          * reorder, mark it as reordering.
1970                                          */
1971                                         lro = 1;
1972                                 }
1973                         } else {
1974                                 /* Negative time? */
1975                                 lro = 0;
1976                         }
1977                         if (lro > rack->r_ctl.rc_reorder_fade) {
1978                                 /* Turn off reordering seen too */
1979                                 rack->r_ctl.rc_reorder_ts = 0;
1980                                 lro = 0;
1981                         }
1982                 } else {
1983                         /* Reodering does not fade */
1984                         lro = 1;
1985                 }
1986         } else {
1987                 lro = 0;
1988         }
1989         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1990         if (lro) {
1991                 /* It must be set, if not you get 1/4 rtt */
1992                 if (rack->r_ctl.rc_reorder_shift)
1993                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1994                 else
1995                         thresh += (srtt >> 2);
1996         } else {
1997                 thresh += 1;
1998         }
1999         /* We don't let the rack timeout be above a RTO */
2000         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
2001                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
2002         }
2003         /* And we don't want it above the RTO max either */
2004         if (thresh > rack_rto_max) {
2005                 thresh = rack_rto_max;
2006         }
2007         return (thresh);
2008 }
2009
2010 static uint32_t
2011 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
2012                      struct rack_sendmap *rsm, uint32_t srtt)
2013 {
2014         struct rack_sendmap *prsm;
2015         uint32_t thresh, len;
2016         int maxseg;
2017
2018         if (srtt == 0)
2019                 srtt = 1;
2020         if (rack->r_ctl.rc_tlp_threshold)
2021                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
2022         else
2023                 thresh = (srtt * 2);
2024
2025         /* Get the previous sent packet, if any  */
2026         maxseg = ctf_fixed_maxseg(tp);
2027         counter_u64_add(rack_enter_tlp_calc, 1);
2028         len = rsm->r_end - rsm->r_start;
2029         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
2030                 /* Exactly like the ID */
2031                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
2032                         uint32_t alt_thresh;
2033                         /*
2034                          * Compensate for delayed-ack with the d-ack time.
2035                          */
2036                         counter_u64_add(rack_used_tlpmethod, 1);
2037                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2038                         if (alt_thresh > thresh)
2039                                 thresh = alt_thresh;
2040                 }
2041         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
2042                 /* 2.1 behavior */
2043                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
2044                 if (prsm && (len <= maxseg)) {
2045                         /*
2046                          * Two packets outstanding, thresh should be (2*srtt) +
2047                          * possible inter-packet delay (if any).
2048                          */
2049                         uint32_t inter_gap = 0;
2050                         int idx, nidx;
2051
2052                         counter_u64_add(rack_used_tlpmethod, 1);
2053                         idx = rsm->r_rtr_cnt - 1;
2054                         nidx = prsm->r_rtr_cnt - 1;
2055                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
2056                                 /* Yes it was sent later (or at the same time) */
2057                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
2058                         }
2059                         thresh += inter_gap;
2060                 } else  if (len <= maxseg) {
2061                         /*
2062                          * Possibly compensate for delayed-ack.
2063                          */
2064                         uint32_t alt_thresh;
2065
2066                         counter_u64_add(rack_used_tlpmethod2, 1);
2067                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2068                         if (alt_thresh > thresh)
2069                                 thresh = alt_thresh;
2070                 }
2071         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2072                 /* 2.2 behavior */
2073                 if (len <= maxseg) {
2074                         uint32_t alt_thresh;
2075                         /*
2076                          * Compensate for delayed-ack with the d-ack time.
2077                          */
2078                         counter_u64_add(rack_used_tlpmethod, 1);
2079                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2080                         if (alt_thresh > thresh)
2081                                 thresh = alt_thresh;
2082                 }
2083         }
2084         /* Not above an RTO */
2085         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2086                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2087         }
2088         /* Not above a RTO max */
2089         if (thresh > rack_rto_max) {
2090                 thresh = rack_rto_max;
2091         }
2092         /* Apply user supplied min TLP */
2093         if (thresh < rack_tlp_min) {
2094                 thresh = rack_tlp_min;
2095         }
2096         return (thresh);
2097 }
2098
2099 static uint32_t
2100 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
2101 {
2102         /*
2103          * We want the rack_rtt which is the
2104          * last rtt we measured. However if that
2105          * does not exist we fallback to the srtt (which
2106          * we probably will never do) and then as a last
2107          * resort we use RACK_INITIAL_RTO if no srtt is
2108          * yet set.
2109          */
2110         if (rack->rc_rack_rtt)
2111                 return(rack->rc_rack_rtt);
2112         else if (tp->t_srtt == 0)
2113                 return(RACK_INITIAL_RTO);
2114         return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
2115 }
2116
2117 static struct rack_sendmap *
2118 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2119 {
2120         /*
2121          * Check to see that we don't need to fall into recovery. We will
2122          * need to do so if our oldest transmit is past the time we should
2123          * have had an ack.
2124          */
2125         struct tcp_rack *rack;
2126         struct rack_sendmap *rsm;
2127         int32_t idx;
2128         uint32_t srtt, thresh;
2129
2130         rack = (struct tcp_rack *)tp->t_fb_ptr;
2131         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
2132                 return (NULL);
2133         }
2134         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2135         if (rsm == NULL)
2136                 return (NULL);
2137
2138         if (rsm->r_flags & RACK_ACKED) {
2139                 rsm = rack_find_lowest_rsm(rack);
2140                 if (rsm == NULL)
2141                         return (NULL);
2142         }
2143         idx = rsm->r_rtr_cnt - 1;
2144         srtt = rack_grab_rtt(tp, rack);
2145         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2146         if (tsused < rsm->r_tim_lastsent[idx]) {
2147                 return (NULL);
2148         }
2149         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2150                 return (NULL);
2151         }
2152         /* Ok if we reach here we are over-due */
2153         rack->r_ctl.rc_rsm_start = rsm->r_start;
2154         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2155         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2156         rack_cong_signal(tp, NULL, CC_NDUPACK);
2157         return (rsm);
2158 }
2159
2160 static uint32_t
2161 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2162 {
2163         int32_t t;
2164         int32_t tt;
2165         uint32_t ret_val;
2166
2167         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2168         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2169             rack_persist_min, rack_persist_max);
2170         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2171                 tp->t_rxtshift++;
2172         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2173         ret_val = (uint32_t)tt;
2174         return (ret_val);
2175 }
2176
2177 static uint32_t
2178 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
2179 {
2180         /*
2181          * Start the FR timer, we do this based on getting the first one in
2182          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2183          * events we need to stop the running timer (if its running) before
2184          * starting the new one.
2185          */
2186         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
2187         uint32_t srtt_cur;
2188         int32_t idx;
2189         int32_t is_tlp_timer = 0;
2190         struct rack_sendmap *rsm;
2191
2192         if (rack->t_timers_stopped) {
2193                 /* All timers have been stopped none are to run */
2194                 return (0);
2195         }
2196         if (rack->rc_in_persist) {
2197                 /* We can't start any timer in persists */
2198                 return (rack_get_persists_timer_val(tp, rack));
2199         }
2200         if ((tp->t_state < TCPS_ESTABLISHED) ||
2201             ((tp->t_flags & TF_SACK_PERMIT) == 0))
2202                 goto activate_rxt;
2203         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2204         if ((rsm == NULL) || sup_rack) {
2205                 /* Nothing on the send map */
2206 activate_rxt:
2207                 time_since_sent = 0;
2208                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2209                 if (rsm) {
2210                         idx = rsm->r_rtr_cnt - 1;
2211                         if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2212                                 tstmp_touse = rsm->r_tim_lastsent[idx];
2213                         else
2214                                 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2215                         if (TSTMP_GT(tstmp_touse, cts))
2216                             time_since_sent = cts - tstmp_touse;
2217                 }
2218                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2219                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2220                         to = TICKS_2_MSEC(tp->t_rxtcur);
2221                         if (to > time_since_sent)
2222                                 to -= time_since_sent;
2223                         else
2224                                 to = rack->r_ctl.rc_min_to;
2225                         if (to == 0)
2226                                 to = 1;
2227                         return (to);
2228                 }
2229                 return (0);
2230         }
2231         if (rsm->r_flags & RACK_ACKED) {
2232                 rsm = rack_find_lowest_rsm(rack);
2233                 if (rsm == NULL) {
2234                         /* No lowest? */
2235                         goto activate_rxt;
2236                 }
2237         }
2238         if (rack->sack_attack_disable) {
2239                 /*
2240                  * We don't want to do
2241                  * any TLP's if you are an attacker.
2242                  * Though if you are doing what
2243                  * is expected you may still have
2244                  * SACK-PASSED marks.
2245                  */
2246                 goto activate_rxt;
2247         }
2248         /* Convert from ms to usecs */
2249         if (rsm->r_flags & RACK_SACK_PASSED) {
2250                 if ((tp->t_flags & TF_SENTFIN) &&
2251                     ((tp->snd_max - tp->snd_una) == 1) &&
2252                     (rsm->r_flags & RACK_HAS_FIN)) {
2253                         /*
2254                          * We don't start a rack timer if all we have is a
2255                          * FIN outstanding.
2256                          */
2257                         goto activate_rxt;
2258                 }
2259                 if ((rack->use_rack_cheat == 0) &&
2260                     (IN_RECOVERY(tp->t_flags)) &&
2261                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
2262                         /*
2263                          * We are not cheating, in recovery  and
2264                          * not enough ack's to yet get our next
2265                          * retransmission out.
2266                          *
2267                          * Note that classified attackers do not
2268                          * get to use the rack-cheat.
2269                          */
2270                         goto activate_tlp;
2271                 }
2272                 srtt = rack_grab_rtt(tp, rack);
2273                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2274                 idx = rsm->r_rtr_cnt - 1;
2275                 exp = rsm->r_tim_lastsent[idx] + thresh;
2276                 if (SEQ_GEQ(exp, cts)) {
2277                         to = exp - cts;
2278                         if (to < rack->r_ctl.rc_min_to) {
2279                                 to = rack->r_ctl.rc_min_to;
2280                         }
2281                 } else {
2282                         to = rack->r_ctl.rc_min_to;
2283                 }
2284         } else {
2285                 /* Ok we need to do a TLP not RACK */
2286 activate_tlp:
2287                 if ((rack->rc_tlp_in_progress != 0) ||
2288                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2289                         /*
2290                          * The previous send was a TLP or a tlp_rtx is in
2291                          * process.
2292                          */
2293                         goto activate_rxt;
2294                 }
2295                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2296                 if (rsm == NULL) {
2297                         /* We found no rsm to TLP with. */
2298                         goto activate_rxt;
2299                 }
2300                 if (rsm->r_flags & RACK_HAS_FIN) {
2301                         /* If its a FIN we dont do TLP */
2302                         rsm = NULL;
2303                         goto activate_rxt;
2304                 }
2305                 idx = rsm->r_rtr_cnt - 1;
2306                 time_since_sent = 0;
2307                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2308                         tstmp_touse = rsm->r_tim_lastsent[idx];
2309                 else
2310                         tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2311                 if (TSTMP_GT(tstmp_touse, cts))
2312                     time_since_sent = cts - tstmp_touse;
2313                 is_tlp_timer = 1;
2314                 if (tp->t_srtt) {
2315                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2316                         srtt = TICKS_2_MSEC(srtt_cur);
2317                 } else
2318                         srtt = RACK_INITIAL_RTO;
2319                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2320                 if (thresh > time_since_sent)
2321                         to = thresh - time_since_sent;
2322                 else
2323                         to = rack->r_ctl.rc_min_to;
2324                 if (to > TCPTV_REXMTMAX) {
2325                         /*
2326                          * If the TLP time works out to larger than the max
2327                          * RTO lets not do TLP.. just RTO.
2328                          */
2329                         goto activate_rxt;
2330                 }
2331                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2332                         /*
2333                          * The tail is no longer the last one I did a probe
2334                          * on
2335                          */
2336                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2337                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2338                 }
2339         }
2340         if (is_tlp_timer == 0) {
2341                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2342         } else {
2343                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2344                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2345                         /*
2346                          * We have exceeded how many times we can retran the
2347                          * current TLP timer, switch to the RTO timer.
2348                          */
2349                         goto activate_rxt;
2350                 } else {
2351                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2352                 }
2353         }
2354         if (to == 0)
2355                 to = 1;
2356         return (to);
2357 }
2358
2359 static void
2360 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2361 {
2362         if (rack->rc_in_persist == 0) {
2363                 rack->r_ctl.rc_went_idle_time = cts;
2364                 rack_timer_cancel(tp, rack, cts, __LINE__);
2365                 tp->t_rxtshift = 0;
2366                 rack->rc_in_persist = 1;
2367         }
2368 }
2369
2370 static void
2371 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2372 {
2373         if (rack->rc_inp->inp_in_hpts)  {
2374                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2375                 rack->r_ctl.rc_hpts_flags  = 0;
2376         }
2377         rack->rc_in_persist = 0;
2378         rack->r_ctl.rc_went_idle_time = 0;
2379         tp->t_flags &= ~TF_FORCEDATA;
2380         tp->t_rxtshift = 0;
2381 }
2382
2383 static void
2384 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
2385       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
2386 {
2387         struct inpcb *inp;
2388         uint32_t delayed_ack = 0;
2389         uint32_t hpts_timeout;
2390         uint8_t stopped;
2391         uint32_t left = 0;
2392
2393         inp = tp->t_inpcb;
2394         if (inp->inp_in_hpts) {
2395                 /* A previous call is already set up */
2396                 return;
2397         }
2398         if ((tp->t_state == TCPS_CLOSED) ||
2399             (tp->t_state == TCPS_LISTEN)) {
2400                 return;
2401         }
2402         stopped = rack->rc_tmr_stopped;
2403         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2404                 left = rack->r_ctl.rc_timer_exp - cts;
2405         }
2406         rack->tlp_timer_up = 0;
2407         rack->r_ctl.rc_timer_exp = 0;
2408         if (rack->rc_inp->inp_in_hpts == 0) {
2409                 rack->r_ctl.rc_hpts_flags = 0;
2410         }
2411         if (slot) {
2412                 /* We are hptsi too */
2413                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2414         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2415                 /*
2416                  * We are still left on the hpts when the to goes
2417                  * it will be for output.
2418                  */
2419                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
2420                         slot = rack->r_ctl.rc_last_output_to - cts;
2421                 else
2422                         slot = 1;
2423         }
2424         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
2425 #ifdef NETFLIX_EXP_DETECTION
2426         if (rack->sack_attack_disable &&
2427             (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) {
2428                 /*
2429                  * We have a potential attacker on
2430                  * the line. We have possibly some
2431                  * (or now) pacing time set. We want to
2432                  * slow down the processing of sacks by some
2433                  * amount (if it is an attacker). Set the default
2434                  * slot for attackers in place (unless the orginal
2435                  * interval is longer). Its stored in
2436                  * micro-seconds, so lets convert to msecs.
2437                  */
2438                 slot = USEC_TO_MSEC(tcp_sad_pacing_interval);
2439         }
2440 #endif
2441         if (tp->t_flags & TF_DELACK) {
2442                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2443                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2444         }
2445         if (delayed_ack && ((hpts_timeout == 0) ||
2446                             (delayed_ack < hpts_timeout)))
2447                 hpts_timeout = delayed_ack;
2448         else
2449                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2450         /*
2451          * If no timers are going to run and we will fall off the hptsi
2452          * wheel, we resort to a keep-alive timer if its configured.
2453          */
2454         if ((hpts_timeout == 0) &&
2455             (slot == 0)) {
2456                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2457                     (tp->t_state <= TCPS_CLOSING)) {
2458                         /*
2459                          * Ok we have no timer (persists, rack, tlp, rxt  or
2460                          * del-ack), we don't have segments being paced. So
2461                          * all that is left is the keepalive timer.
2462                          */
2463                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2464                                 /* Get the established keep-alive time */
2465                                 hpts_timeout = TP_KEEPIDLE(tp);
2466                         } else {
2467                                 /* Get the initial setup keep-alive time */
2468                                 hpts_timeout = TP_KEEPINIT(tp);
2469                         }
2470                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2471                 }
2472         }
2473         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2474             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2475                 /*
2476                  * RACK, TLP, persists and RXT timers all are restartable
2477                  * based on actions input .. i.e we received a packet (ack
2478                  * or sack) and that changes things (rw, or snd_una etc).
2479                  * Thus we can restart them with a new value. For
2480                  * keep-alive, delayed_ack we keep track of what was left
2481                  * and restart the timer with a smaller value.
2482                  */
2483                 if (left < hpts_timeout)
2484                         hpts_timeout = left;
2485         }
2486         if (hpts_timeout) {
2487                 /*
2488                  * Hack alert for now we can't time-out over 2,147,483
2489                  * seconds (a bit more than 596 hours), which is probably ok
2490                  * :).
2491                  */
2492                 if (hpts_timeout > 0x7ffffffe)
2493                         hpts_timeout = 0x7ffffffe;
2494                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2495         }
2496         if (slot) {
2497                 rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2498                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)
2499                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2500                 else
2501                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2502                 rack->r_ctl.rc_last_output_to = cts + slot;
2503                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2504                         if (rack->rc_inp->inp_in_hpts == 0)
2505                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2506                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2507                 } else {
2508                         /*
2509                          * Arrange for the hpts to kick back in after the
2510                          * t-o if the t-o does not cause a send.
2511                          */
2512                         if (rack->rc_inp->inp_in_hpts == 0)
2513                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2514                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2515                 }
2516         } else if (hpts_timeout) {
2517                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)  {
2518                         /* For a rack timer, don't wake us */
2519                         rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2520                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2521                 } else {
2522                         /* All other timers wake us up */
2523                         rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
2524                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2525                 }
2526                 if (rack->rc_inp->inp_in_hpts == 0)
2527                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2528                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2529         } else {
2530                 /* No timer starting */
2531 #ifdef INVARIANTS
2532                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2533                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2534                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2535                 }
2536 #endif
2537         }
2538         rack->rc_tmr_stopped = 0;
2539         if (slot)
2540                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2541 }
2542
2543 /*
2544  * RACK Timer, here we simply do logging and house keeping.
2545  * the normal rack_output() function will call the
2546  * appropriate thing to check if we need to do a RACK retransmit.
2547  * We return 1, saying don't proceed with rack_output only
2548  * when all timers have been stopped (destroyed PCB?).
2549  */
2550 static int
2551 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2552 {
2553         /*
2554          * This timer simply provides an internal trigger to send out data.
2555          * The check_recovery_mode call will see if there are needed
2556          * retransmissions, if so we will enter fast-recovery. The output
2557          * call may or may not do the same thing depending on sysctl
2558          * settings.
2559          */
2560         struct rack_sendmap *rsm;
2561         int32_t recovery, ll;
2562
2563         if (tp->t_timers->tt_flags & TT_STOPPED) {
2564                 return (1);
2565         }
2566         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2567                 /* Its not time yet */
2568                 return (0);
2569         }
2570         recovery = IN_RECOVERY(tp->t_flags);
2571         counter_u64_add(rack_to_tot, 1);
2572         if (rack->r_state && (rack->r_state != tp->t_state))
2573                 rack_set_state(tp, rack);
2574         rsm = rack_check_recovery_mode(tp, cts);
2575         if (rsm)
2576                 ll = rsm->r_end - rsm->r_start;
2577         else
2578                 ll = 0;
2579         rack_log_to_event(rack, RACK_TO_FRM_RACK, ll);
2580         if (rsm) {
2581                 uint32_t rtt;
2582
2583                 rtt = rack->rc_rack_rtt;
2584                 if (rtt == 0)
2585                         rtt = 1;
2586                 if ((recovery == 0) &&
2587                     (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
2588                         /*
2589                          * The rack-timeout that enter's us into recovery
2590                          * will force out one MSS and set us up so that we
2591                          * can do one more send in 2*rtt (transitioning the
2592                          * rack timeout into a rack-tlp).
2593                          */
2594                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2595                         rack_log_to_prr(rack, 3);
2596                 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
2597                            rack->use_rack_cheat) {
2598                         /*
2599                          * When a rack timer goes, if the rack cheat is
2600                          * on, arrange it so we can send a full segment.
2601                          */
2602                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2603                         rack_log_to_prr(rack, 4);
2604                 }
2605         } else {
2606                 /* This is a case that should happen rarely if ever */
2607                 counter_u64_add(rack_tlp_does_nada, 1);
2608 #ifdef TCP_BLACKBOX
2609                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2610 #endif
2611                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2612         }
2613         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2614         return (0);
2615 }
2616
2617 static __inline void
2618 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
2619                struct rack_sendmap *rsm, uint32_t start)
2620 {
2621         int idx;
2622
2623         nrsm->r_start = start;
2624         nrsm->r_end = rsm->r_end;
2625         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2626         nrsm->r_flags = rsm->r_flags;
2627         nrsm->r_dupack = rsm->r_dupack;
2628         nrsm->r_rtr_bytes = 0;
2629         rsm->r_end = nrsm->r_start;
2630         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2631                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2632         }
2633 }
2634
2635 static struct rack_sendmap *
2636 rack_merge_rsm(struct tcp_rack *rack,
2637                struct rack_sendmap *l_rsm,
2638                struct rack_sendmap *r_rsm)
2639 {
2640         /*
2641          * We are merging two ack'd RSM's,
2642          * the l_rsm is on the left (lower seq
2643          * values) and the r_rsm is on the right
2644          * (higher seq value). The simplest way
2645          * to merge these is to move the right
2646          * one into the left. I don't think there
2647          * is any reason we need to try to find
2648          * the oldest (or last oldest retransmitted).
2649          */
2650         struct rack_sendmap *rm;
2651
2652         l_rsm->r_end = r_rsm->r_end;
2653         if (l_rsm->r_dupack < r_rsm->r_dupack)
2654                 l_rsm->r_dupack = r_rsm->r_dupack;
2655         if (r_rsm->r_rtr_bytes)
2656                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
2657         if (r_rsm->r_in_tmap) {
2658                 /* This really should not happen */
2659                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
2660                 r_rsm->r_in_tmap = 0;
2661         }
2662         /* Now the flags */
2663         if (r_rsm->r_flags & RACK_HAS_FIN)
2664                 l_rsm->r_flags |= RACK_HAS_FIN;
2665         if (r_rsm->r_flags & RACK_TLP)
2666                 l_rsm->r_flags |= RACK_TLP;
2667         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
2668                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
2669         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
2670 #ifdef INVARIANTS
2671         if (rm != r_rsm) {
2672                 panic("removing head in rack:%p rsm:%p rm:%p",
2673                       rack, r_rsm, rm);
2674         }
2675 #endif
2676         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
2677                 /* Transfer the split limit to the map we free */
2678                 r_rsm->r_limit_type = l_rsm->r_limit_type;
2679                 l_rsm->r_limit_type = 0;
2680         }
2681         rack_free(rack, r_rsm);
2682         return(l_rsm);
2683 }
2684
2685 /*
2686  * TLP Timer, here we simply setup what segment we want to
2687  * have the TLP expire on, the normal rack_output() will then
2688  * send it out.
2689  *
2690  * We return 1, saying don't proceed with rack_output only
2691  * when all timers have been stopped (destroyed PCB?).
2692  */
2693 static int
2694 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2695 {
2696         /*
2697          * Tail Loss Probe.
2698          */
2699         struct rack_sendmap *rsm = NULL;
2700         struct rack_sendmap *insret;
2701         struct socket *so;
2702         uint32_t amm, old_prr_snd = 0;
2703         uint32_t out, avail;
2704         int collapsed_win = 0;
2705
2706         if (tp->t_timers->tt_flags & TT_STOPPED) {
2707                 return (1);
2708         }
2709         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2710                 /* Its not time yet */
2711                 return (0);
2712         }
2713         if (rack_progress_timeout_check(tp)) {
2714                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2715                 return (1);
2716         }
2717         /*
2718          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2719          * need to figure out how to force a full MSS segment out.
2720          */
2721         rack_log_to_event(rack, RACK_TO_FRM_TLP, 0);
2722         counter_u64_add(rack_tlp_tot, 1);
2723         if (rack->r_state && (rack->r_state != tp->t_state))
2724                 rack_set_state(tp, rack);
2725         so = tp->t_inpcb->inp_socket;
2726 #ifdef KERN_TLS
2727         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
2728                 /*
2729                  * For hardware TLS we do *not* want to send
2730                  * new data, lets instead just do a retransmission.
2731                  */
2732                 goto need_retran;
2733         }
2734 #endif
2735         avail = sbavail(&so->so_snd);
2736         out = tp->snd_max - tp->snd_una;
2737         rack->tlp_timer_up = 1;
2738         if (out > tp->snd_wnd) {
2739                 /* special case, we need a retransmission */
2740                 collapsed_win = 1;
2741                 goto need_retran;
2742         }
2743         /*
2744          * If we are in recovery we can jazz out a segment if new data is
2745          * present simply by setting rc_prr_sndcnt to a segment.
2746          */
2747         if ((avail > out) &&
2748             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2749                 /* New data is available */
2750                 amm = avail - out;
2751                 if (amm > ctf_fixed_maxseg(tp)) {
2752                         amm = ctf_fixed_maxseg(tp);
2753                 } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) {
2754                         /* not enough to fill a MTU and no-delay is off */
2755                         goto need_retran;
2756                 }
2757                 if (IN_RECOVERY(tp->t_flags)) {
2758                         /* Unlikely */
2759                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2760                         if (out + amm <= tp->snd_wnd) {
2761                                 rack->r_ctl.rc_prr_sndcnt = amm;
2762                                 rack_log_to_prr(rack, 4);
2763                         } else
2764                                 goto need_retran;
2765                 } else {
2766                         /* Set the send-new override */
2767                         if (out + amm <= tp->snd_wnd)
2768                                 rack->r_ctl.rc_tlp_new_data = amm;
2769                         else
2770                                 goto need_retran;
2771                 }
2772                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2773                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2774                 rack->r_ctl.rc_tlpsend = NULL;
2775                 counter_u64_add(rack_tlp_newdata, 1);
2776                 goto send;
2777         }
2778 need_retran:
2779         /*
2780          * Ok we need to arrange the last un-acked segment to be re-sent, or
2781          * optionally the first un-acked segment.
2782          */
2783         if (collapsed_win == 0) {
2784                 if (rack_always_send_oldest)
2785                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2786                 else {
2787                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2788                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2789                                 rsm = rack_find_high_nonack(rack, rsm);
2790                         }
2791                 }
2792                 if (rsm == NULL) {
2793                         counter_u64_add(rack_tlp_does_nada, 1);
2794 #ifdef TCP_BLACKBOX
2795                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2796 #endif
2797                         goto out;
2798                 }
2799         } else {
2800                 /*
2801                  * We must find the last segment
2802                  * that was acceptable by the client.
2803                  */
2804                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
2805                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
2806                                 /* Found one */
2807                                 break;
2808                         }
2809                 }
2810                 if (rsm == NULL) {
2811                         /* None? if so send the first */
2812                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2813                         if (rsm == NULL) {
2814                                 counter_u64_add(rack_tlp_does_nada, 1);
2815 #ifdef TCP_BLACKBOX
2816                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2817 #endif
2818                                 goto out;
2819                         }
2820                 }
2821         }
2822         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
2823                 /*
2824                  * We need to split this the last segment in two.
2825                  */
2826                 struct rack_sendmap *nrsm;
2827
2828
2829                 nrsm = rack_alloc_full_limit(rack);
2830                 if (nrsm == NULL) {
2831                         /*
2832                          * No memory to split, we will just exit and punt
2833                          * off to the RXT timer.
2834                          */
2835                         counter_u64_add(rack_tlp_does_nada, 1);
2836                         goto out;
2837                 }
2838                 rack_clone_rsm(rack, nrsm, rsm,
2839                                (rsm->r_end - ctf_fixed_maxseg(tp)));
2840                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
2841 #ifdef INVARIANTS
2842                 if (insret != NULL) {
2843                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
2844                               nrsm, insret, rack, rsm);
2845                 }
2846 #endif
2847                 if (rsm->r_in_tmap) {
2848                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2849                         nrsm->r_in_tmap = 1;
2850                 }
2851                 rsm->r_flags &= (~RACK_HAS_FIN);
2852                 rsm = nrsm;
2853         }
2854         rack->r_ctl.rc_tlpsend = rsm;
2855         rack->r_ctl.rc_tlp_rtx_out = 1;
2856         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2857                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2858                 tp->t_rxtshift++;
2859         } else {
2860                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2861                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2862         }
2863 send:
2864         rack->r_ctl.rc_tlp_send_cnt++;
2865         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2866                 /*
2867                  * Can't [re]/transmit a segment we have not heard from the
2868                  * peer in max times. We need the retransmit timer to take
2869                  * over.
2870                  */
2871         restore:
2872                 rack->r_ctl.rc_tlpsend = NULL;
2873                 if (rsm)
2874                         rsm->r_flags &= ~RACK_TLP;
2875                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2876                 rack_log_to_prr(rack, 5);
2877                 counter_u64_add(rack_tlp_retran_fail, 1);
2878                 goto out;
2879         } else if (rsm) {
2880                 rsm->r_flags |= RACK_TLP;
2881         }
2882         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2883             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2884                 /*
2885                  * We don't want to send a single segment more than the max
2886                  * either.
2887                  */
2888                 goto restore;
2889         }
2890         rack->r_timer_override = 1;
2891         rack->r_tlp_running = 1;
2892         rack->rc_tlp_in_progress = 1;
2893         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2894         return (0);
2895 out:
2896         rack->tlp_timer_up = 0;
2897         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2898         return (0);
2899 }
2900
2901 /*
2902  * Delayed ack Timer, here we simply need to setup the
2903  * ACK_NOW flag and remove the DELACK flag. From there
2904  * the output routine will send the ack out.
2905  *
2906  * We only return 1, saying don't proceed, if all timers
2907  * are stopped (destroyed PCB?).
2908  */
2909 static int
2910 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2911 {
2912         if (tp->t_timers->tt_flags & TT_STOPPED) {
2913                 return (1);
2914         }
2915         rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0);
2916         tp->t_flags &= ~TF_DELACK;
2917         tp->t_flags |= TF_ACKNOW;
2918         KMOD_TCPSTAT_INC(tcps_delack);
2919         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2920         return (0);
2921 }
2922
2923 /*
2924  * Persists timer, here we simply need to setup the
2925  * FORCE-DATA flag the output routine will send
2926  * the one byte send.
2927  *
2928  * We only return 1, saying don't proceed, if all timers
2929  * are stopped (destroyed PCB?).
2930  */
2931 static int
2932 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2933 {
2934         struct tcptemp *t_template;
2935         struct inpcb *inp;
2936         int32_t retval = 1;
2937
2938         inp = tp->t_inpcb;
2939
2940         if (tp->t_timers->tt_flags & TT_STOPPED) {
2941                 return (1);
2942         }
2943         if (rack->rc_in_persist == 0)
2944                 return (0);
2945         if (rack_progress_timeout_check(tp)) {
2946                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2947                 return (1);
2948         }
2949         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2950         /*
2951          * Persistence timer into zero window. Force a byte to be output, if
2952          * possible.
2953          */
2954         KMOD_TCPSTAT_INC(tcps_persisttimeo);
2955         /*
2956          * Hack: if the peer is dead/unreachable, we do not time out if the
2957          * window is closed.  After a full backoff, drop the connection if
2958          * the idle time (no responses to probes) reaches the maximum
2959          * backoff that we would use if retransmitting.
2960          */
2961         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2962             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2963             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2964                 KMOD_TCPSTAT_INC(tcps_persistdrop);
2965                 retval = 1;
2966                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2967                 goto out;
2968         }
2969         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2970             tp->snd_una == tp->snd_max)
2971                 rack_exit_persist(tp, rack);
2972         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2973         /*
2974          * If the user has closed the socket then drop a persisting
2975          * connection after a much reduced timeout.
2976          */
2977         if (tp->t_state > TCPS_CLOSE_WAIT &&
2978             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2979                 retval = 1;
2980                 KMOD_TCPSTAT_INC(tcps_persistdrop);
2981                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2982                 goto out;
2983         }
2984         t_template = tcpip_maketemplate(rack->rc_inp);
2985         if (t_template) {
2986                 tcp_respond(tp, t_template->tt_ipgen,
2987                             &t_template->tt_t, (struct mbuf *)NULL,
2988                             tp->rcv_nxt, tp->snd_una - 1, 0);
2989                 /* This sends an ack */
2990                 if (tp->t_flags & TF_DELACK)
2991                         tp->t_flags &= ~TF_DELACK;
2992                 free(t_template, M_TEMP);
2993         }
2994         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2995                 tp->t_rxtshift++;
2996 out:
2997         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0);
2998         rack_start_hpts_timer(rack, tp, cts,
2999                               0, 0, 0);
3000         return (retval);
3001 }
3002
3003 /*
3004  * If a keepalive goes off, we had no other timers
3005  * happening. We always return 1 here since this
3006  * routine either drops the connection or sends
3007  * out a segment with respond.
3008  */
3009 static int
3010 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3011 {
3012         struct tcptemp *t_template;
3013         struct inpcb *inp;
3014
3015         if (tp->t_timers->tt_flags & TT_STOPPED) {
3016                 return (1);
3017         }
3018         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
3019         inp = tp->t_inpcb;
3020         rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0);
3021         /*
3022          * Keep-alive timer went off; send something or drop connection if
3023          * idle for too long.
3024          */
3025         KMOD_TCPSTAT_INC(tcps_keeptimeo);
3026         if (tp->t_state < TCPS_ESTABLISHED)
3027                 goto dropit;
3028         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
3029             tp->t_state <= TCPS_CLOSING) {
3030                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
3031                         goto dropit;
3032                 /*
3033                  * Send a packet designed to force a response if the peer is
3034                  * up and reachable: either an ACK if the connection is
3035                  * still alive, or an RST if the peer has closed the
3036                  * connection due to timeout or reboot. Using sequence
3037                  * number tp->snd_una-1 causes the transmitted zero-length
3038                  * segment to lie outside the receive window; by the
3039                  * protocol spec, this requires the correspondent TCP to
3040                  * respond.
3041                  */
3042                 KMOD_TCPSTAT_INC(tcps_keepprobe);
3043                 t_template = tcpip_maketemplate(inp);
3044                 if (t_template) {
3045                         tcp_respond(tp, t_template->tt_ipgen,
3046                             &t_template->tt_t, (struct mbuf *)NULL,
3047                             tp->rcv_nxt, tp->snd_una - 1, 0);
3048                         free(t_template, M_TEMP);
3049                 }
3050         }
3051         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
3052         return (1);
3053 dropit:
3054         KMOD_TCPSTAT_INC(tcps_keepdrops);
3055         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
3056         return (1);
3057 }
3058
3059 /*
3060  * Retransmit helper function, clear up all the ack
3061  * flags and take care of important book keeping.
3062  */
3063 static void
3064 rack_remxt_tmr(struct tcpcb *tp)
3065 {
3066         /*
3067          * The retransmit timer went off, all sack'd blocks must be
3068          * un-acked.
3069          */
3070         struct rack_sendmap *rsm, *trsm = NULL;
3071         struct tcp_rack *rack;
3072         int32_t cnt = 0;
3073
3074         rack = (struct tcp_rack *)tp->t_fb_ptr;
3075         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
3076         rack_log_to_event(rack, RACK_TO_FRM_TMR, 0);
3077         if (rack->r_state && (rack->r_state != tp->t_state))
3078                 rack_set_state(tp, rack);
3079         /*
3080          * Ideally we would like to be able to
3081          * mark SACK-PASS on anything not acked here.
3082          * However, if we do that we would burst out
3083          * all that data 1ms apart. This would be unwise,
3084          * so for now we will just let the normal rxt timer
3085          * and tlp timer take care of it.
3086          */
3087         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3088                 if (rsm->r_flags & RACK_ACKED) {
3089                         cnt++;
3090                         rsm->r_dupack = 0;
3091                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3092                         if (rsm->r_in_tmap == 0) {
3093                                 /* We must re-add it back to the tlist */
3094                                 if (trsm == NULL) {
3095                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3096                                 } else {
3097                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
3098                                 }
3099                                 rsm->r_in_tmap = 1;
3100                         }
3101                 }
3102                 trsm = rsm;
3103                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
3104         }
3105         /* Clear the count (we just un-acked them) */
3106         rack->r_ctl.rc_sacked = 0;
3107         /* Clear the tlp rtx mark */
3108         rack->r_ctl.rc_tlp_rtx_out = 0;
3109         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
3110         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3111         rack->r_ctl.rc_prr_sndcnt = 0;
3112         rack_log_to_prr(rack, 6);
3113         rack->r_timer_override = 1;
3114 }
3115
3116 /*
3117  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
3118  * we will setup to retransmit the lowest seq number outstanding.
3119  */
3120 static int
3121 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3122 {
3123         int32_t rexmt;
3124         struct inpcb *inp;
3125         int32_t retval = 0;
3126         bool isipv6;
3127
3128         inp = tp->t_inpcb;
3129         if (tp->t_timers->tt_flags & TT_STOPPED) {
3130                 return (1);
3131         }
3132         if (rack_progress_timeout_check(tp)) {
3133                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
3134                 return (1);
3135         }
3136         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
3137         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
3138             (tp->snd_una == tp->snd_max)) {
3139                 /* Nothing outstanding .. nothing to do */
3140                 return (0);
3141         }
3142         /*
3143          * Retransmission timer went off.  Message has not been acked within
3144          * retransmit interval.  Back off to a longer retransmit interval
3145          * and retransmit one segment.
3146          */
3147         rack_remxt_tmr(tp);
3148         if ((rack->r_ctl.rc_resend == NULL) ||
3149             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
3150                 /*
3151                  * If the rwnd collapsed on
3152                  * the one we are retransmitting
3153                  * it does not count against the
3154                  * rxt count.
3155                  */
3156                 tp->t_rxtshift++;
3157         }
3158         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
3159                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
3160                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
3161                 retval = 1;
3162                 tcp_set_inp_to_drop(rack->rc_inp,
3163                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
3164                 goto out;
3165         }
3166         if (tp->t_state == TCPS_SYN_SENT) {
3167                 /*
3168                  * If the SYN was retransmitted, indicate CWND to be limited
3169                  * to 1 segment in cc_conn_init().
3170                  */
3171                 tp->snd_cwnd = 1;
3172         } else if (tp->t_rxtshift == 1) {
3173                 /*
3174                  * first retransmit; record ssthresh and cwnd so they can be
3175                  * recovered if this turns out to be a "bad" retransmit. A
3176                  * retransmit is considered "bad" if an ACK for this segment
3177                  * is received within RTT/2 interval; the assumption here is
3178                  * that the ACK was already in flight.  See "On Estimating
3179                  * End-to-End Network Path Properties" by Allman and Paxson
3180                  * for more details.
3181                  */
3182                 tp->snd_cwnd_prev = tp->snd_cwnd;
3183                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
3184                 tp->snd_recover_prev = tp->snd_recover;
3185                 if (IN_FASTRECOVERY(tp->t_flags))
3186                         tp->t_flags |= TF_WASFRECOVERY;
3187                 else
3188                         tp->t_flags &= ~TF_WASFRECOVERY;
3189                 if (IN_CONGRECOVERY(tp->t_flags))
3190                         tp->t_flags |= TF_WASCRECOVERY;
3191                 else
3192                         tp->t_flags &= ~TF_WASCRECOVERY;
3193                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
3194                 tp->t_flags |= TF_PREVVALID;
3195         } else
3196                 tp->t_flags &= ~TF_PREVVALID;
3197         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
3198         if ((tp->t_state == TCPS_SYN_SENT) ||
3199             (tp->t_state == TCPS_SYN_RECEIVED))
3200                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
3201         else
3202                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
3203         TCPT_RANGESET(tp->t_rxtcur, rexmt,
3204            max(MSEC_2_TICKS(rack_rto_min), rexmt),
3205            MSEC_2_TICKS(rack_rto_max));
3206         /*
3207          * We enter the path for PLMTUD if connection is established or, if
3208          * connection is FIN_WAIT_1 status, reason for the last is that if
3209          * amount of data we send is very small, we could send it in couple
3210          * of packets and process straight to FIN. In that case we won't
3211          * catch ESTABLISHED state.
3212          */
3213 #ifdef INET6
3214         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
3215 #else
3216         isipv6 = false;
3217 #endif
3218         if (((V_tcp_pmtud_blackhole_detect == 1) ||
3219             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
3220             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
3221             ((tp->t_state == TCPS_ESTABLISHED) ||
3222             (tp->t_state == TCPS_FIN_WAIT_1))) {
3223
3224                 /*
3225                  * Idea here is that at each stage of mtu probe (usually,
3226                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
3227                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
3228                  * should take care of that.
3229                  */
3230                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
3231                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
3232                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
3233                     tp->t_rxtshift % 2 == 0)) {
3234                         /*
3235                          * Enter Path MTU Black-hole Detection mechanism: -
3236                          * Disable Path MTU Discovery (IP "DF" bit). -
3237                          * Reduce MTU to lower value than what we negotiated
3238                          * with peer.
3239                          */
3240                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
3241                                 /* Record that we may have found a black hole. */
3242                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
3243                                 /* Keep track of previous MSS. */
3244                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
3245                         }
3246
3247                         /*
3248                          * Reduce the MSS to blackhole value or to the
3249                          * default in an attempt to retransmit.
3250                          */
3251 #ifdef INET6
3252                         if (isipv6 &&
3253                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3254                                 /* Use the sysctl tuneable blackhole MSS. */
3255                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3256                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3257                         } else if (isipv6) {
3258                                 /* Use the default MSS. */
3259                                 tp->t_maxseg = V_tcp_v6mssdflt;
3260                                 /*
3261                                  * Disable Path MTU Discovery when we switch
3262                                  * to minmss.
3263                                  */
3264                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3265                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3266                         }
3267 #endif
3268 #if defined(INET6) && defined(INET)
3269                         else
3270 #endif
3271 #ifdef INET
3272                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3273                                 /* Use the sysctl tuneable blackhole MSS. */
3274                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3275                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3276                         } else {
3277                                 /* Use the default MSS. */
3278                                 tp->t_maxseg = V_tcp_mssdflt;
3279                                 /*
3280                                  * Disable Path MTU Discovery when we switch
3281                                  * to minmss.
3282                                  */
3283                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3284                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3285                         }
3286 #endif
3287                 } else {
3288                         /*
3289                          * If further retransmissions are still unsuccessful
3290                          * with a lowered MTU, maybe this isn't a blackhole
3291                          * and we restore the previous MSS and blackhole
3292                          * detection flags. The limit '6' is determined by
3293                          * giving each probe stage (1448, 1188, 524) 2
3294                          * chances to recover.
3295                          */
3296                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3297                             (tp->t_rxtshift >= 6)) {
3298                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3299                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3300                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3301                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3302                         }
3303                 }
3304         }
3305         /*
3306          * If we backed off this far, our srtt estimate is probably bogus.
3307          * Clobber it so we'll take the next rtt measurement as our srtt;
3308          * move the current srtt into rttvar to keep the current retransmit
3309          * times until then.
3310          */
3311         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3312 #ifdef INET6
3313                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3314                         in6_losing(tp->t_inpcb);
3315                 else
3316 #endif
3317                         in_losing(tp->t_inpcb);
3318                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3319                 tp->t_srtt = 0;
3320         }
3321         if (rack_use_sack_filter)
3322                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3323         tp->snd_recover = tp->snd_max;
3324         tp->t_flags |= TF_ACKNOW;
3325         tp->t_rtttime = 0;
3326         rack_cong_signal(tp, NULL, CC_RTO);
3327 out:
3328         return (retval);
3329 }
3330
3331 static int
3332 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3333 {
3334         int32_t ret = 0;
3335         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3336
3337         if (timers == 0) {
3338                 return (0);
3339         }
3340         if (tp->t_state == TCPS_LISTEN) {
3341                 /* no timers on listen sockets */
3342                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3343                         return (0);
3344                 return (1);
3345         }
3346         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3347                 uint32_t left;
3348
3349                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3350                         ret = -1;
3351                         rack_log_to_processing(rack, cts, ret, 0);
3352                         return (0);
3353                 }
3354                 if (hpts_calling == 0) {
3355                         ret = -2;
3356                         rack_log_to_processing(rack, cts, ret, 0);
3357                         return (0);
3358                 }
3359                 /*
3360                  * Ok our timer went off early and we are not paced false
3361                  * alarm, go back to sleep.
3362                  */
3363                 ret = -3;
3364                 left = rack->r_ctl.rc_timer_exp - cts;
3365                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3366                 rack_log_to_processing(rack, cts, ret, left);
3367                 rack->rc_last_pto_set = 0;
3368                 return (1);
3369         }
3370         rack->rc_tmr_stopped = 0;
3371         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3372         if (timers & PACE_TMR_DELACK) {
3373                 ret = rack_timeout_delack(tp, rack, cts);
3374         } else if (timers & PACE_TMR_RACK) {
3375                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3376                 ret = rack_timeout_rack(tp, rack, cts);
3377         } else if (timers & PACE_TMR_TLP) {
3378                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3379                 ret = rack_timeout_tlp(tp, rack, cts);
3380         } else if (timers & PACE_TMR_RXT) {
3381                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3382                 ret = rack_timeout_rxt(tp, rack, cts);
3383         } else if (timers & PACE_TMR_PERSIT) {
3384                 ret = rack_timeout_persist(tp, rack, cts);
3385         } else if (timers & PACE_TMR_KEEP) {
3386                 ret = rack_timeout_keepalive(tp, rack, cts);
3387         }
3388         rack_log_to_processing(rack, cts, ret, timers);
3389         return (ret);
3390 }
3391
3392 static void
3393 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3394 {
3395         uint8_t hpts_removed = 0;
3396
3397         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3398             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3399                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3400                 hpts_removed = 1;
3401         }
3402         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3403                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3404                 if (rack->rc_inp->inp_in_hpts &&
3405                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3406                         /*
3407                          * Canceling timer's when we have no output being
3408                          * paced. We also must remove ourselves from the
3409                          * hpts.
3410                          */
3411                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3412                         hpts_removed = 1;
3413                 }
3414                 rack_log_to_cancel(rack, hpts_removed, line);
3415                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3416         }
3417 }
3418
3419 static void
3420 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3421 {
3422         return;
3423 }
3424
3425 static int
3426 rack_stopall(struct tcpcb *tp)
3427 {
3428         struct tcp_rack *rack;
3429         rack = (struct tcp_rack *)tp->t_fb_ptr;
3430         rack->t_timers_stopped = 1;
3431         return (0);
3432 }
3433
3434 static void
3435 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3436 {
3437         return;
3438 }
3439
3440 static int
3441 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3442 {
3443         return (0);
3444 }
3445
3446 static void
3447 rack_stop_all_timers(struct tcpcb *tp)
3448 {
3449         struct tcp_rack *rack;
3450
3451         /*
3452          * Assure no timers are running.
3453          */
3454         if (tcp_timer_active(tp, TT_PERSIST)) {
3455                 /* We enter in persists, set the flag appropriately */
3456                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3457                 rack->rc_in_persist = 1;
3458         }
3459         tcp_timer_suspend(tp, TT_PERSIST);
3460         tcp_timer_suspend(tp, TT_REXMT);
3461         tcp_timer_suspend(tp, TT_KEEP);
3462         tcp_timer_suspend(tp, TT_DELACK);
3463 }
3464
3465 static void
3466 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3467     struct rack_sendmap *rsm, uint32_t ts)
3468 {
3469         int32_t idx;
3470
3471         rsm->r_rtr_cnt++;
3472         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3473         rsm->r_dupack = 0;
3474         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3475                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3476                 rsm->r_flags |= RACK_OVERMAX;
3477         }
3478         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3479                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3480                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3481         }
3482         idx = rsm->r_rtr_cnt - 1;
3483         rsm->r_tim_lastsent[idx] = ts;
3484         if (rsm->r_flags & RACK_ACKED) {
3485                 /* Problably MTU discovery messing with us */
3486                 rsm->r_flags &= ~RACK_ACKED;
3487                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3488         }
3489         if (rsm->r_in_tmap) {
3490                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3491                 rsm->r_in_tmap = 0;
3492         }
3493         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3494         rsm->r_in_tmap = 1;
3495         if (rsm->r_flags & RACK_SACK_PASSED) {
3496                 /* We have retransmitted due to the SACK pass */
3497                 rsm->r_flags &= ~RACK_SACK_PASSED;
3498                 rsm->r_flags |= RACK_WAS_SACKPASS;
3499         }
3500 }
3501
3502
3503 static uint32_t
3504 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3505     struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
3506 {
3507         /*
3508          * We (re-)transmitted starting at rsm->r_start for some length
3509          * (possibly less than r_end.
3510          */
3511         struct rack_sendmap *nrsm, *insret;
3512         uint32_t c_end;
3513         int32_t len;
3514
3515         len = *lenp;
3516         c_end = rsm->r_start + len;
3517         if (SEQ_GEQ(c_end, rsm->r_end)) {
3518                 /*
3519                  * We retransmitted the whole piece or more than the whole
3520                  * slopping into the next rsm.
3521                  */
3522                 rack_update_rsm(tp, rack, rsm, ts);
3523                 if (c_end == rsm->r_end) {
3524                         *lenp = 0;
3525                         return (0);
3526                 } else {
3527                         int32_t act_len;
3528
3529                         /* Hangs over the end return whats left */
3530                         act_len = rsm->r_end - rsm->r_start;
3531                         *lenp = (len - act_len);
3532                         return (rsm->r_end);
3533                 }
3534                 /* We don't get out of this block. */
3535         }
3536         /*
3537          * Here we retransmitted less than the whole thing which means we
3538          * have to split this into what was transmitted and what was not.
3539          */
3540         nrsm = rack_alloc_full_limit(rack);
3541         if (nrsm == NULL) {
3542                 /*
3543                  * We can't get memory, so lets not proceed.
3544                  */
3545                 *lenp = 0;
3546                 return (0);
3547         }
3548         /*
3549          * So here we are going to take the original rsm and make it what we
3550          * retransmitted. nrsm will be the tail portion we did not
3551          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3552          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3553          * 1, 6 and the new piece will be 6, 11.
3554          */
3555         rack_clone_rsm(rack, nrsm, rsm, c_end);
3556         nrsm->r_dupack = 0;
3557         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
3558         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3559 #ifdef INVARIANTS
3560         if (insret != NULL) {
3561                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3562                       nrsm, insret, rack, rsm);
3563         }
3564 #endif
3565         if (rsm->r_in_tmap) {
3566                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3567                 nrsm->r_in_tmap = 1;
3568         }
3569         rsm->r_flags &= (~RACK_HAS_FIN);
3570         rack_update_rsm(tp, rack, rsm, ts);
3571         *lenp = 0;
3572         return (0);
3573 }
3574
3575
3576 static void
3577 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3578     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3579     uint8_t pass, struct rack_sendmap *hintrsm)
3580 {
3581         struct tcp_rack *rack;
3582         struct rack_sendmap *rsm, *nrsm, *insret, fe;
3583         register uint32_t snd_max, snd_una;
3584
3585         /*
3586          * Add to the RACK log of packets in flight or retransmitted. If
3587          * there is a TS option we will use the TS echoed, if not we will
3588          * grab a TS.
3589          *
3590          * Retransmissions will increment the count and move the ts to its
3591          * proper place. Note that if options do not include TS's then we
3592          * won't be able to effectively use the ACK for an RTT on a retran.
3593          *
3594          * Notes about r_start and r_end. Lets consider a send starting at
3595          * sequence 1 for 10 bytes. In such an example the r_start would be
3596          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3597          * This means that r_end is actually the first sequence for the next
3598          * slot (11).
3599          *
3600          */
3601         /*
3602          * If err is set what do we do XXXrrs? should we not add the thing?
3603          * -- i.e. return if err != 0 or should we pretend we sent it? --
3604          * i.e. proceed with add ** do this for now.
3605          */
3606         INP_WLOCK_ASSERT(tp->t_inpcb);
3607         if (err)
3608                 /*
3609                  * We don't log errors -- we could but snd_max does not
3610                  * advance in this case either.
3611                  */
3612                 return;
3613
3614         if (th_flags & TH_RST) {
3615                 /*
3616                  * We don't log resets and we return immediately from
3617                  * sending
3618                  */
3619                 return;
3620         }
3621         rack = (struct tcp_rack *)tp->t_fb_ptr;
3622         snd_una = tp->snd_una;
3623         if (SEQ_LEQ((seq_out + len), snd_una)) {
3624                 /* Are sending an old segment to induce an ack (keep-alive)? */
3625                 return;
3626         }
3627         if (SEQ_LT(seq_out, snd_una)) {
3628                 /* huh? should we panic? */
3629                 uint32_t end;
3630
3631                 end = seq_out + len;
3632                 seq_out = snd_una;
3633                 if (SEQ_GEQ(end, seq_out))
3634                         len = end - seq_out;
3635                 else
3636                         len = 0;
3637         }
3638         snd_max = tp->snd_max;
3639         if (th_flags & (TH_SYN | TH_FIN)) {
3640                 /*
3641                  * The call to rack_log_output is made before bumping
3642                  * snd_max. This means we can record one extra byte on a SYN
3643                  * or FIN if seq_out is adding more on and a FIN is present
3644                  * (and we are not resending).
3645                  */
3646                 if (th_flags & TH_SYN)
3647                         len++;
3648                 if (th_flags & TH_FIN)
3649                         len++;
3650                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3651                         /*
3652                          * The add/update as not been done for the FIN/SYN
3653                          * yet.
3654                          */
3655                         snd_max = tp->snd_nxt;
3656                 }
3657         }
3658         if (len == 0) {
3659                 /* We don't log zero window probes */
3660                 return;
3661         }
3662         rack->r_ctl.rc_time_last_sent = ts;
3663         if (IN_RECOVERY(tp->t_flags)) {
3664                 rack->r_ctl.rc_prr_out += len;
3665         }
3666         /* First question is it a retransmission or new? */
3667         if (seq_out == snd_max) {
3668                 /* Its new */
3669 again:
3670                 rsm = rack_alloc(rack);
3671                 if (rsm == NULL) {
3672                         /*
3673                          * Hmm out of memory and the tcb got destroyed while
3674                          * we tried to wait.
3675                          */
3676                         return;
3677                 }
3678                 if (th_flags & TH_FIN) {
3679                         rsm->r_flags = RACK_HAS_FIN;
3680                 } else {
3681                         rsm->r_flags = 0;
3682                 }
3683                 rsm->r_tim_lastsent[0] = ts;
3684                 rsm->r_rtr_cnt = 1;
3685                 rsm->r_rtr_bytes = 0;
3686                 if (th_flags & TH_SYN) {
3687                         /* The data space is one beyond snd_una */
3688                         rsm->r_start = seq_out + 1;
3689                         rsm->r_end = rsm->r_start + (len - 1);
3690                 } else {
3691                         /* Normal case */
3692                         rsm->r_start = seq_out;
3693                         rsm->r_end = rsm->r_start + len;
3694                 }
3695                 rsm->r_dupack = 0;
3696                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3697                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
3698 #ifdef INVARIANTS
3699                 if (insret != NULL) {
3700                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3701                               nrsm, insret, rack, rsm);
3702                 }
3703 #endif
3704                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3705                 rsm->r_in_tmap = 1;
3706                 return;
3707         }
3708         /*
3709          * If we reach here its a retransmission and we need to find it.
3710          */
3711         memset(&fe, 0, sizeof(fe));
3712 more:
3713         if (hintrsm && (hintrsm->r_start == seq_out)) {
3714                 rsm = hintrsm;
3715                 hintrsm = NULL;
3716         } else {
3717                 /* No hints sorry */
3718                 rsm = NULL;
3719         }
3720         if ((rsm) && (rsm->r_start == seq_out)) {
3721                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3722                 if (len == 0) {
3723                         return;
3724                 } else {
3725                         goto more;
3726                 }
3727         }
3728         /* Ok it was not the last pointer go through it the hard way. */
3729 refind:
3730         fe.r_start = seq_out;
3731         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3732         if (rsm) {
3733                 if (rsm->r_start == seq_out) {
3734                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3735                         if (len == 0) {
3736                                 return;
3737                         } else {
3738                                 goto refind;
3739                         }
3740                 }
3741                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3742                         /* Transmitted within this piece */
3743                         /*
3744                          * Ok we must split off the front and then let the
3745                          * update do the rest
3746                          */
3747                         nrsm = rack_alloc_full_limit(rack);
3748                         if (nrsm == NULL) {
3749                                 rack_update_rsm(tp, rack, rsm, ts);
3750                                 return;
3751                         }
3752                         /*
3753                          * copy rsm to nrsm and then trim the front of rsm
3754                          * to not include this part.
3755                          */
3756                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
3757                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3758 #ifdef INVARIANTS
3759                         if (insret != NULL) {
3760                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3761                                       nrsm, insret, rack, rsm);
3762                         }
3763 #endif
3764                         if (rsm->r_in_tmap) {
3765                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3766                                 nrsm->r_in_tmap = 1;
3767                         }
3768                         rsm->r_flags &= (~RACK_HAS_FIN);
3769                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3770                         if (len == 0) {
3771                                 return;
3772                         } else if (len > 0)
3773                                 goto refind;
3774                 }
3775         }
3776         /*
3777          * Hmm not found in map did they retransmit both old and on into the
3778          * new?
3779          */
3780         if (seq_out == tp->snd_max) {
3781                 goto again;
3782         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3783 #ifdef INVARIANTS
3784                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3785                     seq_out, len, tp->snd_una, tp->snd_max);
3786                 printf("Starting Dump of all rack entries\n");
3787                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3788                         printf("rsm:%p start:%u end:%u\n",
3789                             rsm, rsm->r_start, rsm->r_end);
3790                 }
3791                 printf("Dump complete\n");
3792                 panic("seq_out not found rack:%p tp:%p",
3793                     rack, tp);
3794 #endif
3795         } else {
3796 #ifdef INVARIANTS
3797                 /*
3798                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3799                  * flag)
3800                  */
3801                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3802                     seq_out, len, tp->snd_max, tp);
3803 #endif
3804         }
3805 }
3806
3807 /*
3808  * Record one of the RTT updates from an ack into
3809  * our sample structure.
3810  */
3811 static void
3812 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3813 {
3814         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3815             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3816                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3817         }
3818         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3819             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3820                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3821         }
3822         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3823         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3824         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3825 }
3826
3827 /*
3828  * Collect new round-trip time estimate
3829  * and update averages and current timeout.
3830  */
3831 static void
3832 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3833 {
3834         int32_t delta;
3835         uint32_t o_srtt, o_var;
3836         int32_t rtt;
3837
3838         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3839                 /* No valid sample */
3840                 return;
3841         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3842                 /* We are to use the lowest RTT seen in a single ack */
3843                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3844         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3845                 /* We are to use the highest RTT seen in a single ack */
3846                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3847         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3848                 /* We are to use the average RTT seen in a single ack */
3849                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3850                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3851         } else {
3852 #ifdef INVARIANTS
3853                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3854 #endif
3855                 return;
3856         }
3857         if (rtt == 0)
3858                 rtt = 1;
3859         rack_log_rtt_sample(rack, rtt);
3860         o_srtt = tp->t_srtt;
3861         o_var = tp->t_rttvar;
3862         rack = (struct tcp_rack *)tp->t_fb_ptr;
3863         if (tp->t_srtt != 0) {
3864                 /*
3865                  * srtt is stored as fixed point with 5 bits after the
3866                  * binary point (i.e., scaled by 8).  The following magic is
3867                  * equivalent to the smoothing algorithm in rfc793 with an
3868                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3869                  * Adjust rtt to origin 0.
3870                  */
3871                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3872                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3873
3874                 tp->t_srtt += delta;
3875                 if (tp->t_srtt <= 0)
3876                         tp->t_srtt = 1;
3877
3878                 /*
3879                  * We accumulate a smoothed rtt variance (actually, a
3880                  * smoothed mean difference), then set the retransmit timer
3881                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3882                  * is stored as fixed point with 4 bits after the binary
3883                  * point (scaled by 16).  The following is equivalent to
3884                  * rfc793 smoothing with an alpha of .75 (rttvar =
3885                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3886                  * wired-in beta.
3887                  */
3888                 if (delta < 0)
3889                         delta = -delta;
3890                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3891                 tp->t_rttvar += delta;
3892                 if (tp->t_rttvar <= 0)
3893                         tp->t_rttvar = 1;
3894                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3895                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3896         } else {
3897                 /*
3898                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3899                  * variance to half the rtt (so our first retransmit happens
3900                  * at 3*rtt).
3901                  */
3902                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3903                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3904                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3905         }
3906         KMOD_TCPSTAT_INC(tcps_rttupdated);
3907         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3908         tp->t_rttupdated++;
3909 #ifdef STATS
3910         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3911 #endif
3912         tp->t_rxtshift = 0;
3913
3914         /*
3915          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3916          * way we do the smoothing, srtt and rttvar will each average +1/2
3917          * tick of bias.  When we compute the retransmit timer, we want 1/2
3918          * tick of rounding and 1 extra tick because of +-1/2 tick
3919          * uncertainty in the firing of the timer.  The bias will give us
3920          * exactly the 1.5 tick we need.  But, because the bias is
3921          * statistical, we have to test that we don't drop below the minimum
3922          * feasible timer (which is 2 ticks).
3923          */
3924         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3925            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3926         tp->t_softerror = 0;
3927 }
3928
3929 static void
3930 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3931     uint32_t t, uint32_t cts)
3932 {
3933         /*
3934          * For this RSM, we acknowledged the data from a previous
3935          * transmission, not the last one we made. This means we did a false
3936          * retransmit.
3937          */
3938         struct tcp_rack *rack;
3939
3940         if (rsm->r_flags & RACK_HAS_FIN) {
3941                 /*
3942                  * The sending of the FIN often is multiple sent when we
3943                  * have everything outstanding ack'd. We ignore this case
3944                  * since its over now.
3945                  */
3946                 return;
3947         }
3948         if (rsm->r_flags & RACK_TLP) {
3949                 /*
3950                  * We expect TLP's to have this occur.
3951                  */
3952                 return;
3953         }
3954         rack = (struct tcp_rack *)tp->t_fb_ptr;
3955         /* should we undo cc changes and exit recovery? */
3956         if (IN_RECOVERY(tp->t_flags)) {
3957                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3958                         /*
3959                          * Undo what we ratched down and exit recovery if
3960                          * possible
3961                          */
3962                         EXIT_RECOVERY(tp->t_flags);
3963                         tp->snd_recover = tp->snd_una;
3964                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3965                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3966                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3967                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3968                 }
3969         }
3970         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3971                 /*
3972                  * We retransmitted based on a sack and the earlier
3973                  * retransmission ack'd it - re-ordering is occuring.
3974                  */
3975                 counter_u64_add(rack_reorder_seen, 1);
3976                 rack->r_ctl.rc_reorder_ts = cts;
3977         }
3978         counter_u64_add(rack_badfr, 1);
3979         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3980 }
3981
3982
3983 static int
3984 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3985     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3986 {
3987         int32_t i;
3988         uint32_t t;
3989
3990         if (rsm->r_flags & RACK_ACKED)
3991                 /* Already done */
3992                 return (0);
3993
3994
3995         if ((rsm->r_rtr_cnt == 1) ||
3996             ((ack_type == CUM_ACKED) &&
3997             (to->to_flags & TOF_TS) &&
3998             (to->to_tsecr) &&
3999             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
4000             ) {
4001                 /*
4002                  * We will only find a matching timestamp if its cum-acked.
4003                  * But if its only one retransmission its for-sure matching
4004                  * :-)
4005                  */
4006                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4007                 if ((int)t <= 0)
4008                         t = 1;
4009                 if (!tp->t_rttlow || tp->t_rttlow > t)
4010                         tp->t_rttlow = t;
4011                 if (!rack->r_ctl.rc_rack_min_rtt ||
4012                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4013                         rack->r_ctl.rc_rack_min_rtt = t;
4014                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4015                                 rack->r_ctl.rc_rack_min_rtt = 1;
4016                         }
4017                 }
4018                 tcp_rack_xmit_timer(rack, t + 1);
4019                 if ((rsm->r_flags & RACK_TLP) &&
4020                     (!IN_RECOVERY(tp->t_flags))) {
4021                         /* Segment was a TLP and our retrans matched */
4022                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
4023                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
4024                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4025                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4026                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
4027                                 /*
4028                                  * When we enter recovery we need to assure
4029                                  * we send one packet.
4030                                  */
4031                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4032                                 rack_log_to_prr(rack, 7);
4033                         }
4034                 }
4035                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4036                         /* New more recent rack_tmit_time */
4037                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4038                         rack->rc_rack_rtt = t;
4039                 }
4040                 return (1);
4041         }
4042         /*
4043          * We clear the soft/rxtshift since we got an ack.
4044          * There is no assurance we will call the commit() function
4045          * so we need to clear these to avoid incorrect handling.
4046          */
4047         tp->t_rxtshift = 0;
4048         tp->t_softerror = 0;
4049         if ((to->to_flags & TOF_TS) &&
4050             (ack_type == CUM_ACKED) &&
4051             (to->to_tsecr) &&
4052             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
4053                 /*
4054                  * Now which timestamp does it match? In this block the ACK
4055                  * must be coming from a previous transmission.
4056                  */
4057                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
4058                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
4059                                 t = cts - rsm->r_tim_lastsent[i];
4060                                 if ((int)t <= 0)
4061                                         t = 1;
4062                                 if ((i + 1) < rsm->r_rtr_cnt) {
4063                                         /* Likely */
4064                                         rack_earlier_retran(tp, rsm, t, cts);
4065                                 }
4066                                 if (!tp->t_rttlow || tp->t_rttlow > t)
4067                                         tp->t_rttlow = t;
4068                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4069                                         rack->r_ctl.rc_rack_min_rtt = t;
4070                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4071                                                 rack->r_ctl.rc_rack_min_rtt = 1;
4072                                         }
4073                                 }
4074                                 /*
4075                                  * Note the following calls to
4076                                  * tcp_rack_xmit_timer() are being commented
4077                                  * out for now. They give us no more accuracy
4078                                  * and often lead to a wrong choice. We have
4079                                  * enough samples that have not been
4080                                  * retransmitted. I leave the commented out
4081                                  * code in here in case in the future we
4082                                  * decide to add it back (though I can't forsee
4083                                  * doing that). That way we will easily see
4084                                  * where they need to be placed.
4085                                  */
4086                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
4087                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4088                                         /* New more recent rack_tmit_time */
4089                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4090                                         rack->rc_rack_rtt = t;
4091                                 }
4092                                 return (1);
4093                         }
4094                 }
4095                 goto ts_not_found;
4096         } else {
4097                 /*
4098                  * Ok its a SACK block that we retransmitted. or a windows
4099                  * machine without timestamps. We can tell nothing from the
4100                  * time-stamp since its not there or the time the peer last
4101                  * recieved a segment that moved forward its cum-ack point.
4102                  */
4103 ts_not_found:
4104                 i = rsm->r_rtr_cnt - 1;
4105                 t = cts - rsm->r_tim_lastsent[i];
4106                 if ((int)t <= 0)
4107                         t = 1;
4108                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4109                         /*
4110                          * We retransmitted and the ack came back in less
4111                          * than the smallest rtt we have observed. We most
4112                          * likey did an improper retransmit as outlined in
4113                          * 4.2 Step 3 point 2 in the rack-draft.
4114                          */
4115                         i = rsm->r_rtr_cnt - 2;
4116                         t = cts - rsm->r_tim_lastsent[i];
4117                         rack_earlier_retran(tp, rsm, t, cts);
4118                 } else if (rack->r_ctl.rc_rack_min_rtt) {
4119                         /*
4120                          * We retransmitted it and the retransmit did the
4121                          * job.
4122                          */
4123                         if (!rack->r_ctl.rc_rack_min_rtt ||
4124                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4125                                 rack->r_ctl.rc_rack_min_rtt = t;
4126                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
4127                                         rack->r_ctl.rc_rack_min_rtt = 1;
4128                                 }
4129                         }
4130                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
4131                                 /* New more recent rack_tmit_time */
4132                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
4133                                 rack->rc_rack_rtt = t;
4134                         }
4135                         return (1);
4136                 }
4137         }
4138         return (0);
4139 }
4140
4141 /*
4142  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
4143  */
4144 static void
4145 rack_log_sack_passed(struct tcpcb *tp,
4146     struct tcp_rack *rack, struct rack_sendmap *rsm)
4147 {
4148         struct rack_sendmap *nrsm;
4149
4150         nrsm = rsm;
4151         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
4152             rack_head, r_tnext) {
4153                 if (nrsm == rsm) {
4154                         /* Skip orginal segment he is acked */
4155                         continue;
4156                 }
4157                 if (nrsm->r_flags & RACK_ACKED) {
4158                         /*
4159                          * Skip ack'd segments, though we
4160                          * should not see these, since tmap
4161                          * should not have ack'd segments.
4162                          */
4163                         continue;
4164                 }
4165                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4166                         /*
4167                          * We found one that is already marked
4168                          * passed, we have been here before and
4169                          * so all others below this are marked.
4170                          */
4171                         break;
4172                 }
4173                 nrsm->r_flags |= RACK_SACK_PASSED;
4174                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
4175         }
4176 }
4177
4178 static uint32_t
4179 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
4180                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
4181 {
4182         uint32_t start, end, changed = 0;
4183         struct rack_sendmap stack_map;
4184         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
4185         int32_t used_ref = 1;
4186         int moved = 0;
4187
4188         start = sack->start;
4189         end = sack->end;
4190         rsm = *prsm;
4191         memset(&fe, 0, sizeof(fe));
4192 do_rest_ofb:
4193         if ((rsm == NULL) ||
4194             (SEQ_LT(end, rsm->r_start)) ||
4195             (SEQ_GEQ(start, rsm->r_end)) ||
4196             (SEQ_LT(start, rsm->r_start))) {
4197                 /*
4198                  * We are not in the right spot,
4199                  * find the correct spot in the tree.
4200                  */
4201                 used_ref = 0;
4202                 fe.r_start = start;
4203                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4204                 moved++;
4205         }
4206         if (rsm == NULL) {
4207                 /* TSNH */
4208                 goto out;
4209         }
4210         /* Ok we have an ACK for some piece of this rsm */
4211         if (rsm->r_start != start) {
4212                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4213                         /**
4214                          * Need to split this in two pieces the before and after,
4215                          * the before remains in the map, the after must be
4216                          * added. In other words we have:
4217                          * rsm        |--------------|
4218                          * sackblk        |------->
4219                          * rsm will become
4220                          *     rsm    |---|
4221                          * and nrsm will be  the sacked piece
4222                          *     nrsm       |----------|
4223                          *
4224                          * But before we start down that path lets
4225                          * see if the sack spans over on top of
4226                          * the next guy and it is already sacked.
4227                          */
4228                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4229                         if (next && (next->r_flags & RACK_ACKED) &&
4230                             SEQ_GEQ(end, next->r_start)) {
4231                                 /**
4232                                  * So the next one is already acked, and
4233                                  * we can thus by hookery use our stack_map
4234                                  * to reflect the piece being sacked and
4235                                  * then adjust the two tree entries moving
4236                                  * the start and ends around. So we start like:
4237                                  *  rsm     |------------|             (not-acked)
4238                                  *  next                 |-----------| (acked)
4239                                  *  sackblk        |-------->
4240                                  *  We want to end like so:
4241                                  *  rsm     |------|                   (not-acked)
4242                                  *  next           |-----------------| (acked)
4243                                  *  nrsm           |-----|
4244                                  * Where nrsm is a temporary stack piece we
4245                                  * use to update all the gizmos.
4246                                  */
4247                                 /* Copy up our fudge block */
4248                                 nrsm = &stack_map;
4249                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4250                                 /* Now adjust our tree blocks */
4251                                 rsm->r_end = start;
4252                                 next->r_start = start;
4253                                 /* Clear out the dup ack count of the remainder */
4254                                 rsm->r_dupack = 0;
4255                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4256                                 /* Now lets make sure our fudge block is right */
4257                                 nrsm->r_start = start;
4258                                 /* Now lets update all the stats and such */
4259                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4260                                 changed += (nrsm->r_end - nrsm->r_start);
4261                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4262                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4263                                         counter_u64_add(rack_reorder_seen, 1);
4264                                         rack->r_ctl.rc_reorder_ts = cts;
4265                                 }
4266                                 /*
4267                                  * Now we want to go up from rsm (the
4268                                  * one left un-acked) to the next one
4269                                  * in the tmap. We do this so when
4270                                  * we walk backwards we include marking
4271                                  * sack-passed on rsm (The one passed in
4272                                  * is skipped since it is generally called
4273                                  * on something sacked before removing it
4274                                  * from the tmap).
4275                                  */
4276                                 if (rsm->r_in_tmap) {
4277                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
4278                                         /*
4279                                          * Now that we have the next
4280                                          * one walk backwards from there.
4281                                          */
4282                                         if (nrsm && nrsm->r_in_tmap)
4283                                                 rack_log_sack_passed(tp, rack, nrsm);
4284                                 }
4285                                 /* Now are we done? */
4286                                 if (SEQ_LT(end, next->r_end) ||
4287                                     (end == next->r_end)) {
4288                                         /* Done with block */
4289                                         goto out;
4290                                 }
4291                                 counter_u64_add(rack_sack_used_next_merge, 1);
4292                                 /* Postion for the next block */
4293                                 start = next->r_end;
4294                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
4295                                 if (rsm == NULL)
4296                                         goto out;
4297                         } else {
4298                                 /**
4299                                  * We can't use any hookery here, so we
4300                                  * need to split the map. We enter like
4301                                  * so:
4302                                  *  rsm      |--------|
4303                                  *  sackblk       |----->
4304                                  * We will add the new block nrsm and
4305                                  * that will be the new portion, and then
4306                                  * fall through after reseting rsm. So we
4307                                  * split and look like this:
4308                                  *  rsm      |----|
4309                                  *  sackblk       |----->
4310                                  *  nrsm          |---|
4311                                  * We then fall through reseting
4312                                  * rsm to nrsm, so the next block
4313                                  * picks it up.
4314                                  */
4315                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4316                                 if (nrsm == NULL) {
4317                                         /*
4318                                          * failed XXXrrs what can we do but loose the sack
4319                                          * info?
4320                                          */
4321                                         goto out;
4322                                 }
4323                                 counter_u64_add(rack_sack_splits, 1);
4324                                 rack_clone_rsm(rack, nrsm, rsm, start);
4325                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4326 #ifdef INVARIANTS
4327                                 if (insret != NULL) {
4328                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4329                                               nrsm, insret, rack, rsm);
4330                                 }
4331 #endif
4332                                 if (rsm->r_in_tmap) {
4333                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4334                                         nrsm->r_in_tmap = 1;
4335                                 }
4336                                 rsm->r_flags &= (~RACK_HAS_FIN);
4337                                 /* Position us to point to the new nrsm that starts the sack blk */
4338                                 rsm = nrsm;
4339                         }
4340                 } else {
4341                         /* Already sacked this piece */
4342                         counter_u64_add(rack_sack_skipped_acked, 1);
4343                         moved++;
4344                         if (end == rsm->r_end) {
4345                                 /* Done with block */
4346                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4347                                 goto out;
4348                         } else if (SEQ_LT(end, rsm->r_end)) {
4349                                 /* A partial sack to a already sacked block */
4350                                 moved++;
4351                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4352                                 goto out;
4353                         } else {
4354                                 /*
4355                                  * The end goes beyond this guy
4356                                  * repostion the start to the
4357                                  * next block.
4358                                  */
4359                                 start = rsm->r_end;
4360                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4361                                 if (rsm == NULL)
4362                                         goto out;
4363                         }
4364                 }
4365         }
4366         if (SEQ_GEQ(end, rsm->r_end)) {
4367                 /**
4368                  * The end of this block is either beyond this guy or right
4369                  * at this guy. I.e.:
4370                  *  rsm ---                 |-----|
4371                  *  end                     |-----|
4372                  *  <or>
4373                  *  end                     |---------|
4374                  */
4375                 if (rsm->r_flags & RACK_TLP)
4376                         rack->r_ctl.rc_tlp_rtx_out = 0;
4377                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4378                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4379                         changed += (rsm->r_end - rsm->r_start);
4380                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4381                         if (rsm->r_in_tmap) /* should be true */
4382                                 rack_log_sack_passed(tp, rack, rsm);
4383                         /* Is Reordering occuring? */
4384                         if (rsm->r_flags & RACK_SACK_PASSED) {
4385                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4386                                 counter_u64_add(rack_reorder_seen, 1);
4387                                 rack->r_ctl.rc_reorder_ts = cts;
4388                         }
4389                         rsm->r_flags |= RACK_ACKED;
4390                         rsm->r_flags &= ~RACK_TLP;
4391                         if (rsm->r_in_tmap) {
4392                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4393                                 rsm->r_in_tmap = 0;
4394                         }
4395                 } else {
4396                         counter_u64_add(rack_sack_skipped_acked, 1);
4397                         moved++;
4398                 }
4399                 if (end == rsm->r_end) {
4400                         /* This block only - done, setup for next  */
4401                         goto out;
4402                 }
4403                 /*
4404                  * There is more not coverend by this rsm move on
4405                  * to the next block in the RB tree.
4406                  */
4407                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4408                 start = rsm->r_end;
4409                 rsm = nrsm;
4410                 if (rsm == NULL)
4411                         goto out;
4412                 goto do_rest_ofb;
4413         }
4414         /**
4415          * The end of this sack block is smaller than
4416          * our rsm i.e.:
4417          *  rsm ---                 |-----|
4418          *  end                     |--|
4419          */
4420         if ((rsm->r_flags & RACK_ACKED) == 0) {
4421                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4422                 if (prev && (prev->r_flags & RACK_ACKED)) {
4423                         /**
4424                          * Goal, we want the right remainder of rsm to shrink
4425                          * in place and span from (rsm->r_start = end) to rsm->r_end.
4426                          * We want to expand prev to go all the way
4427                          * to prev->r_end <- end.
4428                          * so in the tree we have before:
4429                          *   prev     |--------|         (acked)
4430                          *   rsm               |-------| (non-acked)
4431                          *   sackblk           |-|
4432                          * We churn it so we end up with
4433                          *   prev     |----------|       (acked)
4434                          *   rsm                 |-----| (non-acked)
4435                          *   nrsm              |-| (temporary)
4436                          */
4437                         nrsm = &stack_map;
4438                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4439                         prev->r_end = end;
4440                         rsm->r_start = end;
4441                         /* Now adjust nrsm (stack copy) to be
4442                          * the one that is the small
4443                          * piece that was "sacked".
4444                          */
4445                         nrsm->r_end = end;
4446                         rsm->r_dupack = 0;
4447                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4448                         /*
4449                          * Now nrsm is our new little piece
4450                          * that is acked (which was merged
4451                          * to prev). Update the rtt and changed
4452                          * based on that. Also check for reordering.
4453                          */
4454                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4455                         changed += (nrsm->r_end - nrsm->r_start);
4456                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4457                         if (nrsm->r_flags & RACK_SACK_PASSED) {
4458                                 counter_u64_add(rack_reorder_seen, 1);
4459                                 rack->r_ctl.rc_reorder_ts = cts;
4460                         }
4461                         rsm = prev;
4462                         counter_u64_add(rack_sack_used_prev_merge, 1);
4463                 } else {
4464                         /**
4465                          * This is the case where our previous
4466                          * block is not acked either, so we must
4467                          * split the block in two.
4468                          */
4469                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4470                         if (nrsm == NULL) {
4471                                 /* failed rrs what can we do but loose the sack info? */
4472                                 goto out;
4473                         }
4474                         /**
4475                          * In this case nrsm becomes
4476                          * nrsm->r_start = end;
4477                          * nrsm->r_end = rsm->r_end;
4478                          * which is un-acked.
4479                          * <and>
4480                          * rsm->r_end = nrsm->r_start;
4481                          * i.e. the remaining un-acked
4482                          * piece is left on the left
4483                          * hand side.
4484                          *
4485                          * So we start like this
4486                          * rsm      |----------| (not acked)
4487                          * sackblk  |---|
4488                          * build it so we have
4489                          * rsm      |---|         (acked)
4490                          * nrsm         |------|  (not acked)
4491                          */
4492                         counter_u64_add(rack_sack_splits, 1);
4493                         rack_clone_rsm(rack, nrsm, rsm, end);
4494                         rsm->r_flags &= (~RACK_HAS_FIN);
4495                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4496 #ifdef INVARIANTS
4497                         if (insret != NULL) {
4498                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4499                                       nrsm, insret, rack, rsm);
4500                         }
4501 #endif
4502                         if (rsm->r_in_tmap) {
4503                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4504                                 nrsm->r_in_tmap = 1;
4505                         }
4506                         nrsm->r_dupack = 0;
4507                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
4508                         if (rsm->r_flags & RACK_TLP)
4509                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4510                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4511                         changed += (rsm->r_end - rsm->r_start);
4512                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4513                         if (rsm->r_in_tmap) /* should be true */
4514                                 rack_log_sack_passed(tp, rack, rsm);
4515                         /* Is Reordering occuring? */
4516                         if (rsm->r_flags & RACK_SACK_PASSED) {
4517                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4518                                 counter_u64_add(rack_reorder_seen, 1);
4519                                 rack->r_ctl.rc_reorder_ts = cts;
4520                         }
4521                         rsm->r_flags |= RACK_ACKED;
4522                         rsm->r_flags &= ~RACK_TLP;
4523                         if (rsm->r_in_tmap) {
4524                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4525                                 rsm->r_in_tmap = 0;
4526                         }
4527                 }
4528         } else if (start != end){
4529                 /*
4530                  * The block was already acked.
4531                  */
4532                 counter_u64_add(rack_sack_skipped_acked, 1);
4533                 moved++;
4534         }
4535 out:
4536         if (rsm && (rsm->r_flags & RACK_ACKED)) {
4537                 /*
4538                  * Now can we merge where we worked
4539                  * with either the previous or
4540                  * next block?
4541                  */
4542                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4543                 while (next) {
4544                     if (next->r_flags & RACK_ACKED) {
4545                         /* yep this and next can be merged */
4546                         rsm = rack_merge_rsm(rack, rsm, next);
4547                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4548                     } else
4549                             break;
4550                 }
4551                 /* Now what about the previous? */
4552                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4553                 while (prev) {
4554                     if (prev->r_flags & RACK_ACKED) {
4555                         /* yep the previous and this can be merged */
4556                         rsm = rack_merge_rsm(rack, prev, rsm);
4557                         prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4558                     } else
4559                             break;
4560                 }
4561         }
4562         if (used_ref == 0) {
4563                 counter_u64_add(rack_sack_proc_all, 1);
4564         } else {
4565                 counter_u64_add(rack_sack_proc_short, 1);
4566         }
4567         /* Save off the next one for quick reference. */
4568         if (rsm)
4569                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4570         else
4571                 nrsm = NULL;
4572         *prsm = rack->r_ctl.rc_sacklast = nrsm;
4573         /* Pass back the moved. */
4574         *moved_two = moved;
4575         return (changed);
4576 }
4577
4578 static void inline
4579 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4580 {
4581         struct rack_sendmap *tmap;
4582
4583         tmap = NULL;
4584         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4585                 /* Its no longer sacked, mark it so */
4586                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4587 #ifdef INVARIANTS
4588                 if (rsm->r_in_tmap) {
4589                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4590                               rack, rsm, rsm->r_flags);
4591                 }
4592 #endif
4593                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4594                 /* Rebuild it into our tmap */
4595                 if (tmap == NULL) {
4596                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4597                         tmap = rsm;
4598                 } else {
4599                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4600                         tmap = rsm;
4601                 }
4602                 tmap->r_in_tmap = 1;
4603                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4604         }
4605         /*
4606          * Now lets possibly clear the sack filter so we start
4607          * recognizing sacks that cover this area.
4608          */
4609         if (rack_use_sack_filter)
4610                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4611
4612 }
4613
4614 static void
4615 rack_do_decay(struct tcp_rack *rack)
4616 {
4617 #ifdef NETFLIX_EXP_DETECTION
4618         struct timeval res;
4619
4620 #define timersub(tvp, uvp, vvp)                                         \
4621         do {                                                            \
4622                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
4623                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
4624                 if ((vvp)->tv_usec < 0) {                               \
4625                         (vvp)->tv_sec--;                                \
4626                         (vvp)->tv_usec += 1000000;                      \
4627                 }                                                       \
4628         } while (0)
4629
4630         timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res);
4631 #undef timersub
4632
4633         rack->r_ctl.input_pkt++;
4634         if ((rack->rc_in_persist) ||
4635             (res.tv_sec >= 1) ||
4636             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
4637                 /*
4638                  * Check for decay of non-SAD,
4639                  * we want all SAD detection metrics to
4640                  * decay 1/4 per second (or more) passed.
4641                  */
4642                 uint32_t pkt_delta;
4643
4644                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
4645                 /* Update our saved tracking values */
4646                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
4647                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
4648                 /* Now do we escape without decay? */
4649                 if (rack->rc_in_persist ||
4650                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
4651                     (pkt_delta < tcp_sad_low_pps)){
4652                         /*
4653                          * We don't decay idle connections
4654                          * or ones that have a low input pps.
4655                          */
4656                         return;
4657                 }
4658                 /* Decay the counters */
4659                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
4660                                                         tcp_sad_decay_val);
4661                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
4662                                                          tcp_sad_decay_val);
4663                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
4664                                                                tcp_sad_decay_val);
4665                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
4666                                                                 tcp_sad_decay_val);
4667         }
4668 #endif
4669 }
4670
4671 static void
4672 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4673 {
4674         uint32_t changed, entered_recovery = 0;
4675         struct tcp_rack *rack;
4676         struct rack_sendmap *rsm, *rm;
4677         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4678         register uint32_t th_ack;
4679         int32_t i, j, k, num_sack_blks = 0;
4680         uint32_t cts, acked, ack_point, sack_changed = 0;
4681         int loop_start = 0, moved_two = 0;
4682
4683         INP_WLOCK_ASSERT(tp->t_inpcb);
4684         if (th->th_flags & TH_RST) {
4685                 /* We don't log resets */
4686                 return;
4687         }
4688         rack = (struct tcp_rack *)tp->t_fb_ptr;
4689         cts = tcp_ts_getticks();
4690         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4691         changed = 0;
4692         th_ack = th->th_ack;
4693         if (rack->sack_attack_disable == 0)
4694                 rack_do_decay(rack);
4695         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
4696                 /*
4697                  * You only get credit for
4698                  * MSS and greater (and you get extra
4699                  * credit for larger cum-ack moves).
4700                  */
4701                 int ac;
4702
4703                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
4704                 rack->r_ctl.ack_count += ac;
4705                 counter_u64_add(rack_ack_total, ac);
4706         }
4707         if (rack->r_ctl.ack_count > 0xfff00000) {
4708                 /*
4709                  * reduce the number to keep us under
4710                  * a uint32_t.
4711                  */
4712                 rack->r_ctl.ack_count /= 2;
4713                 rack->r_ctl.sack_count /= 2;
4714         }
4715         if (SEQ_GT(th_ack, tp->snd_una)) {
4716                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4717                 tp->t_acktime = ticks;
4718         }
4719         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4720                 changed = th_ack - rsm->r_start;
4721         if (changed) {
4722                 /*
4723                  * The ACK point is advancing to th_ack, we must drop off
4724                  * the packets in the rack log and calculate any eligble
4725                  * RTT's.
4726                  */
4727                 rack->r_wanted_output++;
4728         more:
4729                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4730                 if (rsm == NULL) {
4731                         if ((th_ack - 1) == tp->iss) {
4732                                 /*
4733                                  * For the SYN incoming case we will not
4734                                  * have called tcp_output for the sending of
4735                                  * the SYN, so there will be no map. All
4736                                  * other cases should probably be a panic.
4737                                  */
4738                                 goto proc_sack;
4739                         }
4740                         if (tp->t_flags & TF_SENTFIN) {
4741                                 /* if we send a FIN we will not hav a map */
4742                                 goto proc_sack;
4743                         }
4744 #ifdef INVARIANTS
4745                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4746                               tp,
4747                               th, tp->t_state, rack,
4748                               tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4749 #endif
4750                         goto proc_sack;
4751                 }
4752                 if (SEQ_LT(th_ack, rsm->r_start)) {
4753                         /* Huh map is missing this */
4754 #ifdef INVARIANTS
4755                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4756                                rsm->r_start,
4757                                th_ack, tp->t_state, rack->r_state);
4758 #endif
4759                         goto proc_sack;
4760                 }
4761                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4762                 /* Now do we consume the whole thing? */
4763                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4764                         /* Its all consumed. */
4765                         uint32_t left;
4766
4767                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4768                         rsm->r_rtr_bytes = 0;
4769                         if (rsm->r_flags & RACK_TLP)
4770                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4771                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4772 #ifdef INVARIANTS
4773                         if (rm != rsm) {
4774                                 panic("removing head in rack:%p rsm:%p rm:%p",
4775                                       rack, rsm, rm);
4776                         }
4777 #endif
4778                         if (rsm->r_in_tmap) {
4779                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4780                                 rsm->r_in_tmap = 0;
4781                         }
4782                         if (rsm->r_flags & RACK_ACKED) {
4783                                 /*
4784                                  * It was acked on the scoreboard -- remove
4785                                  * it from total
4786                                  */
4787                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4788                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4789                                 /*
4790                                  * There are segments ACKED on the
4791                                  * scoreboard further up. We are seeing
4792                                  * reordering.
4793                                  */
4794                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4795                                 counter_u64_add(rack_reorder_seen, 1);
4796                                 rsm->r_flags |= RACK_ACKED;
4797                                 rack->r_ctl.rc_reorder_ts = cts;
4798                         }
4799                         left = th_ack - rsm->r_end;
4800                         if (rsm->r_rtr_cnt > 1) {
4801                                 /*
4802                                  * Technically we should make r_rtr_cnt be
4803                                  * monotonicly increasing and just mod it to
4804                                  * the timestamp it is replacing.. that way
4805                                  * we would have the last 3 retransmits. Now
4806                                  * rc_loss_count will be wrong if we
4807                                  * retransmit something more than 2 times in
4808                                  * recovery :(
4809                                  */
4810                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4811                         }
4812                         /* Free back to zone */
4813                         rack_free(rack, rsm);
4814                         if (left) {
4815                                 goto more;
4816                         }
4817                         goto proc_sack;
4818                 }
4819                 if (rsm->r_flags & RACK_ACKED) {
4820                         /*
4821                          * It was acked on the scoreboard -- remove it from
4822                          * total for the part being cum-acked.
4823                          */
4824                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4825                 }
4826                 /*
4827                  * Clear the dup ack count for
4828                  * the piece that remains.
4829                  */
4830                 rsm->r_dupack = 0;
4831                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4832                 if (rsm->r_rtr_bytes) {
4833                         /*
4834                          * It was retransmitted adjust the
4835                          * sack holes for what was acked.
4836                          */
4837                         int ack_am;
4838
4839                         ack_am = (th_ack - rsm->r_start);
4840                         if (ack_am >= rsm->r_rtr_bytes) {
4841                                 rack->r_ctl.rc_holes_rxt -= ack_am;
4842                                 rsm->r_rtr_bytes -= ack_am;
4843                         }
4844                 }
4845                 /* Update where the piece starts */
4846                 rsm->r_start = th_ack;
4847         }
4848 proc_sack:
4849         /* Check for reneging */
4850         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4851         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4852                 /*
4853                  * The peer has moved snd_una up to
4854                  * the edge of this send, i.e. one
4855                  * that it had previously acked. The only
4856                  * way that can be true if the peer threw
4857                  * away data (space issues) that it had
4858                  * previously sacked (else it would have
4859                  * given us snd_una up to (rsm->r_end).
4860                  * We need to undo the acked markings here.
4861                  *
4862                  * Note we have to look to make sure th_ack is
4863                  * our rsm->r_start in case we get an old ack
4864                  * where th_ack is behind snd_una.
4865                  */
4866                 rack_peer_reneges(rack, rsm, th->th_ack);
4867         }
4868         if ((to->to_flags & TOF_SACK) == 0) {
4869                 /* We are done nothing left */
4870                 goto out;
4871         }
4872         /* Sack block processing */
4873         if (SEQ_GT(th_ack, tp->snd_una))
4874                 ack_point = th_ack;
4875         else
4876                 ack_point = tp->snd_una;
4877         for (i = 0; i < to->to_nsacks; i++) {
4878                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4879                       &sack, sizeof(sack));
4880                 sack.start = ntohl(sack.start);
4881                 sack.end = ntohl(sack.end);
4882                 if (SEQ_GT(sack.end, sack.start) &&
4883                     SEQ_GT(sack.start, ack_point) &&
4884                     SEQ_LT(sack.start, tp->snd_max) &&
4885                     SEQ_GT(sack.end, ack_point) &&
4886                     SEQ_LEQ(sack.end, tp->snd_max)) {
4887                         sack_blocks[num_sack_blks] = sack;
4888                         num_sack_blks++;
4889 #ifdef NETFLIX_STATS
4890                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4891                            SEQ_LEQ(sack.end, th_ack)) {
4892                         /*
4893                          * Its a D-SACK block.
4894                          */
4895                         tcp_record_dsack(sack.start, sack.end);
4896 #endif
4897                 }
4898
4899         }
4900         /*
4901          * Sort the SACK blocks so we can update the rack scoreboard with
4902          * just one pass.
4903          */
4904         if (rack_use_sack_filter) {
4905                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
4906                                                  num_sack_blks, th->th_ack);
4907                 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
4908         }
4909         if (num_sack_blks == 0)  {
4910                 /* Nothing to sack (DSACKs?) */
4911                 goto out_with_totals;
4912         }
4913         if (num_sack_blks < 2) {
4914                 /* Only one, we don't need to sort */
4915                 goto do_sack_work;
4916         }
4917         /* Sort the sacks */
4918         for (i = 0; i < num_sack_blks; i++) {
4919                 for (j = i + 1; j < num_sack_blks; j++) {
4920                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4921                                 sack = sack_blocks[i];
4922                                 sack_blocks[i] = sack_blocks[j];
4923                                 sack_blocks[j] = sack;
4924                         }
4925                 }
4926         }
4927         /*
4928          * Now are any of the sack block ends the same (yes some
4929          * implementations send these)?
4930          */
4931 again:
4932         if (num_sack_blks == 0)
4933                 goto out_with_totals;
4934         if (num_sack_blks > 1) {
4935                 for (i = 0; i < num_sack_blks; i++) {
4936                         for (j = i + 1; j < num_sack_blks; j++) {
4937                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4938                                         /*
4939                                          * Ok these two have the same end we
4940                                          * want the smallest end and then
4941                                          * throw away the larger and start
4942                                          * again.
4943                                          */
4944                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4945                                                 /*
4946                                                  * The second block covers
4947                                                  * more area use that
4948                                                  */
4949                                                 sack_blocks[i].start = sack_blocks[j].start;
4950                                         }
4951                                         /*
4952                                          * Now collapse out the dup-sack and
4953                                          * lower the count
4954                                          */
4955                                         for (k = (j + 1); k < num_sack_blks; k++) {
4956                                                 sack_blocks[j].start = sack_blocks[k].start;
4957                                                 sack_blocks[j].end = sack_blocks[k].end;
4958                                                 j++;
4959                                         }
4960                                         num_sack_blks--;
4961                                         goto again;
4962                                 }
4963                         }
4964                 }
4965         }
4966 do_sack_work:
4967         /*
4968          * First lets look to see if
4969          * we have retransmitted and
4970          * can use the transmit next?
4971          */
4972         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4973         if (rsm &&
4974             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
4975             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
4976                 /*
4977                  * We probably did the FR and the next
4978                  * SACK in continues as we would expect.
4979                  */
4980                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
4981                 if (acked) {
4982                         rack->r_wanted_output++;
4983                         changed += acked;
4984                         sack_changed += acked;
4985                 }
4986                 if (num_sack_blks == 1) {
4987                         /*
4988                          * This is what we would expect from
4989                          * a normal implementation to happen
4990                          * after we have retransmitted the FR,
4991                          * i.e the sack-filter pushes down
4992                          * to 1 block and the next to be retransmitted
4993                          * is the sequence in the sack block (has more
4994                          * are acked). Count this as ACK'd data to boost
4995                          * up the chances of recovering any false positives.
4996                          */
4997                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
4998                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
4999                         counter_u64_add(rack_express_sack, 1);
5000                         if (rack->r_ctl.ack_count > 0xfff00000) {
5001                                 /*
5002                                  * reduce the number to keep us under
5003                                  * a uint32_t.
5004                                  */
5005                                 rack->r_ctl.ack_count /= 2;
5006                                 rack->r_ctl.sack_count /= 2;
5007                         }
5008                         goto out_with_totals;
5009                 } else {
5010                         /*
5011                          * Start the loop through the
5012                          * rest of blocks, past the first block.
5013                          */
5014                         moved_two = 0;
5015                         loop_start = 1;
5016                 }
5017         }
5018         /* Its a sack of some sort */
5019         rack->r_ctl.sack_count++;
5020         if (rack->r_ctl.sack_count > 0xfff00000) {
5021                 /*
5022                  * reduce the number to keep us under
5023                  * a uint32_t.
5024                  */
5025                 rack->r_ctl.ack_count /= 2;
5026                 rack->r_ctl.sack_count /= 2;
5027         }
5028         counter_u64_add(rack_sack_total, 1);
5029         if (rack->sack_attack_disable) {
5030                 /* An attacker disablement is in place */
5031                 if (num_sack_blks > 1) {
5032                         rack->r_ctl.sack_count += (num_sack_blks - 1);
5033                         rack->r_ctl.sack_moved_extra++;
5034                         counter_u64_add(rack_move_some, 1);
5035                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
5036                                 rack->r_ctl.sack_moved_extra /= 2;
5037                                 rack->r_ctl.sack_noextra_move /= 2;
5038                         }
5039                 }
5040                 goto out;
5041         }
5042         rsm = rack->r_ctl.rc_sacklast;
5043         for (i = loop_start; i < num_sack_blks; i++) {
5044                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
5045                 if (acked) {
5046                         rack->r_wanted_output++;
5047                         changed += acked;
5048                         sack_changed += acked;
5049                 }
5050                 if (moved_two) {
5051                         /*
5052                          * If we did not get a SACK for at least a MSS and
5053                          * had to move at all, or if we moved more than our
5054                          * threshold, it counts against the "extra" move.
5055                          */
5056                         rack->r_ctl.sack_moved_extra += moved_two;
5057                         counter_u64_add(rack_move_some, 1);
5058                 } else {
5059                         /*
5060                          * else we did not have to move
5061                          * any more than we would expect.
5062                          */
5063                         rack->r_ctl.sack_noextra_move++;
5064                         counter_u64_add(rack_move_none, 1);
5065                 }
5066                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
5067                         /*
5068                          * If the SACK was not a full MSS then
5069                          * we add to sack_count the number of
5070                          * MSS's (or possibly more than
5071                          * a MSS if its a TSO send) we had to skip by.
5072                          */
5073                         rack->r_ctl.sack_count += moved_two;
5074                         counter_u64_add(rack_sack_total, moved_two);
5075                 }
5076                 /*
5077                  * Now we need to setup for the next
5078                  * round. First we make sure we won't
5079                  * exceed the size of our uint32_t on
5080                  * the various counts, and then clear out
5081                  * moved_two.
5082                  */
5083                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
5084                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
5085                         rack->r_ctl.sack_moved_extra /= 2;
5086                         rack->r_ctl.sack_noextra_move /= 2;
5087                 }
5088                 if (rack->r_ctl.sack_count > 0xfff00000) {
5089                         rack->r_ctl.ack_count /= 2;
5090                         rack->r_ctl.sack_count /= 2;
5091                 }
5092                 moved_two = 0;
5093         }
5094 out_with_totals:
5095         if (num_sack_blks > 1) {
5096                 /*
5097                  * You get an extra stroke if
5098                  * you have more than one sack-blk, this
5099                  * could be where we are skipping forward
5100                  * and the sack-filter is still working, or
5101                  * it could be an attacker constantly
5102                  * moving us.
5103                  */
5104                 rack->r_ctl.sack_moved_extra++;
5105                 counter_u64_add(rack_move_some, 1);
5106         }
5107 out:
5108 #ifdef NETFLIX_EXP_DETECTION
5109         if ((rack->do_detection || tcp_force_detection) &&
5110             tcp_sack_to_ack_thresh &&
5111             tcp_sack_to_move_thresh &&
5112             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
5113                 /*
5114                  * We have thresholds set to find
5115                  * possible attackers and disable sack.
5116                  * Check them.
5117                  */
5118                 uint64_t ackratio, moveratio, movetotal;
5119
5120                 /* Log detecting */
5121                 rack_log_sad(rack, 1);
5122                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
5123                 ackratio *= (uint64_t)(1000);
5124                 if (rack->r_ctl.ack_count)
5125                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
5126                 else {
5127                         /* We really should not hit here */
5128                         ackratio = 1000;
5129                 }
5130                 if ((rack->sack_attack_disable  == 0) &&
5131                     (ackratio > rack_highest_sack_thresh_seen))
5132                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
5133                 movetotal = rack->r_ctl.sack_moved_extra;
5134                 movetotal += rack->r_ctl.sack_noextra_move;
5135                 moveratio = rack->r_ctl.sack_moved_extra;
5136                 moveratio *= (uint64_t)1000;
5137                 if (movetotal)
5138                         moveratio /= movetotal;
5139                 else {
5140                         /* No moves, thats pretty good */
5141                         moveratio = 0;
5142                 }
5143                 if ((rack->sack_attack_disable == 0) &&
5144                     (moveratio > rack_highest_move_thresh_seen))
5145                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
5146                 if (rack->sack_attack_disable == 0) {
5147                         if ((ackratio > tcp_sack_to_ack_thresh) &&
5148                             (moveratio > tcp_sack_to_move_thresh)) {
5149                                 /* Disable sack processing */
5150                                 rack->sack_attack_disable = 1;
5151                                 if (rack->r_rep_attack == 0) {
5152                                         rack->r_rep_attack = 1;
5153                                         counter_u64_add(rack_sack_attacks_detected, 1);
5154                                 }
5155                                 if (tcp_attack_on_turns_on_logging) {
5156                                         /*
5157                                          * Turn on logging, used for debugging
5158                                          * false positives.
5159                                          */
5160                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
5161                                 }
5162                                 /* Clamp the cwnd at flight size */
5163                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
5164                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
5165                                 rack_log_sad(rack, 2);
5166                         }
5167                 } else {
5168                         /* We are sack-disabled check for false positives */
5169                         if ((ackratio <= tcp_restoral_thresh) ||
5170                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
5171                                 rack->sack_attack_disable  = 0;
5172                                 rack_log_sad(rack, 3);
5173                                 /* Restart counting */
5174                                 rack->r_ctl.sack_count = 0;
5175                                 rack->r_ctl.sack_moved_extra = 0;
5176                                 rack->r_ctl.sack_noextra_move = 1;
5177                                 rack->r_ctl.ack_count = max(1,
5178                                       (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
5179
5180                                 if (rack->r_rep_reverse == 0) {
5181                                         rack->r_rep_reverse = 1;
5182                                         counter_u64_add(rack_sack_attacks_reversed, 1);
5183                                 }
5184                                 /* Restore the cwnd */
5185                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
5186                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
5187                         }
5188                 }
5189         }
5190 #endif
5191         if (changed) {
5192                 /* Something changed cancel the rack timer */
5193                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5194         }
5195         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
5196                 /*
5197                  * Ok we have a high probability that we need to go in to
5198                  * recovery since we have data sack'd
5199                  */
5200                 struct rack_sendmap *rsm;
5201                 uint32_t tsused;
5202
5203                 tsused = tcp_ts_getticks();
5204                 rsm = tcp_rack_output(tp, rack, tsused);
5205                 if (rsm) {
5206                         /* Enter recovery */
5207                         rack->r_ctl.rc_rsm_start = rsm->r_start;
5208                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
5209                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
5210                         entered_recovery = 1;
5211                         rack_cong_signal(tp, NULL, CC_NDUPACK);
5212                         /*
5213                          * When we enter recovery we need to assure we send
5214                          * one packet.
5215                          */
5216                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5217                         rack_log_to_prr(rack, 8);
5218                         rack->r_timer_override = 1;
5219                 }
5220         }
5221         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
5222                 /* Deal with changed and PRR here (in recovery only) */
5223                 uint32_t pipe, snd_una;
5224
5225                 rack->r_ctl.rc_prr_delivered += changed;
5226                 /* Compute prr_sndcnt */
5227                 if (SEQ_GT(tp->snd_una, th_ack)) {
5228                         snd_una = tp->snd_una;
5229                 } else {
5230                         snd_una = th_ack;
5231                 }
5232                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
5233                 if (pipe > tp->snd_ssthresh) {
5234                         long sndcnt;
5235
5236                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
5237                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
5238                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
5239                         else {
5240                                 rack->r_ctl.rc_prr_sndcnt = 0;
5241                                 rack_log_to_prr(rack, 9);
5242                                 sndcnt = 0;
5243                         }
5244                         sndcnt++;
5245                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
5246                                 sndcnt -= rack->r_ctl.rc_prr_out;
5247                         else
5248                                 sndcnt = 0;
5249                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
5250                         rack_log_to_prr(rack, 10);
5251                 } else {
5252                         uint32_t limit;
5253
5254                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
5255                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
5256                         else
5257                                 limit = 0;
5258                         if (changed > limit)
5259                                 limit = changed;
5260                         limit += ctf_fixed_maxseg(tp);
5261                         if (tp->snd_ssthresh > pipe) {
5262                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
5263                                 rack_log_to_prr(rack, 11);
5264                         } else {
5265                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
5266                                 rack_log_to_prr(rack, 12);
5267                         }
5268                 }
5269                 if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) {
5270                         rack->r_timer_override = 1;
5271                 }
5272         }
5273 }
5274
5275 static void
5276 rack_strike_dupack(struct tcp_rack *rack)
5277 {
5278         struct rack_sendmap *rsm;
5279
5280         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5281         if (rsm && (rsm->r_dupack < 0xff)) {
5282                 rsm->r_dupack++;
5283                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
5284                         rack->r_wanted_output = 1;
5285                         rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
5286                 } else {
5287                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
5288                 }
5289         }
5290 }
5291
5292 /*
5293  * Return value of 1, we do not need to call rack_process_data().
5294  * return value of 0, rack_process_data can be called.
5295  * For ret_val if its 0 the TCP is locked, if its non-zero
5296  * its unlocked and probably unsafe to touch the TCB.
5297  */
5298 static int
5299 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5300     struct tcpcb *tp, struct tcpopt *to,
5301     uint32_t tiwin, int32_t tlen,
5302     int32_t * ofia, int32_t thflags, int32_t * ret_val)
5303 {
5304         int32_t ourfinisacked = 0;
5305         int32_t nsegs, acked_amount;
5306         int32_t acked;
5307         struct mbuf *mfree;
5308         struct tcp_rack *rack;
5309         int32_t recovery = 0;
5310
5311         rack = (struct tcp_rack *)tp->t_fb_ptr;
5312         if (SEQ_GT(th->th_ack, tp->snd_max)) {
5313                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
5314                 rack->r_wanted_output++;
5315                 return (1);
5316         }
5317         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
5318                 if (rack->rc_in_persist)
5319                         tp->t_rxtshift = 0;
5320                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
5321                         rack_strike_dupack(rack);
5322                 rack_log_ack(tp, to, th);
5323         }
5324         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5325                 /*
5326                  * Old ack, behind (or duplicate to) the last one rcv'd
5327                  * Note: Should mark reordering is occuring! We should also
5328                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
5329                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
5330                  * retran and> ack 3
5331                  */
5332                 return (0);
5333         }
5334         /*
5335          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
5336          * something we sent.
5337          */
5338         if (tp->t_flags & TF_NEEDSYN) {
5339                 /*
5340                  * T/TCP: Connection was half-synchronized, and our SYN has
5341                  * been ACK'd (so connection is now fully synchronized).  Go
5342                  * to non-starred state, increment snd_una for ACK of SYN,
5343                  * and check if we can do window scaling.
5344                  */
5345                 tp->t_flags &= ~TF_NEEDSYN;
5346                 tp->snd_una++;
5347                 /* Do window scaling? */
5348                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5349                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5350                         tp->rcv_scale = tp->request_r_scale;
5351                         /* Send window already scaled. */
5352                 }
5353         }
5354         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5355         INP_WLOCK_ASSERT(tp->t_inpcb);
5356
5357         acked = BYTES_THIS_ACK(tp, th);
5358         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5359         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
5360
5361         /*
5362          * If we just performed our first retransmit, and the ACK arrives
5363          * within our recovery window, then it was a mistake to do the
5364          * retransmit in the first place.  Recover our original cwnd and
5365          * ssthresh, and proceed to transmit where we left off.
5366          */
5367         if (tp->t_flags & TF_PREVVALID) {
5368                 tp->t_flags &= ~TF_PREVVALID;
5369                 if (tp->t_rxtshift == 1 &&
5370                     (int)(ticks - tp->t_badrxtwin) < 0)
5371                         rack_cong_signal(tp, th, CC_RTO_ERR);
5372         }
5373         /*
5374          * If we have a timestamp reply, update smoothed round trip time. If
5375          * no timestamp is present but transmit timer is running and timed
5376          * sequence number was acked, update smoothed round trip time. Since
5377          * we now have an rtt measurement, cancel the timer backoff (cf.,
5378          * Phil Karn's retransmit alg.). Recompute the initial retransmit
5379          * timer.
5380          *
5381          * Some boxes send broken timestamp replies during the SYN+ACK
5382          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5383          * and blow up the retransmit timer.
5384          */
5385         /*
5386          * If all outstanding data is acked, stop retransmit timer and
5387          * remember to restart (more output or persist). If there is more
5388          * data to be acked, restart retransmit timer, using current
5389          * (possibly backed-off) value.
5390          */
5391         if (th->th_ack == tp->snd_max) {
5392                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5393                 rack->r_wanted_output++;
5394         }
5395         if (acked == 0) {
5396                 if (ofia)
5397                         *ofia = ourfinisacked;
5398                 return (0);
5399         }
5400         if (rack->r_ctl.rc_early_recovery) {
5401                 if (IN_RECOVERY(tp->t_flags)) {
5402                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5403                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5404                                 tcp_rack_partialack(tp, th);
5405                         } else {
5406                                 rack_post_recovery(tp, th);
5407                                 recovery = 1;
5408                         }
5409                 }
5410         }
5411         /*
5412          * Let the congestion control algorithm update congestion control
5413          * related information. This typically means increasing the
5414          * congestion window.
5415          */
5416         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
5417         SOCKBUF_LOCK(&so->so_snd);
5418         acked_amount = min(acked, (int)sbavail(&so->so_snd));
5419         tp->snd_wnd -= acked_amount;
5420         mfree = sbcut_locked(&so->so_snd, acked_amount);
5421         if ((sbused(&so->so_snd) == 0) &&
5422             (acked > acked_amount) &&
5423             (tp->t_state >= TCPS_FIN_WAIT_1)) {
5424                 ourfinisacked = 1;
5425         }
5426         /* NB: sowwakeup_locked() does an implicit unlock. */
5427         sowwakeup_locked(so);
5428         m_freem(mfree);
5429         if (rack->r_ctl.rc_early_recovery == 0) {
5430                 if (IN_RECOVERY(tp->t_flags)) {
5431                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5432                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5433                                 tcp_rack_partialack(tp, th);
5434                         } else {
5435                                 rack_post_recovery(tp, th);
5436                         }
5437                 }
5438         }
5439         tp->snd_una = th->th_ack;
5440         if (SEQ_GT(tp->snd_una, tp->snd_recover))
5441                 tp->snd_recover = tp->snd_una;
5442
5443         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
5444                 tp->snd_nxt = tp->snd_una;
5445         }
5446         if (tp->snd_una == tp->snd_max) {
5447                 /* Nothing left outstanding */
5448                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5449                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
5450                         tp->t_acktime = 0;
5451                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5452                 /* Set need output so persist might get set */
5453                 rack->r_wanted_output++;
5454                 if (rack_use_sack_filter)
5455                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5456                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
5457                     (sbavail(&so->so_snd) == 0) &&
5458                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
5459                         /*
5460                          * The socket was gone and the
5461                          * peer sent data, time to
5462                          * reset him.
5463                          */
5464                         *ret_val = 1;
5465                         tp = tcp_close(tp);
5466                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
5467                         return (1);
5468                 }
5469         }
5470         if (ofia)
5471                 *ofia = ourfinisacked;
5472         return (0);
5473 }
5474
5475 static void
5476 rack_collapsed_window(struct tcp_rack *rack)
5477 {
5478         /*
5479          * Now we must walk the
5480          * send map and divide the
5481          * ones left stranded. These
5482          * guys can't cause us to abort
5483          * the connection and are really
5484          * "unsent". However if a buggy
5485          * client actually did keep some
5486          * of the data i.e. collapsed the win
5487          * and refused to ack and then opened
5488          * the win and acked that data. We would
5489          * get into an ack war, the simplier
5490          * method then of just pretending we
5491          * did not send those segments something
5492          * won't work.
5493          */
5494         struct rack_sendmap *rsm, *nrsm, fe, *insret;
5495         tcp_seq max_seq;
5496         uint32_t maxseg;
5497
5498         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
5499         maxseg = ctf_fixed_maxseg(rack->rc_tp);
5500         memset(&fe, 0, sizeof(fe));
5501         fe.r_start = max_seq;
5502         /* Find the first seq past or at maxseq */
5503         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
5504         if (rsm == NULL) {
5505                 /* Nothing to do strange */
5506                 rack->rc_has_collapsed = 0;
5507                 return;
5508         }
5509         /*
5510          * Now do we need to split at
5511          * the collapse point?
5512          */
5513         if (SEQ_GT(max_seq, rsm->r_start)) {
5514                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
5515                 if (nrsm == NULL) {
5516                         /* We can't get a rsm, mark all? */
5517                         nrsm = rsm;
5518                         goto no_split;
5519                 }
5520                 /* Clone it */
5521                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
5522                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
5523 #ifdef INVARIANTS
5524                 if (insret != NULL) {
5525                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
5526                               nrsm, insret, rack, rsm);
5527                 }
5528 #endif
5529                 if (rsm->r_in_tmap) {
5530                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5531                         nrsm->r_in_tmap = 1;
5532                 }
5533                 /*
5534                  * Set in the new RSM as the
5535                  * collapsed starting point
5536                  */
5537                 rsm = nrsm;
5538         }
5539 no_split:
5540         counter_u64_add(rack_collapsed_win, 1);
5541         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
5542                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
5543                 rack->rc_has_collapsed = 1;
5544         }
5545 }
5546
5547 static void
5548 rack_un_collapse_window(struct tcp_rack *rack)
5549 {
5550         struct rack_sendmap *rsm;
5551
5552         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
5553                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
5554                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
5555                 else
5556                         break;
5557         }
5558         rack->rc_has_collapsed = 0;
5559 }
5560
5561 /*
5562  * Return value of 1, the TCB is unlocked and most
5563  * likely gone, return value of 0, the TCP is still
5564  * locked.
5565  */
5566 static int
5567 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
5568     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
5569     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5570 {
5571         /*
5572          * Update window information. Don't look at window if no ACK: TAC's
5573          * send garbage on first SYN.
5574          */
5575         int32_t nsegs;
5576         int32_t tfo_syn;
5577         struct tcp_rack *rack;
5578
5579         rack = (struct tcp_rack *)tp->t_fb_ptr;
5580         INP_WLOCK_ASSERT(tp->t_inpcb);
5581         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5582         if ((thflags & TH_ACK) &&
5583             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
5584             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
5585             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
5586                 /* keep track of pure window updates */
5587                 if (tlen == 0 &&
5588                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
5589                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
5590                 tp->snd_wnd = tiwin;
5591                 tp->snd_wl1 = th->th_seq;
5592                 tp->snd_wl2 = th->th_ack;
5593                 if (tp->snd_wnd > tp->max_sndwnd)
5594                         tp->max_sndwnd = tp->snd_wnd;
5595                 rack->r_wanted_output++;
5596         } else if (thflags & TH_ACK) {
5597                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
5598                         tp->snd_wnd = tiwin;
5599                         tp->snd_wl1 = th->th_seq;
5600                         tp->snd_wl2 = th->th_ack;
5601                 }
5602         }
5603         if (tp->snd_wnd < ctf_outstanding(tp))
5604                 /* The peer collapsed the window */
5605                 rack_collapsed_window(rack);
5606         else if (rack->rc_has_collapsed)
5607                 rack_un_collapse_window(rack);
5608         /* Was persist timer active and now we have window space? */
5609         if ((rack->rc_in_persist != 0) &&
5610             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
5611                                 rack->r_ctl.rc_pace_min_segs))) {
5612                 rack_exit_persist(tp, rack);
5613                 tp->snd_nxt = tp->snd_max;
5614                 /* Make sure we output to start the timer */
5615                 rack->r_wanted_output++;
5616         }
5617         /* Do we enter persists? */
5618         if ((rack->rc_in_persist == 0) &&
5619             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
5620             TCPS_HAVEESTABLISHED(tp->t_state) &&
5621             (tp->snd_max == tp->snd_una) &&
5622             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
5623             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
5624                 /*
5625                  * Here the rwnd is less than
5626                  * the pacing size, we are established,
5627                  * nothing is outstanding, and there is
5628                  * data to send. Enter persists.
5629                  */
5630                 tp->snd_nxt = tp->snd_una;
5631                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
5632         }
5633         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
5634                 m_freem(m);
5635                 return (0);
5636         }
5637         /*
5638          * Process segments with URG.
5639          */
5640         if ((thflags & TH_URG) && th->th_urp &&
5641             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5642                 /*
5643                  * This is a kludge, but if we receive and accept random
5644                  * urgent pointers, we'll crash in soreceive.  It's hard to
5645                  * imagine someone actually wanting to send this much urgent
5646                  * data.
5647                  */
5648                 SOCKBUF_LOCK(&so->so_rcv);
5649                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
5650                         th->th_urp = 0; /* XXX */
5651                         thflags &= ~TH_URG;     /* XXX */
5652                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
5653                         goto dodata;    /* XXX */
5654                 }
5655                 /*
5656                  * If this segment advances the known urgent pointer, then
5657                  * mark the data stream.  This should not happen in
5658                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
5659                  * FIN has been received from the remote side. In these
5660                  * states we ignore the URG.
5661                  *
5662                  * According to RFC961 (Assigned Protocols), the urgent
5663                  * pointer points to the last octet of urgent data.  We
5664                  * continue, however, to consider it to indicate the first
5665                  * octet of data past the urgent section as the original
5666                  * spec states (in one of two places).
5667                  */
5668                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
5669                         tp->rcv_up = th->th_seq + th->th_urp;
5670                         so->so_oobmark = sbavail(&so->so_rcv) +
5671                             (tp->rcv_up - tp->rcv_nxt) - 1;
5672                         if (so->so_oobmark == 0)
5673                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
5674                         sohasoutofband(so);
5675                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
5676                 }
5677                 SOCKBUF_UNLOCK(&so->so_rcv);
5678                 /*
5679                  * Remove out of band data so doesn't get presented to user.
5680                  * This can happen independent of advancing the URG pointer,
5681                  * but if two URG's are pending at once, some out-of-band
5682                  * data may creep in... ick.
5683                  */
5684                 if (th->th_urp <= (uint32_t) tlen &&
5685                     !(so->so_options & SO_OOBINLINE)) {
5686                         /* hdr drop is delayed */
5687                         tcp_pulloutofband(so, th, m, drop_hdrlen);
5688                 }
5689         } else {
5690                 /*
5691                  * If no out of band data is expected, pull receive urgent
5692                  * pointer along with the receive window.
5693                  */
5694                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
5695                         tp->rcv_up = tp->rcv_nxt;
5696         }
5697 dodata:                         /* XXX */
5698         INP_WLOCK_ASSERT(tp->t_inpcb);
5699
5700         /*
5701          * Process the segment text, merging it into the TCP sequencing
5702          * queue, and arranging for acknowledgment of receipt if necessary.
5703          * This process logically involves adjusting tp->rcv_wnd as data is
5704          * presented to the user (this happens in tcp_usrreq.c, case
5705          * PRU_RCVD).  If a FIN has already been received on this connection
5706          * then we just ignore the text.
5707          */
5708         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
5709                    IS_FASTOPEN(tp->t_flags));
5710         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
5711             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5712                 tcp_seq save_start = th->th_seq;
5713                 tcp_seq save_rnxt  = tp->rcv_nxt;
5714                 int     save_tlen  = tlen;
5715
5716                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5717                 /*
5718                  * Insert segment which includes th into TCP reassembly
5719                  * queue with control block tp.  Set thflags to whether
5720                  * reassembly now includes a segment with FIN.  This handles
5721                  * the common case inline (segment is the next to be
5722                  * received on an established connection, and the queue is
5723                  * empty), avoiding linkage into and removal from the queue
5724                  * and repetition of various conversions. Set DELACK for
5725                  * segments received in order, but ack immediately when
5726                  * segments are out of order (so fast retransmit can work).
5727                  */
5728                 if (th->th_seq == tp->rcv_nxt &&
5729                     SEGQ_EMPTY(tp) &&
5730                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
5731                     tfo_syn)) {
5732 #ifdef NETFLIX_SB_LIMITS
5733                         u_int mcnt, appended;
5734
5735                         if (so->so_rcv.sb_shlim) {
5736                                 mcnt = m_memcnt(m);
5737                                 appended = 0;
5738                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5739                                     CFO_NOSLEEP, NULL) == false) {
5740                                         counter_u64_add(tcp_sb_shlim_fails, 1);
5741                                         m_freem(m);
5742                                         return (0);
5743                                 }
5744                         }
5745 #endif
5746                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
5747                                 rack_timer_cancel(tp, rack,
5748                                     rack->r_ctl.rc_rcvtime, __LINE__);
5749                                 tp->t_flags |= TF_DELACK;
5750                         } else {
5751                                 rack->r_wanted_output++;
5752                                 tp->t_flags |= TF_ACKNOW;
5753                         }
5754                         tp->rcv_nxt += tlen;
5755                         thflags = th->th_flags & TH_FIN;
5756                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
5757                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
5758                         SOCKBUF_LOCK(&so->so_rcv);
5759                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5760                                 m_freem(m);
5761                         } else
5762 #ifdef NETFLIX_SB_LIMITS
5763                                 appended =
5764 #endif
5765                                         sbappendstream_locked(&so->so_rcv, m, 0);
5766                         /* NB: sorwakeup_locked() does an implicit unlock. */
5767                         sorwakeup_locked(so);
5768 #ifdef NETFLIX_SB_LIMITS
5769                         if (so->so_rcv.sb_shlim && appended != mcnt)
5770                                 counter_fo_release(so->so_rcv.sb_shlim,
5771                                     mcnt - appended);
5772 #endif
5773                 } else {
5774                         /*
5775                          * XXX: Due to the header drop above "th" is
5776                          * theoretically invalid by now.  Fortunately
5777                          * m_adj() doesn't actually frees any mbufs when
5778                          * trimming from the head.
5779                          */
5780                         tcp_seq temp = save_start;
5781                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
5782                         tp->t_flags |= TF_ACKNOW;
5783                 }
5784                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
5785                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
5786                                 /*
5787                                  * DSACK actually handled in the fastpath
5788                                  * above.
5789                                  */
5790                                 tcp_update_sack_list(tp, save_start,
5791                                     save_start + save_tlen);
5792                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
5793                                 if ((tp->rcv_numsacks >= 1) &&
5794                                     (tp->sackblks[0].end == save_start)) {
5795                                         /*
5796                                          * Partial overlap, recorded at todrop
5797                                          * above.
5798                                          */
5799                                         tcp_update_sack_list(tp,
5800                                             tp->sackblks[0].start,
5801                                             tp->sackblks[0].end);
5802                                 } else {
5803                                         tcp_update_dsack_list(tp, save_start,
5804                                             save_start + save_tlen);
5805                                 }
5806                         } else if (tlen >= save_tlen) {
5807                                 /* Update of sackblks. */
5808                                 tcp_update_dsack_list(tp, save_start,
5809                                     save_start + save_tlen);
5810                         } else if (tlen > 0) {
5811                                 tcp_update_dsack_list(tp, save_start,
5812                                     save_start + tlen);
5813                         }
5814                 }
5815         } else {
5816                 m_freem(m);
5817                 thflags &= ~TH_FIN;
5818         }
5819
5820         /*
5821          * If FIN is received ACK the FIN and let the user know that the
5822          * connection is closing.
5823          */
5824         if (thflags & TH_FIN) {
5825                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5826                         socantrcvmore(so);
5827                         /*
5828                          * If connection is half-synchronized (ie NEEDSYN
5829                          * flag on) then delay ACK, so it may be piggybacked
5830                          * when SYN is sent. Otherwise, since we received a
5831                          * FIN then no more input can be expected, send ACK
5832                          * now.
5833                          */
5834                         if (tp->t_flags & TF_NEEDSYN) {
5835                                 rack_timer_cancel(tp, rack,
5836                                     rack->r_ctl.rc_rcvtime, __LINE__);
5837                                 tp->t_flags |= TF_DELACK;
5838                         } else {
5839                                 tp->t_flags |= TF_ACKNOW;
5840                         }
5841                         tp->rcv_nxt++;
5842                 }
5843                 switch (tp->t_state) {
5844
5845                         /*
5846                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
5847                          * CLOSE_WAIT state.
5848                          */
5849                 case TCPS_SYN_RECEIVED:
5850                         tp->t_starttime = ticks;
5851                         /* FALLTHROUGH */
5852                 case TCPS_ESTABLISHED:
5853                         rack_timer_cancel(tp, rack,
5854                             rack->r_ctl.rc_rcvtime, __LINE__);
5855                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
5856                         break;
5857
5858                         /*
5859                          * If still in FIN_WAIT_1 STATE FIN has not been
5860                          * acked so enter the CLOSING state.
5861                          */
5862                 case TCPS_FIN_WAIT_1:
5863                         rack_timer_cancel(tp, rack,
5864                             rack->r_ctl.rc_rcvtime, __LINE__);
5865                         tcp_state_change(tp, TCPS_CLOSING);
5866                         break;
5867
5868                         /*
5869                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
5870                          * starting the time-wait timer, turning off the
5871                          * other standard timers.
5872                          */
5873                 case TCPS_FIN_WAIT_2:
5874                         rack_timer_cancel(tp, rack,
5875                             rack->r_ctl.rc_rcvtime, __LINE__);
5876                         tcp_twstart(tp);
5877                         return (1);
5878                 }
5879         }
5880         /*
5881          * Return any desired output.
5882          */
5883         if ((tp->t_flags & TF_ACKNOW) ||
5884             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
5885                 rack->r_wanted_output++;
5886         }
5887         INP_WLOCK_ASSERT(tp->t_inpcb);
5888         return (0);
5889 }
5890
5891 /*
5892  * Here nothing is really faster, its just that we
5893  * have broken out the fast-data path also just like
5894  * the fast-ack.
5895  */
5896 static int
5897 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
5898     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5899     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
5900 {
5901         int32_t nsegs;
5902         int32_t newsize = 0;    /* automatic sockbuf scaling */
5903         struct tcp_rack *rack;
5904 #ifdef NETFLIX_SB_LIMITS
5905         u_int mcnt, appended;
5906 #endif
5907 #ifdef TCPDEBUG
5908         /*
5909          * The size of tcp_saveipgen must be the size of the max ip header,
5910          * now IPv6.
5911          */
5912         u_char tcp_saveipgen[IP6_HDR_LEN];
5913         struct tcphdr tcp_savetcp;
5914         short ostate = 0;
5915
5916 #endif
5917         /*
5918          * If last ACK falls within this segment's sequence numbers, record
5919          * the timestamp. NOTE that the test is modified according to the
5920          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5921          */
5922         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5923                 return (0);
5924         }
5925         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5926                 return (0);
5927         }
5928         if (tiwin && tiwin != tp->snd_wnd) {
5929                 return (0);
5930         }
5931         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5932                 return (0);
5933         }
5934         if (__predict_false((to->to_flags & TOF_TS) &&
5935             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5936                 return (0);
5937         }
5938         if (__predict_false((th->th_ack != tp->snd_una))) {
5939                 return (0);
5940         }
5941         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5942                 return (0);
5943         }
5944         if ((to->to_flags & TOF_TS) != 0 &&
5945             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5946                 tp->ts_recent_age = tcp_ts_getticks();
5947                 tp->ts_recent = to->to_tsval;
5948         }
5949         rack = (struct tcp_rack *)tp->t_fb_ptr;
5950         /*
5951          * This is a pure, in-sequence data packet with nothing on the
5952          * reassembly queue and we have enough buffer space to take it.
5953          */
5954         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5955
5956 #ifdef NETFLIX_SB_LIMITS
5957         if (so->so_rcv.sb_shlim) {
5958                 mcnt = m_memcnt(m);
5959                 appended = 0;
5960                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5961                     CFO_NOSLEEP, NULL) == false) {
5962                         counter_u64_add(tcp_sb_shlim_fails, 1);
5963                         m_freem(m);
5964                         return (1);
5965                 }
5966         }
5967 #endif
5968         /* Clean receiver SACK report if present */
5969         if (tp->rcv_numsacks)
5970                 tcp_clean_sackreport(tp);
5971         KMOD_TCPSTAT_INC(tcps_preddat);
5972         tp->rcv_nxt += tlen;
5973         /*
5974          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5975          */
5976         tp->snd_wl1 = th->th_seq;
5977         /*
5978          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5979          */
5980         tp->rcv_up = tp->rcv_nxt;
5981         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
5982         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
5983 #ifdef TCPDEBUG
5984         if (so->so_options & SO_DEBUG)
5985                 tcp_trace(TA_INPUT, ostate, tp,
5986                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5987 #endif
5988         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5989
5990         /* Add data to socket buffer. */
5991         SOCKBUF_LOCK(&so->so_rcv);
5992         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5993                 m_freem(m);
5994         } else {
5995                 /*
5996                  * Set new socket buffer size. Give up when limit is
5997                  * reached.
5998                  */
5999                 if (newsize)
6000                         if (!sbreserve_locked(&so->so_rcv,
6001                             newsize, so, NULL))
6002                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
6003                 m_adj(m, drop_hdrlen);  /* delayed header drop */
6004 #ifdef NETFLIX_SB_LIMITS
6005                 appended =
6006 #endif
6007                         sbappendstream_locked(&so->so_rcv, m, 0);
6008                 ctf_calc_rwin(so, tp);
6009         }
6010         /* NB: sorwakeup_locked() does an implicit unlock. */
6011         sorwakeup_locked(so);
6012 #ifdef NETFLIX_SB_LIMITS
6013         if (so->so_rcv.sb_shlim && mcnt != appended)
6014                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
6015 #endif
6016         if (DELAY_ACK(tp, tlen)) {
6017                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6018                 tp->t_flags |= TF_DELACK;
6019         } else {
6020                 tp->t_flags |= TF_ACKNOW;
6021                 rack->r_wanted_output++;
6022         }
6023         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
6024                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
6025         return (1);
6026 }
6027
6028 /*
6029  * This subfunction is used to try to highly optimize the
6030  * fast path. We again allow window updates that are
6031  * in sequence to remain in the fast-path. We also add
6032  * in the __predict's to attempt to help the compiler.
6033  * Note that if we return a 0, then we can *not* process
6034  * it and the caller should push the packet into the
6035  * slow-path.
6036  */
6037 static int
6038 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6039     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6040     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts, uint8_t iptos)
6041 {
6042         int32_t acked;
6043         int32_t nsegs;
6044
6045 #ifdef TCPDEBUG
6046         /*
6047          * The size of tcp_saveipgen must be the size of the max ip header,
6048          * now IPv6.
6049          */
6050         u_char tcp_saveipgen[IP6_HDR_LEN];
6051         struct tcphdr tcp_savetcp;
6052         short ostate = 0;
6053
6054 #endif
6055         struct tcp_rack *rack;
6056
6057         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
6058                 /* Old ack, behind (or duplicate to) the last one rcv'd */
6059                 return (0);
6060         }
6061         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
6062                 /* Above what we have sent? */
6063                 return (0);
6064         }
6065         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
6066                 /* We are retransmitting */
6067                 return (0);
6068         }
6069         if (__predict_false(tiwin == 0)) {
6070                 /* zero window */
6071                 return (0);
6072         }
6073         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
6074                 /* We need a SYN or a FIN, unlikely.. */
6075                 return (0);
6076         }
6077         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
6078                 /* Timestamp is behind .. old ack with seq wrap? */
6079                 return (0);
6080         }
6081         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
6082                 /* Still recovering */
6083                 return (0);
6084         }
6085         rack = (struct tcp_rack *)tp->t_fb_ptr;
6086         if (rack->r_ctl.rc_sacked) {
6087                 /* We have sack holes on our scoreboard */
6088                 return (0);
6089         }
6090         /* Ok if we reach here, we can process a fast-ack */
6091         nsegs = max(1, m->m_pkthdr.lro_nsegs);
6092         rack_log_ack(tp, to, th);
6093         /*
6094          * We made progress, clear the tlp
6095          * out flag so we could start a TLP
6096          * again.
6097          */
6098         rack->r_ctl.rc_tlp_rtx_out = 0;
6099         /* Did the window get updated? */
6100         if (tiwin != tp->snd_wnd) {
6101                 tp->snd_wnd = tiwin;
6102                 tp->snd_wl1 = th->th_seq;
6103                 if (tp->snd_wnd > tp->max_sndwnd)
6104                         tp->max_sndwnd = tp->snd_wnd;
6105         }
6106         /* Do we exit persists? */
6107         if ((rack->rc_in_persist != 0) &&
6108             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
6109                                rack->r_ctl.rc_pace_min_segs))) {
6110                 rack_exit_persist(tp, rack);
6111         }
6112         /* Do we enter persists? */
6113         if ((rack->rc_in_persist == 0) &&
6114             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
6115             TCPS_HAVEESTABLISHED(tp->t_state) &&
6116             (tp->snd_max == tp->snd_una) &&
6117             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
6118             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
6119                 /*
6120                  * Here the rwnd is less than
6121                  * the pacing size, we are established,
6122                  * nothing is outstanding, and there is
6123                  * data to send. Enter persists.
6124                  */
6125                 tp->snd_nxt = tp->snd_una;
6126                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
6127         }
6128         /*
6129          * If last ACK falls within this segment's sequence numbers, record
6130          * the timestamp. NOTE that the test is modified according to the
6131          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
6132          */
6133         if ((to->to_flags & TOF_TS) != 0 &&
6134             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
6135                 tp->ts_recent_age = tcp_ts_getticks();
6136                 tp->ts_recent = to->to_tsval;
6137         }
6138         /*
6139          * This is a pure ack for outstanding data.
6140          */
6141         KMOD_TCPSTAT_INC(tcps_predack);
6142
6143         /*
6144          * "bad retransmit" recovery.
6145          */
6146         if (tp->t_flags & TF_PREVVALID) {
6147                 tp->t_flags &= ~TF_PREVVALID;
6148                 if (tp->t_rxtshift == 1 &&
6149                     (int)(ticks - tp->t_badrxtwin) < 0)
6150                         rack_cong_signal(tp, th, CC_RTO_ERR);
6151         }
6152         /*
6153          * Recalculate the transmit timer / rtt.
6154          *
6155          * Some boxes send broken timestamp replies during the SYN+ACK
6156          * phase, ignore timestamps of 0 or we could calculate a huge RTT
6157          * and blow up the retransmit timer.
6158          */
6159         acked = BYTES_THIS_ACK(tp, th);
6160
6161 #ifdef TCP_HHOOK
6162         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
6163         hhook_run_tcp_est_in(tp, th, to);
6164 #endif
6165
6166         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
6167         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
6168         sbdrop(&so->so_snd, acked);
6169         /*
6170          * Let the congestion control algorithm update congestion control
6171          * related information. This typically means increasing the
6172          * congestion window.
6173          */
6174         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
6175
6176         tp->snd_una = th->th_ack;
6177         if (tp->snd_wnd < ctf_outstanding(tp)) {
6178                 /* The peer collapsed the window */
6179                 rack_collapsed_window(rack);
6180         } else if (rack->rc_has_collapsed)
6181                 rack_un_collapse_window(rack);
6182
6183         /*
6184          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
6185          */
6186         tp->snd_wl2 = th->th_ack;
6187         tp->t_dupacks = 0;
6188         m_freem(m);
6189         /* ND6_HINT(tp);         *//* Some progress has been made. */
6190
6191         /*
6192          * If all outstanding data are acked, stop retransmit timer,
6193          * otherwise restart timer using current (possibly backed-off)
6194          * value. If process is waiting for space, wakeup/selwakeup/signal.
6195          * If data are ready to send, let tcp_output decide between more
6196          * output or persist.
6197          */
6198 #ifdef TCPDEBUG
6199         if (so->so_options & SO_DEBUG)
6200                 tcp_trace(TA_INPUT, ostate, tp,
6201                     (void *)tcp_saveipgen,
6202                     &tcp_savetcp, 0);
6203 #endif
6204         if (tp->snd_una == tp->snd_max) {
6205                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
6206                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
6207                         tp->t_acktime = 0;
6208                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6209         }
6210         /* Wake up the socket if we have room to write more */
6211         sowwakeup(so);
6212         if (sbavail(&so->so_snd)) {
6213                 rack->r_wanted_output++;
6214         }
6215         return (1);
6216 }
6217
6218 /*
6219  * Return value of 1, the TCB is unlocked and most
6220  * likely gone, return value of 0, the TCP is still
6221  * locked.
6222  */
6223 static int
6224 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
6225     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6226     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t tos)
6227 {
6228         int32_t ret_val = 0;
6229         int32_t todrop;
6230         int32_t ourfinisacked = 0;
6231         struct tcp_rack *rack;
6232
6233         ctf_calc_rwin(so, tp);
6234         /*
6235          * If the state is SYN_SENT: if seg contains an ACK, but not for our
6236          * SYN, drop the input. if seg contains a RST, then drop the
6237          * connection. if seg does not contain SYN, then drop it. Otherwise
6238          * this is an acceptable SYN segment initialize tp->rcv_nxt and
6239          * tp->irs if seg contains ack then advance tp->snd_una if seg
6240          * contains an ECE and ECN support is enabled, the stream is ECN
6241          * capable. if SYN has been acked change to ESTABLISHED else
6242          * SYN_RCVD state arrange for segment to be acked (eventually)
6243          * continue processing rest of data/controls, beginning with URG
6244          */
6245         if ((thflags & TH_ACK) &&
6246             (SEQ_LEQ(th->th_ack, tp->iss) ||
6247             SEQ_GT(th->th_ack, tp->snd_max))) {
6248                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6249                 return (1);
6250         }
6251         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
6252                 TCP_PROBE5(connect__refused, NULL, tp,
6253                     mtod(m, const char *), tp, th);
6254                 tp = tcp_drop(tp, ECONNREFUSED);
6255                 ctf_do_drop(m, tp);
6256                 return (1);
6257         }
6258         if (thflags & TH_RST) {
6259                 ctf_do_drop(m, tp);
6260                 return (1);
6261         }
6262         if (!(thflags & TH_SYN)) {
6263                 ctf_do_drop(m, tp);
6264                 return (1);
6265         }
6266         tp->irs = th->th_seq;
6267         tcp_rcvseqinit(tp);
6268         rack = (struct tcp_rack *)tp->t_fb_ptr;
6269         if (thflags & TH_ACK) {
6270                 int tfo_partial = 0;
6271
6272                 KMOD_TCPSTAT_INC(tcps_connects);
6273                 soisconnected(so);
6274 #ifdef MAC
6275                 mac_socketpeer_set_from_mbuf(m, so);
6276 #endif
6277                 /* Do window scaling on this connection? */
6278                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6279                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6280                         tp->rcv_scale = tp->request_r_scale;
6281                 }
6282                 tp->rcv_adv += min(tp->rcv_wnd,
6283                     TCP_MAXWIN << tp->rcv_scale);
6284                 /*
6285                  * If not all the data that was sent in the TFO SYN
6286                  * has been acked, resend the remainder right away.
6287                  */
6288                 if (IS_FASTOPEN(tp->t_flags) &&
6289                     (tp->snd_una != tp->snd_max)) {
6290                         tp->snd_nxt = th->th_ack;
6291                         tfo_partial = 1;
6292                 }
6293                 /*
6294                  * If there's data, delay ACK; if there's also a FIN ACKNOW
6295                  * will be turned on later.
6296                  */
6297                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
6298                         rack_timer_cancel(tp, rack,
6299                                           rack->r_ctl.rc_rcvtime, __LINE__);
6300                         tp->t_flags |= TF_DELACK;
6301                 } else {
6302                         rack->r_wanted_output++;
6303                         tp->t_flags |= TF_ACKNOW;
6304                 }
6305
6306                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
6307                     (V_tcp_do_ecn == 1)) {
6308                         tp->t_flags2 |= TF2_ECN_PERMIT;
6309                         KMOD_TCPSTAT_INC(tcps_ecn_shs);
6310                 }
6311                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
6312                         /*
6313                          * We advance snd_una for the
6314                          * fast open case. If th_ack is
6315                          * acknowledging data beyond
6316                          * snd_una we can't just call
6317                          * ack-processing since the
6318                          * data stream in our send-map
6319                          * will start at snd_una + 1 (one
6320                          * beyond the SYN). If its just
6321                          * equal we don't need to do that
6322                          * and there is no send_map.
6323                          */
6324                         tp->snd_una++;
6325                 }
6326                 /*
6327                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
6328                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
6329                  */
6330                 tp->t_starttime = ticks;
6331                 if (tp->t_flags & TF_NEEDFIN) {
6332                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
6333                         tp->t_flags &= ~TF_NEEDFIN;
6334                         thflags &= ~TH_SYN;
6335                 } else {
6336                         tcp_state_change(tp, TCPS_ESTABLISHED);
6337                         TCP_PROBE5(connect__established, NULL, tp,
6338                             mtod(m, const char *), tp, th);
6339                         cc_conn_init(tp);
6340                 }
6341         } else {
6342                 /*
6343                  * Received initial SYN in SYN-SENT[*] state => simultaneous
6344                  * open.  If segment contains CC option and there is a
6345                  * cached CC, apply TAO test. If it succeeds, connection is *
6346                  * half-synchronized. Otherwise, do 3-way handshake:
6347                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
6348                  * there was no CC option, clear cached CC value.
6349                  */
6350                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
6351                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
6352         }
6353         INP_WLOCK_ASSERT(tp->t_inpcb);
6354         /*
6355          * Advance th->th_seq to correspond to first data byte. If data,
6356          * trim to stay within window, dropping FIN if necessary.
6357          */
6358         th->th_seq++;
6359         if (tlen > tp->rcv_wnd) {
6360                 todrop = tlen - tp->rcv_wnd;
6361                 m_adj(m, -todrop);
6362                 tlen = tp->rcv_wnd;
6363                 thflags &= ~TH_FIN;
6364                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
6365                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
6366         }
6367         tp->snd_wl1 = th->th_seq - 1;
6368         tp->rcv_up = th->th_seq;
6369         /*
6370          * Client side of transaction: already sent SYN and data. If the
6371          * remote host used T/TCP to validate the SYN, our data will be
6372          * ACK'd; if so, enter normal data segment processing in the middle
6373          * of step 5, ack processing. Otherwise, goto step 6.
6374          */
6375         if (thflags & TH_ACK) {
6376                 /* For syn-sent we need to possibly update the rtt */
6377                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6378                         uint32_t t;
6379
6380                         t = tcp_ts_getticks() - to->to_tsecr;
6381                         if (!tp->t_rttlow || tp->t_rttlow > t)
6382                                 tp->t_rttlow = t;
6383                         tcp_rack_xmit_timer(rack, t + 1);
6384                         tcp_rack_xmit_timer_commit(rack, tp);
6385                 }
6386                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
6387                         return (ret_val);
6388                 /* We may have changed to FIN_WAIT_1 above */
6389                 if (tp->t_state == TCPS_FIN_WAIT_1) {
6390                         /*
6391                          * In FIN_WAIT_1 STATE in addition to the processing
6392                          * for the ESTABLISHED state if our FIN is now
6393                          * acknowledged then enter FIN_WAIT_2.
6394                          */
6395                         if (ourfinisacked) {
6396                                 /*
6397                                  * If we can't receive any more data, then
6398                                  * closing user can proceed. Starting the
6399                                  * timer is contrary to the specification,
6400                                  * but if we don't get a FIN we'll hang
6401                                  * forever.
6402                                  *
6403                                  * XXXjl: we should release the tp also, and
6404                                  * use a compressed state.
6405                                  */
6406                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6407                                         soisdisconnected(so);
6408                                         tcp_timer_activate(tp, TT_2MSL,
6409                                             (tcp_fast_finwait2_recycle ?
6410                                             tcp_finwait2_timeout :
6411                                             TP_MAXIDLE(tp)));
6412                                 }
6413                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6414                         }
6415                 }
6416         }
6417         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6418            tiwin, thflags, nxt_pkt));
6419 }
6420
6421 /*
6422  * Return value of 1, the TCB is unlocked and most
6423  * likely gone, return value of 0, the TCP is still
6424  * locked.
6425  */
6426 static int
6427 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
6428     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6429     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6430 {
6431         struct tcp_rack *rack;
6432         int32_t ret_val = 0;
6433         int32_t ourfinisacked = 0;
6434
6435         ctf_calc_rwin(so, tp);
6436         if ((thflags & TH_ACK) &&
6437             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
6438             SEQ_GT(th->th_ack, tp->snd_max))) {
6439                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6440                 return (1);
6441         }
6442         rack = (struct tcp_rack *)tp->t_fb_ptr;
6443         if (IS_FASTOPEN(tp->t_flags)) {
6444                 /*
6445                  * When a TFO connection is in SYN_RECEIVED, the
6446                  * only valid packets are the initial SYN, a
6447                  * retransmit/copy of the initial SYN (possibly with
6448                  * a subset of the original data), a valid ACK, a
6449                  * FIN, or a RST.
6450                  */
6451                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
6452                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6453                         return (1);
6454                 } else if (thflags & TH_SYN) {
6455                         /* non-initial SYN is ignored */
6456                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
6457                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
6458                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
6459                                 ctf_do_drop(m, NULL);
6460                                 return (0);
6461                         }
6462                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
6463                         ctf_do_drop(m, NULL);
6464                         return (0);
6465                 }
6466         }
6467         if ((thflags & TH_RST) ||
6468             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6469                 return (ctf_process_rst(m, th, so, tp));
6470         /*
6471          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6472          * it's less than ts_recent, drop it.
6473          */
6474         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6475             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6476                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6477                         return (ret_val);
6478         }
6479         /*
6480          * In the SYN-RECEIVED state, validate that the packet belongs to
6481          * this connection before trimming the data to fit the receive
6482          * window.  Check the sequence number versus IRS since we know the
6483          * sequence numbers haven't wrapped.  This is a partial fix for the
6484          * "LAND" DoS attack.
6485          */
6486         if (SEQ_LT(th->th_seq, tp->irs)) {
6487                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6488                 return (1);
6489         }
6490         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6491                 return (ret_val);
6492         }
6493         /*
6494          * If last ACK falls within this segment's sequence numbers, record
6495          * its timestamp. NOTE: 1) That the test incorporates suggestions
6496          * from the latest proposal of the tcplw@cray.com list (Braden
6497          * 1993/04/26). 2) That updating only on newer timestamps interferes
6498          * with our earlier PAWS tests, so this check should be solely
6499          * predicated on the sequence space of this segment. 3) That we
6500          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6501          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6502          * SEG.Len, This modified check allows us to overcome RFC1323's
6503          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6504          * p.869. In such cases, we can still calculate the RTT correctly
6505          * when RCV.NXT == Last.ACK.Sent.
6506          */
6507         if ((to->to_flags & TOF_TS) != 0 &&
6508             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6509             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6510             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6511                 tp->ts_recent_age = tcp_ts_getticks();
6512                 tp->ts_recent = to->to_tsval;
6513         }
6514         tp->snd_wnd = tiwin;
6515         /*
6516          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6517          * is on (half-synchronized state), then queue data for later
6518          * processing; else drop segment and return.
6519          */
6520         if ((thflags & TH_ACK) == 0) {
6521                 if (IS_FASTOPEN(tp->t_flags)) {
6522                         cc_conn_init(tp);
6523                 }
6524                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6525                     tiwin, thflags, nxt_pkt));
6526         }
6527         KMOD_TCPSTAT_INC(tcps_connects);
6528         soisconnected(so);
6529         /* Do window scaling? */
6530         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6531             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6532                 tp->rcv_scale = tp->request_r_scale;
6533         }
6534         /*
6535          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
6536          * FIN-WAIT-1
6537          */
6538         tp->t_starttime = ticks;
6539         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
6540                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
6541                 tp->t_tfo_pending = NULL;
6542
6543                 /*
6544                  * Account for the ACK of our SYN prior to
6545                  * regular ACK processing below.
6546                  */
6547                 tp->snd_una++;
6548         }
6549         if (tp->t_flags & TF_NEEDFIN) {
6550                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
6551                 tp->t_flags &= ~TF_NEEDFIN;
6552         } else {
6553                 tcp_state_change(tp, TCPS_ESTABLISHED);
6554                 TCP_PROBE5(accept__established, NULL, tp,
6555                     mtod(m, const char *), tp, th);
6556                 /*
6557                  * TFO connections call cc_conn_init() during SYN
6558                  * processing.  Calling it again here for such connections
6559                  * is not harmless as it would undo the snd_cwnd reduction
6560                  * that occurs when a TFO SYN|ACK is retransmitted.
6561                  */
6562                 if (!IS_FASTOPEN(tp->t_flags))
6563                         cc_conn_init(tp);
6564         }
6565         /*
6566          * If segment contains data or ACK, will call tcp_reass() later; if
6567          * not, do so now to pass queued data to user.
6568          */
6569         if (tlen == 0 && (thflags & TH_FIN) == 0)
6570                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
6571                     (struct mbuf *)0);
6572         tp->snd_wl1 = th->th_seq - 1;
6573         /* For syn-recv we need to possibly update the rtt */
6574         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6575                 uint32_t t;
6576
6577                 t = tcp_ts_getticks() - to->to_tsecr;
6578                 if (!tp->t_rttlow || tp->t_rttlow > t)
6579                         tp->t_rttlow = t;
6580                 tcp_rack_xmit_timer(rack, t + 1);
6581                 tcp_rack_xmit_timer_commit(rack, tp);
6582         }
6583         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6584                 return (ret_val);
6585         }
6586         if (tp->t_state == TCPS_FIN_WAIT_1) {
6587                 /* We could have went to FIN_WAIT_1 (or EST) above */
6588                 /*
6589                  * In FIN_WAIT_1 STATE in addition to the processing for the
6590                  * ESTABLISHED state if our FIN is now acknowledged then
6591                  * enter FIN_WAIT_2.
6592                  */
6593                 if (ourfinisacked) {
6594                         /*
6595                          * If we can't receive any more data, then closing
6596                          * user can proceed. Starting the timer is contrary
6597                          * to the specification, but if we don't get a FIN
6598                          * we'll hang forever.
6599                          *
6600                          * XXXjl: we should release the tp also, and use a
6601                          * compressed state.
6602                          */
6603                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6604                                 soisdisconnected(so);
6605                                 tcp_timer_activate(tp, TT_2MSL,
6606                                     (tcp_fast_finwait2_recycle ?
6607                                     tcp_finwait2_timeout :
6608                                     TP_MAXIDLE(tp)));
6609                         }
6610                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
6611                 }
6612         }
6613         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6614             tiwin, thflags, nxt_pkt));
6615 }
6616
6617 /*
6618  * Return value of 1, the TCB is unlocked and most
6619  * likely gone, return value of 0, the TCP is still
6620  * locked.
6621  */
6622 static int
6623 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
6624     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6625     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6626 {
6627         int32_t ret_val = 0;
6628
6629         /*
6630          * Header prediction: check for the two common cases of a
6631          * uni-directional data xfer.  If the packet has no control flags,
6632          * is in-sequence, the window didn't change and we're not
6633          * retransmitting, it's a candidate.  If the length is zero and the
6634          * ack moved forward, we're the sender side of the xfer.  Just free
6635          * the data acked & wake any higher level process that was blocked
6636          * waiting for space.  If the length is non-zero and the ack didn't
6637          * move, we're the receiver side.  If we're getting packets in-order
6638          * (the reassembly queue is empty), add the data toc The socket
6639          * buffer and note that we need a delayed ack. Make sure that the
6640          * hidden state-flags are also off. Since we check for
6641          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
6642          */
6643         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
6644             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
6645             __predict_true(SEGQ_EMPTY(tp)) &&
6646             __predict_true(th->th_seq == tp->rcv_nxt)) {
6647                 struct tcp_rack *rack;
6648
6649                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6650                 if (tlen == 0) {
6651                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
6652                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime, iptos)) {
6653                                 return (0);
6654                         }
6655                 } else {
6656                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
6657                             tiwin, nxt_pkt, iptos)) {
6658                                 return (0);
6659                         }
6660                 }
6661         }
6662         ctf_calc_rwin(so, tp);
6663
6664         if ((thflags & TH_RST) ||
6665             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6666                 return (ctf_process_rst(m, th, so, tp));
6667
6668         /*
6669          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6670          * synchronized state.
6671          */
6672         if (thflags & TH_SYN) {
6673                 ctf_challenge_ack(m, th, tp, &ret_val);
6674                 return (ret_val);
6675         }
6676         /*
6677          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6678          * it's less than ts_recent, drop it.
6679          */
6680         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6681             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6682                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6683                         return (ret_val);
6684         }
6685         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6686                 return (ret_val);
6687         }
6688         /*
6689          * If last ACK falls within this segment's sequence numbers, record
6690          * its timestamp. NOTE: 1) That the test incorporates suggestions
6691          * from the latest proposal of the tcplw@cray.com list (Braden
6692          * 1993/04/26). 2) That updating only on newer timestamps interferes
6693          * with our earlier PAWS tests, so this check should be solely
6694          * predicated on the sequence space of this segment. 3) That we
6695          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6696          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6697          * SEG.Len, This modified check allows us to overcome RFC1323's
6698          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6699          * p.869. In such cases, we can still calculate the RTT correctly
6700          * when RCV.NXT == Last.ACK.Sent.
6701          */
6702         if ((to->to_flags & TOF_TS) != 0 &&
6703             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6704             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6705             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6706                 tp->ts_recent_age = tcp_ts_getticks();
6707                 tp->ts_recent = to->to_tsval;
6708         }
6709         /*
6710          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6711          * is on (half-synchronized state), then queue data for later
6712          * processing; else drop segment and return.
6713          */
6714         if ((thflags & TH_ACK) == 0) {
6715                 if (tp->t_flags & TF_NEEDSYN) {
6716
6717                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6718                             tiwin, thflags, nxt_pkt));
6719
6720                 } else if (tp->t_flags & TF_ACKNOW) {
6721                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6722                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6723                         return (ret_val);
6724                 } else {
6725                         ctf_do_drop(m, NULL);
6726                         return (0);
6727                 }
6728         }
6729         /*
6730          * Ack processing.
6731          */
6732         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6733                 return (ret_val);
6734         }
6735         if (sbavail(&so->so_snd)) {
6736                 if (rack_progress_timeout_check(tp)) {
6737                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6738                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6739                         return (1);
6740                 }
6741         }
6742         /* State changes only happen in rack_process_data() */
6743         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6744             tiwin, thflags, nxt_pkt));
6745 }
6746
6747 /*
6748  * Return value of 1, the TCB is unlocked and most
6749  * likely gone, return value of 0, the TCP is still
6750  * locked.
6751  */
6752 static int
6753 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
6754     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6755     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6756 {
6757         int32_t ret_val = 0;
6758
6759         ctf_calc_rwin(so, tp);
6760         if ((thflags & TH_RST) ||
6761             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6762                 return (ctf_process_rst(m, th, so, tp));
6763         /*
6764          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6765          * synchronized state.
6766          */
6767         if (thflags & TH_SYN) {
6768                 ctf_challenge_ack(m, th, tp, &ret_val);
6769                 return (ret_val);
6770         }
6771         /*
6772          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6773          * it's less than ts_recent, drop it.
6774          */
6775         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6776             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6777                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6778                         return (ret_val);
6779         }
6780         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6781                 return (ret_val);
6782         }
6783         /*
6784          * If last ACK falls within this segment's sequence numbers, record
6785          * its timestamp. NOTE: 1) That the test incorporates suggestions
6786          * from the latest proposal of the tcplw@cray.com list (Braden
6787          * 1993/04/26). 2) That updating only on newer timestamps interferes
6788          * with our earlier PAWS tests, so this check should be solely
6789          * predicated on the sequence space of this segment. 3) That we
6790          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6791          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6792          * SEG.Len, This modified check allows us to overcome RFC1323's
6793          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6794          * p.869. In such cases, we can still calculate the RTT correctly
6795          * when RCV.NXT == Last.ACK.Sent.
6796          */
6797         if ((to->to_flags & TOF_TS) != 0 &&
6798             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6799             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6800             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6801                 tp->ts_recent_age = tcp_ts_getticks();
6802                 tp->ts_recent = to->to_tsval;
6803         }
6804         /*
6805          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6806          * is on (half-synchronized state), then queue data for later
6807          * processing; else drop segment and return.
6808          */
6809         if ((thflags & TH_ACK) == 0) {
6810                 if (tp->t_flags & TF_NEEDSYN) {
6811                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6812                             tiwin, thflags, nxt_pkt));
6813
6814                 } else if (tp->t_flags & TF_ACKNOW) {
6815                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6816                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6817                         return (ret_val);
6818                 } else {
6819                         ctf_do_drop(m, NULL);
6820                         return (0);
6821                 }
6822         }
6823         /*
6824          * Ack processing.
6825          */
6826         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6827                 return (ret_val);
6828         }
6829         if (sbavail(&so->so_snd)) {
6830                 if (rack_progress_timeout_check(tp)) {
6831                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6832                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6833                         return (1);
6834                 }
6835         }
6836         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6837             tiwin, thflags, nxt_pkt));
6838 }
6839
6840 static int
6841 rack_check_data_after_close(struct mbuf *m,
6842     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
6843 {
6844         struct tcp_rack *rack;
6845
6846         rack = (struct tcp_rack *)tp->t_fb_ptr;
6847         if (rack->rc_allow_data_af_clo == 0) {
6848         close_now:
6849                 tp = tcp_close(tp);
6850                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
6851                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
6852                 return (1);
6853         }
6854         if (sbavail(&so->so_snd) == 0)
6855                 goto close_now;
6856         /* Ok we allow data that is ignored and a followup reset */
6857         tp->rcv_nxt = th->th_seq + *tlen;
6858         tp->t_flags2 |= TF2_DROP_AF_DATA;
6859         rack->r_wanted_output = 1;
6860         *tlen = 0;
6861         return (0);
6862 }
6863
6864 /*
6865  * Return value of 1, the TCB is unlocked and most
6866  * likely gone, return value of 0, the TCP is still
6867  * locked.
6868  */
6869 static int
6870 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
6871     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6872     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6873 {
6874         int32_t ret_val = 0;
6875         int32_t ourfinisacked = 0;
6876
6877         ctf_calc_rwin(so, tp);
6878
6879         if ((thflags & TH_RST) ||
6880             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6881                 return (ctf_process_rst(m, th, so, tp));
6882         /*
6883          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6884          * synchronized state.
6885          */
6886         if (thflags & TH_SYN) {
6887                 ctf_challenge_ack(m, th, tp, &ret_val);
6888                 return (ret_val);
6889         }
6890         /*
6891          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6892          * it's less than ts_recent, drop it.
6893          */
6894         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6895             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6896                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6897                         return (ret_val);
6898         }
6899         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6900                 return (ret_val);
6901         }
6902         /*
6903          * If new data are received on a connection after the user processes
6904          * are gone, then RST the other end.
6905          */
6906         if ((so->so_state & SS_NOFDREF) && tlen) {
6907                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6908                         return (1);
6909         }
6910         /*
6911          * If last ACK falls within this segment's sequence numbers, record
6912          * its timestamp. NOTE: 1) That the test incorporates suggestions
6913          * from the latest proposal of the tcplw@cray.com list (Braden
6914          * 1993/04/26). 2) That updating only on newer timestamps interferes
6915          * with our earlier PAWS tests, so this check should be solely
6916          * predicated on the sequence space of this segment. 3) That we
6917          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6918          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6919          * SEG.Len, This modified check allows us to overcome RFC1323's
6920          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6921          * p.869. In such cases, we can still calculate the RTT correctly
6922          * when RCV.NXT == Last.ACK.Sent.
6923          */
6924         if ((to->to_flags & TOF_TS) != 0 &&
6925             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6926             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6927             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6928                 tp->ts_recent_age = tcp_ts_getticks();
6929                 tp->ts_recent = to->to_tsval;
6930         }
6931         /*
6932          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6933          * is on (half-synchronized state), then queue data for later
6934          * processing; else drop segment and return.
6935          */
6936         if ((thflags & TH_ACK) == 0) {
6937                 if (tp->t_flags & TF_NEEDSYN) {
6938                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6939                             tiwin, thflags, nxt_pkt));
6940                 } else if (tp->t_flags & TF_ACKNOW) {
6941                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6942                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6943                         return (ret_val);
6944                 } else {
6945                         ctf_do_drop(m, NULL);
6946                         return (0);
6947                 }
6948         }
6949         /*
6950          * Ack processing.
6951          */
6952         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6953                 return (ret_val);
6954         }
6955         if (ourfinisacked) {
6956                 /*
6957                  * If we can't receive any more data, then closing user can
6958                  * proceed. Starting the timer is contrary to the
6959                  * specification, but if we don't get a FIN we'll hang
6960                  * forever.
6961                  *
6962                  * XXXjl: we should release the tp also, and use a
6963                  * compressed state.
6964                  */
6965                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6966                         soisdisconnected(so);
6967                         tcp_timer_activate(tp, TT_2MSL,
6968                             (tcp_fast_finwait2_recycle ?
6969                             tcp_finwait2_timeout :
6970                             TP_MAXIDLE(tp)));
6971                 }
6972                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6973         }
6974         if (sbavail(&so->so_snd)) {
6975                 if (rack_progress_timeout_check(tp)) {
6976                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6977                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6978                         return (1);
6979                 }
6980         }
6981         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6982             tiwin, thflags, nxt_pkt));
6983 }
6984
6985 /*
6986  * Return value of 1, the TCB is unlocked and most
6987  * likely gone, return value of 0, the TCP is still
6988  * locked.
6989  */
6990 static int
6991 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
6992     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6993     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6994 {
6995         int32_t ret_val = 0;
6996         int32_t ourfinisacked = 0;
6997
6998         ctf_calc_rwin(so, tp);
6999
7000         if ((thflags & TH_RST) ||
7001             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7002                 return (ctf_process_rst(m, th, so, tp));
7003         /*
7004          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7005          * synchronized state.
7006          */
7007         if (thflags & TH_SYN) {
7008                 ctf_challenge_ack(m, th, tp, &ret_val);
7009                 return (ret_val);
7010         }
7011         /*
7012          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7013          * it's less than ts_recent, drop it.
7014          */
7015         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7016             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7017                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7018                         return (ret_val);
7019         }
7020         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7021                 return (ret_val);
7022         }
7023         /*
7024          * If new data are received on a connection after the user processes
7025          * are gone, then RST the other end.
7026          */
7027         if ((so->so_state & SS_NOFDREF) && tlen) {
7028                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7029                         return (1);
7030         }
7031         /*
7032          * If last ACK falls within this segment's sequence numbers, record
7033          * its timestamp. NOTE: 1) That the test incorporates suggestions
7034          * from the latest proposal of the tcplw@cray.com list (Braden
7035          * 1993/04/26). 2) That updating only on newer timestamps interferes
7036          * with our earlier PAWS tests, so this check should be solely
7037          * predicated on the sequence space of this segment. 3) That we
7038          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7039          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7040          * SEG.Len, This modified check allows us to overcome RFC1323's
7041          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7042          * p.869. In such cases, we can still calculate the RTT correctly
7043          * when RCV.NXT == Last.ACK.Sent.
7044          */
7045         if ((to->to_flags & TOF_TS) != 0 &&
7046             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7047             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7048             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7049                 tp->ts_recent_age = tcp_ts_getticks();
7050                 tp->ts_recent = to->to_tsval;
7051         }
7052         /*
7053          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7054          * is on (half-synchronized state), then queue data for later
7055          * processing; else drop segment and return.
7056          */
7057         if ((thflags & TH_ACK) == 0) {
7058                 if (tp->t_flags & TF_NEEDSYN) {
7059                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7060                             tiwin, thflags, nxt_pkt));
7061                 } else if (tp->t_flags & TF_ACKNOW) {
7062                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7063                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7064                         return (ret_val);
7065                 } else {
7066                         ctf_do_drop(m, NULL);
7067                         return (0);
7068                 }
7069         }
7070         /*
7071          * Ack processing.
7072          */
7073         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7074                 return (ret_val);
7075         }
7076         if (ourfinisacked) {
7077                 tcp_twstart(tp);
7078                 m_freem(m);
7079                 return (1);
7080         }
7081         if (sbavail(&so->so_snd)) {
7082                 if (rack_progress_timeout_check(tp)) {
7083                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7084                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7085                         return (1);
7086                 }
7087         }
7088         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7089             tiwin, thflags, nxt_pkt));
7090 }
7091
7092 /*
7093  * Return value of 1, the TCB is unlocked and most
7094  * likely gone, return value of 0, the TCP is still
7095  * locked.
7096  */
7097 static int
7098 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
7099     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7100     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7101 {
7102         int32_t ret_val = 0;
7103         int32_t ourfinisacked = 0;
7104
7105         ctf_calc_rwin(so, tp);
7106
7107         if ((thflags & TH_RST) ||
7108             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7109                 return (ctf_process_rst(m, th, so, tp));
7110         /*
7111          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7112          * synchronized state.
7113          */
7114         if (thflags & TH_SYN) {
7115                 ctf_challenge_ack(m, th, tp, &ret_val);
7116                 return (ret_val);
7117         }
7118         /*
7119          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7120          * it's less than ts_recent, drop it.
7121          */
7122         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7123             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7124                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7125                         return (ret_val);
7126         }
7127         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7128                 return (ret_val);
7129         }
7130         /*
7131          * If new data are received on a connection after the user processes
7132          * are gone, then RST the other end.
7133          */
7134         if ((so->so_state & SS_NOFDREF) && tlen) {
7135                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7136                         return (1);
7137         }
7138         /*
7139          * If last ACK falls within this segment's sequence numbers, record
7140          * its timestamp. NOTE: 1) That the test incorporates suggestions
7141          * from the latest proposal of the tcplw@cray.com list (Braden
7142          * 1993/04/26). 2) That updating only on newer timestamps interferes
7143          * with our earlier PAWS tests, so this check should be solely
7144          * predicated on the sequence space of this segment. 3) That we
7145          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7146          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7147          * SEG.Len, This modified check allows us to overcome RFC1323's
7148          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7149          * p.869. In such cases, we can still calculate the RTT correctly
7150          * when RCV.NXT == Last.ACK.Sent.
7151          */
7152         if ((to->to_flags & TOF_TS) != 0 &&
7153             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7154             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7155             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7156                 tp->ts_recent_age = tcp_ts_getticks();
7157                 tp->ts_recent = to->to_tsval;
7158         }
7159         /*
7160          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7161          * is on (half-synchronized state), then queue data for later
7162          * processing; else drop segment and return.
7163          */
7164         if ((thflags & TH_ACK) == 0) {
7165                 if (tp->t_flags & TF_NEEDSYN) {
7166                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7167                             tiwin, thflags, nxt_pkt));
7168                 } else if (tp->t_flags & TF_ACKNOW) {
7169                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7170                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7171                         return (ret_val);
7172                 } else {
7173                         ctf_do_drop(m, NULL);
7174                         return (0);
7175                 }
7176         }
7177         /*
7178          * case TCPS_LAST_ACK: Ack processing.
7179          */
7180         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7181                 return (ret_val);
7182         }
7183         if (ourfinisacked) {
7184                 tp = tcp_close(tp);
7185                 ctf_do_drop(m, tp);
7186                 return (1);
7187         }
7188         if (sbavail(&so->so_snd)) {
7189                 if (rack_progress_timeout_check(tp)) {
7190                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7191                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7192                         return (1);
7193                 }
7194         }
7195         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7196             tiwin, thflags, nxt_pkt));
7197 }
7198
7199
7200 /*
7201  * Return value of 1, the TCB is unlocked and most
7202  * likely gone, return value of 0, the TCP is still
7203  * locked.
7204  */
7205 static int
7206 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
7207     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7208     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7209 {
7210         int32_t ret_val = 0;
7211         int32_t ourfinisacked = 0;
7212
7213         ctf_calc_rwin(so, tp);
7214
7215         /* Reset receive buffer auto scaling when not in bulk receive mode. */
7216         if ((thflags & TH_RST) ||
7217             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7218                 return (ctf_process_rst(m, th, so, tp));
7219         /*
7220          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7221          * synchronized state.
7222          */
7223         if (thflags & TH_SYN) {
7224                 ctf_challenge_ack(m, th, tp, &ret_val);
7225                 return (ret_val);
7226         }
7227         /*
7228          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7229          * it's less than ts_recent, drop it.
7230          */
7231         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7232             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7233                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7234                         return (ret_val);
7235         }
7236         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7237                 return (ret_val);
7238         }
7239         /*
7240          * If new data are received on a connection after the user processes
7241          * are gone, then RST the other end.
7242          */
7243         if ((so->so_state & SS_NOFDREF) &&
7244             tlen) {
7245                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7246                         return (1);
7247         }
7248         /*
7249          * If last ACK falls within this segment's sequence numbers, record
7250          * its timestamp. NOTE: 1) That the test incorporates suggestions
7251          * from the latest proposal of the tcplw@cray.com list (Braden
7252          * 1993/04/26). 2) That updating only on newer timestamps interferes
7253          * with our earlier PAWS tests, so this check should be solely
7254          * predicated on the sequence space of this segment. 3) That we
7255          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7256          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7257          * SEG.Len, This modified check allows us to overcome RFC1323's
7258          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7259          * p.869. In such cases, we can still calculate the RTT correctly
7260          * when RCV.NXT == Last.ACK.Sent.
7261          */
7262         if ((to->to_flags & TOF_TS) != 0 &&
7263             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7264             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7265             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7266                 tp->ts_recent_age = tcp_ts_getticks();
7267                 tp->ts_recent = to->to_tsval;
7268         }
7269         /*
7270          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7271          * is on (half-synchronized state), then queue data for later
7272          * processing; else drop segment and return.
7273          */
7274         if ((thflags & TH_ACK) == 0) {
7275                 if (tp->t_flags & TF_NEEDSYN) {
7276                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7277                             tiwin, thflags, nxt_pkt));
7278                 } else if (tp->t_flags & TF_ACKNOW) {
7279                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7280                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7281                         return (ret_val);
7282                 } else {
7283                         ctf_do_drop(m, NULL);
7284                         return (0);
7285                 }
7286         }
7287         /*
7288          * Ack processing.
7289          */
7290         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7291                 return (ret_val);
7292         }
7293         if (sbavail(&so->so_snd)) {
7294                 if (rack_progress_timeout_check(tp)) {
7295                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7296                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7297                         return (1);
7298                 }
7299         }
7300         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7301             tiwin, thflags, nxt_pkt));
7302 }
7303
7304
7305 static void inline
7306 rack_clear_rate_sample(struct tcp_rack *rack)
7307 {
7308         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
7309         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
7310         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
7311 }
7312
7313 static void
7314 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack)
7315 {
7316         uint32_t tls_seg = 0;
7317
7318 #ifdef KERN_TLS
7319         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
7320                 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd);
7321                 rack->r_ctl.rc_pace_min_segs = tls_seg;
7322         } else
7323 #endif
7324                 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
7325         rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs;
7326         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES)
7327                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
7328 #ifdef KERN_TLS
7329         if (tls_seg != 0) {
7330                 if (rack_hw_tls_max_seg > 1) {
7331                         rack->r_ctl.rc_pace_max_segs /= tls_seg;
7332                         if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs)
7333                                 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg;
7334                 } else {
7335                         rack->r_ctl.rc_pace_max_segs = 1;
7336                 }
7337                 if (rack->r_ctl.rc_pace_max_segs == 0)
7338                         rack->r_ctl.rc_pace_max_segs = 1;
7339                 rack->r_ctl.rc_pace_max_segs *= tls_seg;
7340         }
7341 #endif
7342         rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2);
7343 }
7344
7345 static int
7346 rack_init(struct tcpcb *tp)
7347 {
7348         struct tcp_rack *rack = NULL;
7349         struct rack_sendmap *insret;
7350
7351         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
7352         if (tp->t_fb_ptr == NULL) {
7353                 /*
7354                  * We need to allocate memory but cant. The INP and INP_INFO
7355                  * locks and they are recusive (happens during setup. So a
7356                  * scheme to drop the locks fails :(
7357                  *
7358                  */
7359                 return (ENOMEM);
7360         }
7361         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
7362
7363         rack = (struct tcp_rack *)tp->t_fb_ptr;
7364         RB_INIT(&rack->r_ctl.rc_mtree);
7365         TAILQ_INIT(&rack->r_ctl.rc_free);
7366         TAILQ_INIT(&rack->r_ctl.rc_tmap);
7367         rack->rc_tp = tp;
7368         if (tp->t_inpcb) {
7369                 rack->rc_inp = tp->t_inpcb;
7370         }
7371         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
7372         /* Probably not needed but lets be sure */
7373         rack_clear_rate_sample(rack);
7374         rack->r_cpu = 0;
7375         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
7376         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
7377         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
7378         rack->rc_pace_reduce = rack_slot_reduction;
7379         if (use_rack_cheat)
7380                 rack->use_rack_cheat = 1;
7381         if (V_tcp_delack_enabled)
7382                 tp->t_delayed_ack = 1;
7383         else
7384                 tp->t_delayed_ack = 0;
7385         rack->rc_pace_max_segs = rack_hptsi_segments;
7386         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
7387         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
7388         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
7389         rack->r_enforce_min_pace = rack_min_pace_time;
7390         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
7391         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
7392         rack->r_ctl.rc_early_recovery = rack_early_recovery;
7393         rack->rc_always_pace = rack_pace_every_seg;
7394         rack_set_pace_segments(tp, rack);
7395         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
7396         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
7397         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
7398         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
7399         rack->r_ctl.rc_min_to = rack_min_to;
7400         rack->rack_per_of_gp = rack_per_of_gp;
7401         microuptime(&rack->r_ctl.rc_last_ack);
7402         rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
7403         rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks();
7404         /* Do we force on detection? */
7405 #ifdef NETFLIX_EXP_DETECTION
7406         if (tcp_force_detection)
7407                 rack->do_detection = 1;
7408         else
7409 #endif
7410                 rack->do_detection = 0;
7411         if (tp->snd_una != tp->snd_max) {
7412                 /* Create a send map for the current outstanding data */
7413                 struct rack_sendmap *rsm;
7414
7415                 rsm = rack_alloc(rack);
7416                 if (rsm == NULL) {
7417                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7418                         tp->t_fb_ptr = NULL;
7419                         return (ENOMEM);
7420                 }
7421                 rsm->r_flags = RACK_OVERMAX;
7422                 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
7423                 rsm->r_rtr_cnt = 1;
7424                 rsm->r_rtr_bytes = 0;
7425                 rsm->r_start = tp->snd_una;
7426                 rsm->r_end = tp->snd_max;
7427                 rsm->r_dupack = 0;
7428                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7429 #ifdef INVARIANTS
7430                 if (insret != NULL) {
7431                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
7432                               insret, rack, rsm);
7433                 }
7434 #endif
7435                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7436                 rsm->r_in_tmap = 1;
7437         }
7438         rack_stop_all_timers(tp);
7439         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7440         return (0);
7441 }
7442
7443 static int
7444 rack_handoff_ok(struct tcpcb *tp)
7445 {
7446         if ((tp->t_state == TCPS_CLOSED) ||
7447             (tp->t_state == TCPS_LISTEN)) {
7448                 /* Sure no problem though it may not stick */
7449                 return (0);
7450         }
7451         if ((tp->t_state == TCPS_SYN_SENT) ||
7452             (tp->t_state == TCPS_SYN_RECEIVED)) {
7453                 /*
7454                  * We really don't know you have to get to ESTAB or beyond
7455                  * to tell.
7456                  */
7457                 return (EAGAIN);
7458         }
7459         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
7460                 return (0);
7461         }
7462         /*
7463          * If we reach here we don't do SACK on this connection so we can
7464          * never do rack.
7465          */
7466         return (EINVAL);
7467 }
7468
7469 static void
7470 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
7471 {
7472         if (tp->t_fb_ptr) {
7473                 struct tcp_rack *rack;
7474                 struct rack_sendmap *rsm, *nrsm, *rm;
7475                 if (tp->t_inpcb) {
7476                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
7477                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
7478                 }
7479                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7480 #ifdef TCP_BLACKBOX
7481                 tcp_log_flowend(tp);
7482 #endif
7483                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
7484                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7485 #ifdef INVARIANTS
7486                         if (rm != rsm) {
7487                                 panic("At fini, rack:%p rsm:%p rm:%p",
7488                                       rack, rsm, rm);
7489                         }
7490 #endif
7491                         uma_zfree(rack_zone, rsm);
7492                 }
7493                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7494                 while (rsm) {
7495                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
7496                         uma_zfree(rack_zone, rsm);
7497                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7498                 }
7499                 rack->rc_free_cnt = 0;
7500                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7501                 tp->t_fb_ptr = NULL;
7502         }
7503         /* Make sure snd_nxt is correctly set */
7504         tp->snd_nxt = tp->snd_max;
7505 }
7506
7507
7508 static void
7509 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
7510 {
7511         switch (tp->t_state) {
7512         case TCPS_SYN_SENT:
7513                 rack->r_state = TCPS_SYN_SENT;
7514                 rack->r_substate = rack_do_syn_sent;
7515                 break;
7516         case TCPS_SYN_RECEIVED:
7517                 rack->r_state = TCPS_SYN_RECEIVED;
7518                 rack->r_substate = rack_do_syn_recv;
7519                 break;
7520         case TCPS_ESTABLISHED:
7521                 rack_set_pace_segments(tp, rack);
7522                 rack->r_state = TCPS_ESTABLISHED;
7523                 rack->r_substate = rack_do_established;
7524                 break;
7525         case TCPS_CLOSE_WAIT:
7526                 rack->r_state = TCPS_CLOSE_WAIT;
7527                 rack->r_substate = rack_do_close_wait;
7528                 break;
7529         case TCPS_FIN_WAIT_1:
7530                 rack->r_state = TCPS_FIN_WAIT_1;
7531                 rack->r_substate = rack_do_fin_wait_1;
7532                 break;
7533         case TCPS_CLOSING:
7534                 rack->r_state = TCPS_CLOSING;
7535                 rack->r_substate = rack_do_closing;
7536                 break;
7537         case TCPS_LAST_ACK:
7538                 rack->r_state = TCPS_LAST_ACK;
7539                 rack->r_substate = rack_do_lastack;
7540                 break;
7541         case TCPS_FIN_WAIT_2:
7542                 rack->r_state = TCPS_FIN_WAIT_2;
7543                 rack->r_substate = rack_do_fin_wait_2;
7544                 break;
7545         case TCPS_LISTEN:
7546         case TCPS_CLOSED:
7547         case TCPS_TIME_WAIT:
7548         default:
7549                 break;
7550         };
7551 }
7552
7553
7554 static void
7555 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
7556 {
7557         /*
7558          * We received an ack, and then did not
7559          * call send or were bounced out due to the
7560          * hpts was running. Now a timer is up as well, is
7561          * it the right timer?
7562          */
7563         struct rack_sendmap *rsm;
7564         int tmr_up;
7565
7566         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7567         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
7568                 return;
7569         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7570         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
7571             (tmr_up == PACE_TMR_RXT)) {
7572                 /* Should be an RXT */
7573                 return;
7574         }
7575         if (rsm == NULL) {
7576                 /* Nothing outstanding? */
7577                 if (tp->t_flags & TF_DELACK) {
7578                         if (tmr_up == PACE_TMR_DELACK)
7579                                 /* We are supposed to have delayed ack up and we do */
7580                                 return;
7581                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
7582                         /*
7583                          * if we hit enobufs then we would expect the possiblity
7584                          * of nothing outstanding and the RXT up (and the hptsi timer).
7585                          */
7586                         return;
7587                 } else if (((V_tcp_always_keepalive ||
7588                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7589                             (tp->t_state <= TCPS_CLOSING)) &&
7590                            (tmr_up == PACE_TMR_KEEP) &&
7591                            (tp->snd_max == tp->snd_una)) {
7592                         /* We should have keep alive up and we do */
7593                         return;
7594                 }
7595         }
7596         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
7597                    ((tmr_up == PACE_TMR_TLP) ||
7598                     (tmr_up == PACE_TMR_RACK) ||
7599                     (tmr_up == PACE_TMR_RXT))) {
7600                 /*
7601                  * Either a Rack, TLP or RXT is fine if  we
7602                  * have outstanding data.
7603                  */
7604                 return;
7605         } else if (tmr_up == PACE_TMR_DELACK) {
7606                 /*
7607                  * If the delayed ack was going to go off
7608                  * before the rtx/tlp/rack timer were going to
7609                  * expire, then that would be the timer in control.
7610                  * Note we don't check the time here trusting the
7611                  * code is correct.
7612                  */
7613                 return;
7614         }
7615         /*
7616          * Ok the timer originally started is not what we want now.
7617          * We will force the hpts to be stopped if any, and restart
7618          * with the slot set to what was in the saved slot.
7619          */
7620         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
7621         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7622 }
7623
7624 static int
7625 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
7626     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
7627     int32_t nxt_pkt, struct timeval *tv)
7628 {
7629         int32_t thflags, retval, did_out = 0;
7630         int32_t way_out = 0;
7631         uint32_t cts;
7632         uint32_t tiwin;
7633         struct tcpopt to;
7634         struct tcp_rack *rack;
7635         struct rack_sendmap *rsm;
7636         int32_t prev_state = 0;
7637
7638         if (m->m_flags & M_TSTMP_LRO) {
7639                 tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7640                 tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7641         }
7642         cts = tcp_tv_to_mssectick(tv);
7643         rack = (struct tcp_rack *)tp->t_fb_ptr;
7644
7645         kern_prefetch(rack, &prev_state);
7646         prev_state = 0;
7647         thflags = th->th_flags;
7648
7649         NET_EPOCH_ASSERT();
7650         INP_WLOCK_ASSERT(tp->t_inpcb);
7651         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
7652             __func__));
7653         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
7654             __func__));
7655         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
7656                 union tcp_log_stackspecific log;
7657                 struct timeval tv;
7658
7659                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
7660                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
7661                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
7662                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
7663                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
7664                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
7665                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
7666                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
7667                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
7668                     tlen, &log, true, &tv);
7669         }
7670         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
7671                 way_out = 4;
7672                 retval = 0;
7673                 goto done_with_input;
7674         }
7675         /*
7676          * If a segment with the ACK-bit set arrives in the SYN-SENT state
7677          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
7678          */
7679         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
7680             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
7681                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7682                 return(1);
7683         }
7684         /*
7685          * Segment received on connection. Reset idle time and keep-alive
7686          * timer. XXX: This should be done after segment validation to
7687          * ignore broken/spoofed segs.
7688          */
7689         if  (tp->t_idle_reduce &&
7690              (tp->snd_max == tp->snd_una) &&
7691              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
7692                 counter_u64_add(rack_input_idle_reduces, 1);
7693                 rack_cc_after_idle(tp);
7694         }
7695         tp->t_rcvtime = ticks;
7696
7697         /*
7698          * Unscale the window into a 32-bit value. For the SYN_SENT state
7699          * the scale is zero.
7700          */
7701         tiwin = th->th_win << tp->snd_scale;
7702 #ifdef STATS
7703         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
7704 #endif
7705         if (tiwin > rack->r_ctl.rc_high_rwnd)
7706                 rack->r_ctl.rc_high_rwnd = tiwin;
7707         /*
7708          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
7709          * this to occur after we've validated the segment.
7710          */
7711         if (tp->t_flags2 & TF2_ECN_PERMIT) {
7712                 if (thflags & TH_CWR) {
7713                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
7714                         tp->t_flags |= TF_ACKNOW;
7715                 }
7716                 switch (iptos & IPTOS_ECN_MASK) {
7717                 case IPTOS_ECN_CE:
7718                         tp->t_flags2 |= TF2_ECN_SND_ECE;
7719                         KMOD_TCPSTAT_INC(tcps_ecn_ce);
7720                         break;
7721                 case IPTOS_ECN_ECT0:
7722                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
7723                         break;
7724                 case IPTOS_ECN_ECT1:
7725                         KMOD_TCPSTAT_INC(tcps_ecn_ect1);
7726                         break;
7727                 }
7728
7729                 /* Process a packet differently from RFC3168. */
7730                 cc_ecnpkt_handler(tp, th, iptos);
7731
7732                 /* Congestion experienced. */
7733                 if (thflags & TH_ECE) {
7734                         rack_cong_signal(tp, th, CC_ECN);
7735                 }
7736         }
7737         /*
7738          * Parse options on any incoming segment.
7739          */
7740         tcp_dooptions(&to, (u_char *)(th + 1),
7741             (th->th_off << 2) - sizeof(struct tcphdr),
7742             (thflags & TH_SYN) ? TO_SYN : 0);
7743
7744         /*
7745          * If echoed timestamp is later than the current time, fall back to
7746          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
7747          * were used when this connection was established.
7748          */
7749         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
7750                 to.to_tsecr -= tp->ts_offset;
7751                 if (TSTMP_GT(to.to_tsecr, cts))
7752                         to.to_tsecr = 0;
7753         }
7754         /*
7755          * If its the first time in we need to take care of options and
7756          * verify we can do SACK for rack!
7757          */
7758         if (rack->r_state == 0) {
7759                 /* Should be init'd by rack_init() */
7760                 KASSERT(rack->rc_inp != NULL,
7761                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
7762                 if (rack->rc_inp == NULL) {
7763                         rack->rc_inp = tp->t_inpcb;
7764                 }
7765
7766                 /*
7767                  * Process options only when we get SYN/ACK back. The SYN
7768                  * case for incoming connections is handled in tcp_syncache.
7769                  * According to RFC1323 the window field in a SYN (i.e., a
7770                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
7771                  * this is traditional behavior, may need to be cleaned up.
7772                  */
7773                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
7774                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
7775                         if ((to.to_flags & TOF_SCALE) &&
7776                             (tp->t_flags & TF_REQ_SCALE)) {
7777                                 tp->t_flags |= TF_RCVD_SCALE;
7778                                 tp->snd_scale = to.to_wscale;
7779                         }
7780                         /*
7781                          * Initial send window.  It will be updated with the
7782                          * next incoming segment to the scaled value.
7783                          */
7784                         tp->snd_wnd = th->th_win;
7785                         if (to.to_flags & TOF_TS) {
7786                                 tp->t_flags |= TF_RCVD_TSTMP;
7787                                 tp->ts_recent = to.to_tsval;
7788                                 tp->ts_recent_age = cts;
7789                         }
7790                         if (to.to_flags & TOF_MSS)
7791                                 tcp_mss(tp, to.to_mss);
7792                         if ((tp->t_flags & TF_SACK_PERMIT) &&
7793                             (to.to_flags & TOF_SACKPERM) == 0)
7794                                 tp->t_flags &= ~TF_SACK_PERMIT;
7795                         if (IS_FASTOPEN(tp->t_flags)) {
7796                                 if (to.to_flags & TOF_FASTOPEN) {
7797                                         uint16_t mss;
7798
7799                                         if (to.to_flags & TOF_MSS)
7800                                                 mss = to.to_mss;
7801                                         else
7802                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
7803                                                         mss = TCP6_MSS;
7804                                                 else
7805                                                         mss = TCP_MSS;
7806                                         tcp_fastopen_update_cache(tp, mss,
7807                                             to.to_tfo_len, to.to_tfo_cookie);
7808                                 } else
7809                                         tcp_fastopen_disable_path(tp);
7810                         }
7811                 }
7812                 /*
7813                  * At this point we are at the initial call. Here we decide
7814                  * if we are doing RACK or not. We do this by seeing if
7815                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
7816                  * we switch to the default code.
7817                  */
7818                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
7819                         tcp_switch_back_to_default(tp);
7820                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
7821                             tlen, iptos);
7822                         return (1);
7823                 }
7824                 /* Set the flag */
7825                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
7826                 tcp_set_hpts(tp->t_inpcb);
7827                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
7828         }
7829         /*
7830          * This is the one exception case where we set the rack state
7831          * always. All other times (timers etc) we must have a rack-state
7832          * set (so we assure we have done the checks above for SACK).
7833          */
7834         memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval));
7835         rack->r_ctl.rc_rcvtime = cts;
7836         if (rack->r_state != tp->t_state)
7837                 rack_set_state(tp, rack);
7838         if (SEQ_GT(th->th_ack, tp->snd_una) &&
7839             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
7840                 kern_prefetch(rsm, &prev_state);
7841         prev_state = rack->r_state;
7842         rack->r_ctl.rc_tlp_send_cnt = 0;
7843         rack_clear_rate_sample(rack);
7844         retval = (*rack->r_substate) (m, th, so,
7845             tp, &to, drop_hdrlen,
7846             tlen, tiwin, thflags, nxt_pkt, iptos);
7847 #ifdef INVARIANTS
7848         if ((retval == 0) &&
7849             (tp->t_inpcb == NULL)) {
7850                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
7851                     retval, tp, prev_state);
7852         }
7853 #endif
7854         if (retval == 0) {
7855                 /*
7856                  * If retval is 1 the tcb is unlocked and most likely the tp
7857                  * is gone.
7858                  */
7859                 INP_WLOCK_ASSERT(tp->t_inpcb);
7860                 if (rack->set_pacing_done_a_iw == 0) {
7861                         /* How much has been acked? */
7862                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
7863                                 /* We have enough to set in the pacing segment size */
7864                                 rack->set_pacing_done_a_iw = 1;
7865                                 rack_set_pace_segments(tp, rack);
7866                         }
7867                 }
7868                 tcp_rack_xmit_timer_commit(rack, tp);
7869                 if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) {
7870                         if (rack->r_wanted_output != 0) {
7871                                 did_out = 1;
7872                                 (void)tp->t_fb->tfb_tcp_output(tp);
7873                         }
7874                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7875                 }
7876                 if ((nxt_pkt == 0) &&
7877                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
7878                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
7879                      (tp->t_flags & TF_DELACK) ||
7880                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7881                       (tp->t_state <= TCPS_CLOSING)))) {
7882                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
7883                         if ((tp->snd_max == tp->snd_una) &&
7884                             ((tp->t_flags & TF_DELACK) == 0) &&
7885                             (rack->rc_inp->inp_in_hpts) &&
7886                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
7887                                 /* keep alive not needed if we are hptsi output yet */
7888                                 ;
7889                         } else {
7890                                 if (rack->rc_inp->inp_in_hpts) {
7891                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7892                                         counter_u64_add(rack_per_timer_hole, 1);
7893                                 }
7894                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7895                         }
7896                         way_out = 1;
7897                 } else if (nxt_pkt == 0) {
7898                         /* Do we have the correct timer running? */
7899                         rack_timer_audit(tp, rack, &so->so_snd);
7900                         way_out = 2;
7901                 }
7902         done_with_input:
7903                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
7904                 if (did_out)
7905                         rack->r_wanted_output = 0;
7906 #ifdef INVARIANTS
7907                 if (tp->t_inpcb == NULL) {
7908                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
7909                               did_out,
7910                               retval, tp, prev_state);
7911                 }
7912 #endif
7913         }
7914         return (retval);
7915 }
7916
7917 void
7918 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
7919     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
7920 {
7921         struct timeval tv;
7922
7923         /* First lets see if we have old packets */
7924         if (tp->t_in_pkt) {
7925                 if (ctf_do_queued_segments(so, tp, 1)) {
7926                         m_freem(m);
7927                         return;
7928                 }
7929         }
7930         if (m->m_flags & M_TSTMP_LRO) {
7931                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7932                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7933         } else {
7934                 /* Should not be should we kassert instead? */
7935                 tcp_get_usecs(&tv);
7936         }
7937         if(rack_do_segment_nounlock(m, th, so, tp,
7938                                     drop_hdrlen, tlen, iptos, 0, &tv) == 0)
7939                 INP_WUNLOCK(tp->t_inpcb);
7940 }
7941
7942 struct rack_sendmap *
7943 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
7944 {
7945         struct rack_sendmap *rsm = NULL;
7946         int32_t idx;
7947         uint32_t srtt = 0, thresh = 0, ts_low = 0;
7948
7949         /* Return the next guy to be re-transmitted */
7950         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
7951                 return (NULL);
7952         }
7953         if (tp->t_flags & TF_SENTFIN) {
7954                 /* retran the end FIN? */
7955                 return (NULL);
7956         }
7957         /* ok lets look at this one */
7958         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7959         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
7960                 goto check_it;
7961         }
7962         rsm = rack_find_lowest_rsm(rack);
7963         if (rsm == NULL) {
7964                 return (NULL);
7965         }
7966 check_it:
7967         if (rsm->r_flags & RACK_ACKED) {
7968                 return (NULL);
7969         }
7970         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
7971                 /* Its not yet ready */
7972                 return (NULL);
7973         }
7974         srtt = rack_grab_rtt(tp, rack);
7975         idx = rsm->r_rtr_cnt - 1;
7976         ts_low = rsm->r_tim_lastsent[idx];
7977         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
7978         if ((tsused == ts_low) ||
7979             (TSTMP_LT(tsused, ts_low))) {
7980                 /* No time since sending */
7981                 return (NULL);
7982         }
7983         if ((tsused - ts_low) < thresh) {
7984                 /* It has not been long enough yet */
7985                 return (NULL);
7986         }
7987         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
7988             ((rsm->r_flags & RACK_SACK_PASSED) &&
7989              (rack->sack_attack_disable == 0))) {
7990                 /*
7991                  * We have passed the dup-ack threshold <or>
7992                  * a SACK has indicated this is missing.
7993                  * Note that if you are a declared attacker
7994                  * it is only the dup-ack threshold that
7995                  * will cause retransmits.
7996                  */
7997                 /* log retransmit reason */
7998                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
7999                 return (rsm);
8000         }
8001         return (NULL);
8002 }
8003
8004 static int32_t
8005 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len)
8006 {
8007         int32_t slot = 0;
8008
8009         if ((rack->rack_per_of_gp == 0) ||
8010             (rack->rc_always_pace == 0)) {
8011                 /*
8012                  * We use the most optimistic possible cwnd/srtt for
8013                  * sending calculations. This will make our
8014                  * calculation anticipate getting more through
8015                  * quicker then possible. But thats ok we don't want
8016                  * the peer to have a gap in data sending.
8017                  */
8018                 uint32_t srtt, cwnd, tr_perms = 0;
8019
8020 old_method:
8021                 if (rack->r_ctl.rc_rack_min_rtt)
8022                         srtt = rack->r_ctl.rc_rack_min_rtt;
8023                 else
8024                         srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8025                 if (rack->r_ctl.rc_rack_largest_cwnd)
8026                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8027                 else
8028                         cwnd = tp->snd_cwnd;
8029                 tr_perms = cwnd / srtt;
8030                 if (tr_perms == 0) {
8031                         tr_perms = ctf_fixed_maxseg(tp);
8032                 }
8033                 /*
8034                  * Calculate how long this will take to drain, if
8035                  * the calculation comes out to zero, thats ok we
8036                  * will use send_a_lot to possibly spin around for
8037                  * more increasing tot_len_this_send to the point
8038                  * that its going to require a pace, or we hit the
8039                  * cwnd. Which in that case we are just waiting for
8040                  * a ACK.
8041                  */
8042                 slot = len / tr_perms;
8043                 /* Now do we reduce the time so we don't run dry? */
8044                 if (slot && rack->rc_pace_reduce) {
8045                         int32_t reduce;
8046
8047                         reduce = (slot / rack->rc_pace_reduce);
8048                         if (reduce < slot) {
8049                                 slot -= reduce;
8050                         } else
8051                                 slot = 0;
8052                 }
8053         } else {
8054                 int cnt;
8055                 uint64_t bw_est, bw_raise, res, lentim;
8056
8057                 bw_est = 0;
8058                 for (cnt=0; cnt<RACK_GP_HIST; cnt++) {
8059                         if ((rack->r_ctl.rc_gp_hist_filled == 0) &&
8060                             (rack->r_ctl.rc_gp_history[cnt] == 0))
8061                                 break;
8062                         bw_est += rack->r_ctl.rc_gp_history[cnt];
8063                 }
8064                 if (bw_est == 0) {
8065                         /*
8066                          * No way yet to make a b/w estimate
8067                          * (no goodput est yet).
8068                          */
8069                         goto old_method;
8070                 }
8071                 /* Covert to bytes per second */
8072                 bw_est *= MSEC_IN_SECOND;
8073                 /*
8074                  * Now ratchet it up by our percentage. Note
8075                  * that the minimum you can do is 1 which would
8076                  * get you 101% of the average last N goodput estimates.
8077                  * The max you can do is 256 which would yeild you
8078                  * 356% of the last N goodput estimates.
8079                  */
8080                 bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp;
8081                 bw_est += bw_raise;
8082                 /* average by the number we added */
8083                 bw_est /= cnt;
8084                 /* Now calculate a rate based on this b/w */
8085                 lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND;
8086                 res = lentim / bw_est;
8087                 slot = (uint32_t)res;
8088         }
8089         if (rack->r_enforce_min_pace &&
8090             (slot == 0)) {
8091                 /* We are enforcing a minimum pace time of 1ms */
8092                 slot = rack->r_enforce_min_pace;
8093         }
8094         if (slot)
8095                 counter_u64_add(rack_calc_nonzero, 1);
8096         else
8097                 counter_u64_add(rack_calc_zero, 1);
8098         return (slot);
8099 }
8100
8101 static int
8102 rack_output(struct tcpcb *tp)
8103 {
8104         struct socket *so;
8105         uint32_t recwin, sendwin;
8106         uint32_t sb_offset;
8107         int32_t len, flags, error = 0;
8108         struct mbuf *m;
8109         struct mbuf *mb;
8110         uint32_t if_hw_tsomaxsegcount = 0;
8111         uint32_t if_hw_tsomaxsegsize = 0;
8112         int32_t maxseg;
8113         long tot_len_this_send = 0;
8114         struct ip *ip = NULL;
8115 #ifdef TCPDEBUG
8116         struct ipovly *ipov = NULL;
8117 #endif
8118         struct udphdr *udp = NULL;
8119         struct tcp_rack *rack;
8120         struct tcphdr *th;
8121         uint8_t pass = 0;
8122         uint8_t wanted_cookie = 0;
8123         u_char opt[TCP_MAXOLEN];
8124         unsigned ipoptlen, optlen, hdrlen, ulen=0;
8125         uint32_t rack_seq;
8126
8127 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8128         unsigned ipsec_optlen = 0;
8129
8130 #endif
8131         int32_t idle, sendalot;
8132         int32_t sub_from_prr = 0;
8133         volatile int32_t sack_rxmit;
8134         struct rack_sendmap *rsm = NULL;
8135         int32_t tso, mtu;
8136         struct tcpopt to;
8137         int32_t slot = 0;
8138         int32_t sup_rack = 0;
8139         uint32_t cts;
8140         uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
8141         int32_t do_a_prefetch;
8142         int32_t prefetch_rsm = 0;
8143         int force_tso = 0;
8144         int32_t orig_len;
8145         int32_t prefetch_so_done = 0;
8146         struct tcp_log_buffer *lgb = NULL;
8147         struct inpcb *inp;
8148         struct sockbuf *sb;
8149 #ifdef INET6
8150         struct ip6_hdr *ip6 = NULL;
8151         int32_t isipv6;
8152 #endif
8153         uint8_t filled_all = 0;
8154         bool hw_tls = false;
8155
8156         /* setup and take the cache hits here */
8157         rack = (struct tcp_rack *)tp->t_fb_ptr;
8158         inp = rack->rc_inp;
8159         so = inp->inp_socket;
8160         sb = &so->so_snd;
8161         kern_prefetch(sb, &do_a_prefetch);
8162         do_a_prefetch = 1;
8163
8164 #ifdef KERN_TLS
8165         hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
8166 #endif
8167
8168         NET_EPOCH_ASSERT();
8169         INP_WLOCK_ASSERT(inp);
8170
8171 #ifdef TCP_OFFLOAD
8172         if (tp->t_flags & TF_TOE)
8173                 return (tcp_offload_output(tp));
8174 #endif
8175         maxseg = ctf_fixed_maxseg(tp);
8176         /*
8177          * For TFO connections in SYN_RECEIVED, only allow the initial
8178          * SYN|ACK and those sent by the retransmit timer.
8179          */
8180         if (IS_FASTOPEN(tp->t_flags) &&
8181             (tp->t_state == TCPS_SYN_RECEIVED) &&
8182             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
8183             (rack->r_ctl.rc_resend == NULL))         /* not a retransmit */
8184                 return (0);
8185 #ifdef INET6
8186         if (rack->r_state) {
8187                 /* Use the cache line loaded if possible */
8188                 isipv6 = rack->r_is_v6;
8189         } else {
8190                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
8191         }
8192 #endif
8193         cts = tcp_ts_getticks();
8194         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
8195             inp->inp_in_hpts) {
8196                 /*
8197                  * We are on the hpts for some timer but not hptsi output.
8198                  * Remove from the hpts unconditionally.
8199                  */
8200                 rack_timer_cancel(tp, rack, cts, __LINE__);
8201         }
8202         /* Mark that we have called rack_output(). */
8203         if ((rack->r_timer_override) ||
8204             (tp->t_flags & TF_FORCEDATA) ||
8205             (tp->t_state < TCPS_ESTABLISHED)) {
8206                 if (tp->t_inpcb->inp_in_hpts)
8207                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
8208         } else if (tp->t_inpcb->inp_in_hpts) {
8209                 /*
8210                  * On the hpts you can't pass even if ACKNOW is on, we will
8211                  * when the hpts fires.
8212                  */
8213                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
8214                 return (0);
8215         }
8216         hpts_calling = inp->inp_hpts_calls;
8217         inp->inp_hpts_calls = 0;
8218         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
8219                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
8220                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
8221                         return (0);
8222                 }
8223         }
8224         rack->r_wanted_output = 0;
8225         rack->r_timer_override = 0;
8226         /*
8227          * For TFO connections in SYN_SENT or SYN_RECEIVED,
8228          * only allow the initial SYN or SYN|ACK and those sent
8229          * by the retransmit timer.
8230          */
8231         if (IS_FASTOPEN(tp->t_flags) &&
8232             ((tp->t_state == TCPS_SYN_RECEIVED) ||
8233              (tp->t_state == TCPS_SYN_SENT)) &&
8234             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
8235             (tp->t_rxtshift == 0))              /* not a retransmit */
8236                 return (0);
8237         /*
8238          * Determine length of data that should be transmitted, and flags
8239          * that will be used. If there is some data or critical controls
8240          * (SYN, RST) to send, then transmit; otherwise, investigate
8241          * further.
8242          */
8243         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
8244         if (tp->t_idle_reduce) {
8245                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
8246                         rack_cc_after_idle(tp);
8247         }
8248         tp->t_flags &= ~TF_LASTIDLE;
8249         if (idle) {
8250                 if (tp->t_flags & TF_MORETOCOME) {
8251                         tp->t_flags |= TF_LASTIDLE;
8252                         idle = 0;
8253                 }
8254         }
8255 again:
8256         /*
8257          * If we've recently taken a timeout, snd_max will be greater than
8258          * snd_nxt.  There may be SACK information that allows us to avoid
8259          * resending already delivered data.  Adjust snd_nxt accordingly.
8260          */
8261         sendalot = 0;
8262         cts = tcp_ts_getticks();
8263         tso = 0;
8264         mtu = 0;
8265         sb_offset = tp->snd_max - tp->snd_una;
8266         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
8267
8268         flags = tcp_outflags[tp->t_state];
8269         while (rack->rc_free_cnt < rack_free_cache) {
8270                 rsm = rack_alloc(rack);
8271                 if (rsm == NULL) {
8272                         if (inp->inp_hpts_calls)
8273                                 /* Retry in a ms */
8274                                 slot = 1;
8275                         goto just_return_nolock;
8276                 }
8277                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
8278                 rack->rc_free_cnt++;
8279                 rsm = NULL;
8280         }
8281         if (inp->inp_hpts_calls)
8282                 inp->inp_hpts_calls = 0;
8283         sack_rxmit = 0;
8284         len = 0;
8285         rsm = NULL;
8286         if (flags & TH_RST) {
8287                 SOCKBUF_LOCK(sb);
8288                 goto send;
8289         }
8290         if (rack->r_ctl.rc_tlpsend) {
8291                 /* Tail loss probe */
8292                 long cwin;
8293                 long tlen;
8294
8295                 doing_tlp = 1;
8296                 /*
8297                  * Check if we can do a TLP with a RACK'd packet
8298                  * this can happen if we are not doing the rack
8299                  * cheat and we skipped to a TLP and it
8300                  * went off.
8301                  */
8302                 rsm = tcp_rack_output(tp, rack, cts);
8303                 if (rsm == NULL)
8304                         rsm = rack->r_ctl.rc_tlpsend;
8305                 rack->r_ctl.rc_tlpsend = NULL;
8306                 sack_rxmit = 1;
8307                 tlen = rsm->r_end - rsm->r_start;
8308                 if (tlen > ctf_fixed_maxseg(tp))
8309                         tlen = ctf_fixed_maxseg(tp);
8310                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8311                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8312                     __func__, __LINE__,
8313                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8314                 sb_offset = rsm->r_start - tp->snd_una;
8315                 cwin = min(tp->snd_wnd, tlen);
8316                 len = cwin;
8317         } else if (rack->r_ctl.rc_resend) {
8318                 /* Retransmit timer */
8319                 rsm = rack->r_ctl.rc_resend;
8320                 rack->r_ctl.rc_resend = NULL;
8321                 len = rsm->r_end - rsm->r_start;
8322                 sack_rxmit = 1;
8323                 sendalot = 0;
8324                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8325                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8326                     __func__, __LINE__,
8327                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8328                 sb_offset = rsm->r_start - tp->snd_una;
8329                 if (len >= ctf_fixed_maxseg(tp)) {
8330                         len = ctf_fixed_maxseg(tp);
8331                 }
8332         } else if ((rack->rc_in_persist == 0) &&
8333             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
8334                 int maxseg;
8335
8336                 maxseg = ctf_fixed_maxseg(tp);
8337                 if ((!IN_RECOVERY(tp->t_flags)) &&
8338                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
8339                         /* Enter recovery if not induced by a time-out */
8340                         rack->r_ctl.rc_rsm_start = rsm->r_start;
8341                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8342                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8343                         rack_cong_signal(tp, NULL, CC_NDUPACK);
8344                         /*
8345                          * When we enter recovery we need to assure we send
8346                          * one packet.
8347                          */
8348                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
8349                         rack_log_to_prr(rack, 13);
8350                 }
8351 #ifdef INVARIANTS
8352                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
8353                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
8354                             tp, rack, rsm, rsm->r_start, tp->snd_una);
8355                 }
8356 #endif
8357                 len = rsm->r_end - rsm->r_start;
8358                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8359                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8360                     __func__, __LINE__,
8361                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8362                 sb_offset = rsm->r_start - tp->snd_una;
8363                 /* Can we send it within the PRR boundary? */
8364                 if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
8365                         /* It does not fit */
8366                         if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
8367                             (rack->r_ctl.rc_prr_sndcnt < maxseg)) {
8368                                 /*
8369                                  * prr is less than a segment, we
8370                                  * have more acks due in besides
8371                                  * what we need to resend. Lets not send
8372                                  * to avoid sending small pieces of
8373                                  * what we need to retransmit.
8374                                  */
8375                                 len = 0;
8376                                 goto just_return_nolock;
8377                         }
8378                         len = rack->r_ctl.rc_prr_sndcnt;
8379                 }
8380                 sendalot = 0;
8381                 if (len >= maxseg) {
8382                         len = maxseg;
8383                 }
8384                 if (len > 0) {
8385                         sub_from_prr = 1;
8386                         sack_rxmit = 1;
8387                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
8388                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
8389                             min(len, ctf_fixed_maxseg(tp)));
8390                         counter_u64_add(rack_rtm_prr_retran, 1);
8391                 }
8392         }
8393         /*
8394          * Enforce a connection sendmap count limit if set
8395          * as long as we are not retransmiting.
8396          */
8397         if ((rsm == NULL) &&
8398             (rack->do_detection == 0) &&
8399             (V_tcp_map_entries_limit > 0) &&
8400             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
8401                 counter_u64_add(rack_to_alloc_limited, 1);
8402                 if (!rack->alloc_limit_reported) {
8403                         rack->alloc_limit_reported = 1;
8404                         counter_u64_add(rack_alloc_limited_conns, 1);
8405                 }
8406                 goto just_return_nolock;
8407         }
8408         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
8409                 /* we are retransmitting the fin */
8410                 len--;
8411                 if (len) {
8412                         /*
8413                          * When retransmitting data do *not* include the
8414                          * FIN. This could happen from a TLP probe.
8415                          */
8416                         flags &= ~TH_FIN;
8417                 }
8418         }
8419 #ifdef INVARIANTS
8420         /* For debugging */
8421         rack->r_ctl.rc_rsm_at_retran = rsm;
8422 #endif
8423         /*
8424          * Get standard flags, and add SYN or FIN if requested by 'hidden'
8425          * state flags.
8426          */
8427         if (tp->t_flags & TF_NEEDFIN)
8428                 flags |= TH_FIN;
8429         if (tp->t_flags & TF_NEEDSYN)
8430                 flags |= TH_SYN;
8431         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
8432                 void *end_rsm;
8433                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
8434                 if (end_rsm)
8435                         kern_prefetch(end_rsm, &prefetch_rsm);
8436                 prefetch_rsm = 1;
8437         }
8438         SOCKBUF_LOCK(sb);
8439         /*
8440          * If in persist timeout with window of 0, send 1 byte. Otherwise,
8441          * if window is small but nonzero and time TF_SENTFIN expired, we
8442          * will send what we can and go to transmit state.
8443          */
8444         if (tp->t_flags & TF_FORCEDATA) {
8445                 if (sendwin == 0) {
8446                         /*
8447                          * If we still have some data to send, then clear
8448                          * the FIN bit.  Usually this would happen below
8449                          * when it realizes that we aren't sending all the
8450                          * data.  However, if we have exactly 1 byte of
8451                          * unsent data, then it won't clear the FIN bit
8452                          * below, and if we are in persist state, we wind up
8453                          * sending the packet without recording that we sent
8454                          * the FIN bit.
8455                          *
8456                          * We can't just blindly clear the FIN bit, because
8457                          * if we don't have any more data to send then the
8458                          * probe will be the FIN itself.
8459                          */
8460                         if (sb_offset < sbused(sb))
8461                                 flags &= ~TH_FIN;
8462                         sendwin = 1;
8463                 } else {
8464                         if ((rack->rc_in_persist != 0) &&
8465                             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
8466                                                rack->r_ctl.rc_pace_min_segs)))
8467                                 rack_exit_persist(tp, rack);
8468                         /*
8469                          * If we are dropping persist mode then we need to
8470                          * correct snd_nxt/snd_max and off.
8471                          */
8472                         tp->snd_nxt = tp->snd_max;
8473                         sb_offset = tp->snd_nxt - tp->snd_una;
8474                 }
8475         }
8476         /*
8477          * If snd_nxt == snd_max and we have transmitted a FIN, the
8478          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
8479          * negative length.  This can also occur when TCP opens up its
8480          * congestion window while receiving additional duplicate acks after
8481          * fast-retransmit because TCP will reset snd_nxt to snd_max after
8482          * the fast-retransmit.
8483          *
8484          * In the normal retransmit-FIN-only case, however, snd_nxt will be
8485          * set to snd_una, the sb_offset will be 0, and the length may wind
8486          * up 0.
8487          *
8488          * If sack_rxmit is true we are retransmitting from the scoreboard
8489          * in which case len is already set.
8490          */
8491         if (sack_rxmit == 0) {
8492                 uint32_t avail;
8493
8494                 avail = sbavail(sb);
8495                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
8496                         sb_offset = tp->snd_nxt - tp->snd_una;
8497                 else
8498                         sb_offset = 0;
8499                 if (IN_RECOVERY(tp->t_flags) == 0) {
8500                         if (rack->r_ctl.rc_tlp_new_data) {
8501                                 /* TLP is forcing out new data */
8502                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
8503                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
8504                                 }
8505                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
8506                                         len = tp->snd_wnd;
8507                                 else
8508                                         len = rack->r_ctl.rc_tlp_new_data;
8509                                 rack->r_ctl.rc_tlp_new_data = 0;
8510                                 new_data_tlp = doing_tlp = 1;
8511                         } else {
8512                                 if (sendwin > avail) {
8513                                         /* use the available */
8514                                         if (avail > sb_offset) {
8515                                                 len = (int32_t)(avail - sb_offset);
8516                                         } else {
8517                                                 len = 0;
8518                                         }
8519                                 } else {
8520                                         if (sendwin > sb_offset) {
8521                                                 len = (int32_t)(sendwin - sb_offset);
8522                                         } else {
8523                                                 len = 0;
8524                                         }
8525                                 }
8526                         }
8527                 } else {
8528                         uint32_t outstanding;
8529
8530                         /*
8531                          * We are inside of a SACK recovery episode and are
8532                          * sending new data, having retransmitted all the
8533                          * data possible so far in the scoreboard.
8534                          */
8535                         outstanding = tp->snd_max - tp->snd_una;
8536                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
8537                                 if (tp->snd_wnd > outstanding) {
8538                                         len = tp->snd_wnd - outstanding;
8539                                         /* Check to see if we have the data */
8540                                         if (((sb_offset + len) > avail) &&
8541                                             (avail > sb_offset))
8542                                                 len = avail - sb_offset;
8543                                         else
8544                                                 len = 0;
8545                                 } else
8546                                         len = 0;
8547                         } else if (avail > sb_offset)
8548                                 len = avail - sb_offset;
8549                         else
8550                                 len = 0;
8551                         if (len > 0) {
8552                                 if (len > rack->r_ctl.rc_prr_sndcnt)
8553                                         len = rack->r_ctl.rc_prr_sndcnt;
8554                                 if (len > 0) {
8555                                         sub_from_prr = 1;
8556                                         counter_u64_add(rack_rtm_prr_newdata, 1);
8557                                 }
8558                         }
8559                         if (len > ctf_fixed_maxseg(tp)) {
8560                                 /*
8561                                  * We should never send more than a MSS when
8562                                  * retransmitting or sending new data in prr
8563                                  * mode unless the override flag is on. Most
8564                                  * likely the PRR algorithm is not going to
8565                                  * let us send a lot as well :-)
8566                                  */
8567                                 if (rack->r_ctl.rc_prr_sendalot == 0)
8568                                         len = ctf_fixed_maxseg(tp);
8569                         } else if (len < ctf_fixed_maxseg(tp)) {
8570                                 /*
8571                                  * Do we send any? The idea here is if the
8572                                  * send empty's the socket buffer we want to
8573                                  * do it. However if not then lets just wait
8574                                  * for our prr_sndcnt to get bigger.
8575                                  */
8576                                 long leftinsb;
8577
8578                                 leftinsb = sbavail(sb) - sb_offset;
8579                                 if (leftinsb > len) {
8580                                         /* This send does not empty the sb */
8581                                         len = 0;
8582                                 }
8583                         }
8584                 }
8585         }
8586         if (prefetch_so_done == 0) {
8587                 kern_prefetch(so, &prefetch_so_done);
8588                 prefetch_so_done = 1;
8589         }
8590         /*
8591          * Lop off SYN bit if it has already been sent.  However, if this is
8592          * SYN-SENT state and if segment contains data and if we don't know
8593          * that foreign host supports TAO, suppress sending segment.
8594          */
8595         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
8596             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
8597                 if (tp->t_state != TCPS_SYN_RECEIVED)
8598                         flags &= ~TH_SYN;
8599                 /*
8600                  * When sending additional segments following a TFO SYN|ACK,
8601                  * do not include the SYN bit.
8602                  */
8603                 if (IS_FASTOPEN(tp->t_flags) &&
8604                     (tp->t_state == TCPS_SYN_RECEIVED))
8605                         flags &= ~TH_SYN;
8606                 sb_offset--, len++;
8607         }
8608         /*
8609          * Be careful not to send data and/or FIN on SYN segments. This
8610          * measure is needed to prevent interoperability problems with not
8611          * fully conformant TCP implementations.
8612          */
8613         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
8614                 len = 0;
8615                 flags &= ~TH_FIN;
8616         }
8617         /*
8618          * On TFO sockets, ensure no data is sent in the following cases:
8619          *
8620          *  - When retransmitting SYN|ACK on a passively-created socket
8621          *
8622          *  - When retransmitting SYN on an actively created socket
8623          *
8624          *  - When sending a zero-length cookie (cookie request) on an
8625          *    actively created socket
8626          *
8627          *  - When the socket is in the CLOSED state (RST is being sent)
8628          */
8629         if (IS_FASTOPEN(tp->t_flags) &&
8630             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
8631              ((tp->t_state == TCPS_SYN_SENT) &&
8632               (tp->t_tfo_client_cookie_len == 0)) ||
8633              (flags & TH_RST))) {
8634                 sack_rxmit = 0;
8635                 len = 0;
8636         }
8637         /* Without fast-open there should never be data sent on a SYN */
8638         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
8639                 len = 0;
8640         orig_len = len;
8641         if (len <= 0) {
8642                 /*
8643                  * If FIN has been sent but not acked, but we haven't been
8644                  * called to retransmit, len will be < 0.  Otherwise, window
8645                  * shrank after we sent into it.  If window shrank to 0,
8646                  * cancel pending retransmit, pull snd_nxt back to (closed)
8647                  * window, and set the persist timer if it isn't already
8648                  * going.  If the window didn't close completely, just wait
8649                  * for an ACK.
8650                  *
8651                  * We also do a general check here to ensure that we will
8652                  * set the persist timer when we have data to send, but a
8653                  * 0-byte window. This makes sure the persist timer is set
8654                  * even if the packet hits one of the "goto send" lines
8655                  * below.
8656                  */
8657                 len = 0;
8658                 if ((tp->snd_wnd == 0) &&
8659                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8660                     (tp->snd_una == tp->snd_max) &&
8661                     (sb_offset < (int)sbavail(sb))) {
8662                         tp->snd_nxt = tp->snd_una;
8663                         rack_enter_persist(tp, rack, cts);
8664                 }
8665         } else if ((rsm == NULL) &&
8666                    ((doing_tlp == 0) || (new_data_tlp == 1)) &&
8667                    (len < rack->r_ctl.rc_pace_max_segs)) {
8668                 /*
8669                  * We are not sending a full segment for
8670                  * some reason. Should we not send anything (think
8671                  * sws or persists)?
8672                  */
8673                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8674                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8675                     (len < (int)(sbavail(sb) - sb_offset))) {
8676                         /*
8677                          * Here the rwnd is less than
8678                          * the pacing size, this is not a retransmit,
8679                          * we are established and
8680                          * the send is not the last in the socket buffer
8681                          * we send nothing, and may enter persists.
8682                          */
8683                         len = 0;
8684                         if (tp->snd_max == tp->snd_una) {
8685                                 /*
8686                                  * Nothing out we can
8687                                  * go into persists.
8688                                  */
8689                                 rack_enter_persist(tp, rack, cts);
8690                                 tp->snd_nxt = tp->snd_una;
8691                         }
8692                 } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) &&
8693                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8694                            (len < (int)(sbavail(sb) - sb_offset)) &&
8695                            (len < rack->r_ctl.rc_pace_min_segs)) {
8696                         /*
8697                          * Here we are not retransmitting, and
8698                          * the cwnd is not so small that we could
8699                          * not send at least a min size (rxt timer
8700                          * not having gone off), We have 2 segments or
8701                          * more already in flight, its not the tail end
8702                          * of the socket buffer  and the cwnd is blocking
8703                          * us from sending out a minimum pacing segment size.
8704                          * Lets not send anything.
8705                          */
8706                         len = 0;
8707                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
8708                             min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8709                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8710                            (len < (int)(sbavail(sb) - sb_offset)) &&
8711                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
8712                         /*
8713                          * Here we have a send window but we have
8714                          * filled it up and we can't send another pacing segment.
8715                          * We also have in flight more than 2 segments
8716                          * and we are not completing the sb i.e. we allow
8717                          * the last bytes of the sb to go out even if
8718                          * its not a full pacing segment.
8719                          */
8720                         len = 0;
8721                 }
8722         }
8723         /* len will be >= 0 after this point. */
8724         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
8725         tcp_sndbuf_autoscale(tp, so, sendwin);
8726         /*
8727          * Decide if we can use TCP Segmentation Offloading (if supported by
8728          * hardware).
8729          *
8730          * TSO may only be used if we are in a pure bulk sending state.  The
8731          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
8732          * options prevent using TSO.  With TSO the TCP header is the same
8733          * (except for the sequence number) for all generated packets.  This
8734          * makes it impossible to transmit any options which vary per
8735          * generated segment or packet.
8736          *
8737          * IPv4 handling has a clear separation of ip options and ip header
8738          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
8739          * the right thing below to provide length of just ip options and thus
8740          * checking for ipoptlen is enough to decide if ip options are present.
8741          */
8742
8743 #ifdef INET6
8744         if (isipv6)
8745                 ipoptlen = ip6_optlen(tp->t_inpcb);
8746         else
8747 #endif
8748                 if (tp->t_inpcb->inp_options)
8749                         ipoptlen = tp->t_inpcb->inp_options->m_len -
8750                             offsetof(struct ipoption, ipopt_list);
8751                 else
8752                         ipoptlen = 0;
8753 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8754         /*
8755          * Pre-calculate here as we save another lookup into the darknesses
8756          * of IPsec that way and can actually decide if TSO is ok.
8757          */
8758 #ifdef INET6
8759         if (isipv6 && IPSEC_ENABLED(ipv6))
8760                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
8761 #ifdef INET
8762         else
8763 #endif
8764 #endif                          /* INET6 */
8765 #ifdef INET
8766         if (IPSEC_ENABLED(ipv4))
8767                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
8768 #endif                          /* INET */
8769 #endif
8770
8771 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8772         ipoptlen += ipsec_optlen;
8773 #endif
8774         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) &&
8775             (tp->t_port == 0) &&
8776             ((tp->t_flags & TF_SIGNATURE) == 0) &&
8777             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
8778             ipoptlen == 0)
8779                 tso = 1;
8780         {
8781                 uint32_t outstanding;
8782
8783                 outstanding = tp->snd_max - tp->snd_una;
8784                 if (tp->t_flags & TF_SENTFIN) {
8785                         /*
8786                          * If we sent a fin, snd_max is 1 higher than
8787                          * snd_una
8788                          */
8789                         outstanding--;
8790                 }
8791                 if (sack_rxmit) {
8792                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
8793                                 flags &= ~TH_FIN;
8794                 } else {
8795                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
8796                             sbused(sb)))
8797                                 flags &= ~TH_FIN;
8798                 }
8799         }
8800         recwin = sbspace(&so->so_rcv);
8801
8802         /*
8803          * Sender silly window avoidance.   We transmit under the following
8804          * conditions when len is non-zero:
8805          *
8806          * - We have a full segment (or more with TSO) - This is the last
8807          * buffer in a write()/send() and we are either idle or running
8808          * NODELAY - we've timed out (e.g. persist timer) - we have more
8809          * then 1/2 the maximum send window's worth of data (receiver may be
8810          * limited the window size) - we need to retransmit
8811          */
8812         if (len) {
8813                 if (len >= ctf_fixed_maxseg(tp)) {
8814                         pass = 1;
8815                         goto send;
8816                 }
8817                 /*
8818                  * NOTE! on localhost connections an 'ack' from the remote
8819                  * end may occur synchronously with the output and cause us
8820                  * to flush a buffer queued with moretocome.  XXX
8821                  *
8822                  */
8823                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
8824                     (idle || (tp->t_flags & TF_NODELAY)) &&
8825                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
8826                     (tp->t_flags & TF_NOPUSH) == 0) {
8827                         pass = 2;
8828                         goto send;
8829                 }
8830                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
8831                         pass = 3;
8832                         goto send;
8833                 }
8834                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
8835                         goto send;
8836                 }
8837                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
8838                         pass = 4;
8839                         goto send;
8840                 }
8841                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
8842                         pass = 5;
8843                         goto send;
8844                 }
8845                 if (sack_rxmit) {
8846                         pass = 6;
8847                         goto send;
8848                 }
8849         }
8850         /*
8851          * Sending of standalone window updates.
8852          *
8853          * Window updates are important when we close our window due to a
8854          * full socket buffer and are opening it again after the application
8855          * reads data from it.  Once the window has opened again and the
8856          * remote end starts to send again the ACK clock takes over and
8857          * provides the most current window information.
8858          *
8859          * We must avoid the silly window syndrome whereas every read from
8860          * the receive buffer, no matter how small, causes a window update
8861          * to be sent.  We also should avoid sending a flurry of window
8862          * updates when the socket buffer had queued a lot of data and the
8863          * application is doing small reads.
8864          *
8865          * Prevent a flurry of pointless window updates by only sending an
8866          * update when we can increase the advertized window by more than
8867          * 1/4th of the socket buffer capacity.  When the buffer is getting
8868          * full or is very small be more aggressive and send an update
8869          * whenever we can increase by two mss sized segments. In all other
8870          * situations the ACK's to new incoming data will carry further
8871          * window increases.
8872          *
8873          * Don't send an independent window update if a delayed ACK is
8874          * pending (it will get piggy-backed on it) or the remote side
8875          * already has done a half-close and won't send more data.  Skip
8876          * this if the connection is in T/TCP half-open state.
8877          */
8878         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
8879             !(tp->t_flags & TF_DELACK) &&
8880             !TCPS_HAVERCVDFIN(tp->t_state)) {
8881                 /*
8882                  * "adv" is the amount we could increase the window, taking
8883                  * into account that we are limited by TCP_MAXWIN <<
8884                  * tp->rcv_scale.
8885                  */
8886                 int32_t adv;
8887                 int oldwin;
8888
8889                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
8890                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
8891                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
8892                         adv -= oldwin;
8893                 } else
8894                         oldwin = 0;
8895
8896                 /*
8897                  * If the new window size ends up being the same as the old
8898                  * size when it is scaled, then don't force a window update.
8899                  */
8900                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
8901                         goto dontupdate;
8902
8903                 if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) &&
8904                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
8905                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
8906                      so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) {
8907                         pass = 7;
8908                         goto send;
8909                 }
8910                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
8911                         goto send;
8912         }
8913 dontupdate:
8914
8915         /*
8916          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
8917          * is also a catch-all for the retransmit timer timeout case.
8918          */
8919         if (tp->t_flags & TF_ACKNOW) {
8920                 pass = 8;
8921                 goto send;
8922         }
8923         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
8924                 pass = 9;
8925                 goto send;
8926         }
8927         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
8928                 pass = 10;
8929                 goto send;
8930         }
8931         /*
8932          * If our state indicates that FIN should be sent and we have not
8933          * yet done so, then we need to send.
8934          */
8935         if ((flags & TH_FIN) &&
8936             (tp->snd_nxt == tp->snd_una)) {
8937                 pass = 11;
8938                 goto send;
8939         }
8940         /*
8941          * No reason to send a segment, just return.
8942          */
8943 just_return:
8944         SOCKBUF_UNLOCK(sb);
8945 just_return_nolock:
8946         if (tot_len_this_send == 0)
8947                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
8948         if (slot) {
8949                 /* set the rack tcb into the slot N */
8950                 counter_u64_add(rack_paced_segments, 1);
8951         } else if (tot_len_this_send) {
8952                 counter_u64_add(rack_unpaced_segments, 1);
8953         }
8954         /* Check if we need to go into persists or not */
8955         if ((rack->rc_in_persist == 0) &&
8956             (tp->snd_max == tp->snd_una) &&
8957             TCPS_HAVEESTABLISHED(tp->t_state) &&
8958             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8959             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) &&
8960             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) {
8961                 /* Yes lets make sure to move to persist before timer-start */
8962                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
8963         }
8964         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
8965         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
8966         tp->t_flags &= ~TF_FORCEDATA;
8967         return (0);
8968
8969 send:
8970         if ((flags & TH_FIN) &&
8971             sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8972                 /*
8973                  * We do not transmit a FIN
8974                  * with data outstanding. We
8975                  * need to make it so all data
8976                  * is acked first.
8977                  */
8978                 flags &= ~TH_FIN;
8979         }
8980         if (doing_tlp == 0) {
8981                 /*
8982                  * Data not a TLP, and its not the rxt firing. If it is the
8983                  * rxt firing, we want to leave the tlp_in_progress flag on
8984                  * so we don't send another TLP. It has to be a rack timer
8985                  * or normal send (response to acked data) to clear the tlp
8986                  * in progress flag.
8987                  */
8988                 rack->rc_tlp_in_progress = 0;
8989         }
8990         SOCKBUF_LOCK_ASSERT(sb);
8991         if (len > 0) {
8992                 if (len >= ctf_fixed_maxseg(tp))
8993                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
8994                 else
8995                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
8996         }
8997         /*
8998          * Before ESTABLISHED, force sending of initial options unless TCP
8999          * set not to do any options. NOTE: we assume that the IP/TCP header
9000          * plus TCP options always fit in a single mbuf, leaving room for a
9001          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
9002          * + optlen <= MCLBYTES
9003          */
9004         optlen = 0;
9005 #ifdef INET6
9006         if (isipv6)
9007                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
9008         else
9009 #endif
9010                 hdrlen = sizeof(struct tcpiphdr);
9011
9012         /*
9013          * Compute options for segment. We only have to care about SYN and
9014          * established connection segments.  Options for SYN-ACK segments
9015          * are handled in TCP syncache.
9016          */
9017         to.to_flags = 0;
9018         if ((tp->t_flags & TF_NOOPT) == 0) {
9019                 /* Maximum segment size. */
9020                 if (flags & TH_SYN) {
9021                         tp->snd_nxt = tp->iss;
9022                         to.to_mss = tcp_mssopt(&inp->inp_inc);
9023 #ifdef NETFLIX_TCPOUDP
9024                         if (tp->t_port)
9025                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
9026 #endif
9027                         to.to_flags |= TOF_MSS;
9028
9029                         /*
9030                          * On SYN or SYN|ACK transmits on TFO connections,
9031                          * only include the TFO option if it is not a
9032                          * retransmit, as the presence of the TFO option may
9033                          * have caused the original SYN or SYN|ACK to have
9034                          * been dropped by a middlebox.
9035                          */
9036                         if (IS_FASTOPEN(tp->t_flags) &&
9037                             (tp->t_rxtshift == 0)) {
9038                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
9039                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
9040                                         to.to_tfo_cookie =
9041                                             (u_int8_t *)&tp->t_tfo_cookie.server;
9042                                         to.to_flags |= TOF_FASTOPEN;
9043                                         wanted_cookie = 1;
9044                                 } else if (tp->t_state == TCPS_SYN_SENT) {
9045                                         to.to_tfo_len =
9046                                             tp->t_tfo_client_cookie_len;
9047                                         to.to_tfo_cookie =
9048                                             tp->t_tfo_cookie.client;
9049                                         to.to_flags |= TOF_FASTOPEN;
9050                                         wanted_cookie = 1;
9051                                         /*
9052                                          * If we wind up having more data to
9053                                          * send with the SYN than can fit in
9054                                          * one segment, don't send any more
9055                                          * until the SYN|ACK comes back from
9056                                          * the other end.
9057                                          */
9058                                         sendalot = 0;
9059                                 }
9060                         }
9061                 }
9062                 /* Window scaling. */
9063                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
9064                         to.to_wscale = tp->request_r_scale;
9065                         to.to_flags |= TOF_SCALE;
9066                 }
9067                 /* Timestamps. */
9068                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
9069                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
9070                         to.to_tsval = cts + tp->ts_offset;
9071                         to.to_tsecr = tp->ts_recent;
9072                         to.to_flags |= TOF_TS;
9073                 }
9074                 /* Set receive buffer autosizing timestamp. */
9075                 if (tp->rfbuf_ts == 0 &&
9076                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
9077                         tp->rfbuf_ts = tcp_ts_getticks();
9078                 /* Selective ACK's. */
9079                 if (flags & TH_SYN)
9080                         to.to_flags |= TOF_SACKPERM;
9081                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9082                     tp->rcv_numsacks > 0) {
9083                         to.to_flags |= TOF_SACK;
9084                         to.to_nsacks = tp->rcv_numsacks;
9085                         to.to_sacks = (u_char *)tp->sackblks;
9086                 }
9087 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9088                 /* TCP-MD5 (RFC2385). */
9089                 if (tp->t_flags & TF_SIGNATURE)
9090                         to.to_flags |= TOF_SIGNATURE;
9091 #endif                          /* TCP_SIGNATURE */
9092
9093                 /* Processing the options. */
9094                 hdrlen += optlen = tcp_addoptions(&to, opt);
9095                 /*
9096                  * If we wanted a TFO option to be added, but it was unable
9097                  * to fit, ensure no data is sent.
9098                  */
9099                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
9100                     !(to.to_flags & TOF_FASTOPEN))
9101                         len = 0;
9102         }
9103 #ifdef NETFLIX_TCPOUDP
9104         if (tp->t_port) {
9105                 if (V_tcp_udp_tunneling_port == 0) {
9106                         /* The port was removed?? */
9107                         SOCKBUF_UNLOCK(&so->so_snd);
9108                         return (EHOSTUNREACH);
9109                 }
9110                 hdrlen += sizeof(struct udphdr);
9111         }
9112 #endif
9113 #ifdef INET6
9114         if (isipv6)
9115                 ipoptlen = ip6_optlen(tp->t_inpcb);
9116         else
9117 #endif
9118         if (tp->t_inpcb->inp_options)
9119                 ipoptlen = tp->t_inpcb->inp_options->m_len -
9120                     offsetof(struct ipoption, ipopt_list);
9121         else
9122                 ipoptlen = 0;
9123 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
9124         ipoptlen += ipsec_optlen;
9125 #endif
9126
9127 #ifdef KERN_TLS
9128         /* force TSO for so TLS offload can get mss */
9129         if (sb->sb_flags & SB_TLS_IFNET) {
9130                 force_tso = 1;
9131         }
9132 #endif
9133         /*
9134          * Adjust data length if insertion of options will bump the packet
9135          * length beyond the t_maxseg length. Clear the FIN bit because we
9136          * cut off the tail of the segment.
9137          */
9138         if (len + optlen + ipoptlen > tp->t_maxseg) {
9139                 if (tso) {
9140                         uint32_t if_hw_tsomax;
9141                         uint32_t moff;
9142                         int32_t max_len;
9143
9144                         /* extract TSO information */
9145                         if_hw_tsomax = tp->t_tsomax;
9146                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
9147                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
9148                         KASSERT(ipoptlen == 0,
9149                             ("%s: TSO can't do IP options", __func__));
9150
9151                         /*
9152                          * Check if we should limit by maximum payload
9153                          * length:
9154                          */
9155                         if (if_hw_tsomax != 0) {
9156                                 /* compute maximum TSO length */
9157                                 max_len = (if_hw_tsomax - hdrlen -
9158                                     max_linkhdr);
9159                                 if (max_len <= 0) {
9160                                         len = 0;
9161                                 } else if (len > max_len) {
9162                                         sendalot = 1;
9163                                         len = max_len;
9164                                 }
9165                         }
9166                         /*
9167                          * Prevent the last segment from being fractional
9168                          * unless the send sockbuf can be emptied:
9169                          */
9170                         max_len = (tp->t_maxseg - optlen);
9171                         if (((sb_offset + len) < sbavail(sb)) &&
9172                             (hw_tls == 0)) {
9173                                 moff = len % (u_int)max_len;
9174                                 if (moff != 0) {
9175                                         len -= moff;
9176                                         sendalot = 1;
9177                                 }
9178                         }
9179                         /*
9180                          * In case there are too many small fragments don't
9181                          * use TSO:
9182                          */
9183                         if (len <= maxseg) {
9184                                 len = max_len;
9185                                 sendalot = 1;
9186                                 tso = 0;
9187                         }
9188                         /*
9189                          * Send the FIN in a separate segment after the bulk
9190                          * sending is done. We don't trust the TSO
9191                          * implementations to clear the FIN flag on all but
9192                          * the last segment.
9193                          */
9194                         if (tp->t_flags & TF_NEEDFIN)
9195                                 sendalot = 1;
9196
9197                 } else {
9198                         if (optlen + ipoptlen >= tp->t_maxseg) {
9199                                 /*
9200                                  * Since we don't have enough space to put
9201                                  * the IP header chain and the TCP header in
9202                                  * one packet as required by RFC 7112, don't
9203                                  * send it. Also ensure that at least one
9204                                  * byte of the payload can be put into the
9205                                  * TCP segment.
9206                                  */
9207                                 SOCKBUF_UNLOCK(&so->so_snd);
9208                                 error = EMSGSIZE;
9209                                 sack_rxmit = 0;
9210                                 goto out;
9211                         }
9212                         len = tp->t_maxseg - optlen - ipoptlen;
9213                         sendalot = 1;
9214                 }
9215         } else
9216                 tso = 0;
9217         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
9218             ("%s: len > IP_MAXPACKET", __func__));
9219 #ifdef DIAGNOSTIC
9220 #ifdef INET6
9221         if (max_linkhdr + hdrlen > MCLBYTES)
9222 #else
9223         if (max_linkhdr + hdrlen > MHLEN)
9224 #endif
9225                 panic("tcphdr too big");
9226 #endif
9227
9228         /*
9229          * This KASSERT is here to catch edge cases at a well defined place.
9230          * Before, those had triggered (random) panic conditions further
9231          * down.
9232          */
9233         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
9234         if ((len == 0) &&
9235             (flags & TH_FIN) &&
9236             (sbused(sb))) {
9237                 /*
9238                  * We have outstanding data, don't send a fin by itself!.
9239                  */
9240                 goto just_return;
9241         }
9242         /*
9243          * Grab a header mbuf, attaching a copy of data to be transmitted,
9244          * and initialize the header from the template for sends on this
9245          * connection.
9246          */
9247         if (len) {
9248                 uint32_t max_val;
9249                 uint32_t moff;
9250
9251                 if (rack->rc_pace_max_segs)
9252                         max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp);
9253                 else
9254                         max_val = len;
9255                 if (rack->r_ctl.rc_pace_max_segs < max_val)
9256                         max_val = rack->r_ctl.rc_pace_max_segs;
9257                 /*
9258                  * We allow a limit on sending with hptsi.
9259                  */
9260                 if (len > max_val) {
9261                         len = max_val;
9262                 }
9263 #ifdef INET6
9264                 if (MHLEN < hdrlen + max_linkhdr)
9265                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
9266                 else
9267 #endif
9268                         m = m_gethdr(M_NOWAIT, MT_DATA);
9269
9270                 if (m == NULL) {
9271                         SOCKBUF_UNLOCK(sb);
9272                         error = ENOBUFS;
9273                         sack_rxmit = 0;
9274                         goto out;
9275                 }
9276                 m->m_data += max_linkhdr;
9277                 m->m_len = hdrlen;
9278
9279                 /*
9280                  * Start the m_copy functions from the closest mbuf to the
9281                  * sb_offset in the socket buffer chain.
9282                  */
9283                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
9284                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
9285                         m_copydata(mb, moff, (int)len,
9286                             mtod(m, caddr_t)+hdrlen);
9287                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9288                                 sbsndptr_adv(sb, mb, len);
9289                         m->m_len += len;
9290                 } else {
9291                         struct sockbuf *msb;
9292
9293                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9294                                 msb = NULL;
9295                         else
9296                                 msb = sb;
9297                         m->m_next = tcp_m_copym(
9298 #ifdef NETFLIX_COPY_ARGS
9299                                 tp,
9300 #endif
9301                                 mb, moff, &len,
9302                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
9303                             ((rsm == NULL) ? hw_tls : 0)
9304 #ifdef NETFLIX_COPY_ARGS
9305                                 , &filled_all
9306 #endif
9307                                 );
9308                         if (len <= (tp->t_maxseg - optlen)) {
9309                                 /*
9310                                  * Must have ran out of mbufs for the copy
9311                                  * shorten it to no longer need tso. Lets
9312                                  * not put on sendalot since we are low on
9313                                  * mbufs.
9314                                  */
9315                                 tso = 0;
9316                         }
9317                         if (m->m_next == NULL) {
9318                                 SOCKBUF_UNLOCK(sb);
9319                                 (void)m_free(m);
9320                                 error = ENOBUFS;
9321                                 sack_rxmit = 0;
9322                                 goto out;
9323                         }
9324                 }
9325                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
9326                         KMOD_TCPSTAT_INC(tcps_sndprobe);
9327 #ifdef STATS
9328                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9329                                 stats_voi_update_abs_u32(tp->t_stats,
9330                                     VOI_TCP_RETXPB, len);
9331                         else
9332                                 stats_voi_update_abs_u64(tp->t_stats,
9333                                     VOI_TCP_TXPB, len);
9334 #endif
9335                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
9336                         if (rsm && (rsm->r_flags & RACK_TLP)) {
9337                                 /*
9338                                  * TLP should not count in retran count, but
9339                                  * in its own bin
9340                                  */
9341                                 counter_u64_add(rack_tlp_retran, 1);
9342                                 counter_u64_add(rack_tlp_retran_bytes, len);
9343                         } else {
9344                                 tp->t_sndrexmitpack++;
9345                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
9346                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
9347                         }
9348 #ifdef STATS
9349                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
9350                             len);
9351 #endif
9352                 } else {
9353                         KMOD_TCPSTAT_INC(tcps_sndpack);
9354                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
9355 #ifdef STATS
9356                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
9357                             len);
9358 #endif
9359                 }
9360                 /*
9361                  * If we're sending everything we've got, set PUSH. (This
9362                  * will keep happy those implementations which only give
9363                  * data to the user when a buffer fills or a PUSH comes in.)
9364                  */
9365                 if (sb_offset + len == sbused(sb) &&
9366                     sbused(sb) &&
9367                     !(flags & TH_SYN))
9368                         flags |= TH_PUSH;
9369
9370                 /*
9371                  * Are we doing pacing, if so we must calculate the slot. We
9372                  * only do hptsi in ESTABLISHED and with no RESET being
9373                  * sent where we have data to send.
9374                  */
9375                 if (((tp->t_state == TCPS_ESTABLISHED) ||
9376                     (tp->t_state == TCPS_CLOSE_WAIT) ||
9377                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
9378                     ((tp->t_flags & TF_SENTFIN) == 0) &&
9379                     ((flags & TH_FIN) == 0))) &&
9380                     ((flags & TH_RST) == 0)) {
9381                         /* Get our pacing rate */
9382                         tot_len_this_send += len;
9383                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send);
9384                 }
9385                 SOCKBUF_UNLOCK(sb);
9386         } else {
9387                 SOCKBUF_UNLOCK(sb);
9388                 if (tp->t_flags & TF_ACKNOW)
9389                         KMOD_TCPSTAT_INC(tcps_sndacks);
9390                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
9391                         KMOD_TCPSTAT_INC(tcps_sndctrl);
9392                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
9393                         KMOD_TCPSTAT_INC(tcps_sndurg);
9394                 else
9395                         KMOD_TCPSTAT_INC(tcps_sndwinup);
9396
9397                 m = m_gethdr(M_NOWAIT, MT_DATA);
9398                 if (m == NULL) {
9399                         error = ENOBUFS;
9400                         sack_rxmit = 0;
9401                         goto out;
9402                 }
9403 #ifdef INET6
9404                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
9405                     MHLEN >= hdrlen) {
9406                         M_ALIGN(m, hdrlen);
9407                 } else
9408 #endif
9409                         m->m_data += max_linkhdr;
9410                 m->m_len = hdrlen;
9411         }
9412         SOCKBUF_UNLOCK_ASSERT(sb);
9413         m->m_pkthdr.rcvif = (struct ifnet *)0;
9414 #ifdef MAC
9415         mac_inpcb_create_mbuf(inp, m);
9416 #endif
9417 #ifdef INET6
9418         if (isipv6) {
9419                 ip6 = mtod(m, struct ip6_hdr *);
9420 #ifdef NETFLIX_TCPOUDP
9421                 if (tp->t_port) {
9422                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
9423                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9424                         udp->uh_dport = tp->t_port;
9425                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
9426                         udp->uh_ulen = htons(ulen);
9427                         th = (struct tcphdr *)(udp + 1);
9428                 } else
9429 #endif
9430                         th = (struct tcphdr *)(ip6 + 1);
9431                 tcpip_fillheaders(inp,
9432 #ifdef NETFLIX_TCPOUDP
9433                                   tp->t_port,
9434 #endif
9435                                   ip6, th);
9436         } else
9437 #endif                          /* INET6 */
9438         {
9439                 ip = mtod(m, struct ip *);
9440 #ifdef TCPDEBUG
9441                 ipov = (struct ipovly *)ip;
9442 #endif
9443 #ifdef NETFLIX_TCPOUDP
9444                 if (tp->t_port) {
9445                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
9446                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9447                         udp->uh_dport = tp->t_port;
9448                         ulen = hdrlen + len - sizeof(struct ip);
9449                         udp->uh_ulen = htons(ulen);
9450                         th = (struct tcphdr *)(udp + 1);
9451                 } else
9452 #endif
9453                         th = (struct tcphdr *)(ip + 1);
9454                 tcpip_fillheaders(inp,
9455 #ifdef NETFLIX_TCPOUDP
9456                                   tp->t_port,
9457 #endif
9458                                   ip, th);
9459         }
9460         /*
9461          * Fill in fields, remembering maximum advertised window for use in
9462          * delaying messages about window sizes. If resending a FIN, be sure
9463          * not to use a new sequence number.
9464          */
9465         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
9466             tp->snd_nxt == tp->snd_max)
9467                 tp->snd_nxt--;
9468         /*
9469          * If we are starting a connection, send ECN setup SYN packet. If we
9470          * are on a retransmit, we may resend those bits a number of times
9471          * as per RFC 3168.
9472          */
9473         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
9474                 if (tp->t_rxtshift >= 1) {
9475                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
9476                                 flags |= TH_ECE | TH_CWR;
9477                 } else
9478                         flags |= TH_ECE | TH_CWR;
9479         }
9480         if (tp->t_state == TCPS_ESTABLISHED &&
9481             (tp->t_flags2 & TF2_ECN_PERMIT)) {
9482                 /*
9483                  * If the peer has ECN, mark data packets with ECN capable
9484                  * transmission (ECT). Ignore pure ack packets,
9485                  * retransmissions and window probes.
9486                  */
9487                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
9488                     (sack_rxmit == 0) &&
9489                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
9490 #ifdef INET6
9491                         if (isipv6)
9492                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
9493                         else
9494 #endif
9495                                 ip->ip_tos |= IPTOS_ECN_ECT0;
9496                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
9497                 }
9498                 /*
9499                  * Reply with proper ECN notifications.
9500                  */
9501                 if (tp->t_flags2 & TF2_ECN_SND_CWR) {
9502                         flags |= TH_CWR;
9503                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
9504                 }
9505                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
9506                         flags |= TH_ECE;
9507         }
9508         /*
9509          * If we are doing retransmissions, then snd_nxt will not reflect
9510          * the first unsent octet.  For ACK only packets, we do not want the
9511          * sequence number of the retransmitted packet, we want the sequence
9512          * number of the next unsent octet.  So, if there is no data (and no
9513          * SYN or FIN), use snd_max instead of snd_nxt when filling in
9514          * ti_seq.  But if we are in persist state, snd_max might reflect
9515          * one byte beyond the right edge of the window, so use snd_nxt in
9516          * that case, since we know we aren't doing a retransmission.
9517          * (retransmit and persist are mutually exclusive...)
9518          */
9519         if (sack_rxmit == 0) {
9520                 if (len || (flags & (TH_SYN | TH_FIN)) ||
9521                     rack->rc_in_persist) {
9522                         th->th_seq = htonl(tp->snd_nxt);
9523                         rack_seq = tp->snd_nxt;
9524                 } else if (flags & TH_RST) {
9525                         /*
9526                          * For a Reset send the last cum ack in sequence
9527                          * (this like any other choice may still generate a
9528                          * challenge ack, if a ack-update packet is in
9529                          * flight).
9530                          */
9531                         th->th_seq = htonl(tp->snd_una);
9532                         rack_seq = tp->snd_una;
9533                 } else {
9534                         th->th_seq = htonl(tp->snd_max);
9535                         rack_seq = tp->snd_max;
9536                 }
9537         } else {
9538                 th->th_seq = htonl(rsm->r_start);
9539                 rack_seq = rsm->r_start;
9540         }
9541         th->th_ack = htonl(tp->rcv_nxt);
9542         if (optlen) {
9543                 bcopy(opt, th + 1, optlen);
9544                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
9545         }
9546         th->th_flags = flags;
9547         /*
9548          * Calculate receive window.  Don't shrink window, but avoid silly
9549          * window syndrome.
9550          * If a RST segment is sent, advertise a window of zero.
9551          */
9552         if (flags & TH_RST) {
9553                 recwin = 0;
9554         } else {
9555                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
9556                     recwin < (long)ctf_fixed_maxseg(tp))
9557                         recwin = 0;
9558                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
9559                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
9560                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
9561                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
9562                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
9563         }
9564
9565         /*
9566          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
9567          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
9568          * handled in syncache.
9569          */
9570         if (flags & TH_SYN)
9571                 th->th_win = htons((u_short)
9572                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
9573         else
9574                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
9575         /*
9576          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
9577          * window.  This may cause the remote transmitter to stall.  This
9578          * flag tells soreceive() to disable delayed acknowledgements when
9579          * draining the buffer.  This can occur if the receiver is
9580          * attempting to read more data than can be buffered prior to
9581          * transmitting on the connection.
9582          */
9583         if (th->th_win == 0) {
9584                 tp->t_sndzerowin++;
9585                 tp->t_flags |= TF_RXWIN0SENT;
9586         } else
9587                 tp->t_flags &= ~TF_RXWIN0SENT;
9588         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
9589                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
9590                 th->th_flags |= TH_URG;
9591         } else
9592                 /*
9593                  * If no urgent pointer to send, then we pull the urgent
9594                  * pointer to the left edge of the send window so that it
9595                  * doesn't drift into the send window on sequence number
9596                  * wraparound.
9597                  */
9598                 tp->snd_up = tp->snd_una;       /* drag it along */
9599
9600 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9601         if (to.to_flags & TOF_SIGNATURE) {
9602                 /*
9603                  * Calculate MD5 signature and put it into the place
9604                  * determined before.
9605                  * NOTE: since TCP options buffer doesn't point into
9606                  * mbuf's data, calculate offset and use it.
9607                  */
9608                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
9609                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
9610                         /*
9611                          * Do not send segment if the calculation of MD5
9612                          * digest has failed.
9613                          */
9614                         goto out;
9615                 }
9616         }
9617 #endif
9618
9619         /*
9620          * Put TCP length in extended header, and then checksum extended
9621          * header and data.
9622          */
9623         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
9624 #ifdef INET6
9625         if (isipv6) {
9626                 /*
9627                  * ip6_plen is not need to be filled now, and will be filled
9628                  * in ip6_output.
9629                  */
9630                 if (tp->t_port) {
9631                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
9632                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9633                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
9634                         th->th_sum = htons(0);
9635                         UDPSTAT_INC(udps_opackets);
9636                 } else {
9637                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
9638                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9639                         th->th_sum = in6_cksum_pseudo(ip6,
9640                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
9641                             0);
9642                 }
9643         }
9644 #endif
9645 #if defined(INET6) && defined(INET)
9646         else
9647 #endif
9648 #ifdef INET
9649         {
9650                 if (tp->t_port) {
9651                         m->m_pkthdr.csum_flags = CSUM_UDP;
9652                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9653                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
9654                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
9655                         th->th_sum = htons(0);
9656                         UDPSTAT_INC(udps_opackets);
9657                 } else {
9658                         m->m_pkthdr.csum_flags = CSUM_TCP;
9659                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9660                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
9661                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
9662                             IPPROTO_TCP + len + optlen));
9663                 }
9664                 /* IP version must be set here for ipv4/ipv6 checking later */
9665                 KASSERT(ip->ip_v == IPVERSION,
9666                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
9667         }
9668 #endif
9669         /*
9670          * Enable TSO and specify the size of the segments. The TCP pseudo
9671          * header checksum is always provided. XXX: Fixme: This is currently
9672          * not the case for IPv6.
9673          */
9674         if (tso || force_tso) {
9675                 KASSERT(force_tso || len > tp->t_maxseg - optlen,
9676                     ("%s: len <= tso_segsz", __func__));
9677                 m->m_pkthdr.csum_flags |= CSUM_TSO;
9678                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
9679         }
9680         KASSERT(len + hdrlen == m_length(m, NULL),
9681             ("%s: mbuf chain different than expected: %d + %u != %u",
9682             __func__, len, hdrlen, m_length(m, NULL)));
9683
9684 #ifdef TCP_HHOOK
9685         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
9686         hhook_run_tcp_est_out(tp, th, &to, len, tso);
9687 #endif
9688 #ifdef TCPDEBUG
9689         /*
9690          * Trace.
9691          */
9692         if (so->so_options & SO_DEBUG) {
9693                 u_short save = 0;
9694
9695 #ifdef INET6
9696                 if (!isipv6)
9697 #endif
9698                 {
9699                         save = ipov->ih_len;
9700                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
9701                               * (th->th_off << 2) */ );
9702                 }
9703                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
9704 #ifdef INET6
9705                 if (!isipv6)
9706 #endif
9707                         ipov->ih_len = save;
9708         }
9709 #endif                          /* TCPDEBUG */
9710
9711         /* We're getting ready to send; log now. */
9712         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
9713                 union tcp_log_stackspecific log;
9714                 struct timeval tv;
9715
9716                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
9717                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
9718                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
9719                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
9720                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
9721                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
9722                 log.u_bbr.flex4 = orig_len;
9723                 if (filled_all)
9724                         log.u_bbr.flex5 = 0x80000000;
9725                 else
9726                         log.u_bbr.flex5 = 0;
9727                 if (rsm || sack_rxmit) {
9728                         log.u_bbr.flex8 = 1;
9729                 } else {
9730                         log.u_bbr.flex8 = 0;
9731                 }
9732                 log.u_bbr.pkts_out = tp->t_maxseg;
9733                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
9734                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9735                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
9736                     len, &log, false, NULL, NULL, 0, &tv);
9737         } else
9738                 lgb = NULL;
9739
9740         /*
9741          * Fill in IP length and desired time to live and send to IP level.
9742          * There should be a better way to handle ttl and tos; we could keep
9743          * them in the template, but need a way to checksum without them.
9744          */
9745         /*
9746          * m->m_pkthdr.len should have been set before cksum calcuration,
9747          * because in6_cksum() need it.
9748          */
9749 #ifdef INET6
9750         if (isipv6) {
9751                 /*
9752                  * we separately set hoplimit for every segment, since the
9753                  * user might want to change the value via setsockopt. Also,
9754                  * desired default hop limit might be changed via Neighbor
9755                  * Discovery.
9756                  */
9757                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
9758
9759                 /*
9760                  * Set the packet size here for the benefit of DTrace
9761                  * probes. ip6_output() will set it properly; it's supposed
9762                  * to include the option header lengths as well.
9763                  */
9764                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
9765
9766                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
9767                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9768                 else
9769                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9770
9771                 if (tp->t_state == TCPS_SYN_SENT)
9772                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
9773
9774                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
9775                 /* TODO: IPv6 IP6TOS_ECT bit on */
9776                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
9777                     &inp->inp_route6,
9778                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
9779                     NULL, NULL, inp);
9780
9781                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
9782                         mtu = inp->inp_route6.ro_rt->rt_mtu;
9783         }
9784 #endif                          /* INET6 */
9785 #if defined(INET) && defined(INET6)
9786         else
9787 #endif
9788 #ifdef INET
9789         {
9790                 ip->ip_len = htons(m->m_pkthdr.len);
9791 #ifdef INET6
9792                 if (inp->inp_vflag & INP_IPV6PROTO)
9793                         ip->ip_ttl = in6_selecthlim(inp, NULL);
9794 #endif                          /* INET6 */
9795                 /*
9796                  * If we do path MTU discovery, then we set DF on every
9797                  * packet. This might not be the best thing to do according
9798                  * to RFC3390 Section 2. However the tcp hostcache migitates
9799                  * the problem so it affects only the first tcp connection
9800                  * with a host.
9801                  *
9802                  * NB: Don't set DF on small MTU/MSS to have a safe
9803                  * fallback.
9804                  */
9805                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
9806                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9807                         if (tp->t_port == 0 || len < V_tcp_minmss) {
9808                                 ip->ip_off |= htons(IP_DF);
9809                         }
9810                 } else {
9811                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9812                 }
9813
9814                 if (tp->t_state == TCPS_SYN_SENT)
9815                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
9816
9817                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
9818
9819                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
9820                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
9821                     inp);
9822                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
9823                         mtu = inp->inp_route.ro_rt->rt_mtu;
9824         }
9825 #endif                          /* INET */
9826
9827 out:
9828         if (lgb) {
9829                 lgb->tlb_errno = error;
9830                 lgb = NULL;
9831         }
9832         /*
9833          * In transmit state, time the transmission and arrange for the
9834          * retransmit.  In persist state, just set snd_max.
9835          */
9836         if (error == 0) {
9837                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9838                     (tp->t_flags & TF_SACK_PERMIT) &&
9839                     tp->rcv_numsacks > 0)
9840                         tcp_clean_dsack_blocks(tp);
9841                 if (len == 0)
9842                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
9843                 else if (len == 1) {
9844                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
9845                 } else if (len > 1) {
9846                         int idx;
9847
9848                         idx = (len / ctf_fixed_maxseg(tp)) + 3;
9849                         if (idx >= TCP_MSS_ACCT_ATIMER)
9850                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
9851                         else
9852                                 counter_u64_add(rack_out_size[idx], 1);
9853                 }
9854                 if (hw_tls && len > 0) {
9855                         if (filled_all) {
9856                                 counter_u64_add(rack_tls_filled, 1);
9857                                 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1);
9858                         } else {
9859                                 if (rsm) {
9860                                         counter_u64_add(rack_tls_rxt, 1);
9861                                         rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1);
9862                                 } else if (doing_tlp) {
9863                                         counter_u64_add(rack_tls_tlp, 1);
9864                                         rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1);
9865                                 } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) {
9866                                         counter_u64_add(rack_tls_app, 1);
9867                                         rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1);
9868                                 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) {
9869                                         counter_u64_add(rack_tls_cwnd, 1);
9870                                         rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1);
9871                                 } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) {
9872                                         counter_u64_add(rack_tls_rwnd, 1);
9873                                         rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1);
9874                                 } else {
9875                                         rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1);
9876                                         counter_u64_add(rack_tls_other, 1);
9877                                 }
9878                         }
9879                 }
9880         }
9881         if (sub_from_prr && (error == 0)) {
9882                 if (rack->r_ctl.rc_prr_sndcnt >= len)
9883                         rack->r_ctl.rc_prr_sndcnt -= len;
9884                 else
9885                         rack->r_ctl.rc_prr_sndcnt = 0;
9886         }
9887         sub_from_prr = 0;
9888         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
9889             pass, rsm);
9890         if ((error == 0) &&
9891             (len > 0) &&
9892             (tp->snd_una == tp->snd_max))
9893                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
9894         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
9895             (rack->rc_in_persist == 0)) {
9896                 tcp_seq startseq = tp->snd_nxt;
9897
9898                 /*
9899                  * Advance snd_nxt over sequence space of this segment.
9900                  */
9901                 if (error)
9902                         /* We don't log or do anything with errors */
9903                         goto nomore;
9904
9905                 if (flags & (TH_SYN | TH_FIN)) {
9906                         if (flags & TH_SYN)
9907                                 tp->snd_nxt++;
9908                         if (flags & TH_FIN) {
9909                                 tp->snd_nxt++;
9910                                 tp->t_flags |= TF_SENTFIN;
9911                         }
9912                 }
9913                 /* In the ENOBUFS case we do *not* update snd_max */
9914                 if (sack_rxmit)
9915                         goto nomore;
9916
9917                 tp->snd_nxt += len;
9918                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
9919                         if (tp->snd_una == tp->snd_max) {
9920                                 /*
9921                                  * Update the time we just added data since
9922                                  * none was outstanding.
9923                                  */
9924                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9925                                 tp->t_acktime = ticks;
9926                         }
9927                         tp->snd_max = tp->snd_nxt;
9928                         /*
9929                          * Time this transmission if not a retransmission and
9930                          * not currently timing anything.
9931                          * This is only relevant in case of switching back to
9932                          * the base stack.
9933                          */
9934                         if (tp->t_rtttime == 0) {
9935                                 tp->t_rtttime = ticks;
9936                                 tp->t_rtseq = startseq;
9937                                 KMOD_TCPSTAT_INC(tcps_segstimed);
9938                         }
9939 #ifdef STATS
9940                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
9941                                 tp->t_flags |= TF_GPUTINPROG;
9942                                 tp->gput_seq = startseq;
9943                                 tp->gput_ack = startseq +
9944                                     ulmin(sbavail(sb) - sb_offset, sendwin);
9945                                 tp->gput_ts = tcp_ts_getticks();
9946                         }
9947 #endif
9948                 }
9949         } else {
9950                 /*
9951                  * Persist case, update snd_max but since we are in persist
9952                  * mode (no window) we do not update snd_nxt.
9953                  */
9954                 int32_t xlen = len;
9955
9956                 if (error)
9957                         goto nomore;
9958
9959                 if (flags & TH_SYN)
9960                         ++xlen;
9961                 if (flags & TH_FIN) {
9962                         ++xlen;
9963                         tp->t_flags |= TF_SENTFIN;
9964                 }
9965                 /* In the ENOBUFS case we do *not* update snd_max */
9966                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
9967                         if (tp->snd_una == tp->snd_max) {
9968                                 /*
9969                                  * Update the time we just added data since
9970                                  * none was outstanding.
9971                                  */
9972                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9973                                 tp->t_acktime = ticks;
9974                         }
9975                         tp->snd_max = tp->snd_nxt + len;
9976                 }
9977         }
9978 nomore:
9979         if (error) {
9980                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
9981                 /*
9982                  * Failures do not advance the seq counter above. For the
9983                  * case of ENOBUFS we will fall out and retry in 1ms with
9984                  * the hpts. Everything else will just have to retransmit
9985                  * with the timer.
9986                  *
9987                  * In any case, we do not want to loop around for another
9988                  * send without a good reason.
9989                  */
9990                 sendalot = 0;
9991                 switch (error) {
9992                 case EPERM:
9993                         tp->t_flags &= ~TF_FORCEDATA;
9994                         tp->t_softerror = error;
9995                         return (error);
9996                 case ENOBUFS:
9997                         if (slot == 0) {
9998                                 /*
9999                                  * Pace us right away to retry in a some
10000                                  * time
10001                                  */
10002                                 slot = 1 + rack->rc_enobuf;
10003                                 if (rack->rc_enobuf < 255)
10004                                         rack->rc_enobuf++;
10005                                 if (slot > (rack->rc_rack_rtt / 2)) {
10006                                         slot = rack->rc_rack_rtt / 2;
10007                                 }
10008                                 if (slot < 10)
10009                                         slot = 10;
10010                         }
10011                         counter_u64_add(rack_saw_enobuf, 1);
10012                         error = 0;
10013                         goto enobufs;
10014                 case EMSGSIZE:
10015                         /*
10016                          * For some reason the interface we used initially
10017                          * to send segments changed to another or lowered
10018                          * its MTU. If TSO was active we either got an
10019                          * interface without TSO capabilits or TSO was
10020                          * turned off. If we obtained mtu from ip_output()
10021                          * then update it and try again.
10022                          */
10023                         if (tso)
10024                                 tp->t_flags &= ~TF_TSO;
10025                         if (mtu != 0) {
10026                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
10027                                 goto again;
10028                         }
10029                         slot = 10;
10030                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10031                         tp->t_flags &= ~TF_FORCEDATA;
10032                         return (error);
10033                 case ENETUNREACH:
10034                         counter_u64_add(rack_saw_enetunreach, 1);
10035                 case EHOSTDOWN:
10036                 case EHOSTUNREACH:
10037                 case ENETDOWN:
10038                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
10039                                 tp->t_softerror = error;
10040                         }
10041                         /* FALLTHROUGH */
10042                 default:
10043                         slot = 10;
10044                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10045                         tp->t_flags &= ~TF_FORCEDATA;
10046                         return (error);
10047                 }
10048         } else {
10049                 rack->rc_enobuf = 0;
10050         }
10051         KMOD_TCPSTAT_INC(tcps_sndtotal);
10052
10053         /*
10054          * Data sent (as far as we can tell). If this advertises a larger
10055          * window than any other segment, then remember the size of the
10056          * advertised window. Any pending ACK has now been sent.
10057          */
10058         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
10059                 tp->rcv_adv = tp->rcv_nxt + recwin;
10060         tp->last_ack_sent = tp->rcv_nxt;
10061         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
10062 enobufs:
10063         rack->r_tlp_running = 0;
10064         if (flags & TH_RST) {
10065                 /*
10066                  * We don't send again after sending a RST.
10067                  */
10068                 slot = 0;
10069                 sendalot = 0;
10070         }
10071         if (rsm && (slot == 0)) {
10072                 /*
10073                  * Dup ack retransmission possibly, so
10074                  * lets assure we have at least min rack
10075                  * time, if its a rack resend then the rack
10076                  * to will also be set to this.
10077                  */
10078                 slot = rack->r_ctl.rc_min_to;
10079         }
10080         if (slot) {
10081                 /* set the rack tcb into the slot N */
10082                 counter_u64_add(rack_paced_segments, 1);
10083         } else if (sendalot) {
10084                 if (len)
10085                         counter_u64_add(rack_unpaced_segments, 1);
10086                 sack_rxmit = 0;
10087                 tp->t_flags &= ~TF_FORCEDATA;
10088                 goto again;
10089         } else if (len) {
10090                 counter_u64_add(rack_unpaced_segments, 1);
10091         }
10092         tp->t_flags &= ~TF_FORCEDATA;
10093         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
10094         return (error);
10095 }
10096
10097 /*
10098  * rack_ctloutput() must drop the inpcb lock before performing copyin on
10099  * socket option arguments.  When it re-acquires the lock after the copy, it
10100  * has to revalidate that the connection is still valid for the socket
10101  * option.
10102  */
10103 static int
10104 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
10105     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10106 {
10107         struct epoch_tracker et;
10108         int32_t error = 0, optval;
10109
10110         switch (sopt->sopt_name) {
10111         case TCP_RACK_PROP_RATE:
10112         case TCP_RACK_PROP:
10113         case TCP_RACK_TLP_REDUCE:
10114         case TCP_RACK_EARLY_RECOV:
10115         case TCP_RACK_PACE_ALWAYS:
10116         case TCP_DELACK:
10117         case TCP_RACK_PACE_REDUCE:
10118         case TCP_RACK_PACE_MAX_SEG:
10119         case TCP_RACK_PRR_SENDALOT:
10120         case TCP_RACK_MIN_TO:
10121         case TCP_RACK_EARLY_SEG:
10122         case TCP_RACK_REORD_THRESH:
10123         case TCP_RACK_REORD_FADE:
10124         case TCP_RACK_TLP_THRESH:
10125         case TCP_RACK_PKT_DELAY:
10126         case TCP_RACK_TLP_USE:
10127         case TCP_RACK_TLP_INC_VAR:
10128         case TCP_RACK_IDLE_REDUCE_HIGH:
10129         case TCP_RACK_MIN_PACE:
10130         case TCP_RACK_GP_INCREASE:
10131         case TCP_BBR_RACK_RTT_USE:
10132         case TCP_BBR_USE_RACK_CHEAT:
10133         case TCP_RACK_DO_DETECTION:
10134         case TCP_DATA_AFTER_CLOSE:
10135                 break;
10136         default:
10137                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10138                 break;
10139         }
10140         INP_WUNLOCK(inp);
10141         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
10142         if (error)
10143                 return (error);
10144         INP_WLOCK(inp);
10145         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
10146                 INP_WUNLOCK(inp);
10147                 return (ECONNRESET);
10148         }
10149         tp = intotcpcb(inp);
10150         rack = (struct tcp_rack *)tp->t_fb_ptr;
10151         switch (sopt->sopt_name) {
10152         case TCP_RACK_DO_DETECTION:
10153                 RACK_OPTS_INC(tcp_rack_do_detection);
10154                 if (optval == 0)
10155                         rack->do_detection = 0;
10156                 else
10157                         rack->do_detection = 1;
10158                 break;
10159         case TCP_RACK_PROP_RATE:
10160                 if ((optval <= 0) || (optval >= 100)) {
10161                         error = EINVAL;
10162                         break;
10163                 }
10164                 RACK_OPTS_INC(tcp_rack_prop_rate);
10165                 rack->r_ctl.rc_prop_rate = optval;
10166                 break;
10167         case TCP_RACK_TLP_USE:
10168                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
10169                         error = EINVAL;
10170                         break;
10171                 }
10172                 RACK_OPTS_INC(tcp_tlp_use);
10173                 rack->rack_tlp_threshold_use = optval;
10174                 break;
10175         case TCP_RACK_PROP:
10176                 /* RACK proportional rate reduction (bool) */
10177                 RACK_OPTS_INC(tcp_rack_prop);
10178                 rack->r_ctl.rc_prop_reduce = optval;
10179                 break;
10180         case TCP_RACK_TLP_REDUCE:
10181                 /* RACK TLP cwnd reduction (bool) */
10182                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
10183                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
10184                 break;
10185         case TCP_RACK_EARLY_RECOV:
10186                 /* Should recovery happen early (bool) */
10187                 RACK_OPTS_INC(tcp_rack_early_recov);
10188                 rack->r_ctl.rc_early_recovery = optval;
10189                 break;
10190         case TCP_RACK_PACE_ALWAYS:
10191                 /* Use the always pace method (bool)  */
10192                 RACK_OPTS_INC(tcp_rack_pace_always);
10193                 if (optval > 0)
10194                         rack->rc_always_pace = 1;
10195                 else
10196                         rack->rc_always_pace = 0;
10197                 break;
10198         case TCP_RACK_PACE_REDUCE:
10199                 /* RACK Hptsi reduction factor (divisor) */
10200                 RACK_OPTS_INC(tcp_rack_pace_reduce);
10201                 if (optval)
10202                         /* Must be non-zero */
10203                         rack->rc_pace_reduce = optval;
10204                 else
10205                         error = EINVAL;
10206                 break;
10207         case TCP_RACK_PACE_MAX_SEG:
10208                 /* Max segments in a pace */
10209                 RACK_OPTS_INC(tcp_rack_max_seg);
10210                 rack->rc_pace_max_segs = optval;
10211                 rack_set_pace_segments(tp, rack);
10212                 break;
10213         case TCP_RACK_PRR_SENDALOT:
10214                 /* Allow PRR to send more than one seg */
10215                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
10216                 rack->r_ctl.rc_prr_sendalot = optval;
10217                 break;
10218         case TCP_RACK_MIN_TO:
10219                 /* Minimum time between rack t-o's in ms */
10220                 RACK_OPTS_INC(tcp_rack_min_to);
10221                 rack->r_ctl.rc_min_to = optval;
10222                 break;
10223         case TCP_RACK_EARLY_SEG:
10224                 /* If early recovery max segments */
10225                 RACK_OPTS_INC(tcp_rack_early_seg);
10226                 rack->r_ctl.rc_early_recovery_segs = optval;
10227                 break;
10228         case TCP_RACK_REORD_THRESH:
10229                 /* RACK reorder threshold (shift amount) */
10230                 RACK_OPTS_INC(tcp_rack_reord_thresh);
10231                 if ((optval > 0) && (optval < 31))
10232                         rack->r_ctl.rc_reorder_shift = optval;
10233                 else
10234                         error = EINVAL;
10235                 break;
10236         case TCP_RACK_REORD_FADE:
10237                 /* Does reordering fade after ms time */
10238                 RACK_OPTS_INC(tcp_rack_reord_fade);
10239                 rack->r_ctl.rc_reorder_fade = optval;
10240                 break;
10241         case TCP_RACK_TLP_THRESH:
10242                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10243                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
10244                 if (optval)
10245                         rack->r_ctl.rc_tlp_threshold = optval;
10246                 else
10247                         error = EINVAL;
10248                 break;
10249         case TCP_BBR_USE_RACK_CHEAT:
10250                 RACK_OPTS_INC(tcp_rack_cheat);
10251                 if (optval)
10252                         rack->use_rack_cheat = 1;
10253                 else
10254                         rack->use_rack_cheat = 0;
10255                 break;
10256         case TCP_RACK_PKT_DELAY:
10257                 /* RACK added ms i.e. rack-rtt + reord + N */
10258                 RACK_OPTS_INC(tcp_rack_pkt_delay);
10259                 rack->r_ctl.rc_pkt_delay = optval;
10260                 break;
10261         case TCP_RACK_TLP_INC_VAR:
10262                 /* Does TLP include rtt variance in t-o */
10263                 error = EINVAL;
10264                 break;
10265         case TCP_RACK_IDLE_REDUCE_HIGH:
10266                 error = EINVAL;
10267                 break;
10268         case TCP_DELACK:
10269                 if (optval == 0)
10270                         tp->t_delayed_ack = 0;
10271                 else
10272                         tp->t_delayed_ack = 1;
10273                 if (tp->t_flags & TF_DELACK) {
10274                         tp->t_flags &= ~TF_DELACK;
10275                         tp->t_flags |= TF_ACKNOW;
10276                         NET_EPOCH_ENTER(et);
10277                         rack_output(tp);
10278                         NET_EPOCH_EXIT(et);
10279                 }
10280                 break;
10281         case TCP_RACK_MIN_PACE:
10282                 RACK_OPTS_INC(tcp_rack_min_pace);
10283                 if (optval > 3)
10284                         rack->r_enforce_min_pace = 3;
10285                 else
10286                         rack->r_enforce_min_pace = optval;
10287                 break;
10288         case TCP_RACK_GP_INCREASE:
10289                 if ((optval >= 0) &&
10290                     (optval <= 256))
10291                         rack->rack_per_of_gp = optval;
10292                 else
10293                         error = EINVAL;
10294
10295                 break;
10296         case TCP_BBR_RACK_RTT_USE:
10297                 if ((optval != USE_RTT_HIGH) &&
10298                     (optval != USE_RTT_LOW) &&
10299                     (optval != USE_RTT_AVG))
10300                         error = EINVAL;
10301                 else
10302                         rack->r_ctl.rc_rate_sample_method = optval;
10303                 break;
10304         case TCP_DATA_AFTER_CLOSE:
10305                 if (optval)
10306                         rack->rc_allow_data_af_clo = 1;
10307                 else
10308                         rack->rc_allow_data_af_clo = 0;
10309                 break;
10310         default:
10311                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10312                 break;
10313         }
10314 #ifdef NETFLIX_STATS
10315         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
10316 #endif
10317         INP_WUNLOCK(inp);
10318         return (error);
10319 }
10320
10321 static int
10322 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
10323     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10324 {
10325         int32_t error, optval;
10326
10327         /*
10328          * Because all our options are either boolean or an int, we can just
10329          * pull everything into optval and then unlock and copy. If we ever
10330          * add a option that is not a int, then this will have quite an
10331          * impact to this routine.
10332          */
10333         error = 0;
10334         switch (sopt->sopt_name) {
10335         case TCP_RACK_DO_DETECTION:
10336                 optval = rack->do_detection;
10337                 break;
10338
10339         case TCP_RACK_PROP_RATE:
10340                 optval = rack->r_ctl.rc_prop_rate;
10341                 break;
10342         case TCP_RACK_PROP:
10343                 /* RACK proportional rate reduction (bool) */
10344                 optval = rack->r_ctl.rc_prop_reduce;
10345                 break;
10346         case TCP_RACK_TLP_REDUCE:
10347                 /* RACK TLP cwnd reduction (bool) */
10348                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
10349                 break;
10350         case TCP_RACK_EARLY_RECOV:
10351                 /* Should recovery happen early (bool) */
10352                 optval = rack->r_ctl.rc_early_recovery;
10353                 break;
10354         case TCP_RACK_PACE_REDUCE:
10355                 /* RACK Hptsi reduction factor (divisor) */
10356                 optval = rack->rc_pace_reduce;
10357                 break;
10358         case TCP_RACK_PACE_MAX_SEG:
10359                 /* Max segments in a pace */
10360                 optval = rack->rc_pace_max_segs;
10361                 break;
10362         case TCP_RACK_PACE_ALWAYS:
10363                 /* Use the always pace method */
10364                 optval = rack->rc_always_pace;
10365                 break;
10366         case TCP_RACK_PRR_SENDALOT:
10367                 /* Allow PRR to send more than one seg */
10368                 optval = rack->r_ctl.rc_prr_sendalot;
10369                 break;
10370         case TCP_RACK_MIN_TO:
10371                 /* Minimum time between rack t-o's in ms */
10372                 optval = rack->r_ctl.rc_min_to;
10373                 break;
10374         case TCP_RACK_EARLY_SEG:
10375                 /* If early recovery max segments */
10376                 optval = rack->r_ctl.rc_early_recovery_segs;
10377                 break;
10378         case TCP_RACK_REORD_THRESH:
10379                 /* RACK reorder threshold (shift amount) */
10380                 optval = rack->r_ctl.rc_reorder_shift;
10381                 break;
10382         case TCP_RACK_REORD_FADE:
10383                 /* Does reordering fade after ms time */
10384                 optval = rack->r_ctl.rc_reorder_fade;
10385                 break;
10386         case TCP_BBR_USE_RACK_CHEAT:
10387                 /* Do we use the rack cheat for rxt */
10388                 optval = rack->use_rack_cheat;
10389                 break;
10390         case TCP_RACK_TLP_THRESH:
10391                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10392                 optval = rack->r_ctl.rc_tlp_threshold;
10393                 break;
10394         case TCP_RACK_PKT_DELAY:
10395                 /* RACK added ms i.e. rack-rtt + reord + N */
10396                 optval = rack->r_ctl.rc_pkt_delay;
10397                 break;
10398         case TCP_RACK_TLP_USE:
10399                 optval = rack->rack_tlp_threshold_use;
10400                 break;
10401         case TCP_RACK_TLP_INC_VAR:
10402                 /* Does TLP include rtt variance in t-o */
10403                 error = EINVAL;
10404                 break;
10405         case TCP_RACK_IDLE_REDUCE_HIGH:
10406                 error = EINVAL;
10407                 break;
10408         case TCP_RACK_MIN_PACE:
10409                 optval = rack->r_enforce_min_pace;
10410                 break;
10411         case TCP_RACK_GP_INCREASE:
10412                 optval = rack->rack_per_of_gp;
10413                 break;
10414         case TCP_BBR_RACK_RTT_USE:
10415                 optval = rack->r_ctl.rc_rate_sample_method;
10416                 break;
10417         case TCP_DELACK:
10418                 optval = tp->t_delayed_ack;
10419                 break;
10420         case TCP_DATA_AFTER_CLOSE:
10421                 optval = rack->rc_allow_data_af_clo;
10422                 break;
10423         default:
10424                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10425                 break;
10426         }
10427         INP_WUNLOCK(inp);
10428         if (error == 0) {
10429                 error = sooptcopyout(sopt, &optval, sizeof optval);
10430         }
10431         return (error);
10432 }
10433
10434 static int
10435 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
10436 {
10437         int32_t error = EINVAL;
10438         struct tcp_rack *rack;
10439
10440         rack = (struct tcp_rack *)tp->t_fb_ptr;
10441         if (rack == NULL) {
10442                 /* Huh? */
10443                 goto out;
10444         }
10445         if (sopt->sopt_dir == SOPT_SET) {
10446                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
10447         } else if (sopt->sopt_dir == SOPT_GET) {
10448                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
10449         }
10450 out:
10451         INP_WUNLOCK(inp);
10452         return (error);
10453 }
10454
10455
10456 static struct tcp_function_block __tcp_rack = {
10457         .tfb_tcp_block_name = __XSTRING(STACKNAME),
10458         .tfb_tcp_output = rack_output,
10459         .tfb_do_queued_segments = ctf_do_queued_segments,
10460         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
10461         .tfb_tcp_do_segment = rack_do_segment,
10462         .tfb_tcp_ctloutput = rack_ctloutput,
10463         .tfb_tcp_fb_init = rack_init,
10464         .tfb_tcp_fb_fini = rack_fini,
10465         .tfb_tcp_timer_stop_all = rack_stopall,
10466         .tfb_tcp_timer_activate = rack_timer_activate,
10467         .tfb_tcp_timer_active = rack_timer_active,
10468         .tfb_tcp_timer_stop = rack_timer_stop,
10469         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
10470         .tfb_tcp_handoff_ok = rack_handoff_ok
10471 };
10472
10473 static const char *rack_stack_names[] = {
10474         __XSTRING(STACKNAME),
10475 #ifdef STACKALIAS
10476         __XSTRING(STACKALIAS),
10477 #endif
10478 };
10479
10480 static int
10481 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
10482 {
10483         memset(mem, 0, size);
10484         return (0);
10485 }
10486
10487 static void
10488 rack_dtor(void *mem, int32_t size, void *arg)
10489 {
10490
10491 }
10492
10493 static bool rack_mod_inited = false;
10494
10495 static int
10496 tcp_addrack(module_t mod, int32_t type, void *data)
10497 {
10498         int32_t err = 0;
10499         int num_stacks;
10500
10501         switch (type) {
10502         case MOD_LOAD:
10503                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
10504                     sizeof(struct rack_sendmap),
10505                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
10506
10507                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
10508                     sizeof(struct tcp_rack),
10509                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
10510
10511                 sysctl_ctx_init(&rack_sysctl_ctx);
10512                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
10513                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
10514                     OID_AUTO,
10515 #ifdef STACKALIAS
10516                     __XSTRING(STACKALIAS),
10517 #else
10518                     __XSTRING(STACKNAME),
10519 #endif
10520                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
10521                     "");
10522                 if (rack_sysctl_root == NULL) {
10523                         printf("Failed to add sysctl node\n");
10524                         err = EFAULT;
10525                         goto free_uma;
10526                 }
10527                 rack_init_sysctls();
10528                 num_stacks = nitems(rack_stack_names);
10529                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
10530                     rack_stack_names, &num_stacks);
10531                 if (err) {
10532                         printf("Failed to register %s stack name for "
10533                             "%s module\n", rack_stack_names[num_stacks],
10534                             __XSTRING(MODNAME));
10535                         sysctl_ctx_free(&rack_sysctl_ctx);
10536 free_uma:
10537                         uma_zdestroy(rack_zone);
10538                         uma_zdestroy(rack_pcb_zone);
10539                         rack_counter_destroy();
10540                         printf("Failed to register rack module -- err:%d\n", err);
10541                         return (err);
10542                 }
10543                 tcp_lro_reg_mbufq();
10544                 rack_mod_inited = true;
10545                 break;
10546         case MOD_QUIESCE:
10547                 err = deregister_tcp_functions(&__tcp_rack, true, false);
10548                 break;
10549         case MOD_UNLOAD:
10550                 err = deregister_tcp_functions(&__tcp_rack, false, true);
10551                 if (err == EBUSY)
10552                         break;
10553                 if (rack_mod_inited) {
10554                         uma_zdestroy(rack_zone);
10555                         uma_zdestroy(rack_pcb_zone);
10556                         sysctl_ctx_free(&rack_sysctl_ctx);
10557                         rack_counter_destroy();
10558                         rack_mod_inited = false;
10559                 }
10560                 tcp_lro_dereg_mbufq();
10561                 err = 0;
10562                 break;
10563         default:
10564                 return (EOPNOTSUPP);
10565         }
10566         return (err);
10567 }
10568
10569 static moduledata_t tcp_rack = {
10570         .name = __XSTRING(MODNAME),
10571         .evhand = tcp_addrack,
10572         .priv = 0
10573 };
10574
10575 MODULE_VERSION(MODNAME, 1);
10576 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
10577 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);