sys/netinet/tcp_stacks/rack.c

   1 /*-
   2  * Copyright (c) 2016-9 Netflix, Inc.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  23  * SUCH DAMAGE.
  24  *
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_inet.h"
  31 #include "opt_inet6.h"
  32 #include "opt_ipsec.h"
  33 #include "opt_tcpdebug.h"
  34 #include "opt_ratelimit.h"
  35 #include "opt_kern_tls.h"
  36 #include <sys/param.h>
  37 #include <sys/arb.h>
  38 #include <sys/module.h>
  39 #include <sys/kernel.h>
  40 #ifdef TCP_HHOOK
  41 #include <sys/hhook.h>
  42 #endif
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/lock.h>
  46 #include <sys/mutex.h>
  47 #include <sys/mbuf.h>
  48 #include <sys/proc.h>           /* for proc0 declaration */
  49 #include <sys/socket.h>
  50 #include <sys/socketvar.h>
  51 #ifdef KERN_TLS
  52 #include <sys/ktls.h>
  53 #endif
  54 #include <sys/sysctl.h>
  55 #include <sys/systm.h>
  56 #ifdef STATS
  57 #include <sys/qmath.h>
  58 #include <sys/tree.h>
  59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  60 #endif
  61 #include <sys/refcount.h>
  62 #include <sys/tree.h>
  63 #include <sys/queue.h>
  64 #include <sys/smp.h>
  65 #include <sys/kthread.h>
  66 #include <sys/kern_prefetch.h>
  67
  68 #include <vm/uma.h>
  69
  70 #include <net/route.h>
  71 #include <net/vnet.h>
  72
  73 #define TCPSTATES               /* for logging */
  74
  75 #include <netinet/in.h>
  76 #include <netinet/in_kdtrace.h>
  77 #include <netinet/in_pcb.h>
  78 #include <netinet/ip.h>
  79 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
  80 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
  81 #include <netinet/ip_var.h>
  82 #include <netinet/ip6.h>
  83 #include <netinet6/in6_pcb.h>
  84 #include <netinet6/ip6_var.h>
  85 #include <netinet/tcp.h>
  86 #define TCPOUTFLAGS
  87 #include <netinet/tcp_fsm.h>
  88 #include <netinet/tcp_log_buf.h>
  89 #include <netinet/tcp_seq.h>
  90 #include <netinet/tcp_timer.h>
  91 #include <netinet/tcp_var.h>
  92 #include <netinet/tcp_hpts.h>
  93 #include <netinet/tcpip.h>
  94 #include <netinet/cc/cc.h>
  95 #include <netinet/tcp_fastopen.h>
  96 #include <netinet/tcp_lro.h>
  97 #ifdef TCPDEBUG
  98 #include <netinet/tcp_debug.h>
  99 #endif                          /* TCPDEBUG */
 100 #ifdef TCP_OFFLOAD
 101 #include <netinet/tcp_offload.h>
 102 #endif
 103 #ifdef INET6
 104 #include <netinet6/tcp6_var.h>
 105 #endif
 106
 107 #include <netipsec/ipsec_support.h>
 108
 109 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 110 #include <netipsec/ipsec.h>
 111 #include <netipsec/ipsec6.h>
 112 #endif                          /* IPSEC */
 113
 114 #include <netinet/udp.h>
 115 #include <netinet/udp_var.h>
 116 #include <machine/in_cksum.h>
 117
 118 #ifdef MAC
 119 #include <security/mac/mac_framework.h>
 120 #endif
 121 #include "sack_filter.h"
 122 #include "tcp_rack.h"
 123 #include "rack_bbr_common.h"
 124
 125 uma_zone_t rack_zone;
 126 uma_zone_t rack_pcb_zone;
 127
 128 #ifndef TICKS2SBT
 129 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
 130 #endif
 131
 132 struct sysctl_ctx_list rack_sysctl_ctx;
 133 struct sysctl_oid *rack_sysctl_root;
 134
 135 #define CUM_ACKED 1
 136 #define SACKED 2
 137
 138 /*
 139  * The RACK module incorporates a number of
 140  * TCP ideas that have been put out into the IETF
 141  * over the last few years:
 142  * - Matt Mathis's Rate Halving which slowly drops
 143  *    the congestion window so that the ack clock can
 144  *    be maintained during a recovery.
 145  * - Yuchung Cheng's RACK TCP (for which its named) that
 146  *    will stop us using the number of dup acks and instead
 147  *    use time as the gage of when we retransmit.
 148  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
 149  *    of Dukkipati et.al.
 150  * RACK depends on SACK, so if an endpoint arrives that
 151  * cannot do SACK the state machine below will shuttle the
 152  * connection back to using the "default" TCP stack that is
 153  * in FreeBSD.
 154  *
 155  * To implement RACK the original TCP stack was first decomposed
 156  * into a functional state machine with individual states
 157  * for each of the possible TCP connection states. The do_segement
 158  * functions role in life is to mandate the connection supports SACK
 159  * initially and then assure that the RACK state matches the conenction
 160  * state before calling the states do_segment function. Each
 161  * state is simplified due to the fact that the original do_segment
 162  * has been decomposed and we *know* what state we are in (no
 163  * switches on the state) and all tests for SACK are gone. This
 164  * greatly simplifies what each state does.
 165  *
 166  * TCP output is also over-written with a new version since it
 167  * must maintain the new rack scoreboard.
 168  *
 169  */
 170 static int32_t rack_tlp_thresh = 1;
 171 static int32_t rack_reorder_thresh = 2;
 172 static int32_t rack_reorder_fade = 60000;       /* 0 - never fade, def 60,000
 173                                                  * - 60 seconds */
 174 /* Attack threshold detections */
 175 static uint32_t rack_highest_sack_thresh_seen = 0;
 176 static uint32_t rack_highest_move_thresh_seen = 0;
 177
 178 static int32_t rack_pkt_delay = 1;
 179 static int32_t rack_min_pace_time = 0;
 180 static int32_t rack_early_recovery = 1;
 181 static int32_t rack_send_a_lot_in_prr = 1;
 182 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */
 183 static int32_t rack_verbose_logging = 0;
 184 static int32_t rack_ignore_data_after_close = 1;
 185 static int32_t use_rack_cheat = 1;
 186 static int32_t rack_persist_min = 250;  /* 250ms */
 187 static int32_t rack_persist_max = 1000; /* 1 Second */
 188 static int32_t rack_sack_not_required = 0;      /* set to one to allow non-sack to use rack */
 189 static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */
 190
 191 /*
 192  * Currently regular tcp has a rto_min of 30ms
 193  * the backoff goes 12 times so that ends up
 194  * being a total of 122.850 seconds before a
 195  * connection is killed.
 196  */
 197 static int32_t rack_tlp_min = 10;
 198 static int32_t rack_rto_min = 30;       /* 30ms same as main freebsd */
 199 static int32_t rack_rto_max = 4000;     /* 4 seconds */
 200 static const int32_t rack_free_cache = 2;
 201 static int32_t rack_hptsi_segments = 40;
 202 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 203 static int32_t rack_pace_every_seg = 0;
 204 static int32_t rack_delayed_ack_time = 200;     /* 200ms */
 205 static int32_t rack_slot_reduction = 4;
 206 static int32_t rack_lower_cwnd_at_tlp = 0;
 207 static int32_t rack_use_proportional_reduce = 0;
 208 static int32_t rack_proportional_rate = 10;
 209 static int32_t rack_tlp_max_resend = 2;
 210 static int32_t rack_limited_retran = 0;
 211 static int32_t rack_always_send_oldest = 0;
 212 static int32_t rack_use_sack_filter = 1;
 213 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 214 static int32_t rack_per_of_gp = 50;
 215
 216 /* Rack specific counters */
 217 counter_u64_t rack_badfr;
 218 counter_u64_t rack_badfr_bytes;
 219 counter_u64_t rack_rtm_prr_retran;
 220 counter_u64_t rack_rtm_prr_newdata;
 221 counter_u64_t rack_timestamp_mismatch;
 222 counter_u64_t rack_reorder_seen;
 223 counter_u64_t rack_paced_segments;
 224 counter_u64_t rack_unpaced_segments;
 225 counter_u64_t rack_calc_zero;
 226 counter_u64_t rack_calc_nonzero;
 227 counter_u64_t rack_saw_enobuf;
 228 counter_u64_t rack_saw_enetunreach;
 229 counter_u64_t rack_per_timer_hole;
 230
 231 /* Tail loss probe counters */
 232 counter_u64_t rack_tlp_tot;
 233 counter_u64_t rack_tlp_newdata;
 234 counter_u64_t rack_tlp_retran;
 235 counter_u64_t rack_tlp_retran_bytes;
 236 counter_u64_t rack_tlp_retran_fail;
 237 counter_u64_t rack_to_tot;
 238 counter_u64_t rack_to_arm_rack;
 239 counter_u64_t rack_to_arm_tlp;
 240 counter_u64_t rack_to_alloc;
 241 counter_u64_t rack_to_alloc_hard;
 242 counter_u64_t rack_to_alloc_emerg;
 243 counter_u64_t rack_to_alloc_limited;
 244 counter_u64_t rack_alloc_limited_conns;
 245 counter_u64_t rack_split_limited;
 246
 247 counter_u64_t rack_sack_proc_all;
 248 counter_u64_t rack_sack_proc_short;
 249 counter_u64_t rack_sack_proc_restart;
 250 counter_u64_t rack_sack_attacks_detected;
 251 counter_u64_t rack_sack_attacks_reversed;
 252 counter_u64_t rack_sack_used_next_merge;
 253 counter_u64_t rack_sack_splits;
 254 counter_u64_t rack_sack_used_prev_merge;
 255 counter_u64_t rack_sack_skipped_acked;
 256 counter_u64_t rack_ack_total;
 257 counter_u64_t rack_express_sack;
 258 counter_u64_t rack_sack_total;
 259 counter_u64_t rack_move_none;
 260 counter_u64_t rack_move_some;
 261
 262 counter_u64_t rack_used_tlpmethod;
 263 counter_u64_t rack_used_tlpmethod2;
 264 counter_u64_t rack_enter_tlp_calc;
 265 counter_u64_t rack_input_idle_reduces;
 266 counter_u64_t rack_collapsed_win;
 267 counter_u64_t rack_tlp_does_nada;
 268
 269 /* Counters for HW TLS */
 270 counter_u64_t rack_tls_rwnd;
 271 counter_u64_t rack_tls_cwnd;
 272 counter_u64_t rack_tls_app;
 273 counter_u64_t rack_tls_other;
 274 counter_u64_t rack_tls_filled;
 275 counter_u64_t rack_tls_rxt;
 276 counter_u64_t rack_tls_tlp;
 277
 278 /* Temp CPU counters */
 279 counter_u64_t rack_find_high;
 280
 281 counter_u64_t rack_progress_drops;
 282 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 283 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 284
 285 static void
 286 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 287
 288 static int
 289 rack_process_ack(struct mbuf *m, struct tcphdr *th,
 290     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
 291     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 292 static int
 293 rack_process_data(struct mbuf *m, struct tcphdr *th,
 294     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 295     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 296 static void
 297 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
 298     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 299 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 300 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
 301     uint8_t limit_type);
 302 static struct rack_sendmap *
 303 rack_check_recovery_mode(struct tcpcb *tp,
 304     uint32_t tsused);
 305 static void
 306 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
 307     uint32_t type);
 308 static void rack_counter_destroy(void);
 309 static int
 310 rack_ctloutput(struct socket *so, struct sockopt *sopt,
 311     struct inpcb *inp, struct tcpcb *tp);
 312 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 313 static void
 314 rack_do_segment(struct mbuf *m, struct tcphdr *th,
 315     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
 316     uint8_t iptos);
 317 static void rack_dtor(void *mem, int32_t size, void *arg);
 318 static void
 319 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
 320     uint32_t t, uint32_t cts);
 321 static struct rack_sendmap *
 322 rack_find_high_nonack(struct tcp_rack *rack,
 323     struct rack_sendmap *rsm);
 324 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 325 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 326 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 327 static int
 328 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
 329     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 330 static int32_t rack_handoff_ok(struct tcpcb *tp);
 331 static int32_t rack_init(struct tcpcb *tp);
 332 static void rack_init_sysctls(void);
 333 static void
 334 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 335     struct tcphdr *th);
 336 static void
 337 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 338     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
 339     uint8_t pass, struct rack_sendmap *hintrsm);
 340 static void
 341 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
 342     struct rack_sendmap *rsm);
 343 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num);
 344 static int32_t rack_output(struct tcpcb *tp);
 345
 346 static uint32_t
 347 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
 348     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
 349     uint32_t cts, int *moved_two);
 350 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 351 static void rack_remxt_tmr(struct tcpcb *tp);
 352 static int
 353 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
 354     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 355 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 356 static int32_t rack_stopall(struct tcpcb *tp);
 357 static void
 358 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
 359     uint32_t delta);
 360 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 361 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 362 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 363 static uint32_t
 364 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
 365     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 366 static void
 367 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 368     struct rack_sendmap *rsm, uint32_t ts);
 369 static int
 370 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
 371     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 372 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 373 static int
 374 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
 375     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 376     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 377 static int
 378 rack_do_closing(struct mbuf *m, struct tcphdr *th,
 379     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 380     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 381 static int
 382 rack_do_established(struct mbuf *m, struct tcphdr *th,
 383     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 384     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 385 static int
 386 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 387     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 388     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 389 static int
 390 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
 391     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 392     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 393 static int
 394 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
 395     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 396     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 397 static int
 398 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
 399     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 400     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 401 static int
 402 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
 403     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 404     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 405 static int
 406 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 407     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
 408     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 409 struct rack_sendmap *
 410 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
 411     uint32_t tsused);
 412 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 413 static void
 414      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 415
 416 int32_t rack_clear_counter=0;
 417
 418
 419 static int
 420 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 421 {
 422         uint32_t stat;
 423         int32_t error;
 424
 425         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 426         if (error || req->newptr == NULL)
 427                 return error;
 428
 429         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 430         if (error)
 431                 return (error);
 432         if (stat == 1) {
 433 #ifdef INVARIANTS
 434                 printf("Clearing RACK counters\n");
 435 #endif
 436                 counter_u64_zero(rack_badfr);
 437                 counter_u64_zero(rack_badfr_bytes);
 438                 counter_u64_zero(rack_rtm_prr_retran);
 439                 counter_u64_zero(rack_rtm_prr_newdata);
 440                 counter_u64_zero(rack_timestamp_mismatch);
 441                 counter_u64_zero(rack_reorder_seen);
 442                 counter_u64_zero(rack_tlp_tot);
 443                 counter_u64_zero(rack_tlp_newdata);
 444                 counter_u64_zero(rack_tlp_retran);
 445                 counter_u64_zero(rack_tlp_retran_bytes);
 446                 counter_u64_zero(rack_tlp_retran_fail);
 447                 counter_u64_zero(rack_to_tot);
 448                 counter_u64_zero(rack_to_arm_rack);
 449                 counter_u64_zero(rack_to_arm_tlp);
 450                 counter_u64_zero(rack_paced_segments);
 451                 counter_u64_zero(rack_calc_zero);
 452                 counter_u64_zero(rack_calc_nonzero);
 453                 counter_u64_zero(rack_unpaced_segments);
 454                 counter_u64_zero(rack_saw_enobuf);
 455                 counter_u64_zero(rack_saw_enetunreach);
 456                 counter_u64_zero(rack_per_timer_hole);
 457                 counter_u64_zero(rack_to_alloc_hard);
 458                 counter_u64_zero(rack_to_alloc_emerg);
 459                 counter_u64_zero(rack_sack_proc_all);
 460                 counter_u64_zero(rack_sack_proc_short);
 461                 counter_u64_zero(rack_sack_proc_restart);
 462                 counter_u64_zero(rack_to_alloc);
 463                 counter_u64_zero(rack_to_alloc_limited);
 464                 counter_u64_zero(rack_alloc_limited_conns);
 465                 counter_u64_zero(rack_split_limited);
 466                 counter_u64_zero(rack_find_high);
 467                 counter_u64_zero(rack_tls_rwnd);
 468                 counter_u64_zero(rack_tls_cwnd);
 469                 counter_u64_zero(rack_tls_app);
 470                 counter_u64_zero(rack_tls_other);
 471                 counter_u64_zero(rack_tls_filled);
 472                 counter_u64_zero(rack_tls_rxt);
 473                 counter_u64_zero(rack_tls_tlp);
 474                 counter_u64_zero(rack_sack_attacks_detected);
 475                 counter_u64_zero(rack_sack_attacks_reversed);
 476                 counter_u64_zero(rack_sack_used_next_merge);
 477                 counter_u64_zero(rack_sack_used_prev_merge);
 478                 counter_u64_zero(rack_sack_splits);
 479                 counter_u64_zero(rack_sack_skipped_acked);
 480                 counter_u64_zero(rack_ack_total);
 481                 counter_u64_zero(rack_express_sack);
 482                 counter_u64_zero(rack_sack_total);
 483                 counter_u64_zero(rack_move_none);
 484                 counter_u64_zero(rack_move_some);
 485                 counter_u64_zero(rack_used_tlpmethod);
 486                 counter_u64_zero(rack_used_tlpmethod2);
 487                 counter_u64_zero(rack_enter_tlp_calc);
 488                 counter_u64_zero(rack_progress_drops);
 489                 counter_u64_zero(rack_tlp_does_nada);
 490                 counter_u64_zero(rack_collapsed_win);
 491
 492         }
 493         rack_clear_counter = 0;
 494         return (0);
 495 }
 496
 497
 498
 499 static void
 500 rack_init_sysctls(void)
 501 {
 502         struct sysctl_oid *rack_counters;
 503         struct sysctl_oid *rack_attack;
 504
 505         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 506             SYSCTL_CHILDREN(rack_sysctl_root),
 507             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 508             &rack_rate_sample_method , USE_RTT_LOW,
 509             "What method should we use for rate sampling 0=high, 1=low ");
 510         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 511             SYSCTL_CHILDREN(rack_sysctl_root),
 512             OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
 513             &rack_hw_tls_max_seg , 0,
 514             "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? ");
 515         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 516             SYSCTL_CHILDREN(rack_sysctl_root),
 517             OID_AUTO, "data_after_close", CTLFLAG_RW,
 518             &rack_ignore_data_after_close, 0,
 519             "Do we hold off sending a RST until all pending data is ack'd");
 520         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 521             SYSCTL_CHILDREN(rack_sysctl_root),
 522             OID_AUTO, "cheat_rxt", CTLFLAG_RW,
 523             &use_rack_cheat, 1,
 524             "Do we use the rxt cheat for rack?");
 525
 526         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 527             SYSCTL_CHILDREN(rack_sysctl_root),
 528             OID_AUTO, "persmin", CTLFLAG_RW,
 529             &rack_persist_min, 250,
 530             "What is the minimum time in milliseconds between persists");
 531         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 532             SYSCTL_CHILDREN(rack_sysctl_root),
 533             OID_AUTO, "persmax", CTLFLAG_RW,
 534             &rack_persist_max, 1000,
 535             "What is the largest delay in milliseconds between persists");
 536         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 537             SYSCTL_CHILDREN(rack_sysctl_root),
 538             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
 539             &rack_sack_not_required, 0,
 540             "Do we allow rack to run on connections not supporting SACK?");
 541         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 542             SYSCTL_CHILDREN(rack_sysctl_root),
 543             OID_AUTO, "tlpmethod", CTLFLAG_RW,
 544             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 545             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 546         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 547             SYSCTL_CHILDREN(rack_sysctl_root),
 548             OID_AUTO, "gp_percentage", CTLFLAG_RW,
 549             &rack_per_of_gp, 50,
 550             "Do we pace to percentage of goodput (0=old method)?");
 551         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 552             SYSCTL_CHILDREN(rack_sysctl_root),
 553             OID_AUTO, "min_pace_time", CTLFLAG_RW,
 554             &rack_min_pace_time, 0,
 555             "Should we enforce a minimum pace time of 1ms");
 556         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 557             SYSCTL_CHILDREN(rack_sysctl_root),
 558             OID_AUTO, "bb_verbose", CTLFLAG_RW,
 559             &rack_verbose_logging, 0,
 560             "Should RACK black box logging be verbose");
 561         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 562             SYSCTL_CHILDREN(rack_sysctl_root),
 563             OID_AUTO, "sackfiltering", CTLFLAG_RW,
 564             &rack_use_sack_filter, 1,
 565             "Do we use sack filtering?");
 566         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 567             SYSCTL_CHILDREN(rack_sysctl_root),
 568             OID_AUTO, "delayed_ack", CTLFLAG_RW,
 569             &rack_delayed_ack_time, 200,
 570             "Delayed ack time (200ms)");
 571         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 572             SYSCTL_CHILDREN(rack_sysctl_root),
 573             OID_AUTO, "tlpminto", CTLFLAG_RW,
 574             &rack_tlp_min, 10,
 575             "TLP minimum timeout per the specification (10ms)");
 576         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 577             SYSCTL_CHILDREN(rack_sysctl_root),
 578             OID_AUTO, "send_oldest", CTLFLAG_RW,
 579             &rack_always_send_oldest, 1,
 580             "Should we always send the oldest TLP and RACK-TLP");
 581         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 582             SYSCTL_CHILDREN(rack_sysctl_root),
 583             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 584             &rack_limited_retran, 0,
 585             "How many times can a rack timeout drive out sends");
 586         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 587             SYSCTL_CHILDREN(rack_sysctl_root),
 588             OID_AUTO, "minrto", CTLFLAG_RW,
 589             &rack_rto_min, 0,
 590             "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 591         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 592             SYSCTL_CHILDREN(rack_sysctl_root),
 593             OID_AUTO, "maxrto", CTLFLAG_RW,
 594             &rack_rto_max, 0,
 595             "Maxiumum RTO in ms -- should be at least as large as min_rto");
 596         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 597             SYSCTL_CHILDREN(rack_sysctl_root),
 598             OID_AUTO, "tlp_retry", CTLFLAG_RW,
 599             &rack_tlp_max_resend, 2,
 600             "How many times does TLP retry a single segment or multiple with no ACK");
 601         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 602             SYSCTL_CHILDREN(rack_sysctl_root),
 603             OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 604             &rack_use_proportional_reduce, 0,
 605             "Should we proportionaly reduce cwnd based on the number of losses ");
 606         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 607             SYSCTL_CHILDREN(rack_sysctl_root),
 608             OID_AUTO, "recovery_prop", CTLFLAG_RW,
 609             &rack_proportional_rate, 10,
 610             "What percent reduction per loss");
 611         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 612             SYSCTL_CHILDREN(rack_sysctl_root),
 613             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 614             &rack_lower_cwnd_at_tlp, 0,
 615             "When a TLP completes a retran should we enter recovery?");
 616         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 617             SYSCTL_CHILDREN(rack_sysctl_root),
 618             OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 619             &rack_slot_reduction, 4,
 620             "When setting a slot should we reduce by divisor");
 621         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 622             SYSCTL_CHILDREN(rack_sysctl_root),
 623             OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 624             &rack_pace_every_seg, 0,
 625             "Should we use the original pacing mechanism that did not pace much?");
 626         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 627             SYSCTL_CHILDREN(rack_sysctl_root),
 628             OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 629             &rack_hptsi_segments, 40,
 630             "Should we pace out only a limited size of segments");
 631         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 632             SYSCTL_CHILDREN(rack_sysctl_root),
 633             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 634             &rack_send_a_lot_in_prr, 1,
 635             "Send a lot in prr");
 636         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 637             SYSCTL_CHILDREN(rack_sysctl_root),
 638             OID_AUTO, "minto", CTLFLAG_RW,
 639             &rack_min_to, 1,
 640             "Minimum rack timeout in milliseconds");
 641         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 642             SYSCTL_CHILDREN(rack_sysctl_root),
 643             OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 644             &rack_early_recovery, 1,
 645             "Do we do early recovery with rack");
 646         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 647             SYSCTL_CHILDREN(rack_sysctl_root),
 648             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 649             &rack_reorder_thresh, 2,
 650             "What factor for rack will be added when seeing reordering (shift right)");
 651         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 652             SYSCTL_CHILDREN(rack_sysctl_root),
 653             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 654             &rack_tlp_thresh, 1,
 655             "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 656         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 657             SYSCTL_CHILDREN(rack_sysctl_root),
 658             OID_AUTO, "reorder_fade", CTLFLAG_RW,
 659             &rack_reorder_fade, 0,
 660             "Does reorder detection fade, if so how many ms (0 means never)");
 661         SYSCTL_ADD_S32(&rack_sysctl_ctx,
 662             SYSCTL_CHILDREN(rack_sysctl_root),
 663             OID_AUTO, "pktdelay", CTLFLAG_RW,
 664             &rack_pkt_delay, 1,
 665             "Extra RACK time (in ms) besides reordering thresh");
 666
 667         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 668             SYSCTL_CHILDREN(rack_sysctl_root),
 669             OID_AUTO,
 670             "stats",
 671             CTLFLAG_RW, 0,
 672             "Rack Counters");
 673         rack_badfr = counter_u64_alloc(M_WAITOK);
 674         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 675             SYSCTL_CHILDREN(rack_counters),
 676             OID_AUTO, "badfr", CTLFLAG_RD,
 677             &rack_badfr, "Total number of bad FRs");
 678         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 680             SYSCTL_CHILDREN(rack_counters),
 681             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 682             &rack_badfr_bytes, "Total number of bad FRs");
 683         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 684         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 685             SYSCTL_CHILDREN(rack_counters),
 686             OID_AUTO, "prrsndret", CTLFLAG_RD,
 687             &rack_rtm_prr_retran,
 688             "Total number of prr based retransmits");
 689         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 690         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 691             SYSCTL_CHILDREN(rack_counters),
 692             OID_AUTO, "prrsndnew", CTLFLAG_RD,
 693             &rack_rtm_prr_newdata,
 694             "Total number of prr based new transmits");
 695         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 696         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 697             SYSCTL_CHILDREN(rack_counters),
 698             OID_AUTO, "tsnf", CTLFLAG_RD,
 699             &rack_timestamp_mismatch,
 700             "Total number of timestamps that we could not find the reported ts");
 701         rack_find_high = counter_u64_alloc(M_WAITOK);
 702         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 703             SYSCTL_CHILDREN(rack_counters),
 704             OID_AUTO, "findhigh", CTLFLAG_RD,
 705             &rack_find_high,
 706             "Total number of FIN causing find-high");
 707         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 708         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 709             SYSCTL_CHILDREN(rack_counters),
 710             OID_AUTO, "reordering", CTLFLAG_RD,
 711             &rack_reorder_seen,
 712             "Total number of times we added delay due to reordering");
 713         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 714         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 715             SYSCTL_CHILDREN(rack_counters),
 716             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 717             &rack_tlp_tot,
 718             "Total number of tail loss probe expirations");
 719         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 720         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 721             SYSCTL_CHILDREN(rack_counters),
 722             OID_AUTO, "tlp_new", CTLFLAG_RD,
 723             &rack_tlp_newdata,
 724             "Total number of tail loss probe sending new data");
 725
 726         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 727         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 728             SYSCTL_CHILDREN(rack_counters),
 729             OID_AUTO, "tlp_retran", CTLFLAG_RD,
 730             &rack_tlp_retran,
 731             "Total number of tail loss probe sending retransmitted data");
 732         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 733         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 734             SYSCTL_CHILDREN(rack_counters),
 735             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 736             &rack_tlp_retran_bytes,
 737             "Total bytes of tail loss probe sending retransmitted data");
 738         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 739         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 740             SYSCTL_CHILDREN(rack_counters),
 741             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 742             &rack_tlp_retran_fail,
 743             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 744         rack_to_tot = counter_u64_alloc(M_WAITOK);
 745         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 746             SYSCTL_CHILDREN(rack_counters),
 747             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 748             &rack_to_tot,
 749             "Total number of times the rack to expired?");
 750         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 751         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 752             SYSCTL_CHILDREN(rack_counters),
 753             OID_AUTO, "arm_rack", CTLFLAG_RD,
 754             &rack_to_arm_rack,
 755             "Total number of times the rack timer armed?");
 756         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 757         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 758             SYSCTL_CHILDREN(rack_counters),
 759             OID_AUTO, "arm_tlp", CTLFLAG_RD,
 760             &rack_to_arm_tlp,
 761             "Total number of times the tlp timer armed?");
 762
 763         rack_calc_zero = counter_u64_alloc(M_WAITOK);
 764         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
 765         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 766             SYSCTL_CHILDREN(rack_counters),
 767             OID_AUTO, "calc_zero", CTLFLAG_RD,
 768             &rack_calc_zero,
 769             "Total number of times pacing time worked out to zero?");
 770         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 771             SYSCTL_CHILDREN(rack_counters),
 772             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
 773             &rack_calc_nonzero,
 774             "Total number of times pacing time worked out to non-zero?");
 775         rack_paced_segments = counter_u64_alloc(M_WAITOK);
 776         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 777             SYSCTL_CHILDREN(rack_counters),
 778             OID_AUTO, "paced", CTLFLAG_RD,
 779             &rack_paced_segments,
 780             "Total number of times a segment send caused hptsi");
 781         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 782         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 783             SYSCTL_CHILDREN(rack_counters),
 784             OID_AUTO, "unpaced", CTLFLAG_RD,
 785             &rack_unpaced_segments,
 786             "Total number of times a segment did not cause hptsi");
 787         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 788         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 789             SYSCTL_CHILDREN(rack_counters),
 790             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 791             &rack_saw_enobuf,
 792             "Total number of times a segment did not cause hptsi");
 793         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 794         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 795             SYSCTL_CHILDREN(rack_counters),
 796             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 797             &rack_saw_enetunreach,
 798             "Total number of times a segment did not cause hptsi");
 799         rack_to_alloc = counter_u64_alloc(M_WAITOK);
 800         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 801             SYSCTL_CHILDREN(rack_counters),
 802             OID_AUTO, "allocs", CTLFLAG_RD,
 803             &rack_to_alloc,
 804             "Total allocations of tracking structures");
 805         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 806         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 807             SYSCTL_CHILDREN(rack_counters),
 808             OID_AUTO, "allochard", CTLFLAG_RD,
 809             &rack_to_alloc_hard,
 810             "Total allocations done with sleeping the hard way");
 811         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 812         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 813             SYSCTL_CHILDREN(rack_counters),
 814             OID_AUTO, "allocemerg", CTLFLAG_RD,
 815             &rack_to_alloc_emerg,
 816             "Total allocations done from emergency cache");
 817         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
 818         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 819             SYSCTL_CHILDREN(rack_counters),
 820             OID_AUTO, "alloc_limited", CTLFLAG_RD,
 821             &rack_to_alloc_limited,
 822             "Total allocations dropped due to limit");
 823         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 824         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 825             SYSCTL_CHILDREN(rack_counters),
 826             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 827             &rack_alloc_limited_conns,
 828             "Connections with allocations dropped due to limit");
 829         rack_split_limited = counter_u64_alloc(M_WAITOK);
 830         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 831             SYSCTL_CHILDREN(rack_counters),
 832             OID_AUTO, "split_limited", CTLFLAG_RD,
 833             &rack_split_limited,
 834             "Split allocations dropped due to limit");
 835         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 836         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 837             SYSCTL_CHILDREN(rack_counters),
 838             OID_AUTO, "sack_long", CTLFLAG_RD,
 839             &rack_sack_proc_all,
 840             "Total times we had to walk whole list for sack processing");
 841
 842         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 843         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 844             SYSCTL_CHILDREN(rack_counters),
 845             OID_AUTO, "sack_restart", CTLFLAG_RD,
 846             &rack_sack_proc_restart,
 847             "Total times we had to walk whole list due to a restart");
 848         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 849         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 850             SYSCTL_CHILDREN(rack_counters),
 851             OID_AUTO, "sack_short", CTLFLAG_RD,
 852             &rack_sack_proc_short,
 853             "Total times we took shortcut for sack processing");
 854         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 855         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 856             SYSCTL_CHILDREN(rack_counters),
 857             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 858             &rack_enter_tlp_calc,
 859             "Total times we called calc-tlp");
 860         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 861         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 862             SYSCTL_CHILDREN(rack_counters),
 863             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 864             &rack_used_tlpmethod,
 865             "Total number of runt sacks");
 866         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 867         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 868             SYSCTL_CHILDREN(rack_counters),
 869             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 870             &rack_used_tlpmethod2,
 871             "Total number of times we hit TLP method 2");
 872         /* Sack Attacker detection stuff */
 873         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 874             SYSCTL_CHILDREN(rack_sysctl_root),
 875             OID_AUTO,
 876             "sack_attack",
 877             CTLFLAG_RW, 0,
 878             "Rack Sack Attack Counters and Controls");
 879         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 880             SYSCTL_CHILDREN(rack_attack),
 881             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
 882             &rack_highest_sack_thresh_seen, 0,
 883             "Highest sack to ack ratio seen");
 884         SYSCTL_ADD_U32(&rack_sysctl_ctx,
 885             SYSCTL_CHILDREN(rack_attack),
 886             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
 887             &rack_highest_move_thresh_seen, 0,
 888             "Highest move to non-move ratio seen");
 889         rack_ack_total = counter_u64_alloc(M_WAITOK);
 890         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 891             SYSCTL_CHILDREN(rack_attack),
 892             OID_AUTO, "acktotal", CTLFLAG_RD,
 893             &rack_ack_total,
 894             "Total number of Ack's");
 895
 896         rack_express_sack = counter_u64_alloc(M_WAITOK);
 897         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 898             SYSCTL_CHILDREN(rack_attack),
 899             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
 900             &rack_express_sack,
 901             "Total expresss number of Sack's");
 902         rack_sack_total = counter_u64_alloc(M_WAITOK);
 903         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 904             SYSCTL_CHILDREN(rack_attack),
 905             OID_AUTO, "sacktotal", CTLFLAG_RD,
 906             &rack_sack_total,
 907             "Total number of SACK's");
 908         rack_move_none = counter_u64_alloc(M_WAITOK);
 909         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 910             SYSCTL_CHILDREN(rack_attack),
 911             OID_AUTO, "move_none", CTLFLAG_RD,
 912             &rack_move_none,
 913             "Total number of SACK index reuse of postions under threshold");
 914         rack_move_some = counter_u64_alloc(M_WAITOK);
 915         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 916             SYSCTL_CHILDREN(rack_attack),
 917             OID_AUTO, "move_some", CTLFLAG_RD,
 918             &rack_move_some,
 919             "Total number of SACK index reuse of postions over threshold");
 920         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
 921         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 922             SYSCTL_CHILDREN(rack_attack),
 923             OID_AUTO, "attacks", CTLFLAG_RD,
 924             &rack_sack_attacks_detected,
 925             "Total number of SACK attackers that had sack disabled");
 926         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
 927         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 928             SYSCTL_CHILDREN(rack_attack),
 929             OID_AUTO, "reversed", CTLFLAG_RD,
 930             &rack_sack_attacks_reversed,
 931             "Total number of SACK attackers that were later determined false positive");
 932         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
 933         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 934             SYSCTL_CHILDREN(rack_attack),
 935             OID_AUTO, "nextmerge", CTLFLAG_RD,
 936             &rack_sack_used_next_merge,
 937             "Total number of times we used the next merge");
 938         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
 939         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 940             SYSCTL_CHILDREN(rack_attack),
 941             OID_AUTO, "prevmerge", CTLFLAG_RD,
 942             &rack_sack_used_prev_merge,
 943             "Total number of times we used the prev merge");
 944         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
 945         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 946             SYSCTL_CHILDREN(rack_attack),
 947             OID_AUTO, "skipacked", CTLFLAG_RD,
 948             &rack_sack_skipped_acked,
 949             "Total number of times we skipped previously sacked");
 950         rack_sack_splits = counter_u64_alloc(M_WAITOK);
 951         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 952             SYSCTL_CHILDREN(rack_attack),
 953             OID_AUTO, "ofsplit", CTLFLAG_RD,
 954             &rack_sack_splits,
 955             "Total number of times we did the old fashion tree split");
 956         rack_progress_drops = counter_u64_alloc(M_WAITOK);
 957         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 958             SYSCTL_CHILDREN(rack_counters),
 959             OID_AUTO, "prog_drops", CTLFLAG_RD,
 960             &rack_progress_drops,
 961             "Total number of progress drops");
 962         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 963         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 964             SYSCTL_CHILDREN(rack_counters),
 965             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 966             &rack_input_idle_reduces,
 967             "Total number of idle reductions on input");
 968         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
 969         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 970             SYSCTL_CHILDREN(rack_counters),
 971             OID_AUTO, "collapsed_win", CTLFLAG_RD,
 972             &rack_collapsed_win,
 973             "Total number of collapsed windows");
 974         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 975         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 976             SYSCTL_CHILDREN(rack_counters),
 977             OID_AUTO, "tlp_nada", CTLFLAG_RD,
 978             &rack_tlp_does_nada,
 979             "Total number of nada tlp calls");
 980
 981         rack_tls_rwnd = counter_u64_alloc(M_WAITOK);
 982         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 983             SYSCTL_CHILDREN(rack_counters),
 984             OID_AUTO, "tls_rwnd", CTLFLAG_RD,
 985             &rack_tls_rwnd,
 986             "Total hdwr tls rwnd limited");
 987
 988         rack_tls_cwnd = counter_u64_alloc(M_WAITOK);
 989         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 990             SYSCTL_CHILDREN(rack_counters),
 991             OID_AUTO, "tls_cwnd", CTLFLAG_RD,
 992             &rack_tls_cwnd,
 993             "Total hdwr tls cwnd limited");
 994
 995         rack_tls_app = counter_u64_alloc(M_WAITOK);
 996         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 997             SYSCTL_CHILDREN(rack_counters),
 998             OID_AUTO, "tls_app", CTLFLAG_RD,
 999             &rack_tls_app,
1000             "Total hdwr tls app limited");
1001
1002         rack_tls_other = counter_u64_alloc(M_WAITOK);
1003         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1004             SYSCTL_CHILDREN(rack_counters),
1005             OID_AUTO, "tls_other", CTLFLAG_RD,
1006             &rack_tls_other,
1007             "Total hdwr tls other limited");
1008
1009         rack_tls_filled = counter_u64_alloc(M_WAITOK);
1010         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1011             SYSCTL_CHILDREN(rack_counters),
1012             OID_AUTO, "tls_filled", CTLFLAG_RD,
1013             &rack_tls_filled,
1014             "Total hdwr tls filled");
1015
1016         rack_tls_rxt = counter_u64_alloc(M_WAITOK);
1017         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1018             SYSCTL_CHILDREN(rack_counters),
1019             OID_AUTO, "tls_rxt", CTLFLAG_RD,
1020             &rack_tls_rxt,
1021             "Total hdwr rxt");
1022
1023         rack_tls_tlp = counter_u64_alloc(M_WAITOK);
1024         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1025             SYSCTL_CHILDREN(rack_counters),
1026             OID_AUTO, "tls_tlp", CTLFLAG_RD,
1027             &rack_tls_tlp,
1028             "Total hdwr tls tlp");
1029         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1030         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1031             SYSCTL_CHILDREN(rack_counters),
1032             OID_AUTO, "timer_hole", CTLFLAG_RD,
1033             &rack_per_timer_hole,
1034             "Total persists start in timer hole");
1035
1036         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1037         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1038             OID_AUTO, "outsize", CTLFLAG_RD,
1039             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1040         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1041         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1042             OID_AUTO, "opts", CTLFLAG_RD,
1043             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1044         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1045             SYSCTL_CHILDREN(rack_sysctl_root),
1046             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1047             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1048 }
1049
1050 static __inline int
1051 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1052 {
1053         if (SEQ_GEQ(b->r_start, a->r_start) &&
1054             SEQ_LT(b->r_start, a->r_end)) {
1055                 /*
1056                  * The entry b is within the
1057                  * block a. i.e.:
1058                  * a --   |-------------|
1059                  * b --   |----|
1060                  * <or>
1061                  * b --       |------|
1062                  * <or>
1063                  * b --       |-----------|
1064                  */
1065                 return (0);
1066         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1067                 /*
1068                  * b falls as either the next
1069                  * sequence block after a so a
1070                  * is said to be smaller than b.
1071                  * i.e:
1072                  * a --   |------|
1073                  * b --          |--------|
1074                  * or
1075                  * b --              |-----|
1076                  */
1077                 return (1);
1078         }
1079         /*
1080          * Whats left is where a is
1081          * larger than b. i.e:
1082          * a --         |-------|
1083          * b --  |---|
1084          * or even possibly
1085          * b --   |--------------|
1086          */
1087         return (-1);
1088 }
1089
1090 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1091 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1092
1093 static inline int32_t
1094 rack_progress_timeout_check(struct tcpcb *tp)
1095 {
1096         if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
1097                 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
1098                         /*
1099                          * There is an assumption that the caller
1100                          * will drop the connection so we will
1101                          * increment the counters here.
1102                          */
1103                         struct tcp_rack *rack;
1104                         rack = (struct tcp_rack *)tp->t_fb_ptr;
1105                         counter_u64_add(rack_progress_drops, 1);
1106 #ifdef NETFLIX_STATS
1107                         TCPSTAT_INC(tcps_progdrops);
1108 #endif
1109                         rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
1110                         return (1);
1111                 }
1112         }
1113         return (0);
1114 }
1115
1116
1117
1118 static void
1119 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
1120 {
1121         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1122                 union tcp_log_stackspecific log;
1123                 struct timeval tv;
1124                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1125                 log.u_bbr.flex1 = tsused;
1126                 log.u_bbr.flex2 = thresh;
1127                 log.u_bbr.flex3 = rsm->r_flags;
1128                 log.u_bbr.flex4 = rsm->r_dupack;
1129                 log.u_bbr.flex5 = rsm->r_start;
1130                 log.u_bbr.flex6 = rsm->r_end;
1131                 log.u_bbr.flex8 = mod;
1132                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1133                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1134                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1135                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1136                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1137                     &rack->rc_inp->inp_socket->so_rcv,
1138                     &rack->rc_inp->inp_socket->so_snd,
1139                     BBR_LOG_SETTINGS_CHG, 0,
1140                     0, &log, false, &tv);
1141         }
1142 }
1143
1144
1145
1146 static void
1147 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
1148 {
1149         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1150                 union tcp_log_stackspecific log;
1151                 struct timeval tv;
1152
1153                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1154                 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
1155                 log.u_bbr.flex2 = to;
1156                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1157                 log.u_bbr.flex4 = slot;
1158                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
1159                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1160                 log.u_bbr.flex7 = rack->rc_in_persist;
1161                 log.u_bbr.flex8 = which;
1162                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1163                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1164                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1165                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1166                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1167                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1168                     &rack->rc_inp->inp_socket->so_rcv,
1169                     &rack->rc_inp->inp_socket->so_snd,
1170                     BBR_LOG_TIMERSTAR, 0,
1171                     0, &log, false, &tv);
1172         }
1173 }
1174
1175 static void
1176 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no)
1177 {
1178         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1179                 union tcp_log_stackspecific log;
1180                 struct timeval tv;
1181
1182                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1183                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1184                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1185                 log.u_bbr.flex8 = to_num;
1186                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
1187                 log.u_bbr.flex2 = rack->rc_rack_rtt;
1188                 log.u_bbr.flex3 = no;
1189                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1190                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1191                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1192                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1193                     &rack->rc_inp->inp_socket->so_rcv,
1194                     &rack->rc_inp->inp_socket->so_snd,
1195                     BBR_LOG_RTO, 0,
1196                     0, &log, false, &tv);
1197         }
1198 }
1199
1200 static void
1201 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
1202     uint32_t o_srtt, uint32_t o_var)
1203 {
1204         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1205                 union tcp_log_stackspecific log;
1206                 struct timeval tv;
1207
1208                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1209                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1210                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1211                 log.u_bbr.flex1 = t;
1212                 log.u_bbr.flex2 = o_srtt;
1213                 log.u_bbr.flex3 = o_var;
1214                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
1215                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
1216                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
1217                 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
1218                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
1219                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
1220                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1221                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1222                 TCP_LOG_EVENTP(tp, NULL,
1223                     &rack->rc_inp->inp_socket->so_rcv,
1224                     &rack->rc_inp->inp_socket->so_snd,
1225                     BBR_LOG_BBRRTT, 0,
1226                     0, &log, false, &tv);
1227         }
1228 }
1229
1230 static void
1231 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
1232 {
1233         /*
1234          * Log the rtt sample we are
1235          * applying to the srtt algorithm in
1236          * useconds.
1237          */
1238         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1239                 union tcp_log_stackspecific log;
1240                 struct timeval tv;
1241
1242                 /* Convert our ms to a microsecond */
1243                 memset(&log, 0, sizeof(log));
1244                 log.u_bbr.flex1 = rtt * 1000;
1245                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1246                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
1247                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1248                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
1249                 log.u_bbr.flex8 = rack->sack_attack_disable;
1250                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1251                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1252                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1253                     &rack->rc_inp->inp_socket->so_rcv,
1254                     &rack->rc_inp->inp_socket->so_snd,
1255                     TCP_LOG_RTT, 0,
1256                     0, &log, false, &tv);
1257         }
1258 }
1259
1260
1261 static inline void
1262 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
1263 {
1264         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
1265                 union tcp_log_stackspecific log;
1266                 struct timeval tv;
1267
1268                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1269                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1270                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1271                 log.u_bbr.flex1 = line;
1272                 log.u_bbr.flex2 = tick;
1273                 log.u_bbr.flex3 = tp->t_maxunacktime;
1274                 log.u_bbr.flex4 = tp->t_acktime;
1275                 log.u_bbr.flex8 = event;
1276                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1277                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1278                 TCP_LOG_EVENTP(tp, NULL,
1279                     &rack->rc_inp->inp_socket->so_rcv,
1280                     &rack->rc_inp->inp_socket->so_snd,
1281                     BBR_LOG_PROGRESS, 0,
1282                     0, &log, false, &tv);
1283         }
1284 }
1285
1286 static void
1287 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1288 {
1289         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1290                 union tcp_log_stackspecific log;
1291                 struct timeval tv;
1292
1293                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1294                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1295                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1296                 log.u_bbr.flex1 = slot;
1297                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
1298                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1299                 log.u_bbr.flex8 = rack->rc_in_persist;
1300                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1301                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1302                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1303                     &rack->rc_inp->inp_socket->so_rcv,
1304                     &rack->rc_inp->inp_socket->so_snd,
1305                     BBR_LOG_BBRSND, 0,
1306                     0, &log, false, &tv);
1307         }
1308 }
1309
1310 static void
1311 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1312 {
1313         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1314                 union tcp_log_stackspecific log;
1315                 struct timeval tv;
1316
1317                 memset(&log, 0, sizeof(log));
1318                 log.u_bbr.flex1 = did_out;
1319                 log.u_bbr.flex2 = nxt_pkt;
1320                 log.u_bbr.flex3 = way_out;
1321                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1322                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1323                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
1324                 log.u_bbr.flex7 = rack->r_wanted_output;
1325                 log.u_bbr.flex8 = rack->rc_in_persist;
1326                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1327                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1328                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1329                     &rack->rc_inp->inp_socket->so_rcv,
1330                     &rack->rc_inp->inp_socket->so_snd,
1331                     BBR_LOG_DOSEG_DONE, 0,
1332                     0, &log, false, &tv);
1333         }
1334 }
1335
1336 static void
1337 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm)
1338 {
1339         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
1340                 union tcp_log_stackspecific log;
1341                 struct timeval tv;
1342                 uint32_t cts;
1343
1344                 memset(&log, 0, sizeof(log));
1345                 cts = tcp_get_usecs(&tv);
1346                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
1347                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
1348                 log.u_bbr.flex4 = len;
1349                 log.u_bbr.flex5 = orig_len;
1350                 log.u_bbr.flex6 = rack->r_ctl.rc_sacked;
1351                 log.u_bbr.flex7 = mod;
1352                 log.u_bbr.flex8 = frm;
1353                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1354                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1355                 TCP_LOG_EVENTP(tp, NULL,
1356                     &tp->t_inpcb->inp_socket->so_rcv,
1357                     &tp->t_inpcb->inp_socket->so_snd,
1358                     TCP_HDWR_TLS, 0,
1359                     0, &log, false, &tv);
1360         }
1361 }
1362
1363 static void
1364 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1365 {
1366         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1367                 union tcp_log_stackspecific log;
1368                 struct timeval tv;
1369
1370                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1371                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1372                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1373                 log.u_bbr.flex1 = slot;
1374                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1375                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1376                 log.u_bbr.flex7 = hpts_calling;
1377                 log.u_bbr.flex8 = rack->rc_in_persist;
1378                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1379                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1380                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1381                     &rack->rc_inp->inp_socket->so_rcv,
1382                     &rack->rc_inp->inp_socket->so_snd,
1383                     BBR_LOG_JUSTRET, 0,
1384                     tlen, &log, false, &tv);
1385         }
1386 }
1387
1388 static void
1389 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1390 {
1391         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1392                 union tcp_log_stackspecific log;
1393                 struct timeval tv;
1394
1395                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1396                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1397                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1398                 log.u_bbr.flex1 = line;
1399                 log.u_bbr.flex2 = 0;
1400                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1401                 log.u_bbr.flex4 = 0;
1402                 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
1403                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1404                 log.u_bbr.flex8 = hpts_removed;
1405                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1406                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1407                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1408                     &rack->rc_inp->inp_socket->so_rcv,
1409                     &rack->rc_inp->inp_socket->so_snd,
1410                     BBR_LOG_TIMERCANC, 0,
1411                     0, &log, false, &tv);
1412         }
1413 }
1414
1415 static void
1416 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1417 {
1418         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1419                 union tcp_log_stackspecific log;
1420                 struct timeval tv;
1421
1422                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1423                 log.u_bbr.flex1 = timers;
1424                 log.u_bbr.flex2 = ret;
1425                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1426                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1427                 log.u_bbr.flex5 = cts;
1428                 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
1429                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1430                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1431                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1432                     &rack->rc_inp->inp_socket->so_rcv,
1433                     &rack->rc_inp->inp_socket->so_snd,
1434                     BBR_LOG_TO_PROCESS, 0,
1435                     0, &log, false, &tv);
1436         }
1437 }
1438
1439 static void
1440 rack_log_to_prr(struct tcp_rack *rack, int frm)
1441 {
1442         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1443                 union tcp_log_stackspecific log;
1444                 struct timeval tv;
1445
1446                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1447                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
1448                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
1449                 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
1450                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
1451                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
1452                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
1453                 log.u_bbr.flex8 = frm;
1454                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1455                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1456                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1457                     &rack->rc_inp->inp_socket->so_rcv,
1458                     &rack->rc_inp->inp_socket->so_snd,
1459                     BBR_LOG_BBRUPD, 0,
1460                     0, &log, false, &tv);
1461         }
1462 }
1463
1464 #ifdef NETFLIX_EXP_DETECTION
1465 static void
1466 rack_log_sad(struct tcp_rack *rack, int event)
1467 {
1468         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1469                 union tcp_log_stackspecific log;
1470                 struct timeval tv;
1471
1472                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1473                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
1474                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
1475                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
1476                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
1477                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
1478                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
1479                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
1480                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
1481                 log.u_bbr.lt_epoch |= rack->do_detection;
1482                 log.u_bbr.applimited = tcp_map_minimum;
1483                 log.u_bbr.flex7 = rack->sack_attack_disable;
1484                 log.u_bbr.flex8 = event;
1485                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1486                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
1487                 log.u_bbr.delivered = tcp_sad_decay_val;
1488                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1489                     &rack->rc_inp->inp_socket->so_rcv,
1490                     &rack->rc_inp->inp_socket->so_snd,
1491                     TCP_SAD_DETECTION, 0,
1492                     0, &log, false, &tv);
1493         }
1494 }
1495 #endif
1496
1497 static void
1498 rack_counter_destroy(void)
1499 {
1500         counter_u64_free(rack_badfr);
1501         counter_u64_free(rack_badfr_bytes);
1502         counter_u64_free(rack_rtm_prr_retran);
1503         counter_u64_free(rack_rtm_prr_newdata);
1504         counter_u64_free(rack_timestamp_mismatch);
1505         counter_u64_free(rack_reorder_seen);
1506         counter_u64_free(rack_tlp_tot);
1507         counter_u64_free(rack_tlp_newdata);
1508         counter_u64_free(rack_tlp_retran);
1509         counter_u64_free(rack_tlp_retran_bytes);
1510         counter_u64_free(rack_tlp_retran_fail);
1511         counter_u64_free(rack_to_tot);
1512         counter_u64_free(rack_to_arm_rack);
1513         counter_u64_free(rack_to_arm_tlp);
1514         counter_u64_free(rack_paced_segments);
1515         counter_u64_free(rack_unpaced_segments);
1516         counter_u64_free(rack_saw_enobuf);
1517         counter_u64_free(rack_saw_enetunreach);
1518         counter_u64_free(rack_to_alloc_hard);
1519         counter_u64_free(rack_to_alloc_emerg);
1520         counter_u64_free(rack_sack_proc_all);
1521         counter_u64_free(rack_sack_proc_short);
1522         counter_u64_free(rack_sack_proc_restart);
1523         counter_u64_free(rack_to_alloc);
1524         counter_u64_free(rack_to_alloc_limited);
1525         counter_u64_free(rack_alloc_limited_conns);
1526         counter_u64_free(rack_split_limited);
1527         counter_u64_free(rack_find_high);
1528         counter_u64_free(rack_enter_tlp_calc);
1529         counter_u64_free(rack_used_tlpmethod);
1530         counter_u64_free(rack_used_tlpmethod2);
1531         counter_u64_free(rack_progress_drops);
1532         counter_u64_free(rack_input_idle_reduces);
1533         counter_u64_free(rack_collapsed_win);
1534         counter_u64_free(rack_tlp_does_nada);
1535         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1536         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1537 }
1538
1539 static struct rack_sendmap *
1540 rack_alloc(struct tcp_rack *rack)
1541 {
1542         struct rack_sendmap *rsm;
1543
1544         rsm = uma_zalloc(rack_zone, M_NOWAIT);
1545         if (rsm) {
1546                 rack->r_ctl.rc_num_maps_alloced++;
1547                 counter_u64_add(rack_to_alloc, 1);
1548                 return (rsm);
1549         }
1550         if (rack->rc_free_cnt) {
1551                 counter_u64_add(rack_to_alloc_emerg, 1);
1552                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1553                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
1554                 rack->rc_free_cnt--;
1555                 return (rsm);
1556         }
1557         return (NULL);
1558 }
1559
1560 static struct rack_sendmap *
1561 rack_alloc_full_limit(struct tcp_rack *rack)
1562 {
1563         if ((V_tcp_map_entries_limit > 0) &&
1564             (rack->do_detection == 0) &&
1565             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
1566                 counter_u64_add(rack_to_alloc_limited, 1);
1567                 if (!rack->alloc_limit_reported) {
1568                         rack->alloc_limit_reported = 1;
1569                         counter_u64_add(rack_alloc_limited_conns, 1);
1570                 }
1571                 return (NULL);
1572         }
1573         return (rack_alloc(rack));
1574 }
1575
1576 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1577 static struct rack_sendmap *
1578 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1579 {
1580         struct rack_sendmap *rsm;
1581
1582         if (limit_type) {
1583                 /* currently there is only one limit type */
1584                 if (V_tcp_map_split_limit > 0 &&
1585                     (rack->do_detection == 0) &&
1586                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
1587                         counter_u64_add(rack_split_limited, 1);
1588                         if (!rack->alloc_limit_reported) {
1589                                 rack->alloc_limit_reported = 1;
1590                                 counter_u64_add(rack_alloc_limited_conns, 1);
1591                         }
1592                         return (NULL);
1593                 }
1594         }
1595
1596         /* allocate and mark in the limit type, if set */
1597         rsm = rack_alloc(rack);
1598         if (rsm != NULL && limit_type) {
1599                 rsm->r_limit_type = limit_type;
1600                 rack->r_ctl.rc_num_split_allocs++;
1601         }
1602         return (rsm);
1603 }
1604
1605 static void
1606 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1607 {
1608         if (rsm->r_limit_type) {
1609                 /* currently there is only one limit type */
1610                 rack->r_ctl.rc_num_split_allocs--;
1611         }
1612         if (rack->r_ctl.rc_tlpsend == rsm)
1613                 rack->r_ctl.rc_tlpsend = NULL;
1614         if (rack->r_ctl.rc_sacklast == rsm)
1615                 rack->r_ctl.rc_sacklast = NULL;
1616         if (rack->rc_free_cnt < rack_free_cache) {
1617                 memset(rsm, 0, sizeof(struct rack_sendmap));
1618                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
1619                 rsm->r_limit_type = 0;
1620                 rack->rc_free_cnt++;
1621                 return;
1622         }
1623         rack->r_ctl.rc_num_maps_alloced--;
1624         uma_zfree(rack_zone, rsm);
1625 }
1626
1627 /*
1628  * CC wrapper hook functions
1629  */
1630 static void
1631 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1632     uint16_t type, int32_t recovery)
1633 {
1634 #ifdef STATS
1635         int32_t gput;
1636 #endif
1637
1638         INP_WLOCK_ASSERT(tp->t_inpcb);
1639         tp->ccv->nsegs = nsegs;
1640         tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1641         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1642                 uint32_t max;
1643
1644                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
1645                 if (tp->ccv->bytes_this_ack > max) {
1646                         tp->ccv->bytes_this_ack = max;
1647                 }
1648         }
1649         if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
1650             (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
1651              (tp->snd_cwnd < (ctf_flight_size(tp, rack->r_ctl.rc_sacked) * 2))))
1652                 tp->ccv->flags |= CCF_CWND_LIMITED;
1653         else
1654                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
1655
1656         if (type == CC_ACK) {
1657 #ifdef STATS
1658                 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1659                     ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1660                 if ((tp->t_flags & TF_GPUTINPROG) &&
1661                     SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1662                         gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1663                             max(1, tcp_ts_getticks() - tp->gput_ts);
1664                         /* We store it in bytes per ms (or kbytes per sec) */
1665                         rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8;
1666                         rack->r_ctl.rc_gp_hist_idx++;
1667                         if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST)
1668                                 rack->r_ctl.rc_gp_hist_filled = 1;
1669                         rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST;
1670                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1671                             gput);
1672                         /*
1673                          * XXXLAS: This is a temporary hack, and should be
1674                          * chained off VOI_TCP_GPUT when stats(9) grows an
1675                          * API to deal with chained VOIs.
1676                          */
1677                         if (tp->t_stats_gput_prev > 0)
1678                                 stats_voi_update_abs_s32(tp->t_stats,
1679                                     VOI_TCP_GPUT_ND,
1680                                     ((gput - tp->t_stats_gput_prev) * 100) /
1681                                     tp->t_stats_gput_prev);
1682                         tp->t_flags &= ~TF_GPUTINPROG;
1683                         tp->t_stats_gput_prev = gput;
1684
1685                         if (tp->t_maxpeakrate) {
1686                                 /*
1687                                  * We update t_peakrate_thr. This gives us roughly
1688                                  * one update per round trip time.
1689                                  */
1690                                 tcp_update_peakrate_thr(tp);
1691                         }
1692                 }
1693 #endif
1694                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1695                         tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1696                             nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
1697                         if (tp->t_bytes_acked >= tp->snd_cwnd) {
1698                                 tp->t_bytes_acked -= tp->snd_cwnd;
1699                                 tp->ccv->flags |= CCF_ABC_SENTAWND;
1700                         }
1701                 } else {
1702                         tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1703                         tp->t_bytes_acked = 0;
1704                 }
1705         }
1706         if (CC_ALGO(tp)->ack_received != NULL) {
1707                 /* XXXLAS: Find a way to live without this */
1708                 tp->ccv->curack = th->th_ack;
1709                 CC_ALGO(tp)->ack_received(tp->ccv, type);
1710         }
1711 #ifdef STATS
1712         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1713 #endif
1714         if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1715                 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1716         }
1717         /* we enforce max peak rate if it is set. */
1718         if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1719                 tp->snd_cwnd = tp->t_peakrate_thr;
1720         }
1721 }
1722
1723 static void
1724 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1725 {
1726         struct tcp_rack *rack;
1727
1728         rack = (struct tcp_rack *)tp->t_fb_ptr;
1729         INP_WLOCK_ASSERT(tp->t_inpcb);
1730         if (rack->r_ctl.rc_prr_sndcnt > 0)
1731                 rack->r_wanted_output++;
1732 }
1733
1734 static void
1735 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1736 {
1737         struct tcp_rack *rack;
1738
1739         INP_WLOCK_ASSERT(tp->t_inpcb);
1740         rack = (struct tcp_rack *)tp->t_fb_ptr;
1741         if (CC_ALGO(tp)->post_recovery != NULL) {
1742                 tp->ccv->curack = th->th_ack;
1743                 CC_ALGO(tp)->post_recovery(tp->ccv);
1744         }
1745         /*
1746          * Here we can in theory adjust cwnd to be based on the number of
1747          * losses in the window (rack->r_ctl.rc_loss_count). This is done
1748          * based on the rack_use_proportional flag.
1749          */
1750         if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1751                 int32_t reduce;
1752
1753                 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1754                 if (reduce > 50) {
1755                         reduce = 50;
1756                 }
1757                 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1758         } else {
1759                 if (tp->snd_cwnd > tp->snd_ssthresh) {
1760                         /* Drop us down to the ssthresh (1/2 cwnd at loss) */
1761                         tp->snd_cwnd = tp->snd_ssthresh;
1762                 }
1763         }
1764         if (rack->r_ctl.rc_prr_sndcnt > 0) {
1765                 /* Suck the next prr cnt back into cwnd */
1766                 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1767                 rack->r_ctl.rc_prr_sndcnt = 0;
1768                 rack_log_to_prr(rack, 1);
1769         }
1770         tp->snd_recover = tp->snd_una;
1771         EXIT_RECOVERY(tp->t_flags);
1772
1773
1774 }
1775
1776 static void
1777 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1778 {
1779         struct tcp_rack *rack;
1780
1781         INP_WLOCK_ASSERT(tp->t_inpcb);
1782
1783         rack = (struct tcp_rack *)tp->t_fb_ptr;
1784         switch (type) {
1785         case CC_NDUPACK:
1786                 tp->t_flags &= ~TF_WASFRECOVERY;
1787                 tp->t_flags &= ~TF_WASCRECOVERY;
1788                 if (!IN_FASTRECOVERY(tp->t_flags)) {
1789                         rack->r_ctl.rc_tlp_rtx_out = 0;
1790                         rack->r_ctl.rc_prr_delivered = 0;
1791                         rack->r_ctl.rc_prr_out = 0;
1792                         rack->r_ctl.rc_loss_count = 0;
1793                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
1794                         rack_log_to_prr(rack, 2);
1795                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1796                         tp->snd_recover = tp->snd_max;
1797                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1798                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1799                 }
1800                 break;
1801         case CC_ECN:
1802                 if (!IN_CONGRECOVERY(tp->t_flags)) {
1803                         TCPSTAT_INC(tcps_ecn_rcwnd);
1804                         tp->snd_recover = tp->snd_max;
1805                         if (tp->t_flags2 & TF2_ECN_PERMIT)
1806                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
1807                 }
1808                 break;
1809         case CC_RTO:
1810                 tp->t_dupacks = 0;
1811                 tp->t_bytes_acked = 0;
1812                 EXIT_RECOVERY(tp->t_flags);
1813                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1814                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
1815                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
1816                 break;
1817         case CC_RTO_ERR:
1818                 TCPSTAT_INC(tcps_sndrexmitbad);
1819                 /* RTO was unnecessary, so reset everything. */
1820                 tp->snd_cwnd = tp->snd_cwnd_prev;
1821                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1822                 tp->snd_recover = tp->snd_recover_prev;
1823                 if (tp->t_flags & TF_WASFRECOVERY) {
1824                         ENTER_FASTRECOVERY(tp->t_flags);
1825                         tp->t_flags &= ~TF_WASFRECOVERY;
1826                 }
1827                 if (tp->t_flags & TF_WASCRECOVERY) {
1828                         ENTER_CONGRECOVERY(tp->t_flags);
1829                         tp->t_flags &= ~TF_WASCRECOVERY;
1830                 }
1831                 tp->snd_nxt = tp->snd_max;
1832                 tp->t_badrxtwin = 0;
1833                 break;
1834         }
1835
1836         if (CC_ALGO(tp)->cong_signal != NULL) {
1837                 if (th != NULL)
1838                         tp->ccv->curack = th->th_ack;
1839                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
1840         }
1841 }
1842
1843
1844
1845 static inline void
1846 rack_cc_after_idle(struct tcpcb *tp)
1847 {
1848         uint32_t i_cwnd;
1849
1850         INP_WLOCK_ASSERT(tp->t_inpcb);
1851
1852 #ifdef NETFLIX_STATS
1853         TCPSTAT_INC(tcps_idle_restarts);
1854         if (tp->t_state == TCPS_ESTABLISHED)
1855                 TCPSTAT_INC(tcps_idle_estrestarts);
1856 #endif
1857         if (CC_ALGO(tp)->after_idle != NULL)
1858                 CC_ALGO(tp)->after_idle(tp->ccv);
1859
1860         if (tp->snd_cwnd == 1)
1861                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
1862         else
1863                 i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
1864
1865         /*
1866          * Being idle is no differnt than the initial window. If the cc
1867          * clamps it down below the initial window raise it to the initial
1868          * window.
1869          */
1870         if (tp->snd_cwnd < i_cwnd) {
1871                 tp->snd_cwnd = i_cwnd;
1872         }
1873 }
1874
1875
1876 /*
1877  * Indicate whether this ack should be delayed.  We can delay the ack if
1878  * following conditions are met:
1879  *      - There is no delayed ack timer in progress.
1880  *      - Our last ack wasn't a 0-sized window. We never want to delay
1881  *        the ack that opens up a 0-sized window.
1882  *      - LRO wasn't used for this segment. We make sure by checking that the
1883  *        segment size is not larger than the MSS.
1884  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
1885  *        connection.
1886  */
1887 #define DELAY_ACK(tp, tlen)                      \
1888         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1889         ((tp->t_flags & TF_DELACK) == 0) &&      \
1890         (tlen <= tp->t_maxseg) &&                \
1891         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1892
1893 static struct rack_sendmap *
1894 rack_find_lowest_rsm(struct tcp_rack *rack)
1895 {
1896         struct rack_sendmap *rsm;
1897
1898         /*
1899          * Walk the time-order transmitted list looking for an rsm that is
1900          * not acked. This will be the one that was sent the longest time
1901          * ago that is still outstanding.
1902          */
1903         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1904                 if (rsm->r_flags & RACK_ACKED) {
1905                         continue;
1906                 }
1907                 goto finish;
1908         }
1909 finish:
1910         return (rsm);
1911 }
1912
1913 static struct rack_sendmap *
1914 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1915 {
1916         struct rack_sendmap *prsm;
1917
1918         /*
1919          * Walk the sequence order list backward until we hit and arrive at
1920          * the highest seq not acked. In theory when this is called it
1921          * should be the last segment (which it was not).
1922          */
1923         counter_u64_add(rack_find_high, 1);
1924         prsm = rsm;
1925         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
1926                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1927                         continue;
1928                 }
1929                 return (prsm);
1930         }
1931         return (NULL);
1932 }
1933
1934
1935 static uint32_t
1936 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1937 {
1938         int32_t lro;
1939         uint32_t thresh;
1940
1941         /*
1942          * lro is the flag we use to determine if we have seen reordering.
1943          * If it gets set we have seen reordering. The reorder logic either
1944          * works in one of two ways:
1945          *
1946          * If reorder-fade is configured, then we track the last time we saw
1947          * re-ordering occur. If we reach the point where enough time as
1948          * passed we no longer consider reordering has occuring.
1949          *
1950          * Or if reorder-face is 0, then once we see reordering we consider
1951          * the connection to alway be subject to reordering and just set lro
1952          * to 1.
1953          *
1954          * In the end if lro is non-zero we add the extra time for
1955          * reordering in.
1956          */
1957         if (srtt == 0)
1958                 srtt = 1;
1959         if (rack->r_ctl.rc_reorder_ts) {
1960                 if (rack->r_ctl.rc_reorder_fade) {
1961                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1962                                 lro = cts - rack->r_ctl.rc_reorder_ts;
1963                                 if (lro == 0) {
1964                                         /*
1965                                          * No time as passed since the last
1966                                          * reorder, mark it as reordering.
1967                                          */
1968                                         lro = 1;
1969                                 }
1970                         } else {
1971                                 /* Negative time? */
1972                                 lro = 0;
1973                         }
1974                         if (lro > rack->r_ctl.rc_reorder_fade) {
1975                                 /* Turn off reordering seen too */
1976                                 rack->r_ctl.rc_reorder_ts = 0;
1977                                 lro = 0;
1978                         }
1979                 } else {
1980                         /* Reodering does not fade */
1981                         lro = 1;
1982                 }
1983         } else {
1984                 lro = 0;
1985         }
1986         thresh = srtt + rack->r_ctl.rc_pkt_delay;
1987         if (lro) {
1988                 /* It must be set, if not you get 1/4 rtt */
1989                 if (rack->r_ctl.rc_reorder_shift)
1990                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1991                 else
1992                         thresh += (srtt >> 2);
1993         } else {
1994                 thresh += 1;
1995         }
1996         /* We don't let the rack timeout be above a RTO */
1997         if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1998                 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1999         }
2000         /* And we don't want it above the RTO max either */
2001         if (thresh > rack_rto_max) {
2002                 thresh = rack_rto_max;
2003         }
2004         return (thresh);
2005 }
2006
2007 static uint32_t
2008 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
2009                      struct rack_sendmap *rsm, uint32_t srtt)
2010 {
2011         struct rack_sendmap *prsm;
2012         uint32_t thresh, len;
2013         int maxseg;
2014
2015         if (srtt == 0)
2016                 srtt = 1;
2017         if (rack->r_ctl.rc_tlp_threshold)
2018                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
2019         else
2020                 thresh = (srtt * 2);
2021
2022         /* Get the previous sent packet, if any  */
2023         maxseg = ctf_fixed_maxseg(tp);
2024         counter_u64_add(rack_enter_tlp_calc, 1);
2025         len = rsm->r_end - rsm->r_start;
2026         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
2027                 /* Exactly like the ID */
2028                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
2029                         uint32_t alt_thresh;
2030                         /*
2031                          * Compensate for delayed-ack with the d-ack time.
2032                          */
2033                         counter_u64_add(rack_used_tlpmethod, 1);
2034                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2035                         if (alt_thresh > thresh)
2036                                 thresh = alt_thresh;
2037                 }
2038         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
2039                 /* 2.1 behavior */
2040                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
2041                 if (prsm && (len <= maxseg)) {
2042                         /*
2043                          * Two packets outstanding, thresh should be (2*srtt) +
2044                          * possible inter-packet delay (if any).
2045                          */
2046                         uint32_t inter_gap = 0;
2047                         int idx, nidx;
2048
2049                         counter_u64_add(rack_used_tlpmethod, 1);
2050                         idx = rsm->r_rtr_cnt - 1;
2051                         nidx = prsm->r_rtr_cnt - 1;
2052                         if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
2053                                 /* Yes it was sent later (or at the same time) */
2054                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
2055                         }
2056                         thresh += inter_gap;
2057                 } else  if (len <= maxseg) {
2058                         /*
2059                          * Possibly compensate for delayed-ack.
2060                          */
2061                         uint32_t alt_thresh;
2062
2063                         counter_u64_add(rack_used_tlpmethod2, 1);
2064                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2065                         if (alt_thresh > thresh)
2066                                 thresh = alt_thresh;
2067                 }
2068         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2069                 /* 2.2 behavior */
2070                 if (len <= maxseg) {
2071                         uint32_t alt_thresh;
2072                         /*
2073                          * Compensate for delayed-ack with the d-ack time.
2074                          */
2075                         counter_u64_add(rack_used_tlpmethod, 1);
2076                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2077                         if (alt_thresh > thresh)
2078                                 thresh = alt_thresh;
2079                 }
2080         }
2081         /* Not above an RTO */
2082         if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2083                 thresh = TICKS_2_MSEC(tp->t_rxtcur);
2084         }
2085         /* Not above a RTO max */
2086         if (thresh > rack_rto_max) {
2087                 thresh = rack_rto_max;
2088         }
2089         /* Apply user supplied min TLP */
2090         if (thresh < rack_tlp_min) {
2091                 thresh = rack_tlp_min;
2092         }
2093         return (thresh);
2094 }
2095
2096 static uint32_t
2097 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
2098 {
2099         /*
2100          * We want the rack_rtt which is the
2101          * last rtt we measured. However if that
2102          * does not exist we fallback to the srtt (which
2103          * we probably will never do) and then as a last
2104          * resort we use RACK_INITIAL_RTO if no srtt is
2105          * yet set.
2106          */
2107         if (rack->rc_rack_rtt)
2108                 return(rack->rc_rack_rtt);
2109         else if (tp->t_srtt == 0)
2110                 return(RACK_INITIAL_RTO);
2111         return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT));
2112 }
2113
2114 static struct rack_sendmap *
2115 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2116 {
2117         /*
2118          * Check to see that we don't need to fall into recovery. We will
2119          * need to do so if our oldest transmit is past the time we should
2120          * have had an ack.
2121          */
2122         struct tcp_rack *rack;
2123         struct rack_sendmap *rsm;
2124         int32_t idx;
2125         uint32_t srtt, thresh;
2126
2127         rack = (struct tcp_rack *)tp->t_fb_ptr;
2128         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
2129                 return (NULL);
2130         }
2131         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2132         if (rsm == NULL)
2133                 return (NULL);
2134
2135         if (rsm->r_flags & RACK_ACKED) {
2136                 rsm = rack_find_lowest_rsm(rack);
2137                 if (rsm == NULL)
2138                         return (NULL);
2139         }
2140         idx = rsm->r_rtr_cnt - 1;
2141         srtt = rack_grab_rtt(tp, rack);
2142         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2143         if (tsused < rsm->r_tim_lastsent[idx]) {
2144                 return (NULL);
2145         }
2146         if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2147                 return (NULL);
2148         }
2149         /* Ok if we reach here we are over-due */
2150         rack->r_ctl.rc_rsm_start = rsm->r_start;
2151         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2152         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2153         rack_cong_signal(tp, NULL, CC_NDUPACK);
2154         return (rsm);
2155 }
2156
2157 static uint32_t
2158 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2159 {
2160         int32_t t;
2161         int32_t tt;
2162         uint32_t ret_val;
2163
2164         t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2165         TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2166             rack_persist_min, rack_persist_max);
2167         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2168                 tp->t_rxtshift++;
2169         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2170         ret_val = (uint32_t)tt;
2171         return (ret_val);
2172 }
2173
2174 static uint32_t
2175 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
2176 {
2177         /*
2178          * Start the FR timer, we do this based on getting the first one in
2179          * the rc_tmap. Note that if its NULL we must stop the timer. in all
2180          * events we need to stop the running timer (if its running) before
2181          * starting the new one.
2182          */
2183         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
2184         uint32_t srtt_cur;
2185         int32_t idx;
2186         int32_t is_tlp_timer = 0;
2187         struct rack_sendmap *rsm;
2188
2189         if (rack->t_timers_stopped) {
2190                 /* All timers have been stopped none are to run */
2191                 return (0);
2192         }
2193         if (rack->rc_in_persist) {
2194                 /* We can't start any timer in persists */
2195                 return (rack_get_persists_timer_val(tp, rack));
2196         }
2197         if ((tp->t_state < TCPS_ESTABLISHED) ||
2198             ((tp->t_flags & TF_SACK_PERMIT) == 0))
2199                 goto activate_rxt;
2200         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2201         if ((rsm == NULL) || sup_rack) {
2202                 /* Nothing on the send map */
2203 activate_rxt:
2204                 time_since_sent = 0;
2205                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2206                 if (rsm) {
2207                         idx = rsm->r_rtr_cnt - 1;
2208                         if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2209                                 tstmp_touse = rsm->r_tim_lastsent[idx];
2210                         else
2211                                 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2212                         if (TSTMP_GT(tstmp_touse, cts))
2213                             time_since_sent = cts - tstmp_touse;
2214                 }
2215                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2216                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2217                         to = TICKS_2_MSEC(tp->t_rxtcur);
2218                         if (to > time_since_sent)
2219                                 to -= time_since_sent;
2220                         else
2221                                 to = rack->r_ctl.rc_min_to;
2222                         if (to == 0)
2223                                 to = 1;
2224                         return (to);
2225                 }
2226                 return (0);
2227         }
2228         if (rsm->r_flags & RACK_ACKED) {
2229                 rsm = rack_find_lowest_rsm(rack);
2230                 if (rsm == NULL) {
2231                         /* No lowest? */
2232                         goto activate_rxt;
2233                 }
2234         }
2235         if (rack->sack_attack_disable) {
2236                 /*
2237                  * We don't want to do
2238                  * any TLP's if you are an attacker.
2239                  * Though if you are doing what
2240                  * is expected you may still have
2241                  * SACK-PASSED marks.
2242                  */
2243                 goto activate_rxt;
2244         }
2245         /* Convert from ms to usecs */
2246         if (rsm->r_flags & RACK_SACK_PASSED) {
2247                 if ((tp->t_flags & TF_SENTFIN) &&
2248                     ((tp->snd_max - tp->snd_una) == 1) &&
2249                     (rsm->r_flags & RACK_HAS_FIN)) {
2250                         /*
2251                          * We don't start a rack timer if all we have is a
2252                          * FIN outstanding.
2253                          */
2254                         goto activate_rxt;
2255                 }
2256                 if ((rack->use_rack_cheat == 0) &&
2257                     (IN_RECOVERY(tp->t_flags)) &&
2258                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
2259                         /*
2260                          * We are not cheating, in recovery  and
2261                          * not enough ack's to yet get our next
2262                          * retransmission out.
2263                          *
2264                          * Note that classified attackers do not
2265                          * get to use the rack-cheat.
2266                          */
2267                         goto activate_tlp;
2268                 }
2269                 srtt = rack_grab_rtt(tp, rack);
2270                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
2271                 idx = rsm->r_rtr_cnt - 1;
2272                 exp = rsm->r_tim_lastsent[idx] + thresh;
2273                 if (SEQ_GEQ(exp, cts)) {
2274                         to = exp - cts;
2275                         if (to < rack->r_ctl.rc_min_to) {
2276                                 to = rack->r_ctl.rc_min_to;
2277                         }
2278                 } else {
2279                         to = rack->r_ctl.rc_min_to;
2280                 }
2281         } else {
2282                 /* Ok we need to do a TLP not RACK */
2283 activate_tlp:
2284                 if ((rack->rc_tlp_in_progress != 0) ||
2285                     (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2286                         /*
2287                          * The previous send was a TLP or a tlp_rtx is in
2288                          * process.
2289                          */
2290                         goto activate_rxt;
2291                 }
2292                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2293                 if (rsm == NULL) {
2294                         /* We found no rsm to TLP with. */
2295                         goto activate_rxt;
2296                 }
2297                 if (rsm->r_flags & RACK_HAS_FIN) {
2298                         /* If its a FIN we dont do TLP */
2299                         rsm = NULL;
2300                         goto activate_rxt;
2301                 }
2302                 idx = rsm->r_rtr_cnt - 1;
2303                 time_since_sent = 0;
2304                 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time))
2305                         tstmp_touse = rsm->r_tim_lastsent[idx];
2306                 else
2307                         tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time;
2308                 if (TSTMP_GT(tstmp_touse, cts))
2309                     time_since_sent = cts - tstmp_touse;
2310                 is_tlp_timer = 1;
2311                 if (tp->t_srtt) {
2312                         srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2313                         srtt = TICKS_2_MSEC(srtt_cur);
2314                 } else
2315                         srtt = RACK_INITIAL_RTO;
2316                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2317                 if (thresh > time_since_sent)
2318                         to = thresh - time_since_sent;
2319                 else
2320                         to = rack->r_ctl.rc_min_to;
2321                 if (to > TCPTV_REXMTMAX) {
2322                         /*
2323                          * If the TLP time works out to larger than the max
2324                          * RTO lets not do TLP.. just RTO.
2325                          */
2326                         goto activate_rxt;
2327                 }
2328                 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2329                         /*
2330                          * The tail is no longer the last one I did a probe
2331                          * on
2332                          */
2333                         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2334                         rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2335                 }
2336         }
2337         if (is_tlp_timer == 0) {
2338                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2339         } else {
2340                 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2341                     (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2342                         /*
2343                          * We have exceeded how many times we can retran the
2344                          * current TLP timer, switch to the RTO timer.
2345                          */
2346                         goto activate_rxt;
2347                 } else {
2348                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2349                 }
2350         }
2351         if (to == 0)
2352                 to = 1;
2353         return (to);
2354 }
2355
2356 static void
2357 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2358 {
2359         if (rack->rc_in_persist == 0) {
2360                 rack->r_ctl.rc_went_idle_time = cts;
2361                 rack_timer_cancel(tp, rack, cts, __LINE__);
2362                 tp->t_rxtshift = 0;
2363                 rack->rc_in_persist = 1;
2364         }
2365 }
2366
2367 static void
2368 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2369 {
2370         if (rack->rc_inp->inp_in_hpts)  {
2371                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2372                 rack->r_ctl.rc_hpts_flags  = 0;
2373         }
2374         rack->rc_in_persist = 0;
2375         rack->r_ctl.rc_went_idle_time = 0;
2376         tp->t_flags &= ~TF_FORCEDATA;
2377         tp->t_rxtshift = 0;
2378 }
2379
2380 static void
2381 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
2382       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
2383 {
2384         struct inpcb *inp;
2385         uint32_t delayed_ack = 0;
2386         uint32_t hpts_timeout;
2387         uint8_t stopped;
2388         uint32_t left = 0;
2389
2390         inp = tp->t_inpcb;
2391         if (inp->inp_in_hpts) {
2392                 /* A previous call is already set up */
2393                 return;
2394         }
2395         if ((tp->t_state == TCPS_CLOSED) ||
2396             (tp->t_state == TCPS_LISTEN)) {
2397                 return;
2398         }
2399         stopped = rack->rc_tmr_stopped;
2400         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2401                 left = rack->r_ctl.rc_timer_exp - cts;
2402         }
2403         rack->tlp_timer_up = 0;
2404         rack->r_ctl.rc_timer_exp = 0;
2405         if (rack->rc_inp->inp_in_hpts == 0) {
2406                 rack->r_ctl.rc_hpts_flags = 0;
2407         }
2408         if (slot) {
2409                 /* We are hptsi too */
2410                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2411         } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2412                 /*
2413                  * We are still left on the hpts when the to goes
2414                  * it will be for output.
2415                  */
2416                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
2417                         slot = rack->r_ctl.rc_last_output_to - cts;
2418                 else
2419                         slot = 1;
2420         }
2421         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
2422 #ifdef NETFLIX_EXP_DETECTION
2423         if (rack->sack_attack_disable &&
2424             (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) {
2425                 /*
2426                  * We have a potential attacker on
2427                  * the line. We have possibly some
2428                  * (or now) pacing time set. We want to
2429                  * slow down the processing of sacks by some
2430                  * amount (if it is an attacker). Set the default
2431                  * slot for attackers in place (unless the orginal
2432                  * interval is longer). Its stored in
2433                  * micro-seconds, so lets convert to msecs.
2434                  */
2435                 slot = USEC_TO_MSEC(tcp_sad_pacing_interval);
2436         }
2437 #endif
2438         if (tp->t_flags & TF_DELACK) {
2439                 delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2440                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2441         }
2442         if (delayed_ack && ((hpts_timeout == 0) ||
2443                             (delayed_ack < hpts_timeout)))
2444                 hpts_timeout = delayed_ack;
2445         else
2446                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2447         /*
2448          * If no timers are going to run and we will fall off the hptsi
2449          * wheel, we resort to a keep-alive timer if its configured.
2450          */
2451         if ((hpts_timeout == 0) &&
2452             (slot == 0)) {
2453                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2454                     (tp->t_state <= TCPS_CLOSING)) {
2455                         /*
2456                          * Ok we have no timer (persists, rack, tlp, rxt  or
2457                          * del-ack), we don't have segments being paced. So
2458                          * all that is left is the keepalive timer.
2459                          */
2460                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2461                                 /* Get the established keep-alive time */
2462                                 hpts_timeout = TP_KEEPIDLE(tp);
2463                         } else {
2464                                 /* Get the initial setup keep-alive time */
2465                                 hpts_timeout = TP_KEEPINIT(tp);
2466                         }
2467                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2468                 }
2469         }
2470         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2471             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2472                 /*
2473                  * RACK, TLP, persists and RXT timers all are restartable
2474                  * based on actions input .. i.e we received a packet (ack
2475                  * or sack) and that changes things (rw, or snd_una etc).
2476                  * Thus we can restart them with a new value. For
2477                  * keep-alive, delayed_ack we keep track of what was left
2478                  * and restart the timer with a smaller value.
2479                  */
2480                 if (left < hpts_timeout)
2481                         hpts_timeout = left;
2482         }
2483         if (hpts_timeout) {
2484                 /*
2485                  * Hack alert for now we can't time-out over 2,147,483
2486                  * seconds (a bit more than 596 hours), which is probably ok
2487                  * :).
2488                  */
2489                 if (hpts_timeout > 0x7ffffffe)
2490                         hpts_timeout = 0x7ffffffe;
2491                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2492         }
2493         if (slot) {
2494                 rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2495                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)
2496                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2497                 else
2498                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2499                 rack->r_ctl.rc_last_output_to = cts + slot;
2500                 if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2501                         if (rack->rc_inp->inp_in_hpts == 0)
2502                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2503                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2504                 } else {
2505                         /*
2506                          * Arrange for the hpts to kick back in after the
2507                          * t-o if the t-o does not cause a send.
2508                          */
2509                         if (rack->rc_inp->inp_in_hpts == 0)
2510                                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2511                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2512                 }
2513         } else if (hpts_timeout) {
2514                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)  {
2515                         /* For a rack timer, don't wake us */
2516                         rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
2517                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
2518                 } else {
2519                         /* All other timers wake us up */
2520                         rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
2521                         inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
2522                 }
2523                 if (rack->rc_inp->inp_in_hpts == 0)
2524                         tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2525                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2526         } else {
2527                 /* No timer starting */
2528 #ifdef INVARIANTS
2529                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2530                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2531                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2532                 }
2533 #endif
2534         }
2535         rack->rc_tmr_stopped = 0;
2536         if (slot)
2537                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2538 }
2539
2540 /*
2541  * RACK Timer, here we simply do logging and house keeping.
2542  * the normal rack_output() function will call the
2543  * appropriate thing to check if we need to do a RACK retransmit.
2544  * We return 1, saying don't proceed with rack_output only
2545  * when all timers have been stopped (destroyed PCB?).
2546  */
2547 static int
2548 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2549 {
2550         /*
2551          * This timer simply provides an internal trigger to send out data.
2552          * The check_recovery_mode call will see if there are needed
2553          * retransmissions, if so we will enter fast-recovery. The output
2554          * call may or may not do the same thing depending on sysctl
2555          * settings.
2556          */
2557         struct rack_sendmap *rsm;
2558         int32_t recovery, ll;
2559
2560         if (tp->t_timers->tt_flags & TT_STOPPED) {
2561                 return (1);
2562         }
2563         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2564                 /* Its not time yet */
2565                 return (0);
2566         }
2567         recovery = IN_RECOVERY(tp->t_flags);
2568         counter_u64_add(rack_to_tot, 1);
2569         if (rack->r_state && (rack->r_state != tp->t_state))
2570                 rack_set_state(tp, rack);
2571         rsm = rack_check_recovery_mode(tp, cts);
2572         if (rsm)
2573                 ll = rsm->r_end - rsm->r_start;
2574         else
2575                 ll = 0;
2576         rack_log_to_event(rack, RACK_TO_FRM_RACK, ll);
2577         if (rsm) {
2578                 uint32_t rtt;
2579
2580                 rtt = rack->rc_rack_rtt;
2581                 if (rtt == 0)
2582                         rtt = 1;
2583                 if ((recovery == 0) &&
2584                     (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
2585                         /*
2586                          * The rack-timeout that enter's us into recovery
2587                          * will force out one MSS and set us up so that we
2588                          * can do one more send in 2*rtt (transitioning the
2589                          * rack timeout into a rack-tlp).
2590                          */
2591                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2592                         rack_log_to_prr(rack, 3);
2593                 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) &&
2594                            rack->use_rack_cheat) {
2595                         /*
2596                          * When a rack timer goes, if the rack cheat is
2597                          * on, arrange it so we can send a full segment.
2598                          */
2599                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
2600                         rack_log_to_prr(rack, 4);
2601                 }
2602         } else {
2603                 /* This is a case that should happen rarely if ever */
2604                 counter_u64_add(rack_tlp_does_nada, 1);
2605 #ifdef TCP_BLACKBOX
2606                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2607 #endif
2608                 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2609         }
2610         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2611         return (0);
2612 }
2613
2614 static __inline void
2615 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
2616                struct rack_sendmap *rsm, uint32_t start)
2617 {
2618         int idx;
2619
2620         nrsm->r_start = start;
2621         nrsm->r_end = rsm->r_end;
2622         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2623         nrsm->r_flags = rsm->r_flags;
2624         nrsm->r_dupack = rsm->r_dupack;
2625         nrsm->r_rtr_bytes = 0;
2626         rsm->r_end = nrsm->r_start;
2627         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2628                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2629         }
2630 }
2631
2632 static struct rack_sendmap *
2633 rack_merge_rsm(struct tcp_rack *rack,
2634                struct rack_sendmap *l_rsm,
2635                struct rack_sendmap *r_rsm)
2636 {
2637         /*
2638          * We are merging two ack'd RSM's,
2639          * the l_rsm is on the left (lower seq
2640          * values) and the r_rsm is on the right
2641          * (higher seq value). The simplest way
2642          * to merge these is to move the right
2643          * one into the left. I don't think there
2644          * is any reason we need to try to find
2645          * the oldest (or last oldest retransmitted).
2646          */
2647         struct rack_sendmap *rm;
2648
2649         l_rsm->r_end = r_rsm->r_end;
2650         if (l_rsm->r_dupack < r_rsm->r_dupack)
2651                 l_rsm->r_dupack = r_rsm->r_dupack;
2652         if (r_rsm->r_rtr_bytes)
2653                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
2654         if (r_rsm->r_in_tmap) {
2655                 /* This really should not happen */
2656                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
2657                 r_rsm->r_in_tmap = 0;
2658         }
2659         /* Now the flags */
2660         if (r_rsm->r_flags & RACK_HAS_FIN)
2661                 l_rsm->r_flags |= RACK_HAS_FIN;
2662         if (r_rsm->r_flags & RACK_TLP)
2663                 l_rsm->r_flags |= RACK_TLP;
2664         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
2665                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
2666         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
2667 #ifdef INVARIANTS
2668         if (rm != r_rsm) {
2669                 panic("removing head in rack:%p rsm:%p rm:%p",
2670                       rack, r_rsm, rm);
2671         }
2672 #endif
2673         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
2674                 /* Transfer the split limit to the map we free */
2675                 r_rsm->r_limit_type = l_rsm->r_limit_type;
2676                 l_rsm->r_limit_type = 0;
2677         }
2678         rack_free(rack, r_rsm);
2679         return(l_rsm);
2680 }
2681
2682 /*
2683  * TLP Timer, here we simply setup what segment we want to
2684  * have the TLP expire on, the normal rack_output() will then
2685  * send it out.
2686  *
2687  * We return 1, saying don't proceed with rack_output only
2688  * when all timers have been stopped (destroyed PCB?).
2689  */
2690 static int
2691 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2692 {
2693         /*
2694          * Tail Loss Probe.
2695          */
2696         struct rack_sendmap *rsm = NULL;
2697         struct rack_sendmap *insret;
2698         struct socket *so;
2699         uint32_t amm, old_prr_snd = 0;
2700         uint32_t out, avail;
2701         int collapsed_win = 0;
2702
2703         if (tp->t_timers->tt_flags & TT_STOPPED) {
2704                 return (1);
2705         }
2706         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2707                 /* Its not time yet */
2708                 return (0);
2709         }
2710         if (rack_progress_timeout_check(tp)) {
2711                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2712                 return (1);
2713         }
2714         /*
2715          * A TLP timer has expired. We have been idle for 2 rtts. So we now
2716          * need to figure out how to force a full MSS segment out.
2717          */
2718         rack_log_to_event(rack, RACK_TO_FRM_TLP, 0);
2719         counter_u64_add(rack_tlp_tot, 1);
2720         if (rack->r_state && (rack->r_state != tp->t_state))
2721                 rack_set_state(tp, rack);
2722         so = tp->t_inpcb->inp_socket;
2723 #ifdef KERN_TLS
2724         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
2725                 /*
2726                  * For hardware TLS we do *not* want to send
2727                  * new data, lets instead just do a retransmission.
2728                  */
2729                 goto need_retran;
2730         }
2731 #endif
2732         avail = sbavail(&so->so_snd);
2733         out = tp->snd_max - tp->snd_una;
2734         rack->tlp_timer_up = 1;
2735         if (out > tp->snd_wnd) {
2736                 /* special case, we need a retransmission */
2737                 collapsed_win = 1;
2738                 goto need_retran;
2739         }
2740         /*
2741          * If we are in recovery we can jazz out a segment if new data is
2742          * present simply by setting rc_prr_sndcnt to a segment.
2743          */
2744         if ((avail > out) &&
2745             ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2746                 /* New data is available */
2747                 amm = avail - out;
2748                 if (amm > ctf_fixed_maxseg(tp)) {
2749                         amm = ctf_fixed_maxseg(tp);
2750                 } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) {
2751                         /* not enough to fill a MTU and no-delay is off */
2752                         goto need_retran;
2753                 }
2754                 if (IN_RECOVERY(tp->t_flags)) {
2755                         /* Unlikely */
2756                         old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2757                         if (out + amm <= tp->snd_wnd) {
2758                                 rack->r_ctl.rc_prr_sndcnt = amm;
2759                                 rack_log_to_prr(rack, 4);
2760                         } else
2761                                 goto need_retran;
2762                 } else {
2763                         /* Set the send-new override */
2764                         if (out + amm <= tp->snd_wnd)
2765                                 rack->r_ctl.rc_tlp_new_data = amm;
2766                         else
2767                                 goto need_retran;
2768                 }
2769                 rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2770                 rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2771                 rack->r_ctl.rc_tlpsend = NULL;
2772                 counter_u64_add(rack_tlp_newdata, 1);
2773                 goto send;
2774         }
2775 need_retran:
2776         /*
2777          * Ok we need to arrange the last un-acked segment to be re-sent, or
2778          * optionally the first un-acked segment.
2779          */
2780         if (collapsed_win == 0) {
2781                 if (rack_always_send_oldest)
2782                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2783                 else {
2784                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2785                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2786                                 rsm = rack_find_high_nonack(rack, rsm);
2787                         }
2788                 }
2789                 if (rsm == NULL) {
2790                         counter_u64_add(rack_tlp_does_nada, 1);
2791 #ifdef TCP_BLACKBOX
2792                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2793 #endif
2794                         goto out;
2795                 }
2796         } else {
2797                 /*
2798                  * We must find the last segment
2799                  * that was acceptable by the client.
2800                  */
2801                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
2802                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
2803                                 /* Found one */
2804                                 break;
2805                         }
2806                 }
2807                 if (rsm == NULL) {
2808                         /* None? if so send the first */
2809                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
2810                         if (rsm == NULL) {
2811                                 counter_u64_add(rack_tlp_does_nada, 1);
2812 #ifdef TCP_BLACKBOX
2813                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2814 #endif
2815                                 goto out;
2816                         }
2817                 }
2818         }
2819         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
2820                 /*
2821                  * We need to split this the last segment in two.
2822                  */
2823                 struct rack_sendmap *nrsm;
2824
2825
2826                 nrsm = rack_alloc_full_limit(rack);
2827                 if (nrsm == NULL) {
2828                         /*
2829                          * No memory to split, we will just exit and punt
2830                          * off to the RXT timer.
2831                          */
2832                         counter_u64_add(rack_tlp_does_nada, 1);
2833                         goto out;
2834                 }
2835                 rack_clone_rsm(rack, nrsm, rsm,
2836                                (rsm->r_end - ctf_fixed_maxseg(tp)));
2837                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
2838 #ifdef INVARIANTS
2839                 if (insret != NULL) {
2840                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
2841                               nrsm, insret, rack, rsm);
2842                 }
2843 #endif
2844                 if (rsm->r_in_tmap) {
2845                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2846                         nrsm->r_in_tmap = 1;
2847                 }
2848                 rsm->r_flags &= (~RACK_HAS_FIN);
2849                 rsm = nrsm;
2850         }
2851         rack->r_ctl.rc_tlpsend = rsm;
2852         rack->r_ctl.rc_tlp_rtx_out = 1;
2853         if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2854                 rack->r_ctl.rc_tlp_seg_send_cnt++;
2855                 tp->t_rxtshift++;
2856         } else {
2857                 rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2858                 rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2859         }
2860 send:
2861         rack->r_ctl.rc_tlp_send_cnt++;
2862         if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2863                 /*
2864                  * Can't [re]/transmit a segment we have not heard from the
2865                  * peer in max times. We need the retransmit timer to take
2866                  * over.
2867                  */
2868         restore:
2869                 rack->r_ctl.rc_tlpsend = NULL;
2870                 if (rsm)
2871                         rsm->r_flags &= ~RACK_TLP;
2872                 rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2873                 rack_log_to_prr(rack, 5);
2874                 counter_u64_add(rack_tlp_retran_fail, 1);
2875                 goto out;
2876         } else if (rsm) {
2877                 rsm->r_flags |= RACK_TLP;
2878         }
2879         if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2880             (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2881                 /*
2882                  * We don't want to send a single segment more than the max
2883                  * either.
2884                  */
2885                 goto restore;
2886         }
2887         rack->r_timer_override = 1;
2888         rack->r_tlp_running = 1;
2889         rack->rc_tlp_in_progress = 1;
2890         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2891         return (0);
2892 out:
2893         rack->tlp_timer_up = 0;
2894         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2895         return (0);
2896 }
2897
2898 /*
2899  * Delayed ack Timer, here we simply need to setup the
2900  * ACK_NOW flag and remove the DELACK flag. From there
2901  * the output routine will send the ack out.
2902  *
2903  * We only return 1, saying don't proceed, if all timers
2904  * are stopped (destroyed PCB?).
2905  */
2906 static int
2907 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2908 {
2909         if (tp->t_timers->tt_flags & TT_STOPPED) {
2910                 return (1);
2911         }
2912         rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0);
2913         tp->t_flags &= ~TF_DELACK;
2914         tp->t_flags |= TF_ACKNOW;
2915         TCPSTAT_INC(tcps_delack);
2916         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2917         return (0);
2918 }
2919
2920 /*
2921  * Persists timer, here we simply need to setup the
2922  * FORCE-DATA flag the output routine will send
2923  * the one byte send.
2924  *
2925  * We only return 1, saying don't proceed, if all timers
2926  * are stopped (destroyed PCB?).
2927  */
2928 static int
2929 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2930 {
2931         struct tcptemp *t_template;
2932         struct inpcb *inp;
2933         int32_t retval = 1;
2934
2935         inp = tp->t_inpcb;
2936
2937         if (tp->t_timers->tt_flags & TT_STOPPED) {
2938                 return (1);
2939         }
2940         if (rack->rc_in_persist == 0)
2941                 return (0);
2942         if (rack_progress_timeout_check(tp)) {
2943                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
2944                 return (1);
2945         }
2946         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2947         /*
2948          * Persistence timer into zero window. Force a byte to be output, if
2949          * possible.
2950          */
2951         TCPSTAT_INC(tcps_persisttimeo);
2952         /*
2953          * Hack: if the peer is dead/unreachable, we do not time out if the
2954          * window is closed.  After a full backoff, drop the connection if
2955          * the idle time (no responses to probes) reaches the maximum
2956          * backoff that we would use if retransmitting.
2957          */
2958         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2959             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2960             ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2961                 TCPSTAT_INC(tcps_persistdrop);
2962                 retval = 1;
2963                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2964                 goto out;
2965         }
2966         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2967             tp->snd_una == tp->snd_max)
2968                 rack_exit_persist(tp, rack);
2969         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2970         /*
2971          * If the user has closed the socket then drop a persisting
2972          * connection after a much reduced timeout.
2973          */
2974         if (tp->t_state > TCPS_CLOSE_WAIT &&
2975             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2976                 retval = 1;
2977                 TCPSTAT_INC(tcps_persistdrop);
2978                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2979                 goto out;
2980         }
2981         t_template = tcpip_maketemplate(rack->rc_inp);
2982         if (t_template) {
2983                 tcp_respond(tp, t_template->tt_ipgen,
2984                             &t_template->tt_t, (struct mbuf *)NULL,
2985                             tp->rcv_nxt, tp->snd_una - 1, 0);
2986                 /* This sends an ack */
2987                 if (tp->t_flags & TF_DELACK)
2988                         tp->t_flags &= ~TF_DELACK;
2989                 free(t_template, M_TEMP);
2990         }
2991         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2992                 tp->t_rxtshift++;
2993 out:
2994         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0);
2995         rack_start_hpts_timer(rack, tp, cts,
2996                               0, 0, 0);
2997         return (retval);
2998 }
2999
3000 /*
3001  * If a keepalive goes off, we had no other timers
3002  * happening. We always return 1 here since this
3003  * routine either drops the connection or sends
3004  * out a segment with respond.
3005  */
3006 static int
3007 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3008 {
3009         struct tcptemp *t_template;
3010         struct inpcb *inp;
3011
3012         if (tp->t_timers->tt_flags & TT_STOPPED) {
3013                 return (1);
3014         }
3015         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
3016         inp = tp->t_inpcb;
3017         rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0);
3018         /*
3019          * Keep-alive timer went off; send something or drop connection if
3020          * idle for too long.
3021          */
3022         TCPSTAT_INC(tcps_keeptimeo);
3023         if (tp->t_state < TCPS_ESTABLISHED)
3024                 goto dropit;
3025         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
3026             tp->t_state <= TCPS_CLOSING) {
3027                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
3028                         goto dropit;
3029                 /*
3030                  * Send a packet designed to force a response if the peer is
3031                  * up and reachable: either an ACK if the connection is
3032                  * still alive, or an RST if the peer has closed the
3033                  * connection due to timeout or reboot. Using sequence
3034                  * number tp->snd_una-1 causes the transmitted zero-length
3035                  * segment to lie outside the receive window; by the
3036                  * protocol spec, this requires the correspondent TCP to
3037                  * respond.
3038                  */
3039                 TCPSTAT_INC(tcps_keepprobe);
3040                 t_template = tcpip_maketemplate(inp);
3041                 if (t_template) {
3042                         tcp_respond(tp, t_template->tt_ipgen,
3043                             &t_template->tt_t, (struct mbuf *)NULL,
3044                             tp->rcv_nxt, tp->snd_una - 1, 0);
3045                         free(t_template, M_TEMP);
3046                 }
3047         }
3048         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
3049         return (1);
3050 dropit:
3051         TCPSTAT_INC(tcps_keepdrops);
3052         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
3053         return (1);
3054 }
3055
3056 /*
3057  * Retransmit helper function, clear up all the ack
3058  * flags and take care of important book keeping.
3059  */
3060 static void
3061 rack_remxt_tmr(struct tcpcb *tp)
3062 {
3063         /*
3064          * The retransmit timer went off, all sack'd blocks must be
3065          * un-acked.
3066          */
3067         struct rack_sendmap *rsm, *trsm = NULL;
3068         struct tcp_rack *rack;
3069         int32_t cnt = 0;
3070
3071         rack = (struct tcp_rack *)tp->t_fb_ptr;
3072         rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
3073         rack_log_to_event(rack, RACK_TO_FRM_TMR, 0);
3074         if (rack->r_state && (rack->r_state != tp->t_state))
3075                 rack_set_state(tp, rack);
3076         /*
3077          * Ideally we would like to be able to
3078          * mark SACK-PASS on anything not acked here.
3079          * However, if we do that we would burst out
3080          * all that data 1ms apart. This would be unwise,
3081          * so for now we will just let the normal rxt timer
3082          * and tlp timer take care of it.
3083          */
3084         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3085                 if (rsm->r_flags & RACK_ACKED) {
3086                         cnt++;
3087                         rsm->r_dupack = 0;
3088                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3089                         if (rsm->r_in_tmap == 0) {
3090                                 /* We must re-add it back to the tlist */
3091                                 if (trsm == NULL) {
3092                                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3093                                 } else {
3094                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
3095                                 }
3096                                 rsm->r_in_tmap = 1;
3097                         }
3098                 }
3099                 trsm = rsm;
3100                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
3101         }
3102         /* Clear the count (we just un-acked them) */
3103         rack->r_ctl.rc_sacked = 0;
3104         /* Clear the tlp rtx mark */
3105         rack->r_ctl.rc_tlp_rtx_out = 0;
3106         rack->r_ctl.rc_tlp_seg_send_cnt = 0;
3107         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3108         rack->r_ctl.rc_prr_sndcnt = 0;
3109         rack_log_to_prr(rack, 6);
3110         rack->r_timer_override = 1;
3111 }
3112
3113 /*
3114  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
3115  * we will setup to retransmit the lowest seq number outstanding.
3116  */
3117 static int
3118 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
3119 {
3120         int32_t rexmt;
3121         struct inpcb *inp;
3122         int32_t retval = 0;
3123
3124         inp = tp->t_inpcb;
3125         if (tp->t_timers->tt_flags & TT_STOPPED) {
3126                 return (1);
3127         }
3128         if (rack_progress_timeout_check(tp)) {
3129                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
3130                 return (1);
3131         }
3132         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
3133         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
3134             (tp->snd_una == tp->snd_max)) {
3135                 /* Nothing outstanding .. nothing to do */
3136                 return (0);
3137         }
3138         /*
3139          * Retransmission timer went off.  Message has not been acked within
3140          * retransmit interval.  Back off to a longer retransmit interval
3141          * and retransmit one segment.
3142          */
3143         rack_remxt_tmr(tp);
3144         if ((rack->r_ctl.rc_resend == NULL) ||
3145             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
3146                 /*
3147                  * If the rwnd collapsed on
3148                  * the one we are retransmitting
3149                  * it does not count against the
3150                  * rxt count.
3151                  */
3152                 tp->t_rxtshift++;
3153         }
3154         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
3155                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
3156                 TCPSTAT_INC(tcps_timeoutdrop);
3157                 retval = 1;
3158                 tcp_set_inp_to_drop(rack->rc_inp,
3159                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
3160                 goto out;
3161         }
3162         if (tp->t_state == TCPS_SYN_SENT) {
3163                 /*
3164                  * If the SYN was retransmitted, indicate CWND to be limited
3165                  * to 1 segment in cc_conn_init().
3166                  */
3167                 tp->snd_cwnd = 1;
3168         } else if (tp->t_rxtshift == 1) {
3169                 /*
3170                  * first retransmit; record ssthresh and cwnd so they can be
3171                  * recovered if this turns out to be a "bad" retransmit. A
3172                  * retransmit is considered "bad" if an ACK for this segment
3173                  * is received within RTT/2 interval; the assumption here is
3174                  * that the ACK was already in flight.  See "On Estimating
3175                  * End-to-End Network Path Properties" by Allman and Paxson
3176                  * for more details.
3177                  */
3178                 tp->snd_cwnd_prev = tp->snd_cwnd;
3179                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
3180                 tp->snd_recover_prev = tp->snd_recover;
3181                 if (IN_FASTRECOVERY(tp->t_flags))
3182                         tp->t_flags |= TF_WASFRECOVERY;
3183                 else
3184                         tp->t_flags &= ~TF_WASFRECOVERY;
3185                 if (IN_CONGRECOVERY(tp->t_flags))
3186                         tp->t_flags |= TF_WASCRECOVERY;
3187                 else
3188                         tp->t_flags &= ~TF_WASCRECOVERY;
3189                 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
3190                 tp->t_flags |= TF_PREVVALID;
3191         } else
3192                 tp->t_flags &= ~TF_PREVVALID;
3193         TCPSTAT_INC(tcps_rexmttimeo);
3194         if ((tp->t_state == TCPS_SYN_SENT) ||
3195             (tp->t_state == TCPS_SYN_RECEIVED))
3196                 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
3197         else
3198                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
3199         TCPT_RANGESET(tp->t_rxtcur, rexmt,
3200            max(MSEC_2_TICKS(rack_rto_min), rexmt),
3201            MSEC_2_TICKS(rack_rto_max));
3202         /*
3203          * We enter the path for PLMTUD if connection is established or, if
3204          * connection is FIN_WAIT_1 status, reason for the last is that if
3205          * amount of data we send is very small, we could send it in couple
3206          * of packets and process straight to FIN. In that case we won't
3207          * catch ESTABLISHED state.
3208          */
3209         if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
3210             || (tp->t_state == TCPS_FIN_WAIT_1))) {
3211 #ifdef INET6
3212                 int32_t isipv6;
3213 #endif
3214
3215                 /*
3216                  * Idea here is that at each stage of mtu probe (usually,
3217                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
3218                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
3219                  * should take care of that.
3220                  */
3221                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
3222                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
3223                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
3224                     tp->t_rxtshift % 2 == 0)) {
3225                         /*
3226                          * Enter Path MTU Black-hole Detection mechanism: -
3227                          * Disable Path MTU Discovery (IP "DF" bit). -
3228                          * Reduce MTU to lower value than what we negotiated
3229                          * with peer.
3230                          */
3231                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
3232                                 /* Record that we may have found a black hole. */
3233                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
3234                                 /* Keep track of previous MSS. */
3235                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
3236                         }
3237
3238                         /*
3239                          * Reduce the MSS to blackhole value or to the
3240                          * default in an attempt to retransmit.
3241                          */
3242 #ifdef INET6
3243                         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
3244                         if (isipv6 &&
3245                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
3246                                 /* Use the sysctl tuneable blackhole MSS. */
3247                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
3248                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3249                         } else if (isipv6) {
3250                                 /* Use the default MSS. */
3251                                 tp->t_maxseg = V_tcp_v6mssdflt;
3252                                 /*
3253                                  * Disable Path MTU Discovery when we switch
3254                                  * to minmss.
3255                                  */
3256                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3257                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3258                         }
3259 #endif
3260 #if defined(INET6) && defined(INET)
3261                         else
3262 #endif
3263 #ifdef INET
3264                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3265                                 /* Use the sysctl tuneable blackhole MSS. */
3266                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3267                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3268                         } else {
3269                                 /* Use the default MSS. */
3270                                 tp->t_maxseg = V_tcp_mssdflt;
3271                                 /*
3272                                  * Disable Path MTU Discovery when we switch
3273                                  * to minmss.
3274                                  */
3275                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3276                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3277                         }
3278 #endif
3279                 } else {
3280                         /*
3281                          * If further retransmissions are still unsuccessful
3282                          * with a lowered MTU, maybe this isn't a blackhole
3283                          * and we restore the previous MSS and blackhole
3284                          * detection flags. The limit '6' is determined by
3285                          * giving each probe stage (1448, 1188, 524) 2
3286                          * chances to recover.
3287                          */
3288                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3289                             (tp->t_rxtshift >= 6)) {
3290                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3291                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3292                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3293                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3294                         }
3295                 }
3296         }
3297         /*
3298          * If we backed off this far, our srtt estimate is probably bogus.
3299          * Clobber it so we'll take the next rtt measurement as our srtt;
3300          * move the current srtt into rttvar to keep the current retransmit
3301          * times until then.
3302          */
3303         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3304 #ifdef INET6
3305                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3306                         in6_losing(tp->t_inpcb);
3307                 else
3308 #endif
3309                         in_losing(tp->t_inpcb);
3310                 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3311                 tp->t_srtt = 0;
3312         }
3313         if (rack_use_sack_filter)
3314                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3315         tp->snd_recover = tp->snd_max;
3316         tp->t_flags |= TF_ACKNOW;
3317         tp->t_rtttime = 0;
3318         rack_cong_signal(tp, NULL, CC_RTO);
3319 out:
3320         return (retval);
3321 }
3322
3323 static int
3324 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3325 {
3326         int32_t ret = 0;
3327         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3328
3329         if (timers == 0) {
3330                 return (0);
3331         }
3332         if (tp->t_state == TCPS_LISTEN) {
3333                 /* no timers on listen sockets */
3334                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3335                         return (0);
3336                 return (1);
3337         }
3338         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3339                 uint32_t left;
3340
3341                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3342                         ret = -1;
3343                         rack_log_to_processing(rack, cts, ret, 0);
3344                         return (0);
3345                 }
3346                 if (hpts_calling == 0) {
3347                         ret = -2;
3348                         rack_log_to_processing(rack, cts, ret, 0);
3349                         return (0);
3350                 }
3351                 /*
3352                  * Ok our timer went off early and we are not paced false
3353                  * alarm, go back to sleep.
3354                  */
3355                 ret = -3;
3356                 left = rack->r_ctl.rc_timer_exp - cts;
3357                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3358                 rack_log_to_processing(rack, cts, ret, left);
3359                 rack->rc_last_pto_set = 0;
3360                 return (1);
3361         }
3362         rack->rc_tmr_stopped = 0;
3363         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3364         if (timers & PACE_TMR_DELACK) {
3365                 ret = rack_timeout_delack(tp, rack, cts);
3366         } else if (timers & PACE_TMR_RACK) {
3367                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3368                 ret = rack_timeout_rack(tp, rack, cts);
3369         } else if (timers & PACE_TMR_TLP) {
3370                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3371                 ret = rack_timeout_tlp(tp, rack, cts);
3372         } else if (timers & PACE_TMR_RXT) {
3373                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
3374                 ret = rack_timeout_rxt(tp, rack, cts);
3375         } else if (timers & PACE_TMR_PERSIT) {
3376                 ret = rack_timeout_persist(tp, rack, cts);
3377         } else if (timers & PACE_TMR_KEEP) {
3378                 ret = rack_timeout_keepalive(tp, rack, cts);
3379         }
3380         rack_log_to_processing(rack, cts, ret, timers);
3381         return (ret);
3382 }
3383
3384 static void
3385 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3386 {
3387         uint8_t hpts_removed = 0;
3388
3389         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3390             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3391                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3392                 hpts_removed = 1;
3393         }
3394         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3395                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3396                 if (rack->rc_inp->inp_in_hpts &&
3397                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3398                         /*
3399                          * Canceling timer's when we have no output being
3400                          * paced. We also must remove ourselves from the
3401                          * hpts.
3402                          */
3403                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3404                         hpts_removed = 1;
3405                 }
3406                 rack_log_to_cancel(rack, hpts_removed, line);
3407                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3408         }
3409 }
3410
3411 static void
3412 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3413 {
3414         return;
3415 }
3416
3417 static int
3418 rack_stopall(struct tcpcb *tp)
3419 {
3420         struct tcp_rack *rack;
3421         rack = (struct tcp_rack *)tp->t_fb_ptr;
3422         rack->t_timers_stopped = 1;
3423         return (0);
3424 }
3425
3426 static void
3427 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3428 {
3429         return;
3430 }
3431
3432 static int
3433 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3434 {
3435         return (0);
3436 }
3437
3438 static void
3439 rack_stop_all_timers(struct tcpcb *tp)
3440 {
3441         struct tcp_rack *rack;
3442
3443         /*
3444          * Assure no timers are running.
3445          */
3446         if (tcp_timer_active(tp, TT_PERSIST)) {
3447                 /* We enter in persists, set the flag appropriately */
3448                 rack = (struct tcp_rack *)tp->t_fb_ptr;
3449                 rack->rc_in_persist = 1;
3450         }
3451         tcp_timer_suspend(tp, TT_PERSIST);
3452         tcp_timer_suspend(tp, TT_REXMT);
3453         tcp_timer_suspend(tp, TT_KEEP);
3454         tcp_timer_suspend(tp, TT_DELACK);
3455 }
3456
3457 static void
3458 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3459     struct rack_sendmap *rsm, uint32_t ts)
3460 {
3461         int32_t idx;
3462
3463         rsm->r_rtr_cnt++;
3464         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3465         rsm->r_dupack = 0;
3466         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3467                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3468                 rsm->r_flags |= RACK_OVERMAX;
3469         }
3470         if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3471                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3472                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3473         }
3474         idx = rsm->r_rtr_cnt - 1;
3475         rsm->r_tim_lastsent[idx] = ts;
3476         if (rsm->r_flags & RACK_ACKED) {
3477                 /* Problably MTU discovery messing with us */
3478                 rsm->r_flags &= ~RACK_ACKED;
3479                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3480         }
3481         if (rsm->r_in_tmap) {
3482                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3483                 rsm->r_in_tmap = 0;
3484         }
3485         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3486         rsm->r_in_tmap = 1;
3487         if (rsm->r_flags & RACK_SACK_PASSED) {
3488                 /* We have retransmitted due to the SACK pass */
3489                 rsm->r_flags &= ~RACK_SACK_PASSED;
3490                 rsm->r_flags |= RACK_WAS_SACKPASS;
3491         }
3492 }
3493
3494
3495 static uint32_t
3496 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3497     struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp)
3498 {
3499         /*
3500          * We (re-)transmitted starting at rsm->r_start for some length
3501          * (possibly less than r_end.
3502          */
3503         struct rack_sendmap *nrsm, *insret;
3504         uint32_t c_end;
3505         int32_t len;
3506
3507         len = *lenp;
3508         c_end = rsm->r_start + len;
3509         if (SEQ_GEQ(c_end, rsm->r_end)) {
3510                 /*
3511                  * We retransmitted the whole piece or more than the whole
3512                  * slopping into the next rsm.
3513                  */
3514                 rack_update_rsm(tp, rack, rsm, ts);
3515                 if (c_end == rsm->r_end) {
3516                         *lenp = 0;
3517                         return (0);
3518                 } else {
3519                         int32_t act_len;
3520
3521                         /* Hangs over the end return whats left */
3522                         act_len = rsm->r_end - rsm->r_start;
3523                         *lenp = (len - act_len);
3524                         return (rsm->r_end);
3525                 }
3526                 /* We don't get out of this block. */
3527         }
3528         /*
3529          * Here we retransmitted less than the whole thing which means we
3530          * have to split this into what was transmitted and what was not.
3531          */
3532         nrsm = rack_alloc_full_limit(rack);
3533         if (nrsm == NULL) {
3534                 /*
3535                  * We can't get memory, so lets not proceed.
3536                  */
3537                 *lenp = 0;
3538                 return (0);
3539         }
3540         /*
3541          * So here we are going to take the original rsm and make it what we
3542          * retransmitted. nrsm will be the tail portion we did not
3543          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3544          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3545          * 1, 6 and the new piece will be 6, 11.
3546          */
3547         rack_clone_rsm(rack, nrsm, rsm, c_end);
3548         nrsm->r_dupack = 0;
3549         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
3550         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3551 #ifdef INVARIANTS
3552         if (insret != NULL) {
3553                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3554                       nrsm, insret, rack, rsm);
3555         }
3556 #endif
3557         if (rsm->r_in_tmap) {
3558                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3559                 nrsm->r_in_tmap = 1;
3560         }
3561         rsm->r_flags &= (~RACK_HAS_FIN);
3562         rack_update_rsm(tp, rack, rsm, ts);
3563         *lenp = 0;
3564         return (0);
3565 }
3566
3567
3568 static void
3569 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3570     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3571     uint8_t pass, struct rack_sendmap *hintrsm)
3572 {
3573         struct tcp_rack *rack;
3574         struct rack_sendmap *rsm, *nrsm, *insret, fe;
3575         register uint32_t snd_max, snd_una;
3576
3577         /*
3578          * Add to the RACK log of packets in flight or retransmitted. If
3579          * there is a TS option we will use the TS echoed, if not we will
3580          * grab a TS.
3581          *
3582          * Retransmissions will increment the count and move the ts to its
3583          * proper place. Note that if options do not include TS's then we
3584          * won't be able to effectively use the ACK for an RTT on a retran.
3585          *
3586          * Notes about r_start and r_end. Lets consider a send starting at
3587          * sequence 1 for 10 bytes. In such an example the r_start would be
3588          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3589          * This means that r_end is actually the first sequence for the next
3590          * slot (11).
3591          *
3592          */
3593         /*
3594          * If err is set what do we do XXXrrs? should we not add the thing?
3595          * -- i.e. return if err != 0 or should we pretend we sent it? --
3596          * i.e. proceed with add ** do this for now.
3597          */
3598         INP_WLOCK_ASSERT(tp->t_inpcb);
3599         if (err)
3600                 /*
3601                  * We don't log errors -- we could but snd_max does not
3602                  * advance in this case either.
3603                  */
3604                 return;
3605
3606         if (th_flags & TH_RST) {
3607                 /*
3608                  * We don't log resets and we return immediately from
3609                  * sending
3610                  */
3611                 return;
3612         }
3613         rack = (struct tcp_rack *)tp->t_fb_ptr;
3614         snd_una = tp->snd_una;
3615         if (SEQ_LEQ((seq_out + len), snd_una)) {
3616                 /* Are sending an old segment to induce an ack (keep-alive)? */
3617                 return;
3618         }
3619         if (SEQ_LT(seq_out, snd_una)) {
3620                 /* huh? should we panic? */
3621                 uint32_t end;
3622
3623                 end = seq_out + len;
3624                 seq_out = snd_una;
3625                 if (SEQ_GEQ(end, seq_out))
3626                         len = end - seq_out;
3627                 else
3628                         len = 0;
3629         }
3630         snd_max = tp->snd_max;
3631         if (th_flags & (TH_SYN | TH_FIN)) {
3632                 /*
3633                  * The call to rack_log_output is made before bumping
3634                  * snd_max. This means we can record one extra byte on a SYN
3635                  * or FIN if seq_out is adding more on and a FIN is present
3636                  * (and we are not resending).
3637                  */
3638                 if (th_flags & TH_SYN)
3639                         len++;
3640                 if (th_flags & TH_FIN)
3641                         len++;
3642                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
3643                         /*
3644                          * The add/update as not been done for the FIN/SYN
3645                          * yet.
3646                          */
3647                         snd_max = tp->snd_nxt;
3648                 }
3649         }
3650         if (len == 0) {
3651                 /* We don't log zero window probes */
3652                 return;
3653         }
3654         rack->r_ctl.rc_time_last_sent = ts;
3655         if (IN_RECOVERY(tp->t_flags)) {
3656                 rack->r_ctl.rc_prr_out += len;
3657         }
3658         /* First question is it a retransmission or new? */
3659         if (seq_out == snd_max) {
3660                 /* Its new */
3661 again:
3662                 rsm = rack_alloc(rack);
3663                 if (rsm == NULL) {
3664                         /*
3665                          * Hmm out of memory and the tcb got destroyed while
3666                          * we tried to wait.
3667                          */
3668                         return;
3669                 }
3670                 if (th_flags & TH_FIN) {
3671                         rsm->r_flags = RACK_HAS_FIN;
3672                 } else {
3673                         rsm->r_flags = 0;
3674                 }
3675                 rsm->r_tim_lastsent[0] = ts;
3676                 rsm->r_rtr_cnt = 1;
3677                 rsm->r_rtr_bytes = 0;
3678                 if (th_flags & TH_SYN) {
3679                         /* The data space is one beyond snd_una */
3680                         rsm->r_start = seq_out + 1;
3681                         rsm->r_end = rsm->r_start + (len - 1);
3682                 } else {
3683                         /* Normal case */
3684                         rsm->r_start = seq_out;
3685                         rsm->r_end = rsm->r_start + len;
3686                 }
3687                 rsm->r_dupack = 0;
3688                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
3689                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
3690 #ifdef INVARIANTS
3691                 if (insret != NULL) {
3692                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3693                               nrsm, insret, rack, rsm);
3694                 }
3695 #endif
3696                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3697                 rsm->r_in_tmap = 1;
3698                 return;
3699         }
3700         /*
3701          * If we reach here its a retransmission and we need to find it.
3702          */
3703         memset(&fe, 0, sizeof(fe));
3704 more:
3705         if (hintrsm && (hintrsm->r_start == seq_out)) {
3706                 rsm = hintrsm;
3707                 hintrsm = NULL;
3708         } else {
3709                 /* No hints sorry */
3710                 rsm = NULL;
3711         }
3712         if ((rsm) && (rsm->r_start == seq_out)) {
3713                 seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3714                 if (len == 0) {
3715                         return;
3716                 } else {
3717                         goto more;
3718                 }
3719         }
3720         /* Ok it was not the last pointer go through it the hard way. */
3721 refind:
3722         fe.r_start = seq_out;
3723         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3724         if (rsm) {
3725                 if (rsm->r_start == seq_out) {
3726                         seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3727                         if (len == 0) {
3728                                 return;
3729                         } else {
3730                                 goto refind;
3731                         }
3732                 }
3733                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3734                         /* Transmitted within this piece */
3735                         /*
3736                          * Ok we must split off the front and then let the
3737                          * update do the rest
3738                          */
3739                         nrsm = rack_alloc_full_limit(rack);
3740                         if (nrsm == NULL) {
3741                                 rack_update_rsm(tp, rack, rsm, ts);
3742                                 return;
3743                         }
3744                         /*
3745                          * copy rsm to nrsm and then trim the front of rsm
3746                          * to not include this part.
3747                          */
3748                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
3749                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
3750 #ifdef INVARIANTS
3751                         if (insret != NULL) {
3752                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
3753                                       nrsm, insret, rack, rsm);
3754                         }
3755 #endif
3756                         if (rsm->r_in_tmap) {
3757                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3758                                 nrsm->r_in_tmap = 1;
3759                         }
3760                         rsm->r_flags &= (~RACK_HAS_FIN);
3761                         seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3762                         if (len == 0) {
3763                                 return;
3764                         } else if (len > 0)
3765                                 goto refind;
3766                 }
3767         }
3768         /*
3769          * Hmm not found in map did they retransmit both old and on into the
3770          * new?
3771          */
3772         if (seq_out == tp->snd_max) {
3773                 goto again;
3774         } else if (SEQ_LT(seq_out, tp->snd_max)) {
3775 #ifdef INVARIANTS
3776                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3777                     seq_out, len, tp->snd_una, tp->snd_max);
3778                 printf("Starting Dump of all rack entries\n");
3779                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
3780                         printf("rsm:%p start:%u end:%u\n",
3781                             rsm, rsm->r_start, rsm->r_end);
3782                 }
3783                 printf("Dump complete\n");
3784                 panic("seq_out not found rack:%p tp:%p",
3785                     rack, tp);
3786 #endif
3787         } else {
3788 #ifdef INVARIANTS
3789                 /*
3790                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
3791                  * flag)
3792                  */
3793                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3794                     seq_out, len, tp->snd_max, tp);
3795 #endif
3796         }
3797 }
3798
3799 /*
3800  * Record one of the RTT updates from an ack into
3801  * our sample structure.
3802  */
3803 static void
3804 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3805 {
3806         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3807             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3808                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3809         }
3810         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3811             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3812                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3813         }
3814         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3815         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3816         rack->r_ctl.rack_rs.rs_rtt_cnt++;
3817 }
3818
3819 /*
3820  * Collect new round-trip time estimate
3821  * and update averages and current timeout.
3822  */
3823 static void
3824 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3825 {
3826         int32_t delta;
3827         uint32_t o_srtt, o_var;
3828         int32_t rtt;
3829
3830         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3831                 /* No valid sample */
3832                 return;
3833         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3834                 /* We are to use the lowest RTT seen in a single ack */
3835                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3836         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3837                 /* We are to use the highest RTT seen in a single ack */
3838                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3839         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3840                 /* We are to use the average RTT seen in a single ack */
3841                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3842                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3843         } else {
3844 #ifdef INVARIANTS
3845                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3846 #endif
3847                 return;
3848         }
3849         if (rtt == 0)
3850                 rtt = 1;
3851         rack_log_rtt_sample(rack, rtt);
3852         o_srtt = tp->t_srtt;
3853         o_var = tp->t_rttvar;
3854         rack = (struct tcp_rack *)tp->t_fb_ptr;
3855         if (tp->t_srtt != 0) {
3856                 /*
3857                  * srtt is stored as fixed point with 5 bits after the
3858                  * binary point (i.e., scaled by 8).  The following magic is
3859                  * equivalent to the smoothing algorithm in rfc793 with an
3860                  * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3861                  * Adjust rtt to origin 0.
3862                  */
3863                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3864                     - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3865
3866                 tp->t_srtt += delta;
3867                 if (tp->t_srtt <= 0)
3868                         tp->t_srtt = 1;
3869
3870                 /*
3871                  * We accumulate a smoothed rtt variance (actually, a
3872                  * smoothed mean difference), then set the retransmit timer
3873                  * to smoothed rtt + 4 times the smoothed variance. rttvar
3874                  * is stored as fixed point with 4 bits after the binary
3875                  * point (scaled by 16).  The following is equivalent to
3876                  * rfc793 smoothing with an alpha of .75 (rttvar =
3877                  * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3878                  * wired-in beta.
3879                  */
3880                 if (delta < 0)
3881                         delta = -delta;
3882                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3883                 tp->t_rttvar += delta;
3884                 if (tp->t_rttvar <= 0)
3885                         tp->t_rttvar = 1;
3886                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3887                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3888         } else {
3889                 /*
3890                  * No rtt measurement yet - use the unsmoothed rtt. Set the
3891                  * variance to half the rtt (so our first retransmit happens
3892                  * at 3*rtt).
3893                  */
3894                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3895                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3896                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3897         }
3898         TCPSTAT_INC(tcps_rttupdated);
3899         rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3900         tp->t_rttupdated++;
3901 #ifdef STATS
3902         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3903 #endif
3904         tp->t_rxtshift = 0;
3905
3906         /*
3907          * the retransmit should happen at rtt + 4 * rttvar. Because of the
3908          * way we do the smoothing, srtt and rttvar will each average +1/2
3909          * tick of bias.  When we compute the retransmit timer, we want 1/2
3910          * tick of rounding and 1 extra tick because of +-1/2 tick
3911          * uncertainty in the firing of the timer.  The bias will give us
3912          * exactly the 1.5 tick we need.  But, because the bias is
3913          * statistical, we have to test that we don't drop below the minimum
3914          * feasible timer (which is 2 ticks).
3915          */
3916         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3917            max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3918         tp->t_softerror = 0;
3919 }
3920
3921 static void
3922 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3923     uint32_t t, uint32_t cts)
3924 {
3925         /*
3926          * For this RSM, we acknowledged the data from a previous
3927          * transmission, not the last one we made. This means we did a false
3928          * retransmit.
3929          */
3930         struct tcp_rack *rack;
3931
3932         if (rsm->r_flags & RACK_HAS_FIN) {
3933                 /*
3934                  * The sending of the FIN often is multiple sent when we
3935                  * have everything outstanding ack'd. We ignore this case
3936                  * since its over now.
3937                  */
3938                 return;
3939         }
3940         if (rsm->r_flags & RACK_TLP) {
3941                 /*
3942                  * We expect TLP's to have this occur.
3943                  */
3944                 return;
3945         }
3946         rack = (struct tcp_rack *)tp->t_fb_ptr;
3947         /* should we undo cc changes and exit recovery? */
3948         if (IN_RECOVERY(tp->t_flags)) {
3949                 if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3950                         /*
3951                          * Undo what we ratched down and exit recovery if
3952                          * possible
3953                          */
3954                         EXIT_RECOVERY(tp->t_flags);
3955                         tp->snd_recover = tp->snd_una;
3956                         if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3957                                 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3958                         if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3959                                 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3960                 }
3961         }
3962         if (rsm->r_flags & RACK_WAS_SACKPASS) {
3963                 /*
3964                  * We retransmitted based on a sack and the earlier
3965                  * retransmission ack'd it - re-ordering is occuring.
3966                  */
3967                 counter_u64_add(rack_reorder_seen, 1);
3968                 rack->r_ctl.rc_reorder_ts = cts;
3969         }
3970         counter_u64_add(rack_badfr, 1);
3971         counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3972 }
3973
3974
3975 static int
3976 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3977     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3978 {
3979         int32_t i;
3980         uint32_t t;
3981
3982         if (rsm->r_flags & RACK_ACKED)
3983                 /* Already done */
3984                 return (0);
3985
3986
3987         if ((rsm->r_rtr_cnt == 1) ||
3988             ((ack_type == CUM_ACKED) &&
3989             (to->to_flags & TOF_TS) &&
3990             (to->to_tsecr) &&
3991             (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3992             ) {
3993                 /*
3994                  * We will only find a matching timestamp if its cum-acked.
3995                  * But if its only one retransmission its for-sure matching
3996                  * :-)
3997                  */
3998                 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3999                 if ((int)t <= 0)
4000                         t = 1;
4001                 if (!tp->t_rttlow || tp->t_rttlow > t)
4002                         tp->t_rttlow = t;
4003                 if (!rack->r_ctl.rc_rack_min_rtt ||
4004                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4005                         rack->r_ctl.rc_rack_min_rtt = t;
4006                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4007                                 rack->r_ctl.rc_rack_min_rtt = 1;
4008                         }
4009                 }
4010                 tcp_rack_xmit_timer(rack, t + 1);
4011                 if ((rsm->r_flags & RACK_TLP) &&
4012                     (!IN_RECOVERY(tp->t_flags))) {
4013                         /* Segment was a TLP and our retrans matched */
4014                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
4015                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
4016                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4017                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4018                                 rack_cong_signal(tp, NULL, CC_NDUPACK);
4019                                 /*
4020                                  * When we enter recovery we need to assure
4021                                  * we send one packet.
4022                                  */
4023                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4024                                 rack_log_to_prr(rack, 7);
4025                         }
4026                 }
4027                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4028                         /* New more recent rack_tmit_time */
4029                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4030                         rack->rc_rack_rtt = t;
4031                 }
4032                 return (1);
4033         }
4034         /*
4035          * We clear the soft/rxtshift since we got an ack.
4036          * There is no assurance we will call the commit() function
4037          * so we need to clear these to avoid incorrect handling.
4038          */
4039         tp->t_rxtshift = 0;
4040         tp->t_softerror = 0;
4041         if ((to->to_flags & TOF_TS) &&
4042             (ack_type == CUM_ACKED) &&
4043             (to->to_tsecr) &&
4044             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
4045                 /*
4046                  * Now which timestamp does it match? In this block the ACK
4047                  * must be coming from a previous transmission.
4048                  */
4049                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
4050                         if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
4051                                 t = cts - rsm->r_tim_lastsent[i];
4052                                 if ((int)t <= 0)
4053                                         t = 1;
4054                                 if ((i + 1) < rsm->r_rtr_cnt) {
4055                                         /* Likely */
4056                                         rack_earlier_retran(tp, rsm, t, cts);
4057                                 }
4058                                 if (!tp->t_rttlow || tp->t_rttlow > t)
4059                                         tp->t_rttlow = t;
4060                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4061                                         rack->r_ctl.rc_rack_min_rtt = t;
4062                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
4063                                                 rack->r_ctl.rc_rack_min_rtt = 1;
4064                                         }
4065                                 }
4066                                 /*
4067                                  * Note the following calls to
4068                                  * tcp_rack_xmit_timer() are being commented
4069                                  * out for now. They give us no more accuracy
4070                                  * and often lead to a wrong choice. We have
4071                                  * enough samples that have not been
4072                                  * retransmitted. I leave the commented out
4073                                  * code in here in case in the future we
4074                                  * decide to add it back (though I can't forsee
4075                                  * doing that). That way we will easily see
4076                                  * where they need to be placed.
4077                                  */
4078                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
4079                                     rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
4080                                         /* New more recent rack_tmit_time */
4081                                         rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
4082                                         rack->rc_rack_rtt = t;
4083                                 }
4084                                 return (1);
4085                         }
4086                 }
4087                 goto ts_not_found;
4088         } else {
4089                 /*
4090                  * Ok its a SACK block that we retransmitted. or a windows
4091                  * machine without timestamps. We can tell nothing from the
4092                  * time-stamp since its not there or the time the peer last
4093                  * recieved a segment that moved forward its cum-ack point.
4094                  */
4095 ts_not_found:
4096                 i = rsm->r_rtr_cnt - 1;
4097                 t = cts - rsm->r_tim_lastsent[i];
4098                 if ((int)t <= 0)
4099                         t = 1;
4100                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4101                         /*
4102                          * We retransmitted and the ack came back in less
4103                          * than the smallest rtt we have observed. We most
4104                          * likey did an improper retransmit as outlined in
4105                          * 4.2 Step 3 point 2 in the rack-draft.
4106                          */
4107                         i = rsm->r_rtr_cnt - 2;
4108                         t = cts - rsm->r_tim_lastsent[i];
4109                         rack_earlier_retran(tp, rsm, t, cts);
4110                 } else if (rack->r_ctl.rc_rack_min_rtt) {
4111                         /*
4112                          * We retransmitted it and the retransmit did the
4113                          * job.
4114                          */
4115                         if (!rack->r_ctl.rc_rack_min_rtt ||
4116                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
4117                                 rack->r_ctl.rc_rack_min_rtt = t;
4118                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
4119                                         rack->r_ctl.rc_rack_min_rtt = 1;
4120                                 }
4121                         }
4122                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
4123                                 /* New more recent rack_tmit_time */
4124                                 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
4125                                 rack->rc_rack_rtt = t;
4126                         }
4127                         return (1);
4128                 }
4129         }
4130         return (0);
4131 }
4132
4133 /*
4134  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
4135  */
4136 static void
4137 rack_log_sack_passed(struct tcpcb *tp,
4138     struct tcp_rack *rack, struct rack_sendmap *rsm)
4139 {
4140         struct rack_sendmap *nrsm;
4141
4142         nrsm = rsm;
4143         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
4144             rack_head, r_tnext) {
4145                 if (nrsm == rsm) {
4146                         /* Skip orginal segment he is acked */
4147                         continue;
4148                 }
4149                 if (nrsm->r_flags & RACK_ACKED) {
4150                         /*
4151                          * Skip ack'd segments, though we
4152                          * should not see these, since tmap
4153                          * should not have ack'd segments.
4154                          */
4155                         continue;
4156                 }
4157                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4158                         /*
4159                          * We found one that is already marked
4160                          * passed, we have been here before and
4161                          * so all others below this are marked.
4162                          */
4163                         break;
4164                 }
4165                 nrsm->r_flags |= RACK_SACK_PASSED;
4166                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
4167         }
4168 }
4169
4170 static uint32_t
4171 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
4172                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
4173 {
4174         uint32_t start, end, changed = 0;
4175         struct rack_sendmap stack_map;
4176         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
4177         int32_t used_ref = 1;
4178         int moved = 0;
4179
4180         start = sack->start;
4181         end = sack->end;
4182         rsm = *prsm;
4183         memset(&fe, 0, sizeof(fe));
4184 do_rest_ofb:
4185         if ((rsm == NULL) ||
4186             (SEQ_LT(end, rsm->r_start)) ||
4187             (SEQ_GEQ(start, rsm->r_end)) ||
4188             (SEQ_LT(start, rsm->r_start))) {
4189                 /*
4190                  * We are not in the right spot,
4191                  * find the correct spot in the tree.
4192                  */
4193                 used_ref = 0;
4194                 fe.r_start = start;
4195                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4196                 moved++;
4197         }
4198         if (rsm == NULL) {
4199                 /* TSNH */
4200                 goto out;
4201         }
4202         /* Ok we have an ACK for some piece of this rsm */
4203         if (rsm->r_start != start) {
4204                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4205                         /**
4206                          * Need to split this in two pieces the before and after,
4207                          * the before remains in the map, the after must be
4208                          * added. In other words we have:
4209                          * rsm        |--------------|
4210                          * sackblk        |------->
4211                          * rsm will become
4212                          *     rsm    |---|
4213                          * and nrsm will be  the sacked piece
4214                          *     nrsm       |----------|
4215                          *
4216                          * But before we start down that path lets
4217                          * see if the sack spans over on top of
4218                          * the next guy and it is already sacked.
4219                          */
4220                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4221                         if (next && (next->r_flags & RACK_ACKED) &&
4222                             SEQ_GEQ(end, next->r_start)) {
4223                                 /**
4224                                  * So the next one is already acked, and
4225                                  * we can thus by hookery use our stack_map
4226                                  * to reflect the piece being sacked and
4227                                  * then adjust the two tree entries moving
4228                                  * the start and ends around. So we start like:
4229                                  *  rsm     |------------|             (not-acked)
4230                                  *  next                 |-----------| (acked)
4231                                  *  sackblk        |-------->
4232                                  *  We want to end like so:
4233                                  *  rsm     |------|                   (not-acked)
4234                                  *  next           |-----------------| (acked)
4235                                  *  nrsm           |-----|
4236                                  * Where nrsm is a temporary stack piece we
4237                                  * use to update all the gizmos.
4238                                  */
4239                                 /* Copy up our fudge block */
4240                                 nrsm = &stack_map;
4241                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4242                                 /* Now adjust our tree blocks */
4243                                 rsm->r_end = start;
4244                                 next->r_start = start;
4245                                 /* Clear out the dup ack count of the remainder */
4246                                 rsm->r_dupack = 0;
4247                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4248                                 /* Now lets make sure our fudge block is right */
4249                                 nrsm->r_start = start;
4250                                 /* Now lets update all the stats and such */
4251                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4252                                 changed += (nrsm->r_end - nrsm->r_start);
4253                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4254                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
4255                                         counter_u64_add(rack_reorder_seen, 1);
4256                                         rack->r_ctl.rc_reorder_ts = cts;
4257                                 }
4258                                 /*
4259                                  * Now we want to go up from rsm (the
4260                                  * one left un-acked) to the next one
4261                                  * in the tmap. We do this so when
4262                                  * we walk backwards we include marking
4263                                  * sack-passed on rsm (The one passed in
4264                                  * is skipped since it is generally called
4265                                  * on something sacked before removing it
4266                                  * from the tmap).
4267                                  */
4268                                 if (rsm->r_in_tmap) {
4269                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
4270                                         /*
4271                                          * Now that we have the next
4272                                          * one walk backwards from there.
4273                                          */
4274                                         if (nrsm && nrsm->r_in_tmap)
4275                                                 rack_log_sack_passed(tp, rack, nrsm);
4276                                 }
4277                                 /* Now are we done? */
4278                                 if (SEQ_LT(end, next->r_end) ||
4279                                     (end == next->r_end)) {
4280                                         /* Done with block */
4281                                         goto out;
4282                                 }
4283                                 counter_u64_add(rack_sack_used_next_merge, 1);
4284                                 /* Postion for the next block */
4285                                 start = next->r_end;
4286                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
4287                                 if (rsm == NULL)
4288                                         goto out;
4289                         } else {
4290                                 /**
4291                                  * We can't use any hookery here, so we
4292                                  * need to split the map. We enter like
4293                                  * so:
4294                                  *  rsm      |--------|
4295                                  *  sackblk       |----->
4296                                  * We will add the new block nrsm and
4297                                  * that will be the new portion, and then
4298                                  * fall through after reseting rsm. So we
4299                                  * split and look like this:
4300                                  *  rsm      |----|
4301                                  *  sackblk       |----->
4302                                  *  nrsm          |---|
4303                                  * We then fall through reseting
4304                                  * rsm to nrsm, so the next block
4305                                  * picks it up.
4306                                  */
4307                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4308                                 if (nrsm == NULL) {
4309                                         /*
4310                                          * failed XXXrrs what can we do but loose the sack
4311                                          * info?
4312                                          */
4313                                         goto out;
4314                                 }
4315                                 counter_u64_add(rack_sack_splits, 1);
4316                                 rack_clone_rsm(rack, nrsm, rsm, start);
4317                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4318 #ifdef INVARIANTS
4319                                 if (insret != NULL) {
4320                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4321                                               nrsm, insret, rack, rsm);
4322                                 }
4323 #endif
4324                                 if (rsm->r_in_tmap) {
4325                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4326                                         nrsm->r_in_tmap = 1;
4327                                 }
4328                                 rsm->r_flags &= (~RACK_HAS_FIN);
4329                                 /* Position us to point to the new nrsm that starts the sack blk */
4330                                 rsm = nrsm;
4331                         }
4332                 } else {
4333                         /* Already sacked this piece */
4334                         counter_u64_add(rack_sack_skipped_acked, 1);
4335                         moved++;
4336                         if (end == rsm->r_end) {
4337                                 /* Done with block */
4338                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4339                                 goto out;
4340                         } else if (SEQ_LT(end, rsm->r_end)) {
4341                                 /* A partial sack to a already sacked block */
4342                                 moved++;
4343                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4344                                 goto out;
4345                         } else {
4346                                 /*
4347                                  * The end goes beyond this guy
4348                                  * repostion the start to the
4349                                  * next block.
4350                                  */
4351                                 start = rsm->r_end;
4352                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4353                                 if (rsm == NULL)
4354                                         goto out;
4355                         }
4356                 }
4357         }
4358         if (SEQ_GEQ(end, rsm->r_end)) {
4359                 /**
4360                  * The end of this block is either beyond this guy or right
4361                  * at this guy. I.e.:
4362                  *  rsm ---                 |-----|
4363                  *  end                     |-----|
4364                  *  <or>
4365                  *  end                     |---------|
4366                  */
4367                 if (rsm->r_flags & RACK_TLP)
4368                         rack->r_ctl.rc_tlp_rtx_out = 0;
4369                 if ((rsm->r_flags & RACK_ACKED) == 0) {
4370                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4371                         changed += (rsm->r_end - rsm->r_start);
4372                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4373                         if (rsm->r_in_tmap) /* should be true */
4374                                 rack_log_sack_passed(tp, rack, rsm);
4375                         /* Is Reordering occuring? */
4376                         if (rsm->r_flags & RACK_SACK_PASSED) {
4377                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4378                                 counter_u64_add(rack_reorder_seen, 1);
4379                                 rack->r_ctl.rc_reorder_ts = cts;
4380                         }
4381                         rsm->r_flags |= RACK_ACKED;
4382                         rsm->r_flags &= ~RACK_TLP;
4383                         if (rsm->r_in_tmap) {
4384                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4385                                 rsm->r_in_tmap = 0;
4386                         }
4387                 } else {
4388                         counter_u64_add(rack_sack_skipped_acked, 1);
4389                         moved++;
4390                 }
4391                 if (end == rsm->r_end) {
4392                         /* This block only - done, setup for next  */
4393                         goto out;
4394                 }
4395                 /*
4396                  * There is more not coverend by this rsm move on
4397                  * to the next block in the RB tree.
4398                  */
4399                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4400                 start = rsm->r_end;
4401                 rsm = nrsm;
4402                 if (rsm == NULL)
4403                         goto out;
4404                 goto do_rest_ofb;
4405         }
4406         /**
4407          * The end of this sack block is smaller than
4408          * our rsm i.e.:
4409          *  rsm ---                 |-----|
4410          *  end                     |--|
4411          */
4412         if ((rsm->r_flags & RACK_ACKED) == 0) {
4413                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4414                 if (prev && (prev->r_flags & RACK_ACKED)) {
4415                         /**
4416                          * Goal, we want the right remainder of rsm to shrink
4417                          * in place and span from (rsm->r_start = end) to rsm->r_end.
4418                          * We want to expand prev to go all the way
4419                          * to prev->r_end <- end.
4420                          * so in the tree we have before:
4421                          *   prev     |--------|         (acked)
4422                          *   rsm               |-------| (non-acked)
4423                          *   sackblk           |-|
4424                          * We churn it so we end up with
4425                          *   prev     |----------|       (acked)
4426                          *   rsm                 |-----| (non-acked)
4427                          *   nrsm              |-| (temporary)
4428                          */
4429                         nrsm = &stack_map;
4430                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
4431                         prev->r_end = end;
4432                         rsm->r_start = end;
4433                         /* Now adjust nrsm (stack copy) to be
4434                          * the one that is the small
4435                          * piece that was "sacked".
4436                          */
4437                         nrsm->r_end = end;
4438                         rsm->r_dupack = 0;
4439                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4440                         /*
4441                          * Now nrsm is our new little piece
4442                          * that is acked (which was merged
4443                          * to prev). Update the rtt and changed
4444                          * based on that. Also check for reordering.
4445                          */
4446                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED);
4447                         changed += (nrsm->r_end - nrsm->r_start);
4448                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
4449                         if (nrsm->r_flags & RACK_SACK_PASSED) {
4450                                 counter_u64_add(rack_reorder_seen, 1);
4451                                 rack->r_ctl.rc_reorder_ts = cts;
4452                         }
4453                         rsm = prev;
4454                         counter_u64_add(rack_sack_used_prev_merge, 1);
4455                 } else {
4456                         /**
4457                          * This is the case where our previous
4458                          * block is not acked either, so we must
4459                          * split the block in two.
4460                          */
4461                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4462                         if (nrsm == NULL) {
4463                                 /* failed rrs what can we do but loose the sack info? */
4464                                 goto out;
4465                         }
4466                         /**
4467                          * In this case nrsm becomes
4468                          * nrsm->r_start = end;
4469                          * nrsm->r_end = rsm->r_end;
4470                          * which is un-acked.
4471                          * <and>
4472                          * rsm->r_end = nrsm->r_start;
4473                          * i.e. the remaining un-acked
4474                          * piece is left on the left
4475                          * hand side.
4476                          *
4477                          * So we start like this
4478                          * rsm      |----------| (not acked)
4479                          * sackblk  |---|
4480                          * build it so we have
4481                          * rsm      |---|         (acked)
4482                          * nrsm         |------|  (not acked)
4483                          */
4484                         counter_u64_add(rack_sack_splits, 1);
4485                         rack_clone_rsm(rack, nrsm, rsm, end);
4486                         rsm->r_flags &= (~RACK_HAS_FIN);
4487                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
4488 #ifdef INVARIANTS
4489                         if (insret != NULL) {
4490                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
4491                                       nrsm, insret, rack, rsm);
4492                         }
4493 #endif
4494                         if (rsm->r_in_tmap) {
4495                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4496                                 nrsm->r_in_tmap = 1;
4497                         }
4498                         nrsm->r_dupack = 0;
4499                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
4500                         if (rsm->r_flags & RACK_TLP)
4501                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4502                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4503                         changed += (rsm->r_end - rsm->r_start);
4504                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4505                         if (rsm->r_in_tmap) /* should be true */
4506                                 rack_log_sack_passed(tp, rack, rsm);
4507                         /* Is Reordering occuring? */
4508                         if (rsm->r_flags & RACK_SACK_PASSED) {
4509                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4510                                 counter_u64_add(rack_reorder_seen, 1);
4511                                 rack->r_ctl.rc_reorder_ts = cts;
4512                         }
4513                         rsm->r_flags |= RACK_ACKED;
4514                         rsm->r_flags &= ~RACK_TLP;
4515                         if (rsm->r_in_tmap) {
4516                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4517                                 rsm->r_in_tmap = 0;
4518                         }
4519                 }
4520         } else if (start != end){
4521                 /*
4522                  * The block was already acked.
4523                  */
4524                 counter_u64_add(rack_sack_skipped_acked, 1);
4525                 moved++;
4526         }
4527 out:
4528         if (rsm && (rsm->r_flags & RACK_ACKED)) {
4529                 /*
4530                  * Now can we merge where we worked
4531                  * with either the previous or
4532                  * next block?
4533                  */
4534                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4535                 while (next) {
4536                     if (next->r_flags & RACK_ACKED) {
4537                         /* yep this and next can be merged */
4538                         rsm = rack_merge_rsm(rack, rsm, next);
4539                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4540                     } else
4541                             break;
4542                 }
4543                 /* Now what about the previous? */
4544                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4545                 while (prev) {
4546                     if (prev->r_flags & RACK_ACKED) {
4547                         /* yep the previous and this can be merged */
4548                         rsm = rack_merge_rsm(rack, prev, rsm);
4549                         prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4550                     } else
4551                             break;
4552                 }
4553         }
4554         if (used_ref == 0) {
4555                 counter_u64_add(rack_sack_proc_all, 1);
4556         } else {
4557                 counter_u64_add(rack_sack_proc_short, 1);
4558         }
4559         /* Save off the next one for quick reference. */
4560         if (rsm)
4561                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4562         else
4563                 nrsm = NULL;
4564         *prsm = rack->r_ctl.rc_sacklast = nrsm;
4565         /* Pass back the moved. */
4566         *moved_two = moved;
4567         return (changed);
4568 }
4569
4570 static void inline
4571 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4572 {
4573         struct rack_sendmap *tmap;
4574
4575         tmap = NULL;
4576         while (rsm && (rsm->r_flags & RACK_ACKED)) {
4577                 /* Its no longer sacked, mark it so */
4578                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4579 #ifdef INVARIANTS
4580                 if (rsm->r_in_tmap) {
4581                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
4582                               rack, rsm, rsm->r_flags);
4583                 }
4584 #endif
4585                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4586                 /* Rebuild it into our tmap */
4587                 if (tmap == NULL) {
4588                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4589                         tmap = rsm;
4590                 } else {
4591                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4592                         tmap = rsm;
4593                 }
4594                 tmap->r_in_tmap = 1;
4595                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4596         }
4597         /*
4598          * Now lets possibly clear the sack filter so we start
4599          * recognizing sacks that cover this area.
4600          */
4601         if (rack_use_sack_filter)
4602                 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4603
4604 }
4605
4606 static void
4607 rack_do_decay(struct tcp_rack *rack)
4608 {
4609 #ifdef NETFLIX_EXP_DETECTION
4610         struct timeval res;
4611
4612 #define timersub(tvp, uvp, vvp)                                         \
4613         do {                                                            \
4614                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
4615                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
4616                 if ((vvp)->tv_usec < 0) {                               \
4617                         (vvp)->tv_sec--;                                \
4618                         (vvp)->tv_usec += 1000000;                      \
4619                 }                                                       \
4620         } while (0)
4621
4622         timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res);
4623 #undef timersub
4624
4625         rack->r_ctl.input_pkt++;
4626         if ((rack->rc_in_persist) ||
4627             (res.tv_sec >= 1) ||
4628             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
4629                 /*
4630                  * Check for decay of non-SAD,
4631                  * we want all SAD detection metrics to
4632                  * decay 1/4 per second (or more) passed.
4633                  */
4634                 uint32_t pkt_delta;
4635
4636                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
4637                 /* Update our saved tracking values */
4638                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
4639                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
4640                 /* Now do we escape without decay? */
4641                 if (rack->rc_in_persist ||
4642                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
4643                     (pkt_delta < tcp_sad_low_pps)){
4644                         /*
4645                          * We don't decay idle connections
4646                          * or ones that have a low input pps.
4647                          */
4648                         return;
4649                 }
4650                 /* Decay the counters */
4651                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
4652                                                         tcp_sad_decay_val);
4653                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
4654                                                          tcp_sad_decay_val);
4655                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
4656                                                                tcp_sad_decay_val);
4657                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
4658                                                                 tcp_sad_decay_val);
4659         }
4660 #endif
4661 }
4662
4663 static void
4664 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4665 {
4666         uint32_t changed, entered_recovery = 0;
4667         struct tcp_rack *rack;
4668         struct rack_sendmap *rsm, *rm;
4669         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4670         register uint32_t th_ack;
4671         int32_t i, j, k, num_sack_blks = 0;
4672         uint32_t cts, acked, ack_point, sack_changed = 0;
4673         int loop_start = 0, moved_two = 0;
4674
4675         INP_WLOCK_ASSERT(tp->t_inpcb);
4676         if (th->th_flags & TH_RST) {
4677                 /* We don't log resets */
4678                 return;
4679         }
4680         rack = (struct tcp_rack *)tp->t_fb_ptr;
4681         cts = tcp_ts_getticks();
4682         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4683         changed = 0;
4684         th_ack = th->th_ack;
4685         if (rack->sack_attack_disable == 0)
4686                 rack_do_decay(rack);
4687         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
4688                 /*
4689                  * You only get credit for
4690                  * MSS and greater (and you get extra
4691                  * credit for larger cum-ack moves).
4692                  */
4693                 int ac;
4694
4695                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
4696                 rack->r_ctl.ack_count += ac;
4697                 counter_u64_add(rack_ack_total, ac);
4698         }
4699         if (rack->r_ctl.ack_count > 0xfff00000) {
4700                 /*
4701                  * reduce the number to keep us under
4702                  * a uint32_t.
4703                  */
4704                 rack->r_ctl.ack_count /= 2;
4705                 rack->r_ctl.sack_count /= 2;
4706         }
4707         if (SEQ_GT(th_ack, tp->snd_una)) {
4708                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4709                 tp->t_acktime = ticks;
4710         }
4711         if (rsm && SEQ_GT(th_ack, rsm->r_start))
4712                 changed = th_ack - rsm->r_start;
4713         if (changed) {
4714                 /*
4715                  * The ACK point is advancing to th_ack, we must drop off
4716                  * the packets in the rack log and calculate any eligble
4717                  * RTT's.
4718                  */
4719                 rack->r_wanted_output++;
4720         more:
4721                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4722                 if (rsm == NULL) {
4723                         if ((th_ack - 1) == tp->iss) {
4724                                 /*
4725                                  * For the SYN incoming case we will not
4726                                  * have called tcp_output for the sending of
4727                                  * the SYN, so there will be no map. All
4728                                  * other cases should probably be a panic.
4729                                  */
4730                                 goto proc_sack;
4731                         }
4732                         if (tp->t_flags & TF_SENTFIN) {
4733                                 /* if we send a FIN we will not hav a map */
4734                                 goto proc_sack;
4735                         }
4736 #ifdef INVARIANTS
4737                         panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4738                               tp,
4739                               th, tp->t_state, rack,
4740                               tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4741 #endif
4742                         goto proc_sack;
4743                 }
4744                 if (SEQ_LT(th_ack, rsm->r_start)) {
4745                         /* Huh map is missing this */
4746 #ifdef INVARIANTS
4747                         printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4748                                rsm->r_start,
4749                                th_ack, tp->t_state, rack->r_state);
4750 #endif
4751                         goto proc_sack;
4752                 }
4753                 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4754                 /* Now do we consume the whole thing? */
4755                 if (SEQ_GEQ(th_ack, rsm->r_end)) {
4756                         /* Its all consumed. */
4757                         uint32_t left;
4758
4759                         rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4760                         rsm->r_rtr_bytes = 0;
4761                         if (rsm->r_flags & RACK_TLP)
4762                                 rack->r_ctl.rc_tlp_rtx_out = 0;
4763                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
4764 #ifdef INVARIANTS
4765                         if (rm != rsm) {
4766                                 panic("removing head in rack:%p rsm:%p rm:%p",
4767                                       rack, rsm, rm);
4768                         }
4769 #endif
4770                         if (rsm->r_in_tmap) {
4771                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4772                                 rsm->r_in_tmap = 0;
4773                         }
4774                         if (rsm->r_flags & RACK_ACKED) {
4775                                 /*
4776                                  * It was acked on the scoreboard -- remove
4777                                  * it from total
4778                                  */
4779                                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4780                         } else if (rsm->r_flags & RACK_SACK_PASSED) {
4781                                 /*
4782                                  * There are segments ACKED on the
4783                                  * scoreboard further up. We are seeing
4784                                  * reordering.
4785                                  */
4786                                 rsm->r_flags &= ~RACK_SACK_PASSED;
4787                                 counter_u64_add(rack_reorder_seen, 1);
4788                                 rsm->r_flags |= RACK_ACKED;
4789                                 rack->r_ctl.rc_reorder_ts = cts;
4790                         }
4791                         left = th_ack - rsm->r_end;
4792                         if (rsm->r_rtr_cnt > 1) {
4793                                 /*
4794                                  * Technically we should make r_rtr_cnt be
4795                                  * monotonicly increasing and just mod it to
4796                                  * the timestamp it is replacing.. that way
4797                                  * we would have the last 3 retransmits. Now
4798                                  * rc_loss_count will be wrong if we
4799                                  * retransmit something more than 2 times in
4800                                  * recovery :(
4801                                  */
4802                                 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4803                         }
4804                         /* Free back to zone */
4805                         rack_free(rack, rsm);
4806                         if (left) {
4807                                 goto more;
4808                         }
4809                         goto proc_sack;
4810                 }
4811                 if (rsm->r_flags & RACK_ACKED) {
4812                         /*
4813                          * It was acked on the scoreboard -- remove it from
4814                          * total for the part being cum-acked.
4815                          */
4816                         rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4817                 }
4818                 /*
4819                  * Clear the dup ack count for
4820                  * the piece that remains.
4821                  */
4822                 rsm->r_dupack = 0;
4823                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
4824                 if (rsm->r_rtr_bytes) {
4825                         /*
4826                          * It was retransmitted adjust the
4827                          * sack holes for what was acked.
4828                          */
4829                         int ack_am;
4830
4831                         ack_am = (th_ack - rsm->r_start);
4832                         if (ack_am >= rsm->r_rtr_bytes) {
4833                                 rack->r_ctl.rc_holes_rxt -= ack_am;
4834                                 rsm->r_rtr_bytes -= ack_am;
4835                         }
4836                 }
4837                 /* Update where the piece starts */
4838                 rsm->r_start = th_ack;
4839         }
4840 proc_sack:
4841         /* Check for reneging */
4842         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
4843         if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4844                 /*
4845                  * The peer has moved snd_una up to
4846                  * the edge of this send, i.e. one
4847                  * that it had previously acked. The only
4848                  * way that can be true if the peer threw
4849                  * away data (space issues) that it had
4850                  * previously sacked (else it would have
4851                  * given us snd_una up to (rsm->r_end).
4852                  * We need to undo the acked markings here.
4853                  *
4854                  * Note we have to look to make sure th_ack is
4855                  * our rsm->r_start in case we get an old ack
4856                  * where th_ack is behind snd_una.
4857                  */
4858                 rack_peer_reneges(rack, rsm, th->th_ack);
4859         }
4860         if ((to->to_flags & TOF_SACK) == 0) {
4861                 /* We are done nothing left */
4862                 goto out;
4863         }
4864         /* Sack block processing */
4865         if (SEQ_GT(th_ack, tp->snd_una))
4866                 ack_point = th_ack;
4867         else
4868                 ack_point = tp->snd_una;
4869         for (i = 0; i < to->to_nsacks; i++) {
4870                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
4871                       &sack, sizeof(sack));
4872                 sack.start = ntohl(sack.start);
4873                 sack.end = ntohl(sack.end);
4874                 if (SEQ_GT(sack.end, sack.start) &&
4875                     SEQ_GT(sack.start, ack_point) &&
4876                     SEQ_LT(sack.start, tp->snd_max) &&
4877                     SEQ_GT(sack.end, ack_point) &&
4878                     SEQ_LEQ(sack.end, tp->snd_max)) {
4879                         sack_blocks[num_sack_blks] = sack;
4880                         num_sack_blks++;
4881 #ifdef NETFLIX_STATS
4882                 } else if (SEQ_LEQ(sack.start, th_ack) &&
4883                            SEQ_LEQ(sack.end, th_ack)) {
4884                         /*
4885                          * Its a D-SACK block.
4886                          */
4887                         tcp_record_dsack(sack.start, sack.end);
4888 #endif
4889                 }
4890
4891         }
4892         /*
4893          * Sort the SACK blocks so we can update the rack scoreboard with
4894          * just one pass.
4895          */
4896         if (rack_use_sack_filter) {
4897                 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
4898                                                  num_sack_blks, th->th_ack);
4899                 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
4900         }
4901         if (num_sack_blks == 0)  {
4902                 /* Nothing to sack (DSACKs?) */
4903                 goto out_with_totals;
4904         }
4905         if (num_sack_blks < 2) {
4906                 /* Only one, we don't need to sort */
4907                 goto do_sack_work;
4908         }
4909         /* Sort the sacks */
4910         for (i = 0; i < num_sack_blks; i++) {
4911                 for (j = i + 1; j < num_sack_blks; j++) {
4912                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4913                                 sack = sack_blocks[i];
4914                                 sack_blocks[i] = sack_blocks[j];
4915                                 sack_blocks[j] = sack;
4916                         }
4917                 }
4918         }
4919         /*
4920          * Now are any of the sack block ends the same (yes some
4921          * implementations send these)?
4922          */
4923 again:
4924         if (num_sack_blks == 0)
4925                 goto out_with_totals;
4926         if (num_sack_blks > 1) {
4927                 for (i = 0; i < num_sack_blks; i++) {
4928                         for (j = i + 1; j < num_sack_blks; j++) {
4929                                 if (sack_blocks[i].end == sack_blocks[j].end) {
4930                                         /*
4931                                          * Ok these two have the same end we
4932                                          * want the smallest end and then
4933                                          * throw away the larger and start
4934                                          * again.
4935                                          */
4936                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4937                                                 /*
4938                                                  * The second block covers
4939                                                  * more area use that
4940                                                  */
4941                                                 sack_blocks[i].start = sack_blocks[j].start;
4942                                         }
4943                                         /*
4944                                          * Now collapse out the dup-sack and
4945                                          * lower the count
4946                                          */
4947                                         for (k = (j + 1); k < num_sack_blks; k++) {
4948                                                 sack_blocks[j].start = sack_blocks[k].start;
4949                                                 sack_blocks[j].end = sack_blocks[k].end;
4950                                                 j++;
4951                                         }
4952                                         num_sack_blks--;
4953                                         goto again;
4954                                 }
4955                         }
4956                 }
4957         }
4958 do_sack_work:
4959         /*
4960          * First lets look to see if
4961          * we have retransmitted and
4962          * can use the transmit next?
4963          */
4964         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
4965         if (rsm &&
4966             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
4967             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
4968                 /*
4969                  * We probably did the FR and the next
4970                  * SACK in continues as we would expect.
4971                  */
4972                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
4973                 if (acked) {
4974                         rack->r_wanted_output++;
4975                         changed += acked;
4976                         sack_changed += acked;
4977                 }
4978                 if (num_sack_blks == 1) {
4979                         /*
4980                          * This is what we would expect from
4981                          * a normal implementation to happen
4982                          * after we have retransmitted the FR,
4983                          * i.e the sack-filter pushes down
4984                          * to 1 block and the next to be retransmitted
4985                          * is the sequence in the sack block (has more
4986                          * are acked). Count this as ACK'd data to boost
4987                          * up the chances of recovering any false positives.
4988                          */
4989                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
4990                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
4991                         counter_u64_add(rack_express_sack, 1);
4992                         if (rack->r_ctl.ack_count > 0xfff00000) {
4993                                 /*
4994                                  * reduce the number to keep us under
4995                                  * a uint32_t.
4996                                  */
4997                                 rack->r_ctl.ack_count /= 2;
4998                                 rack->r_ctl.sack_count /= 2;
4999                         }
5000                         goto out_with_totals;
5001                 } else {
5002                         /*
5003                          * Start the loop through the
5004                          * rest of blocks, past the first block.
5005                          */
5006                         moved_two = 0;
5007                         loop_start = 1;
5008                 }
5009         }
5010         /* Its a sack of some sort */
5011         rack->r_ctl.sack_count++;
5012         if (rack->r_ctl.sack_count > 0xfff00000) {
5013                 /*
5014                  * reduce the number to keep us under
5015                  * a uint32_t.
5016                  */
5017                 rack->r_ctl.ack_count /= 2;
5018                 rack->r_ctl.sack_count /= 2;
5019         }
5020         counter_u64_add(rack_sack_total, 1);
5021         if (rack->sack_attack_disable) {
5022                 /* An attacker disablement is in place */
5023                 if (num_sack_blks > 1) {
5024                         rack->r_ctl.sack_count += (num_sack_blks - 1);
5025                         rack->r_ctl.sack_moved_extra++;
5026                         counter_u64_add(rack_move_some, 1);
5027                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
5028                                 rack->r_ctl.sack_moved_extra /= 2;
5029                                 rack->r_ctl.sack_noextra_move /= 2;
5030                         }
5031                 }
5032                 goto out;
5033         }
5034         rsm = rack->r_ctl.rc_sacklast;
5035         for (i = loop_start; i < num_sack_blks; i++) {
5036                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
5037                 if (acked) {
5038                         rack->r_wanted_output++;
5039                         changed += acked;
5040                         sack_changed += acked;
5041                 }
5042                 if (moved_two) {
5043                         /*
5044                          * If we did not get a SACK for at least a MSS and
5045                          * had to move at all, or if we moved more than our
5046                          * threshold, it counts against the "extra" move.
5047                          */
5048                         rack->r_ctl.sack_moved_extra += moved_two;
5049                         counter_u64_add(rack_move_some, 1);
5050                 } else {
5051                         /*
5052                          * else we did not have to move
5053                          * any more than we would expect.
5054                          */
5055                         rack->r_ctl.sack_noextra_move++;
5056                         counter_u64_add(rack_move_none, 1);
5057                 }
5058                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
5059                         /*
5060                          * If the SACK was not a full MSS then
5061                          * we add to sack_count the number of
5062                          * MSS's (or possibly more than
5063                          * a MSS if its a TSO send) we had to skip by.
5064                          */
5065                         rack->r_ctl.sack_count += moved_two;
5066                         counter_u64_add(rack_sack_total, moved_two);
5067                 }
5068                 /*
5069                  * Now we need to setup for the next
5070                  * round. First we make sure we won't
5071                  * exceed the size of our uint32_t on
5072                  * the various counts, and then clear out
5073                  * moved_two.
5074                  */
5075                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
5076                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
5077                         rack->r_ctl.sack_moved_extra /= 2;
5078                         rack->r_ctl.sack_noextra_move /= 2;
5079                 }
5080                 if (rack->r_ctl.sack_count > 0xfff00000) {
5081                         rack->r_ctl.ack_count /= 2;
5082                         rack->r_ctl.sack_count /= 2;
5083                 }
5084                 moved_two = 0;
5085         }
5086 out_with_totals:
5087         if (num_sack_blks > 1) {
5088                 /*
5089                  * You get an extra stroke if
5090                  * you have more than one sack-blk, this
5091                  * could be where we are skipping forward
5092                  * and the sack-filter is still working, or
5093                  * it could be an attacker constantly
5094                  * moving us.
5095                  */
5096                 rack->r_ctl.sack_moved_extra++;
5097                 counter_u64_add(rack_move_some, 1);
5098         }
5099 out:
5100 #ifdef NETFLIX_EXP_DETECTION
5101         if ((rack->do_detection || tcp_force_detection) &&
5102             tcp_sack_to_ack_thresh &&
5103             tcp_sack_to_move_thresh &&
5104             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
5105                 /*
5106                  * We have thresholds set to find
5107                  * possible attackers and disable sack.
5108                  * Check them.
5109                  */
5110                 uint64_t ackratio, moveratio, movetotal;
5111
5112                 /* Log detecting */
5113                 rack_log_sad(rack, 1);
5114                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
5115                 ackratio *= (uint64_t)(1000);
5116                 if (rack->r_ctl.ack_count)
5117                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
5118                 else {
5119                         /* We really should not hit here */
5120                         ackratio = 1000;
5121                 }
5122                 if ((rack->sack_attack_disable  == 0) &&
5123                     (ackratio > rack_highest_sack_thresh_seen))
5124                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
5125                 movetotal = rack->r_ctl.sack_moved_extra;
5126                 movetotal += rack->r_ctl.sack_noextra_move;
5127                 moveratio = rack->r_ctl.sack_moved_extra;
5128                 moveratio *= (uint64_t)1000;
5129                 if (movetotal)
5130                         moveratio /= movetotal;
5131                 else {
5132                         /* No moves, thats pretty good */
5133                         moveratio = 0;
5134                 }
5135                 if ((rack->sack_attack_disable == 0) &&
5136                     (moveratio > rack_highest_move_thresh_seen))
5137                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
5138                 if (rack->sack_attack_disable == 0) {
5139                         if ((ackratio > tcp_sack_to_ack_thresh) &&
5140                             (moveratio > tcp_sack_to_move_thresh)) {
5141                                 /* Disable sack processing */
5142                                 rack->sack_attack_disable = 1;
5143                                 if (rack->r_rep_attack == 0) {
5144                                         rack->r_rep_attack = 1;
5145                                         counter_u64_add(rack_sack_attacks_detected, 1);
5146                                 }
5147                                 if (tcp_attack_on_turns_on_logging) {
5148                                         /*
5149                                          * Turn on logging, used for debugging
5150                                          * false positives.
5151                                          */
5152                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
5153                                 }
5154                                 /* Clamp the cwnd at flight size */
5155                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
5156                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
5157                                 rack_log_sad(rack, 2);
5158                         }
5159                 } else {
5160                         /* We are sack-disabled check for false positives */
5161                         if ((ackratio <= tcp_restoral_thresh) ||
5162                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
5163                                 rack->sack_attack_disable  = 0;
5164                                 rack_log_sad(rack, 3);
5165                                 /* Restart counting */
5166                                 rack->r_ctl.sack_count = 0;
5167                                 rack->r_ctl.sack_moved_extra = 0;
5168                                 rack->r_ctl.sack_noextra_move = 1;
5169                                 rack->r_ctl.ack_count = max(1,
5170                                       (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp)));
5171
5172                                 if (rack->r_rep_reverse == 0) {
5173                                         rack->r_rep_reverse = 1;
5174                                         counter_u64_add(rack_sack_attacks_reversed, 1);
5175                                 }
5176                                 /* Restore the cwnd */
5177                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
5178                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
5179                         }
5180                 }
5181         }
5182 #endif
5183         if (changed) {
5184                 /* Something changed cancel the rack timer */
5185                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5186         }
5187         if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
5188                 /*
5189                  * Ok we have a high probability that we need to go in to
5190                  * recovery since we have data sack'd
5191                  */
5192                 struct rack_sendmap *rsm;
5193                 uint32_t tsused;
5194
5195                 tsused = tcp_ts_getticks();
5196                 rsm = tcp_rack_output(tp, rack, tsused);
5197                 if (rsm) {
5198                         /* Enter recovery */
5199                         rack->r_ctl.rc_rsm_start = rsm->r_start;
5200                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
5201                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
5202                         entered_recovery = 1;
5203                         rack_cong_signal(tp, NULL, CC_NDUPACK);
5204                         /*
5205                          * When we enter recovery we need to assure we send
5206                          * one packet.
5207                          */
5208                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5209                         rack_log_to_prr(rack, 8);
5210                         rack->r_timer_override = 1;
5211                 }
5212         }
5213         if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
5214                 /* Deal with changed and PRR here (in recovery only) */
5215                 uint32_t pipe, snd_una;
5216
5217                 rack->r_ctl.rc_prr_delivered += changed;
5218                 /* Compute prr_sndcnt */
5219                 if (SEQ_GT(tp->snd_una, th_ack)) {
5220                         snd_una = tp->snd_una;
5221                 } else {
5222                         snd_una = th_ack;
5223                 }
5224                 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
5225                 if (pipe > tp->snd_ssthresh) {
5226                         long sndcnt;
5227
5228                         sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
5229                         if (rack->r_ctl.rc_prr_recovery_fs > 0)
5230                                 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
5231                         else {
5232                                 rack->r_ctl.rc_prr_sndcnt = 0;
5233                                 rack_log_to_prr(rack, 9);
5234                                 sndcnt = 0;
5235                         }
5236                         sndcnt++;
5237                         if (sndcnt > (long)rack->r_ctl.rc_prr_out)
5238                                 sndcnt -= rack->r_ctl.rc_prr_out;
5239                         else
5240                                 sndcnt = 0;
5241                         rack->r_ctl.rc_prr_sndcnt = sndcnt;
5242                         rack_log_to_prr(rack, 10);
5243                 } else {
5244                         uint32_t limit;
5245
5246                         if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
5247                                 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
5248                         else
5249                                 limit = 0;
5250                         if (changed > limit)
5251                                 limit = changed;
5252                         limit += ctf_fixed_maxseg(tp);
5253                         if (tp->snd_ssthresh > pipe) {
5254                                 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
5255                                 rack_log_to_prr(rack, 11);
5256                         } else {
5257                                 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
5258                                 rack_log_to_prr(rack, 12);
5259                         }
5260                 }
5261                 if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) {
5262                         rack->r_timer_override = 1;
5263                 }
5264         }
5265 }
5266
5267 static void
5268 rack_strike_dupack(struct tcp_rack *rack)
5269 {
5270         struct rack_sendmap *rsm;
5271
5272         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5273         if (rsm && (rsm->r_dupack < 0xff)) {
5274                 rsm->r_dupack++;
5275                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
5276                         rack->r_wanted_output = 1;
5277                         rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
5278                 } else {
5279                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
5280                 }
5281         }
5282 }
5283
5284 /*
5285  * Return value of 1, we do not need to call rack_process_data().
5286  * return value of 0, rack_process_data can be called.
5287  * For ret_val if its 0 the TCP is locked, if its non-zero
5288  * its unlocked and probably unsafe to touch the TCB.
5289  */
5290 static int
5291 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5292     struct tcpcb *tp, struct tcpopt *to,
5293     uint32_t tiwin, int32_t tlen,
5294     int32_t * ofia, int32_t thflags, int32_t * ret_val)
5295 {
5296         int32_t ourfinisacked = 0;
5297         int32_t nsegs, acked_amount;
5298         int32_t acked;
5299         struct mbuf *mfree;
5300         struct tcp_rack *rack;
5301         int32_t recovery = 0;
5302
5303         rack = (struct tcp_rack *)tp->t_fb_ptr;
5304         if (SEQ_GT(th->th_ack, tp->snd_max)) {
5305                 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
5306                 rack->r_wanted_output++;
5307                 return (1);
5308         }
5309         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
5310                 if (rack->rc_in_persist)
5311                         tp->t_rxtshift = 0;
5312                 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
5313                         rack_strike_dupack(rack);
5314                 rack_log_ack(tp, to, th);
5315         }
5316         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5317                 /*
5318                  * Old ack, behind (or duplicate to) the last one rcv'd
5319                  * Note: Should mark reordering is occuring! We should also
5320                  * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
5321                  * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
5322                  * retran and> ack 3
5323                  */
5324                 return (0);
5325         }
5326         /*
5327          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
5328          * something we sent.
5329          */
5330         if (tp->t_flags & TF_NEEDSYN) {
5331                 /*
5332                  * T/TCP: Connection was half-synchronized, and our SYN has
5333                  * been ACK'd (so connection is now fully synchronized).  Go
5334                  * to non-starred state, increment snd_una for ACK of SYN,
5335                  * and check if we can do window scaling.
5336                  */
5337                 tp->t_flags &= ~TF_NEEDSYN;
5338                 tp->snd_una++;
5339                 /* Do window scaling? */
5340                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5341                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5342                         tp->rcv_scale = tp->request_r_scale;
5343                         /* Send window already scaled. */
5344                 }
5345         }
5346         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5347         INP_WLOCK_ASSERT(tp->t_inpcb);
5348
5349         acked = BYTES_THIS_ACK(tp, th);
5350         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5351         TCPSTAT_ADD(tcps_rcvackbyte, acked);
5352
5353         /*
5354          * If we just performed our first retransmit, and the ACK arrives
5355          * within our recovery window, then it was a mistake to do the
5356          * retransmit in the first place.  Recover our original cwnd and
5357          * ssthresh, and proceed to transmit where we left off.
5358          */
5359         if (tp->t_flags & TF_PREVVALID) {
5360                 tp->t_flags &= ~TF_PREVVALID;
5361                 if (tp->t_rxtshift == 1 &&
5362                     (int)(ticks - tp->t_badrxtwin) < 0)
5363                         rack_cong_signal(tp, th, CC_RTO_ERR);
5364         }
5365         /*
5366          * If we have a timestamp reply, update smoothed round trip time. If
5367          * no timestamp is present but transmit timer is running and timed
5368          * sequence number was acked, update smoothed round trip time. Since
5369          * we now have an rtt measurement, cancel the timer backoff (cf.,
5370          * Phil Karn's retransmit alg.). Recompute the initial retransmit
5371          * timer.
5372          *
5373          * Some boxes send broken timestamp replies during the SYN+ACK
5374          * phase, ignore timestamps of 0 or we could calculate a huge RTT
5375          * and blow up the retransmit timer.
5376          */
5377         /*
5378          * If all outstanding data is acked, stop retransmit timer and
5379          * remember to restart (more output or persist). If there is more
5380          * data to be acked, restart retransmit timer, using current
5381          * (possibly backed-off) value.
5382          */
5383         if (th->th_ack == tp->snd_max) {
5384                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5385                 rack->r_wanted_output++;
5386         }
5387         if (acked == 0) {
5388                 if (ofia)
5389                         *ofia = ourfinisacked;
5390                 return (0);
5391         }
5392         if (rack->r_ctl.rc_early_recovery) {
5393                 if (IN_RECOVERY(tp->t_flags)) {
5394                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5395                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5396                                 tcp_rack_partialack(tp, th);
5397                         } else {
5398                                 rack_post_recovery(tp, th);
5399                                 recovery = 1;
5400                         }
5401                 }
5402         }
5403         /*
5404          * Let the congestion control algorithm update congestion control
5405          * related information. This typically means increasing the
5406          * congestion window.
5407          */
5408         rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
5409         SOCKBUF_LOCK(&so->so_snd);
5410         acked_amount = min(acked, (int)sbavail(&so->so_snd));
5411         tp->snd_wnd -= acked_amount;
5412         mfree = sbcut_locked(&so->so_snd, acked_amount);
5413         if ((sbused(&so->so_snd) == 0) &&
5414             (acked > acked_amount) &&
5415             (tp->t_state >= TCPS_FIN_WAIT_1)) {
5416                 ourfinisacked = 1;
5417         }
5418         /* NB: sowwakeup_locked() does an implicit unlock. */
5419         sowwakeup_locked(so);
5420         m_freem(mfree);
5421         if (rack->r_ctl.rc_early_recovery == 0) {
5422                 if (IN_RECOVERY(tp->t_flags)) {
5423                         if (SEQ_LT(th->th_ack, tp->snd_recover) &&
5424                             (SEQ_LT(th->th_ack, tp->snd_max))) {
5425                                 tcp_rack_partialack(tp, th);
5426                         } else {
5427                                 rack_post_recovery(tp, th);
5428                         }
5429                 }
5430         }
5431         tp->snd_una = th->th_ack;
5432         if (SEQ_GT(tp->snd_una, tp->snd_recover))
5433                 tp->snd_recover = tp->snd_una;
5434
5435         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
5436                 tp->snd_nxt = tp->snd_una;
5437         }
5438         if (tp->snd_una == tp->snd_max) {
5439                 /* Nothing left outstanding */
5440                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5441                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
5442                         tp->t_acktime = 0;
5443                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5444                 /* Set need output so persist might get set */
5445                 rack->r_wanted_output++;
5446                 if (rack_use_sack_filter)
5447                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5448                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
5449                     (sbavail(&so->so_snd) == 0) &&
5450                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
5451                         /*
5452                          * The socket was gone and the
5453                          * peer sent data, time to
5454                          * reset him.
5455                          */
5456                         *ret_val = 1;
5457                         tp = tcp_close(tp);
5458                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
5459                         return (1);
5460                 }
5461         }
5462         if (ofia)
5463                 *ofia = ourfinisacked;
5464         return (0);
5465 }
5466
5467 static void
5468 rack_collapsed_window(struct tcp_rack *rack)
5469 {
5470         /*
5471          * Now we must walk the
5472          * send map and divide the
5473          * ones left stranded. These
5474          * guys can't cause us to abort
5475          * the connection and are really
5476          * "unsent". However if a buggy
5477          * client actually did keep some
5478          * of the data i.e. collapsed the win
5479          * and refused to ack and then opened
5480          * the win and acked that data. We would
5481          * get into an ack war, the simplier
5482          * method then of just pretending we
5483          * did not send those segments something
5484          * won't work.
5485          */
5486         struct rack_sendmap *rsm, *nrsm, fe, *insret;
5487         tcp_seq max_seq;
5488         uint32_t maxseg;
5489
5490         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
5491         maxseg = ctf_fixed_maxseg(rack->rc_tp);
5492         memset(&fe, 0, sizeof(fe));
5493         fe.r_start = max_seq;
5494         /* Find the first seq past or at maxseq */
5495         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
5496         if (rsm == NULL) {
5497                 /* Nothing to do strange */
5498                 rack->rc_has_collapsed = 0;
5499                 return;
5500         }
5501         /*
5502          * Now do we need to split at
5503          * the collapse point?
5504          */
5505         if (SEQ_GT(max_seq, rsm->r_start)) {
5506                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
5507                 if (nrsm == NULL) {
5508                         /* We can't get a rsm, mark all? */
5509                         nrsm = rsm;
5510                         goto no_split;
5511                 }
5512                 /* Clone it */
5513                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
5514                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
5515 #ifdef INVARIANTS
5516                 if (insret != NULL) {
5517                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
5518                               nrsm, insret, rack, rsm);
5519                 }
5520 #endif
5521                 if (rsm->r_in_tmap) {
5522                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
5523                         nrsm->r_in_tmap = 1;
5524                 }
5525                 /*
5526                  * Set in the new RSM as the
5527                  * collapsed starting point
5528                  */
5529                 rsm = nrsm;
5530         }
5531 no_split:
5532         counter_u64_add(rack_collapsed_win, 1);
5533         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
5534                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
5535                 rack->rc_has_collapsed = 1;
5536         }
5537 }
5538
5539 static void
5540 rack_un_collapse_window(struct tcp_rack *rack)
5541 {
5542         struct rack_sendmap *rsm;
5543
5544         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
5545                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
5546                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
5547                 else
5548                         break;
5549         }
5550         rack->rc_has_collapsed = 0;
5551 }
5552
5553 /*
5554  * Return value of 1, the TCB is unlocked and most
5555  * likely gone, return value of 0, the TCP is still
5556  * locked.
5557  */
5558 static int
5559 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
5560     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
5561     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5562 {
5563         /*
5564          * Update window information. Don't look at window if no ACK: TAC's
5565          * send garbage on first SYN.
5566          */
5567         int32_t nsegs;
5568         int32_t tfo_syn;
5569         struct tcp_rack *rack;
5570
5571         rack = (struct tcp_rack *)tp->t_fb_ptr;
5572         INP_WLOCK_ASSERT(tp->t_inpcb);
5573         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5574         if ((thflags & TH_ACK) &&
5575             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
5576             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
5577             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
5578                 /* keep track of pure window updates */
5579                 if (tlen == 0 &&
5580                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
5581                         TCPSTAT_INC(tcps_rcvwinupd);
5582                 tp->snd_wnd = tiwin;
5583                 tp->snd_wl1 = th->th_seq;
5584                 tp->snd_wl2 = th->th_ack;
5585                 if (tp->snd_wnd > tp->max_sndwnd)
5586                         tp->max_sndwnd = tp->snd_wnd;
5587                 rack->r_wanted_output++;
5588         } else if (thflags & TH_ACK) {
5589                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
5590                         tp->snd_wnd = tiwin;
5591                         tp->snd_wl1 = th->th_seq;
5592                         tp->snd_wl2 = th->th_ack;
5593                 }
5594         }
5595         if (tp->snd_wnd < ctf_outstanding(tp))
5596                 /* The peer collapsed the window */
5597                 rack_collapsed_window(rack);
5598         else if (rack->rc_has_collapsed)
5599                 rack_un_collapse_window(rack);
5600         /* Was persist timer active and now we have window space? */
5601         if ((rack->rc_in_persist != 0) &&
5602             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
5603                                 rack->r_ctl.rc_pace_min_segs))) {
5604                 rack_exit_persist(tp, rack);
5605                 tp->snd_nxt = tp->snd_max;
5606                 /* Make sure we output to start the timer */
5607                 rack->r_wanted_output++;
5608         }
5609         /* Do we enter persists? */
5610         if ((rack->rc_in_persist == 0) &&
5611             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
5612             TCPS_HAVEESTABLISHED(tp->t_state) &&
5613             (tp->snd_max == tp->snd_una) &&
5614             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
5615             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
5616                 /*
5617                  * Here the rwnd is less than
5618                  * the pacing size, we are established,
5619                  * nothing is outstanding, and there is
5620                  * data to send. Enter persists.
5621                  */
5622                 tp->snd_nxt = tp->snd_una;
5623                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
5624         }
5625         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
5626                 m_freem(m);
5627                 return (0);
5628         }
5629         /*
5630          * Process segments with URG.
5631          */
5632         if ((thflags & TH_URG) && th->th_urp &&
5633             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5634                 /*
5635                  * This is a kludge, but if we receive and accept random
5636                  * urgent pointers, we'll crash in soreceive.  It's hard to
5637                  * imagine someone actually wanting to send this much urgent
5638                  * data.
5639                  */
5640                 SOCKBUF_LOCK(&so->so_rcv);
5641                 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
5642                         th->th_urp = 0; /* XXX */
5643                         thflags &= ~TH_URG;     /* XXX */
5644                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
5645                         goto dodata;    /* XXX */
5646                 }
5647                 /*
5648                  * If this segment advances the known urgent pointer, then
5649                  * mark the data stream.  This should not happen in
5650                  * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
5651                  * FIN has been received from the remote side. In these
5652                  * states we ignore the URG.
5653                  *
5654                  * According to RFC961 (Assigned Protocols), the urgent
5655                  * pointer points to the last octet of urgent data.  We
5656                  * continue, however, to consider it to indicate the first
5657                  * octet of data past the urgent section as the original
5658                  * spec states (in one of two places).
5659                  */
5660                 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
5661                         tp->rcv_up = th->th_seq + th->th_urp;
5662                         so->so_oobmark = sbavail(&so->so_rcv) +
5663                             (tp->rcv_up - tp->rcv_nxt) - 1;
5664                         if (so->so_oobmark == 0)
5665                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
5666                         sohasoutofband(so);
5667                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
5668                 }
5669                 SOCKBUF_UNLOCK(&so->so_rcv);
5670                 /*
5671                  * Remove out of band data so doesn't get presented to user.
5672                  * This can happen independent of advancing the URG pointer,
5673                  * but if two URG's are pending at once, some out-of-band
5674                  * data may creep in... ick.
5675                  */
5676                 if (th->th_urp <= (uint32_t) tlen &&
5677                     !(so->so_options & SO_OOBINLINE)) {
5678                         /* hdr drop is delayed */
5679                         tcp_pulloutofband(so, th, m, drop_hdrlen);
5680                 }
5681         } else {
5682                 /*
5683                  * If no out of band data is expected, pull receive urgent
5684                  * pointer along with the receive window.
5685                  */
5686                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
5687                         tp->rcv_up = tp->rcv_nxt;
5688         }
5689 dodata:                         /* XXX */
5690         INP_WLOCK_ASSERT(tp->t_inpcb);
5691
5692         /*
5693          * Process the segment text, merging it into the TCP sequencing
5694          * queue, and arranging for acknowledgment of receipt if necessary.
5695          * This process logically involves adjusting tp->rcv_wnd as data is
5696          * presented to the user (this happens in tcp_usrreq.c, case
5697          * PRU_RCVD).  If a FIN has already been received on this connection
5698          * then we just ignore the text.
5699          */
5700         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
5701                    IS_FASTOPEN(tp->t_flags));
5702         if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
5703             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5704                 tcp_seq save_start = th->th_seq;
5705                 tcp_seq save_rnxt  = tp->rcv_nxt;
5706                 int     save_tlen  = tlen;
5707
5708                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5709                 /*
5710                  * Insert segment which includes th into TCP reassembly
5711                  * queue with control block tp.  Set thflags to whether
5712                  * reassembly now includes a segment with FIN.  This handles
5713                  * the common case inline (segment is the next to be
5714                  * received on an established connection, and the queue is
5715                  * empty), avoiding linkage into and removal from the queue
5716                  * and repetition of various conversions. Set DELACK for
5717                  * segments received in order, but ack immediately when
5718                  * segments are out of order (so fast retransmit can work).
5719                  */
5720                 if (th->th_seq == tp->rcv_nxt &&
5721                     SEGQ_EMPTY(tp) &&
5722                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
5723                     tfo_syn)) {
5724 #ifdef NETFLIX_SB_LIMITS
5725                         u_int mcnt, appended;
5726
5727                         if (so->so_rcv.sb_shlim) {
5728                                 mcnt = m_memcnt(m);
5729                                 appended = 0;
5730                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5731                                     CFO_NOSLEEP, NULL) == false) {
5732                                         counter_u64_add(tcp_sb_shlim_fails, 1);
5733                                         m_freem(m);
5734                                         return (0);
5735                                 }
5736                         }
5737 #endif
5738                         if (DELAY_ACK(tp, tlen) || tfo_syn) {
5739                                 rack_timer_cancel(tp, rack,
5740                                     rack->r_ctl.rc_rcvtime, __LINE__);
5741                                 tp->t_flags |= TF_DELACK;
5742                         } else {
5743                                 rack->r_wanted_output++;
5744                                 tp->t_flags |= TF_ACKNOW;
5745                         }
5746                         tp->rcv_nxt += tlen;
5747                         thflags = th->th_flags & TH_FIN;
5748                         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5749                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5750                         SOCKBUF_LOCK(&so->so_rcv);
5751                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5752                                 m_freem(m);
5753                         } else
5754 #ifdef NETFLIX_SB_LIMITS
5755                                 appended =
5756 #endif
5757                                         sbappendstream_locked(&so->so_rcv, m, 0);
5758                         /* NB: sorwakeup_locked() does an implicit unlock. */
5759                         sorwakeup_locked(so);
5760 #ifdef NETFLIX_SB_LIMITS
5761                         if (so->so_rcv.sb_shlim && appended != mcnt)
5762                                 counter_fo_release(so->so_rcv.sb_shlim,
5763                                     mcnt - appended);
5764 #endif
5765                 } else {
5766                         /*
5767                          * XXX: Due to the header drop above "th" is
5768                          * theoretically invalid by now.  Fortunately
5769                          * m_adj() doesn't actually frees any mbufs when
5770                          * trimming from the head.
5771                          */
5772                         tcp_seq temp = save_start;
5773                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
5774                         tp->t_flags |= TF_ACKNOW;
5775                 }
5776                 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
5777                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
5778                                 /*
5779                                  * DSACK actually handled in the fastpath
5780                                  * above.
5781                                  */
5782                                 tcp_update_sack_list(tp, save_start,
5783                                     save_start + save_tlen);
5784                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
5785                                 if ((tp->rcv_numsacks >= 1) &&
5786                                     (tp->sackblks[0].end == save_start)) {
5787                                         /*
5788                                          * Partial overlap, recorded at todrop
5789                                          * above.
5790                                          */
5791                                         tcp_update_sack_list(tp,
5792                                             tp->sackblks[0].start,
5793                                             tp->sackblks[0].end);
5794                                 } else {
5795                                         tcp_update_dsack_list(tp, save_start,
5796                                             save_start + save_tlen);
5797                                 }
5798                         } else if (tlen >= save_tlen) {
5799                                 /* Update of sackblks. */
5800                                 tcp_update_dsack_list(tp, save_start,
5801                                     save_start + save_tlen);
5802                         } else if (tlen > 0) {
5803                                 tcp_update_dsack_list(tp, save_start,
5804                                     save_start + tlen);
5805                         }
5806                 }
5807         } else {
5808                 m_freem(m);
5809                 thflags &= ~TH_FIN;
5810         }
5811
5812         /*
5813          * If FIN is received ACK the FIN and let the user know that the
5814          * connection is closing.
5815          */
5816         if (thflags & TH_FIN) {
5817                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5818                         socantrcvmore(so);
5819                         /*
5820                          * If connection is half-synchronized (ie NEEDSYN
5821                          * flag on) then delay ACK, so it may be piggybacked
5822                          * when SYN is sent. Otherwise, since we received a
5823                          * FIN then no more input can be expected, send ACK
5824                          * now.
5825                          */
5826                         if (tp->t_flags & TF_NEEDSYN) {
5827                                 rack_timer_cancel(tp, rack,
5828                                     rack->r_ctl.rc_rcvtime, __LINE__);
5829                                 tp->t_flags |= TF_DELACK;
5830                         } else {
5831                                 tp->t_flags |= TF_ACKNOW;
5832                         }
5833                         tp->rcv_nxt++;
5834                 }
5835                 switch (tp->t_state) {
5836
5837                         /*
5838                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
5839                          * CLOSE_WAIT state.
5840                          */
5841                 case TCPS_SYN_RECEIVED:
5842                         tp->t_starttime = ticks;
5843                         /* FALLTHROUGH */
5844                 case TCPS_ESTABLISHED:
5845                         rack_timer_cancel(tp, rack,
5846                             rack->r_ctl.rc_rcvtime, __LINE__);
5847                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
5848                         break;
5849
5850                         /*
5851                          * If still in FIN_WAIT_1 STATE FIN has not been
5852                          * acked so enter the CLOSING state.
5853                          */
5854                 case TCPS_FIN_WAIT_1:
5855                         rack_timer_cancel(tp, rack,
5856                             rack->r_ctl.rc_rcvtime, __LINE__);
5857                         tcp_state_change(tp, TCPS_CLOSING);
5858                         break;
5859
5860                         /*
5861                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
5862                          * starting the time-wait timer, turning off the
5863                          * other standard timers.
5864                          */
5865                 case TCPS_FIN_WAIT_2:
5866                         rack_timer_cancel(tp, rack,
5867                             rack->r_ctl.rc_rcvtime, __LINE__);
5868                         tcp_twstart(tp);
5869                         return (1);
5870                 }
5871         }
5872         /*
5873          * Return any desired output.
5874          */
5875         if ((tp->t_flags & TF_ACKNOW) ||
5876             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
5877                 rack->r_wanted_output++;
5878         }
5879         INP_WLOCK_ASSERT(tp->t_inpcb);
5880         return (0);
5881 }
5882
5883 /*
5884  * Here nothing is really faster, its just that we
5885  * have broken out the fast-data path also just like
5886  * the fast-ack.
5887  */
5888 static int
5889 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
5890     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5891     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
5892 {
5893         int32_t nsegs;
5894         int32_t newsize = 0;    /* automatic sockbuf scaling */
5895         struct tcp_rack *rack;
5896 #ifdef NETFLIX_SB_LIMITS
5897         u_int mcnt, appended;
5898 #endif
5899 #ifdef TCPDEBUG
5900         /*
5901          * The size of tcp_saveipgen must be the size of the max ip header,
5902          * now IPv6.
5903          */
5904         u_char tcp_saveipgen[IP6_HDR_LEN];
5905         struct tcphdr tcp_savetcp;
5906         short ostate = 0;
5907
5908 #endif
5909         /*
5910          * If last ACK falls within this segment's sequence numbers, record
5911          * the timestamp. NOTE that the test is modified according to the
5912          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5913          */
5914         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
5915                 return (0);
5916         }
5917         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5918                 return (0);
5919         }
5920         if (tiwin && tiwin != tp->snd_wnd) {
5921                 return (0);
5922         }
5923         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5924                 return (0);
5925         }
5926         if (__predict_false((to->to_flags & TOF_TS) &&
5927             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5928                 return (0);
5929         }
5930         if (__predict_false((th->th_ack != tp->snd_una))) {
5931                 return (0);
5932         }
5933         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5934                 return (0);
5935         }
5936         if ((to->to_flags & TOF_TS) != 0 &&
5937             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5938                 tp->ts_recent_age = tcp_ts_getticks();
5939                 tp->ts_recent = to->to_tsval;
5940         }
5941         rack = (struct tcp_rack *)tp->t_fb_ptr;
5942         /*
5943          * This is a pure, in-sequence data packet with nothing on the
5944          * reassembly queue and we have enough buffer space to take it.
5945          */
5946         nsegs = max(1, m->m_pkthdr.lro_nsegs);
5947
5948 #ifdef NETFLIX_SB_LIMITS
5949         if (so->so_rcv.sb_shlim) {
5950                 mcnt = m_memcnt(m);
5951                 appended = 0;
5952                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
5953                     CFO_NOSLEEP, NULL) == false) {
5954                         counter_u64_add(tcp_sb_shlim_fails, 1);
5955                         m_freem(m);
5956                         return (1);
5957                 }
5958         }
5959 #endif
5960         /* Clean receiver SACK report if present */
5961         if (tp->rcv_numsacks)
5962                 tcp_clean_sackreport(tp);
5963         TCPSTAT_INC(tcps_preddat);
5964         tp->rcv_nxt += tlen;
5965         /*
5966          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5967          */
5968         tp->snd_wl1 = th->th_seq;
5969         /*
5970          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5971          */
5972         tp->rcv_up = tp->rcv_nxt;
5973         TCPSTAT_ADD(tcps_rcvpack, nsegs);
5974         TCPSTAT_ADD(tcps_rcvbyte, tlen);
5975 #ifdef TCPDEBUG
5976         if (so->so_options & SO_DEBUG)
5977                 tcp_trace(TA_INPUT, ostate, tp,
5978                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
5979 #endif
5980         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5981
5982         /* Add data to socket buffer. */
5983         SOCKBUF_LOCK(&so->so_rcv);
5984         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5985                 m_freem(m);
5986         } else {
5987                 /*
5988                  * Set new socket buffer size. Give up when limit is
5989                  * reached.
5990                  */
5991                 if (newsize)
5992                         if (!sbreserve_locked(&so->so_rcv,
5993                             newsize, so, NULL))
5994                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5995                 m_adj(m, drop_hdrlen);  /* delayed header drop */
5996 #ifdef NETFLIX_SB_LIMITS
5997                 appended =
5998 #endif
5999                         sbappendstream_locked(&so->so_rcv, m, 0);
6000                 ctf_calc_rwin(so, tp);
6001         }
6002         /* NB: sorwakeup_locked() does an implicit unlock. */
6003         sorwakeup_locked(so);
6004 #ifdef NETFLIX_SB_LIMITS
6005         if (so->so_rcv.sb_shlim && mcnt != appended)
6006                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
6007 #endif
6008         if (DELAY_ACK(tp, tlen)) {
6009                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6010                 tp->t_flags |= TF_DELACK;
6011         } else {
6012                 tp->t_flags |= TF_ACKNOW;
6013                 rack->r_wanted_output++;
6014         }
6015         if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
6016                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
6017         return (1);
6018 }
6019
6020 /*
6021  * This subfunction is used to try to highly optimize the
6022  * fast path. We again allow window updates that are
6023  * in sequence to remain in the fast-path. We also add
6024  * in the __predict's to attempt to help the compiler.
6025  * Note that if we return a 0, then we can *not* process
6026  * it and the caller should push the packet into the
6027  * slow-path.
6028  */
6029 static int
6030 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6031     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6032     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts, uint8_t iptos)
6033 {
6034         int32_t acked;
6035         int32_t nsegs;
6036
6037 #ifdef TCPDEBUG
6038         /*
6039          * The size of tcp_saveipgen must be the size of the max ip header,
6040          * now IPv6.
6041          */
6042         u_char tcp_saveipgen[IP6_HDR_LEN];
6043         struct tcphdr tcp_savetcp;
6044         short ostate = 0;
6045
6046 #endif
6047         struct tcp_rack *rack;
6048
6049         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
6050                 /* Old ack, behind (or duplicate to) the last one rcv'd */
6051                 return (0);
6052         }
6053         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
6054                 /* Above what we have sent? */
6055                 return (0);
6056         }
6057         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
6058                 /* We are retransmitting */
6059                 return (0);
6060         }
6061         if (__predict_false(tiwin == 0)) {
6062                 /* zero window */
6063                 return (0);
6064         }
6065         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
6066                 /* We need a SYN or a FIN, unlikely.. */
6067                 return (0);
6068         }
6069         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
6070                 /* Timestamp is behind .. old ack with seq wrap? */
6071                 return (0);
6072         }
6073         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
6074                 /* Still recovering */
6075                 return (0);
6076         }
6077         rack = (struct tcp_rack *)tp->t_fb_ptr;
6078         if (rack->r_ctl.rc_sacked) {
6079                 /* We have sack holes on our scoreboard */
6080                 return (0);
6081         }
6082         /* Ok if we reach here, we can process a fast-ack */
6083         nsegs = max(1, m->m_pkthdr.lro_nsegs);
6084         rack_log_ack(tp, to, th);
6085         /*
6086          * We made progress, clear the tlp
6087          * out flag so we could start a TLP
6088          * again.
6089          */
6090         rack->r_ctl.rc_tlp_rtx_out = 0;
6091         /* Did the window get updated? */
6092         if (tiwin != tp->snd_wnd) {
6093                 tp->snd_wnd = tiwin;
6094                 tp->snd_wl1 = th->th_seq;
6095                 if (tp->snd_wnd > tp->max_sndwnd)
6096                         tp->max_sndwnd = tp->snd_wnd;
6097         }
6098         /* Do we exit persists? */
6099         if ((rack->rc_in_persist != 0) &&
6100             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
6101                                rack->r_ctl.rc_pace_min_segs))) {
6102                 rack_exit_persist(tp, rack);
6103         }
6104         /* Do we enter persists? */
6105         if ((rack->rc_in_persist == 0) &&
6106             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
6107             TCPS_HAVEESTABLISHED(tp->t_state) &&
6108             (tp->snd_max == tp->snd_una) &&
6109             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
6110             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
6111                 /*
6112                  * Here the rwnd is less than
6113                  * the pacing size, we are established,
6114                  * nothing is outstanding, and there is
6115                  * data to send. Enter persists.
6116                  */
6117                 tp->snd_nxt = tp->snd_una;
6118                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
6119         }
6120         /*
6121          * If last ACK falls within this segment's sequence numbers, record
6122          * the timestamp. NOTE that the test is modified according to the
6123          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
6124          */
6125         if ((to->to_flags & TOF_TS) != 0 &&
6126             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
6127                 tp->ts_recent_age = tcp_ts_getticks();
6128                 tp->ts_recent = to->to_tsval;
6129         }
6130         /*
6131          * This is a pure ack for outstanding data.
6132          */
6133         TCPSTAT_INC(tcps_predack);
6134
6135         /*
6136          * "bad retransmit" recovery.
6137          */
6138         if (tp->t_flags & TF_PREVVALID) {
6139                 tp->t_flags &= ~TF_PREVVALID;
6140                 if (tp->t_rxtshift == 1 &&
6141                     (int)(ticks - tp->t_badrxtwin) < 0)
6142                         rack_cong_signal(tp, th, CC_RTO_ERR);
6143         }
6144         /*
6145          * Recalculate the transmit timer / rtt.
6146          *
6147          * Some boxes send broken timestamp replies during the SYN+ACK
6148          * phase, ignore timestamps of 0 or we could calculate a huge RTT
6149          * and blow up the retransmit timer.
6150          */
6151         acked = BYTES_THIS_ACK(tp, th);
6152
6153 #ifdef TCP_HHOOK
6154         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
6155         hhook_run_tcp_est_in(tp, th, to);
6156 #endif
6157
6158         TCPSTAT_ADD(tcps_rcvackpack, nsegs);
6159         TCPSTAT_ADD(tcps_rcvackbyte, acked);
6160         sbdrop(&so->so_snd, acked);
6161         /*
6162          * Let the congestion control algorithm update congestion control
6163          * related information. This typically means increasing the
6164          * congestion window.
6165          */
6166         rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
6167
6168         tp->snd_una = th->th_ack;
6169         if (tp->snd_wnd < ctf_outstanding(tp)) {
6170                 /* The peer collapsed the window */
6171                 rack_collapsed_window(rack);
6172         } else if (rack->rc_has_collapsed)
6173                 rack_un_collapse_window(rack);
6174
6175         /*
6176          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
6177          */
6178         tp->snd_wl2 = th->th_ack;
6179         tp->t_dupacks = 0;
6180         m_freem(m);
6181         /* ND6_HINT(tp);         *//* Some progress has been made. */
6182
6183         /*
6184          * If all outstanding data are acked, stop retransmit timer,
6185          * otherwise restart timer using current (possibly backed-off)
6186          * value. If process is waiting for space, wakeup/selwakeup/signal.
6187          * If data are ready to send, let tcp_output decide between more
6188          * output or persist.
6189          */
6190 #ifdef TCPDEBUG
6191         if (so->so_options & SO_DEBUG)
6192                 tcp_trace(TA_INPUT, ostate, tp,
6193                     (void *)tcp_saveipgen,
6194                     &tcp_savetcp, 0);
6195 #endif
6196         if (tp->snd_una == tp->snd_max) {
6197                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
6198                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
6199                         tp->t_acktime = 0;
6200                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6201         }
6202         /* Wake up the socket if we have room to write more */
6203         sowwakeup(so);
6204         if (sbavail(&so->so_snd)) {
6205                 rack->r_wanted_output++;
6206         }
6207         return (1);
6208 }
6209
6210 /*
6211  * Return value of 1, the TCB is unlocked and most
6212  * likely gone, return value of 0, the TCP is still
6213  * locked.
6214  */
6215 static int
6216 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
6217     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6218     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t tos)
6219 {
6220         int32_t ret_val = 0;
6221         int32_t todrop;
6222         int32_t ourfinisacked = 0;
6223         struct tcp_rack *rack;
6224
6225         ctf_calc_rwin(so, tp);
6226         /*
6227          * If the state is SYN_SENT: if seg contains an ACK, but not for our
6228          * SYN, drop the input. if seg contains a RST, then drop the
6229          * connection. if seg does not contain SYN, then drop it. Otherwise
6230          * this is an acceptable SYN segment initialize tp->rcv_nxt and
6231          * tp->irs if seg contains ack then advance tp->snd_una if seg
6232          * contains an ECE and ECN support is enabled, the stream is ECN
6233          * capable. if SYN has been acked change to ESTABLISHED else
6234          * SYN_RCVD state arrange for segment to be acked (eventually)
6235          * continue processing rest of data/controls, beginning with URG
6236          */
6237         if ((thflags & TH_ACK) &&
6238             (SEQ_LEQ(th->th_ack, tp->iss) ||
6239             SEQ_GT(th->th_ack, tp->snd_max))) {
6240                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6241                 return (1);
6242         }
6243         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
6244                 TCP_PROBE5(connect__refused, NULL, tp,
6245                     mtod(m, const char *), tp, th);
6246                 tp = tcp_drop(tp, ECONNREFUSED);
6247                 ctf_do_drop(m, tp);
6248                 return (1);
6249         }
6250         if (thflags & TH_RST) {
6251                 ctf_do_drop(m, tp);
6252                 return (1);
6253         }
6254         if (!(thflags & TH_SYN)) {
6255                 ctf_do_drop(m, tp);
6256                 return (1);
6257         }
6258         tp->irs = th->th_seq;
6259         tcp_rcvseqinit(tp);
6260         rack = (struct tcp_rack *)tp->t_fb_ptr;
6261         if (thflags & TH_ACK) {
6262                 int tfo_partial = 0;
6263
6264                 TCPSTAT_INC(tcps_connects);
6265                 soisconnected(so);
6266 #ifdef MAC
6267                 mac_socketpeer_set_from_mbuf(m, so);
6268 #endif
6269                 /* Do window scaling on this connection? */
6270                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6271                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6272                         tp->rcv_scale = tp->request_r_scale;
6273                 }
6274                 tp->rcv_adv += min(tp->rcv_wnd,
6275                     TCP_MAXWIN << tp->rcv_scale);
6276                 /*
6277                  * If not all the data that was sent in the TFO SYN
6278                  * has been acked, resend the remainder right away.
6279                  */
6280                 if (IS_FASTOPEN(tp->t_flags) &&
6281                     (tp->snd_una != tp->snd_max)) {
6282                         tp->snd_nxt = th->th_ack;
6283                         tfo_partial = 1;
6284                 }
6285                 /*
6286                  * If there's data, delay ACK; if there's also a FIN ACKNOW
6287                  * will be turned on later.
6288                  */
6289                 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
6290                         rack_timer_cancel(tp, rack,
6291                                           rack->r_ctl.rc_rcvtime, __LINE__);
6292                         tp->t_flags |= TF_DELACK;
6293                 } else {
6294                         rack->r_wanted_output++;
6295                         tp->t_flags |= TF_ACKNOW;
6296                 }
6297
6298                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
6299                     V_tcp_do_ecn) {
6300                         tp->t_flags2 |= TF2_ECN_PERMIT;
6301                         TCPSTAT_INC(tcps_ecn_shs);
6302                 }
6303                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
6304                         /*
6305                          * We advance snd_una for the
6306                          * fast open case. If th_ack is
6307                          * acknowledging data beyond
6308                          * snd_una we can't just call
6309                          * ack-processing since the
6310                          * data stream in our send-map
6311                          * will start at snd_una + 1 (one
6312                          * beyond the SYN). If its just
6313                          * equal we don't need to do that
6314                          * and there is no send_map.
6315                          */
6316                         tp->snd_una++;
6317                 }
6318                 /*
6319                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
6320                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
6321                  */
6322                 tp->t_starttime = ticks;
6323                 if (tp->t_flags & TF_NEEDFIN) {
6324                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
6325                         tp->t_flags &= ~TF_NEEDFIN;
6326                         thflags &= ~TH_SYN;
6327                 } else {
6328                         tcp_state_change(tp, TCPS_ESTABLISHED);
6329                         TCP_PROBE5(connect__established, NULL, tp,
6330                             mtod(m, const char *), tp, th);
6331                         cc_conn_init(tp);
6332                 }
6333         } else {
6334                 /*
6335                  * Received initial SYN in SYN-SENT[*] state => simultaneous
6336                  * open.  If segment contains CC option and there is a
6337                  * cached CC, apply TAO test. If it succeeds, connection is *
6338                  * half-synchronized. Otherwise, do 3-way handshake:
6339                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
6340                  * there was no CC option, clear cached CC value.
6341                  */
6342                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
6343                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
6344         }
6345         INP_WLOCK_ASSERT(tp->t_inpcb);
6346         /*
6347          * Advance th->th_seq to correspond to first data byte. If data,
6348          * trim to stay within window, dropping FIN if necessary.
6349          */
6350         th->th_seq++;
6351         if (tlen > tp->rcv_wnd) {
6352                 todrop = tlen - tp->rcv_wnd;
6353                 m_adj(m, -todrop);
6354                 tlen = tp->rcv_wnd;
6355                 thflags &= ~TH_FIN;
6356                 TCPSTAT_INC(tcps_rcvpackafterwin);
6357                 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
6358         }
6359         tp->snd_wl1 = th->th_seq - 1;
6360         tp->rcv_up = th->th_seq;
6361         /*
6362          * Client side of transaction: already sent SYN and data. If the
6363          * remote host used T/TCP to validate the SYN, our data will be
6364          * ACK'd; if so, enter normal data segment processing in the middle
6365          * of step 5, ack processing. Otherwise, goto step 6.
6366          */
6367         if (thflags & TH_ACK) {
6368                 /* For syn-sent we need to possibly update the rtt */
6369                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6370                         uint32_t t;
6371
6372                         t = tcp_ts_getticks() - to->to_tsecr;
6373                         if (!tp->t_rttlow || tp->t_rttlow > t)
6374                                 tp->t_rttlow = t;
6375                         tcp_rack_xmit_timer(rack, t + 1);
6376                         tcp_rack_xmit_timer_commit(rack, tp);
6377                 }
6378                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
6379                         return (ret_val);
6380                 /* We may have changed to FIN_WAIT_1 above */
6381                 if (tp->t_state == TCPS_FIN_WAIT_1) {
6382                         /*
6383                          * In FIN_WAIT_1 STATE in addition to the processing
6384                          * for the ESTABLISHED state if our FIN is now
6385                          * acknowledged then enter FIN_WAIT_2.
6386                          */
6387                         if (ourfinisacked) {
6388                                 /*
6389                                  * If we can't receive any more data, then
6390                                  * closing user can proceed. Starting the
6391                                  * timer is contrary to the specification,
6392                                  * but if we don't get a FIN we'll hang
6393                                  * forever.
6394                                  *
6395                                  * XXXjl: we should release the tp also, and
6396                                  * use a compressed state.
6397                                  */
6398                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6399                                         soisdisconnected(so);
6400                                         tcp_timer_activate(tp, TT_2MSL,
6401                                             (tcp_fast_finwait2_recycle ?
6402                                             tcp_finwait2_timeout :
6403                                             TP_MAXIDLE(tp)));
6404                                 }
6405                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6406                         }
6407                 }
6408         }
6409         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6410            tiwin, thflags, nxt_pkt));
6411 }
6412
6413 /*
6414  * Return value of 1, the TCB is unlocked and most
6415  * likely gone, return value of 0, the TCP is still
6416  * locked.
6417  */
6418 static int
6419 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
6420     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6421     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6422 {
6423         struct tcp_rack *rack;
6424         int32_t ret_val = 0;
6425         int32_t ourfinisacked = 0;
6426
6427         ctf_calc_rwin(so, tp);
6428         if ((thflags & TH_ACK) &&
6429             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
6430             SEQ_GT(th->th_ack, tp->snd_max))) {
6431                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6432                 return (1);
6433         }
6434         rack = (struct tcp_rack *)tp->t_fb_ptr;
6435         if (IS_FASTOPEN(tp->t_flags)) {
6436                 /*
6437                  * When a TFO connection is in SYN_RECEIVED, the
6438                  * only valid packets are the initial SYN, a
6439                  * retransmit/copy of the initial SYN (possibly with
6440                  * a subset of the original data), a valid ACK, a
6441                  * FIN, or a RST.
6442                  */
6443                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
6444                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6445                         return (1);
6446                 } else if (thflags & TH_SYN) {
6447                         /* non-initial SYN is ignored */
6448                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
6449                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
6450                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
6451                                 ctf_do_drop(m, NULL);
6452                                 return (0);
6453                         }
6454                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
6455                         ctf_do_drop(m, NULL);
6456                         return (0);
6457                 }
6458         }
6459         if ((thflags & TH_RST) ||
6460             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6461                 return (ctf_process_rst(m, th, so, tp));
6462         /*
6463          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6464          * it's less than ts_recent, drop it.
6465          */
6466         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6467             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6468                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6469                         return (ret_val);
6470         }
6471         /*
6472          * In the SYN-RECEIVED state, validate that the packet belongs to
6473          * this connection before trimming the data to fit the receive
6474          * window.  Check the sequence number versus IRS since we know the
6475          * sequence numbers haven't wrapped.  This is a partial fix for the
6476          * "LAND" DoS attack.
6477          */
6478         if (SEQ_LT(th->th_seq, tp->irs)) {
6479                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6480                 return (1);
6481         }
6482         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6483                 return (ret_val);
6484         }
6485         /*
6486          * If last ACK falls within this segment's sequence numbers, record
6487          * its timestamp. NOTE: 1) That the test incorporates suggestions
6488          * from the latest proposal of the tcplw@cray.com list (Braden
6489          * 1993/04/26). 2) That updating only on newer timestamps interferes
6490          * with our earlier PAWS tests, so this check should be solely
6491          * predicated on the sequence space of this segment. 3) That we
6492          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6493          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6494          * SEG.Len, This modified check allows us to overcome RFC1323's
6495          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6496          * p.869. In such cases, we can still calculate the RTT correctly
6497          * when RCV.NXT == Last.ACK.Sent.
6498          */
6499         if ((to->to_flags & TOF_TS) != 0 &&
6500             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6501             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6502             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6503                 tp->ts_recent_age = tcp_ts_getticks();
6504                 tp->ts_recent = to->to_tsval;
6505         }
6506         tp->snd_wnd = tiwin;
6507         /*
6508          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6509          * is on (half-synchronized state), then queue data for later
6510          * processing; else drop segment and return.
6511          */
6512         if ((thflags & TH_ACK) == 0) {
6513                 if (IS_FASTOPEN(tp->t_flags)) {
6514                         cc_conn_init(tp);
6515                 }
6516                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6517                     tiwin, thflags, nxt_pkt));
6518         }
6519         TCPSTAT_INC(tcps_connects);
6520         soisconnected(so);
6521         /* Do window scaling? */
6522         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
6523             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
6524                 tp->rcv_scale = tp->request_r_scale;
6525         }
6526         /*
6527          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
6528          * FIN-WAIT-1
6529          */
6530         tp->t_starttime = ticks;
6531         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
6532                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
6533                 tp->t_tfo_pending = NULL;
6534
6535                 /*
6536                  * Account for the ACK of our SYN prior to
6537                  * regular ACK processing below.
6538                  */
6539                 tp->snd_una++;
6540         }
6541         if (tp->t_flags & TF_NEEDFIN) {
6542                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
6543                 tp->t_flags &= ~TF_NEEDFIN;
6544         } else {
6545                 tcp_state_change(tp, TCPS_ESTABLISHED);
6546                 TCP_PROBE5(accept__established, NULL, tp,
6547                     mtod(m, const char *), tp, th);
6548                 /*
6549                  * TFO connections call cc_conn_init() during SYN
6550                  * processing.  Calling it again here for such connections
6551                  * is not harmless as it would undo the snd_cwnd reduction
6552                  * that occurs when a TFO SYN|ACK is retransmitted.
6553                  */
6554                 if (!IS_FASTOPEN(tp->t_flags))
6555                         cc_conn_init(tp);
6556         }
6557         /*
6558          * If segment contains data or ACK, will call tcp_reass() later; if
6559          * not, do so now to pass queued data to user.
6560          */
6561         if (tlen == 0 && (thflags & TH_FIN) == 0)
6562                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
6563                     (struct mbuf *)0);
6564         tp->snd_wl1 = th->th_seq - 1;
6565         /* For syn-recv we need to possibly update the rtt */
6566         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
6567                 uint32_t t;
6568
6569                 t = tcp_ts_getticks() - to->to_tsecr;
6570                 if (!tp->t_rttlow || tp->t_rttlow > t)
6571                         tp->t_rttlow = t;
6572                 tcp_rack_xmit_timer(rack, t + 1);
6573                 tcp_rack_xmit_timer_commit(rack, tp);
6574         }
6575         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6576                 return (ret_val);
6577         }
6578         if (tp->t_state == TCPS_FIN_WAIT_1) {
6579                 /* We could have went to FIN_WAIT_1 (or EST) above */
6580                 /*
6581                  * In FIN_WAIT_1 STATE in addition to the processing for the
6582                  * ESTABLISHED state if our FIN is now acknowledged then
6583                  * enter FIN_WAIT_2.
6584                  */
6585                 if (ourfinisacked) {
6586                         /*
6587                          * If we can't receive any more data, then closing
6588                          * user can proceed. Starting the timer is contrary
6589                          * to the specification, but if we don't get a FIN
6590                          * we'll hang forever.
6591                          *
6592                          * XXXjl: we should release the tp also, and use a
6593                          * compressed state.
6594                          */
6595                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6596                                 soisdisconnected(so);
6597                                 tcp_timer_activate(tp, TT_2MSL,
6598                                     (tcp_fast_finwait2_recycle ?
6599                                     tcp_finwait2_timeout :
6600                                     TP_MAXIDLE(tp)));
6601                         }
6602                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
6603                 }
6604         }
6605         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6606             tiwin, thflags, nxt_pkt));
6607 }
6608
6609 /*
6610  * Return value of 1, the TCB is unlocked and most
6611  * likely gone, return value of 0, the TCP is still
6612  * locked.
6613  */
6614 static int
6615 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
6616     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6617     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6618 {
6619         int32_t ret_val = 0;
6620
6621         /*
6622          * Header prediction: check for the two common cases of a
6623          * uni-directional data xfer.  If the packet has no control flags,
6624          * is in-sequence, the window didn't change and we're not
6625          * retransmitting, it's a candidate.  If the length is zero and the
6626          * ack moved forward, we're the sender side of the xfer.  Just free
6627          * the data acked & wake any higher level process that was blocked
6628          * waiting for space.  If the length is non-zero and the ack didn't
6629          * move, we're the receiver side.  If we're getting packets in-order
6630          * (the reassembly queue is empty), add the data toc The socket
6631          * buffer and note that we need a delayed ack. Make sure that the
6632          * hidden state-flags are also off. Since we check for
6633          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
6634          */
6635         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
6636             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
6637             __predict_true(SEGQ_EMPTY(tp)) &&
6638             __predict_true(th->th_seq == tp->rcv_nxt)) {
6639                 struct tcp_rack *rack;
6640
6641                 rack = (struct tcp_rack *)tp->t_fb_ptr;
6642                 if (tlen == 0) {
6643                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
6644                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime, iptos)) {
6645                                 return (0);
6646                         }
6647                 } else {
6648                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
6649                             tiwin, nxt_pkt, iptos)) {
6650                                 return (0);
6651                         }
6652                 }
6653         }
6654         ctf_calc_rwin(so, tp);
6655
6656         if ((thflags & TH_RST) ||
6657             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6658                 return (ctf_process_rst(m, th, so, tp));
6659
6660         /*
6661          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6662          * synchronized state.
6663          */
6664         if (thflags & TH_SYN) {
6665                 ctf_challenge_ack(m, th, tp, &ret_val);
6666                 return (ret_val);
6667         }
6668         /*
6669          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6670          * it's less than ts_recent, drop it.
6671          */
6672         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6673             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6674                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6675                         return (ret_val);
6676         }
6677         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6678                 return (ret_val);
6679         }
6680         /*
6681          * If last ACK falls within this segment's sequence numbers, record
6682          * its timestamp. NOTE: 1) That the test incorporates suggestions
6683          * from the latest proposal of the tcplw@cray.com list (Braden
6684          * 1993/04/26). 2) That updating only on newer timestamps interferes
6685          * with our earlier PAWS tests, so this check should be solely
6686          * predicated on the sequence space of this segment. 3) That we
6687          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6688          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6689          * SEG.Len, This modified check allows us to overcome RFC1323's
6690          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6691          * p.869. In such cases, we can still calculate the RTT correctly
6692          * when RCV.NXT == Last.ACK.Sent.
6693          */
6694         if ((to->to_flags & TOF_TS) != 0 &&
6695             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6696             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6697             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6698                 tp->ts_recent_age = tcp_ts_getticks();
6699                 tp->ts_recent = to->to_tsval;
6700         }
6701         /*
6702          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6703          * is on (half-synchronized state), then queue data for later
6704          * processing; else drop segment and return.
6705          */
6706         if ((thflags & TH_ACK) == 0) {
6707                 if (tp->t_flags & TF_NEEDSYN) {
6708
6709                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6710                             tiwin, thflags, nxt_pkt));
6711
6712                 } else if (tp->t_flags & TF_ACKNOW) {
6713                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6714                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6715                         return (ret_val);
6716                 } else {
6717                         ctf_do_drop(m, NULL);
6718                         return (0);
6719                 }
6720         }
6721         /*
6722          * Ack processing.
6723          */
6724         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6725                 return (ret_val);
6726         }
6727         if (sbavail(&so->so_snd)) {
6728                 if (rack_progress_timeout_check(tp)) {
6729                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6730                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6731                         return (1);
6732                 }
6733         }
6734         /* State changes only happen in rack_process_data() */
6735         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6736             tiwin, thflags, nxt_pkt));
6737 }
6738
6739 /*
6740  * Return value of 1, the TCB is unlocked and most
6741  * likely gone, return value of 0, the TCP is still
6742  * locked.
6743  */
6744 static int
6745 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
6746     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6747     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6748 {
6749         int32_t ret_val = 0;
6750
6751         ctf_calc_rwin(so, tp);
6752         if ((thflags & TH_RST) ||
6753             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6754                 return (ctf_process_rst(m, th, so, tp));
6755         /*
6756          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6757          * synchronized state.
6758          */
6759         if (thflags & TH_SYN) {
6760                 ctf_challenge_ack(m, th, tp, &ret_val);
6761                 return (ret_val);
6762         }
6763         /*
6764          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6765          * it's less than ts_recent, drop it.
6766          */
6767         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6768             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6769                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6770                         return (ret_val);
6771         }
6772         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6773                 return (ret_val);
6774         }
6775         /*
6776          * If last ACK falls within this segment's sequence numbers, record
6777          * its timestamp. NOTE: 1) That the test incorporates suggestions
6778          * from the latest proposal of the tcplw@cray.com list (Braden
6779          * 1993/04/26). 2) That updating only on newer timestamps interferes
6780          * with our earlier PAWS tests, so this check should be solely
6781          * predicated on the sequence space of this segment. 3) That we
6782          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6783          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6784          * SEG.Len, This modified check allows us to overcome RFC1323's
6785          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6786          * p.869. In such cases, we can still calculate the RTT correctly
6787          * when RCV.NXT == Last.ACK.Sent.
6788          */
6789         if ((to->to_flags & TOF_TS) != 0 &&
6790             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6791             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6792             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6793                 tp->ts_recent_age = tcp_ts_getticks();
6794                 tp->ts_recent = to->to_tsval;
6795         }
6796         /*
6797          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6798          * is on (half-synchronized state), then queue data for later
6799          * processing; else drop segment and return.
6800          */
6801         if ((thflags & TH_ACK) == 0) {
6802                 if (tp->t_flags & TF_NEEDSYN) {
6803                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6804                             tiwin, thflags, nxt_pkt));
6805
6806                 } else if (tp->t_flags & TF_ACKNOW) {
6807                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6808                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6809                         return (ret_val);
6810                 } else {
6811                         ctf_do_drop(m, NULL);
6812                         return (0);
6813                 }
6814         }
6815         /*
6816          * Ack processing.
6817          */
6818         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
6819                 return (ret_val);
6820         }
6821         if (sbavail(&so->so_snd)) {
6822                 if (rack_progress_timeout_check(tp)) {
6823                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6824                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6825                         return (1);
6826                 }
6827         }
6828         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6829             tiwin, thflags, nxt_pkt));
6830 }
6831
6832 static int
6833 rack_check_data_after_close(struct mbuf *m,
6834     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
6835 {
6836         struct tcp_rack *rack;
6837
6838         rack = (struct tcp_rack *)tp->t_fb_ptr;
6839         if (rack->rc_allow_data_af_clo == 0) {
6840         close_now:
6841                 tp = tcp_close(tp);
6842                 TCPSTAT_INC(tcps_rcvafterclose);
6843                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
6844                 return (1);
6845         }
6846         if (sbavail(&so->so_snd) == 0)
6847                 goto close_now;
6848         /* Ok we allow data that is ignored and a followup reset */
6849         tp->rcv_nxt = th->th_seq + *tlen;
6850         tp->t_flags2 |= TF2_DROP_AF_DATA;
6851         rack->r_wanted_output = 1;
6852         *tlen = 0;
6853         return (0);
6854 }
6855
6856 /*
6857  * Return value of 1, the TCB is unlocked and most
6858  * likely gone, return value of 0, the TCP is still
6859  * locked.
6860  */
6861 static int
6862 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
6863     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6864     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6865 {
6866         int32_t ret_val = 0;
6867         int32_t ourfinisacked = 0;
6868
6869         ctf_calc_rwin(so, tp);
6870
6871         if ((thflags & TH_RST) ||
6872             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6873                 return (ctf_process_rst(m, th, so, tp));
6874         /*
6875          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6876          * synchronized state.
6877          */
6878         if (thflags & TH_SYN) {
6879                 ctf_challenge_ack(m, th, tp, &ret_val);
6880                 return (ret_val);
6881         }
6882         /*
6883          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6884          * it's less than ts_recent, drop it.
6885          */
6886         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6887             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6888                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
6889                         return (ret_val);
6890         }
6891         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6892                 return (ret_val);
6893         }
6894         /*
6895          * If new data are received on a connection after the user processes
6896          * are gone, then RST the other end.
6897          */
6898         if ((so->so_state & SS_NOFDREF) && tlen) {
6899                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
6900                         return (1);
6901         }
6902         /*
6903          * If last ACK falls within this segment's sequence numbers, record
6904          * its timestamp. NOTE: 1) That the test incorporates suggestions
6905          * from the latest proposal of the tcplw@cray.com list (Braden
6906          * 1993/04/26). 2) That updating only on newer timestamps interferes
6907          * with our earlier PAWS tests, so this check should be solely
6908          * predicated on the sequence space of this segment. 3) That we
6909          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6910          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6911          * SEG.Len, This modified check allows us to overcome RFC1323's
6912          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6913          * p.869. In such cases, we can still calculate the RTT correctly
6914          * when RCV.NXT == Last.ACK.Sent.
6915          */
6916         if ((to->to_flags & TOF_TS) != 0 &&
6917             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6918             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6919             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6920                 tp->ts_recent_age = tcp_ts_getticks();
6921                 tp->ts_recent = to->to_tsval;
6922         }
6923         /*
6924          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6925          * is on (half-synchronized state), then queue data for later
6926          * processing; else drop segment and return.
6927          */
6928         if ((thflags & TH_ACK) == 0) {
6929                 if (tp->t_flags & TF_NEEDSYN) {
6930                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6931                             tiwin, thflags, nxt_pkt));
6932                 } else if (tp->t_flags & TF_ACKNOW) {
6933                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6934                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
6935                         return (ret_val);
6936                 } else {
6937                         ctf_do_drop(m, NULL);
6938                         return (0);
6939                 }
6940         }
6941         /*
6942          * Ack processing.
6943          */
6944         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6945                 return (ret_val);
6946         }
6947         if (ourfinisacked) {
6948                 /*
6949                  * If we can't receive any more data, then closing user can
6950                  * proceed. Starting the timer is contrary to the
6951                  * specification, but if we don't get a FIN we'll hang
6952                  * forever.
6953                  *
6954                  * XXXjl: we should release the tp also, and use a
6955                  * compressed state.
6956                  */
6957                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
6958                         soisdisconnected(so);
6959                         tcp_timer_activate(tp, TT_2MSL,
6960                             (tcp_fast_finwait2_recycle ?
6961                             tcp_finwait2_timeout :
6962                             TP_MAXIDLE(tp)));
6963                 }
6964                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
6965         }
6966         if (sbavail(&so->so_snd)) {
6967                 if (rack_progress_timeout_check(tp)) {
6968                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6969                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6970                         return (1);
6971                 }
6972         }
6973         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6974             tiwin, thflags, nxt_pkt));
6975 }
6976
6977 /*
6978  * Return value of 1, the TCB is unlocked and most
6979  * likely gone, return value of 0, the TCP is still
6980  * locked.
6981  */
6982 static int
6983 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
6984     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6985     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
6986 {
6987         int32_t ret_val = 0;
6988         int32_t ourfinisacked = 0;
6989
6990         ctf_calc_rwin(so, tp);
6991
6992         if ((thflags & TH_RST) ||
6993             (tp->t_fin_is_rst && (thflags & TH_FIN)))
6994                 return (ctf_process_rst(m, th, so, tp));
6995         /*
6996          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6997          * synchronized state.
6998          */
6999         if (thflags & TH_SYN) {
7000                 ctf_challenge_ack(m, th, tp, &ret_val);
7001                 return (ret_val);
7002         }
7003         /*
7004          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7005          * it's less than ts_recent, drop it.
7006          */
7007         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7008             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7009                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7010                         return (ret_val);
7011         }
7012         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7013                 return (ret_val);
7014         }
7015         /*
7016          * If new data are received on a connection after the user processes
7017          * are gone, then RST the other end.
7018          */
7019         if ((so->so_state & SS_NOFDREF) && tlen) {
7020                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7021                         return (1);
7022         }
7023         /*
7024          * If last ACK falls within this segment's sequence numbers, record
7025          * its timestamp. NOTE: 1) That the test incorporates suggestions
7026          * from the latest proposal of the tcplw@cray.com list (Braden
7027          * 1993/04/26). 2) That updating only on newer timestamps interferes
7028          * with our earlier PAWS tests, so this check should be solely
7029          * predicated on the sequence space of this segment. 3) That we
7030          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7031          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7032          * SEG.Len, This modified check allows us to overcome RFC1323's
7033          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7034          * p.869. In such cases, we can still calculate the RTT correctly
7035          * when RCV.NXT == Last.ACK.Sent.
7036          */
7037         if ((to->to_flags & TOF_TS) != 0 &&
7038             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7039             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7040             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7041                 tp->ts_recent_age = tcp_ts_getticks();
7042                 tp->ts_recent = to->to_tsval;
7043         }
7044         /*
7045          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7046          * is on (half-synchronized state), then queue data for later
7047          * processing; else drop segment and return.
7048          */
7049         if ((thflags & TH_ACK) == 0) {
7050                 if (tp->t_flags & TF_NEEDSYN) {
7051                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7052                             tiwin, thflags, nxt_pkt));
7053                 } else if (tp->t_flags & TF_ACKNOW) {
7054                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7055                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7056                         return (ret_val);
7057                 } else {
7058                         ctf_do_drop(m, NULL);
7059                         return (0);
7060                 }
7061         }
7062         /*
7063          * Ack processing.
7064          */
7065         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7066                 return (ret_val);
7067         }
7068         if (ourfinisacked) {
7069                 tcp_twstart(tp);
7070                 m_freem(m);
7071                 return (1);
7072         }
7073         if (sbavail(&so->so_snd)) {
7074                 if (rack_progress_timeout_check(tp)) {
7075                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7076                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7077                         return (1);
7078                 }
7079         }
7080         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7081             tiwin, thflags, nxt_pkt));
7082 }
7083
7084 /*
7085  * Return value of 1, the TCB is unlocked and most
7086  * likely gone, return value of 0, the TCP is still
7087  * locked.
7088  */
7089 static int
7090 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
7091     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7092     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7093 {
7094         int32_t ret_val = 0;
7095         int32_t ourfinisacked = 0;
7096
7097         ctf_calc_rwin(so, tp);
7098
7099         if ((thflags & TH_RST) ||
7100             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7101                 return (ctf_process_rst(m, th, so, tp));
7102         /*
7103          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7104          * synchronized state.
7105          */
7106         if (thflags & TH_SYN) {
7107                 ctf_challenge_ack(m, th, tp, &ret_val);
7108                 return (ret_val);
7109         }
7110         /*
7111          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7112          * it's less than ts_recent, drop it.
7113          */
7114         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7115             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7116                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7117                         return (ret_val);
7118         }
7119         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7120                 return (ret_val);
7121         }
7122         /*
7123          * If new data are received on a connection after the user processes
7124          * are gone, then RST the other end.
7125          */
7126         if ((so->so_state & SS_NOFDREF) && tlen) {
7127                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7128                         return (1);
7129         }
7130         /*
7131          * If last ACK falls within this segment's sequence numbers, record
7132          * its timestamp. NOTE: 1) That the test incorporates suggestions
7133          * from the latest proposal of the tcplw@cray.com list (Braden
7134          * 1993/04/26). 2) That updating only on newer timestamps interferes
7135          * with our earlier PAWS tests, so this check should be solely
7136          * predicated on the sequence space of this segment. 3) That we
7137          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7138          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7139          * SEG.Len, This modified check allows us to overcome RFC1323's
7140          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7141          * p.869. In such cases, we can still calculate the RTT correctly
7142          * when RCV.NXT == Last.ACK.Sent.
7143          */
7144         if ((to->to_flags & TOF_TS) != 0 &&
7145             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7146             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7147             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7148                 tp->ts_recent_age = tcp_ts_getticks();
7149                 tp->ts_recent = to->to_tsval;
7150         }
7151         /*
7152          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7153          * is on (half-synchronized state), then queue data for later
7154          * processing; else drop segment and return.
7155          */
7156         if ((thflags & TH_ACK) == 0) {
7157                 if (tp->t_flags & TF_NEEDSYN) {
7158                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7159                             tiwin, thflags, nxt_pkt));
7160                 } else if (tp->t_flags & TF_ACKNOW) {
7161                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7162                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7163                         return (ret_val);
7164                 } else {
7165                         ctf_do_drop(m, NULL);
7166                         return (0);
7167                 }
7168         }
7169         /*
7170          * case TCPS_LAST_ACK: Ack processing.
7171          */
7172         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7173                 return (ret_val);
7174         }
7175         if (ourfinisacked) {
7176                 tp = tcp_close(tp);
7177                 ctf_do_drop(m, tp);
7178                 return (1);
7179         }
7180         if (sbavail(&so->so_snd)) {
7181                 if (rack_progress_timeout_check(tp)) {
7182                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7183                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7184                         return (1);
7185                 }
7186         }
7187         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7188             tiwin, thflags, nxt_pkt));
7189 }
7190
7191
7192 /*
7193  * Return value of 1, the TCB is unlocked and most
7194  * likely gone, return value of 0, the TCP is still
7195  * locked.
7196  */
7197 static int
7198 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
7199     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
7200     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
7201 {
7202         int32_t ret_val = 0;
7203         int32_t ourfinisacked = 0;
7204
7205         ctf_calc_rwin(so, tp);
7206
7207         /* Reset receive buffer auto scaling when not in bulk receive mode. */
7208         if ((thflags & TH_RST) ||
7209             (tp->t_fin_is_rst && (thflags & TH_FIN)))
7210                 return (ctf_process_rst(m, th, so, tp));
7211         /*
7212          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
7213          * synchronized state.
7214          */
7215         if (thflags & TH_SYN) {
7216                 ctf_challenge_ack(m, th, tp, &ret_val);
7217                 return (ret_val);
7218         }
7219         /*
7220          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
7221          * it's less than ts_recent, drop it.
7222          */
7223         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
7224             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
7225                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
7226                         return (ret_val);
7227         }
7228         if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
7229                 return (ret_val);
7230         }
7231         /*
7232          * If new data are received on a connection after the user processes
7233          * are gone, then RST the other end.
7234          */
7235         if ((so->so_state & SS_NOFDREF) &&
7236             tlen) {
7237                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
7238                         return (1);
7239         }
7240         /*
7241          * If last ACK falls within this segment's sequence numbers, record
7242          * its timestamp. NOTE: 1) That the test incorporates suggestions
7243          * from the latest proposal of the tcplw@cray.com list (Braden
7244          * 1993/04/26). 2) That updating only on newer timestamps interferes
7245          * with our earlier PAWS tests, so this check should be solely
7246          * predicated on the sequence space of this segment. 3) That we
7247          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
7248          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
7249          * SEG.Len, This modified check allows us to overcome RFC1323's
7250          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
7251          * p.869. In such cases, we can still calculate the RTT correctly
7252          * when RCV.NXT == Last.ACK.Sent.
7253          */
7254         if ((to->to_flags & TOF_TS) != 0 &&
7255             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
7256             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
7257             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
7258                 tp->ts_recent_age = tcp_ts_getticks();
7259                 tp->ts_recent = to->to_tsval;
7260         }
7261         /*
7262          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
7263          * is on (half-synchronized state), then queue data for later
7264          * processing; else drop segment and return.
7265          */
7266         if ((thflags & TH_ACK) == 0) {
7267                 if (tp->t_flags & TF_NEEDSYN) {
7268                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7269                             tiwin, thflags, nxt_pkt));
7270                 } else if (tp->t_flags & TF_ACKNOW) {
7271                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
7272                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
7273                         return (ret_val);
7274                 } else {
7275                         ctf_do_drop(m, NULL);
7276                         return (0);
7277                 }
7278         }
7279         /*
7280          * Ack processing.
7281          */
7282         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
7283                 return (ret_val);
7284         }
7285         if (sbavail(&so->so_snd)) {
7286                 if (rack_progress_timeout_check(tp)) {
7287                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
7288                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7289                         return (1);
7290                 }
7291         }
7292         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
7293             tiwin, thflags, nxt_pkt));
7294 }
7295
7296
7297 static void inline
7298 rack_clear_rate_sample(struct tcp_rack *rack)
7299 {
7300         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
7301         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
7302         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
7303 }
7304
7305 static void
7306 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack)
7307 {
7308         uint32_t tls_seg = 0;
7309
7310 #ifdef KERN_TLS
7311         if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) {
7312                 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd);
7313                 rack->r_ctl.rc_pace_min_segs = tls_seg;
7314         } else
7315 #endif
7316                 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
7317         rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs;
7318         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES)
7319                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
7320 #ifdef KERN_TLS
7321         if (tls_seg != 0) {
7322                 if (rack_hw_tls_max_seg > 1) {
7323                         rack->r_ctl.rc_pace_max_segs /= tls_seg;
7324                         if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs)
7325                                 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg;
7326                 } else {
7327                         rack->r_ctl.rc_pace_max_segs = 1;
7328                 }
7329                 if (rack->r_ctl.rc_pace_max_segs == 0)
7330                         rack->r_ctl.rc_pace_max_segs = 1;
7331                 rack->r_ctl.rc_pace_max_segs *= tls_seg;
7332         }
7333 #endif
7334         rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2);
7335 }
7336
7337 static int
7338 rack_init(struct tcpcb *tp)
7339 {
7340         struct tcp_rack *rack = NULL;
7341         struct rack_sendmap *insret;
7342
7343         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
7344         if (tp->t_fb_ptr == NULL) {
7345                 /*
7346                  * We need to allocate memory but cant. The INP and INP_INFO
7347                  * locks and they are recusive (happens during setup. So a
7348                  * scheme to drop the locks fails :(
7349                  *
7350                  */
7351                 return (ENOMEM);
7352         }
7353         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
7354
7355         rack = (struct tcp_rack *)tp->t_fb_ptr;
7356         RB_INIT(&rack->r_ctl.rc_mtree);
7357         TAILQ_INIT(&rack->r_ctl.rc_free);
7358         TAILQ_INIT(&rack->r_ctl.rc_tmap);
7359         rack->rc_tp = tp;
7360         if (tp->t_inpcb) {
7361                 rack->rc_inp = tp->t_inpcb;
7362         }
7363         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
7364         /* Probably not needed but lets be sure */
7365         rack_clear_rate_sample(rack);
7366         rack->r_cpu = 0;
7367         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
7368         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
7369         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
7370         rack->rc_pace_reduce = rack_slot_reduction;
7371         if (use_rack_cheat)
7372                 rack->use_rack_cheat = 1;
7373         if (V_tcp_delack_enabled)
7374                 tp->t_delayed_ack = 1;
7375         else
7376                 tp->t_delayed_ack = 0;
7377         rack->rc_pace_max_segs = rack_hptsi_segments;
7378         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
7379         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
7380         rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
7381         rack->r_enforce_min_pace = rack_min_pace_time;
7382         rack->r_ctl.rc_prop_rate = rack_proportional_rate;
7383         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
7384         rack->r_ctl.rc_early_recovery = rack_early_recovery;
7385         rack->rc_always_pace = rack_pace_every_seg;
7386         rack_set_pace_segments(tp, rack);
7387         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
7388         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
7389         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
7390         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
7391         rack->r_ctl.rc_min_to = rack_min_to;
7392         rack->rack_per_of_gp = rack_per_of_gp;
7393         microuptime(&rack->r_ctl.rc_last_ack);
7394         rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack;
7395         rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks();
7396         /* Do we force on detection? */
7397 #ifdef NETFLIX_EXP_DETECTION
7398         if (tcp_force_detection)
7399                 rack->do_detection = 1;
7400         else
7401 #endif
7402                 rack->do_detection = 0;
7403         if (tp->snd_una != tp->snd_max) {
7404                 /* Create a send map for the current outstanding data */
7405                 struct rack_sendmap *rsm;
7406
7407                 rsm = rack_alloc(rack);
7408                 if (rsm == NULL) {
7409                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7410                         tp->t_fb_ptr = NULL;
7411                         return (ENOMEM);
7412                 }
7413                 rsm->r_flags = RACK_OVERMAX;
7414                 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time;
7415                 rsm->r_rtr_cnt = 1;
7416                 rsm->r_rtr_bytes = 0;
7417                 rsm->r_start = tp->snd_una;
7418                 rsm->r_end = tp->snd_max;
7419                 rsm->r_dupack = 0;
7420                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7421 #ifdef INVARIANTS
7422                 if (insret != NULL) {
7423                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
7424                               insret, rack, rsm);
7425                 }
7426 #endif
7427                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7428                 rsm->r_in_tmap = 1;
7429         }
7430         rack_stop_all_timers(tp);
7431         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7432         return (0);
7433 }
7434
7435 static int
7436 rack_handoff_ok(struct tcpcb *tp)
7437 {
7438         if ((tp->t_state == TCPS_CLOSED) ||
7439             (tp->t_state == TCPS_LISTEN)) {
7440                 /* Sure no problem though it may not stick */
7441                 return (0);
7442         }
7443         if ((tp->t_state == TCPS_SYN_SENT) ||
7444             (tp->t_state == TCPS_SYN_RECEIVED)) {
7445                 /*
7446                  * We really don't know you have to get to ESTAB or beyond
7447                  * to tell.
7448                  */
7449                 return (EAGAIN);
7450         }
7451         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
7452                 return (0);
7453         }
7454         /*
7455          * If we reach here we don't do SACK on this connection so we can
7456          * never do rack.
7457          */
7458         return (EINVAL);
7459 }
7460
7461 static void
7462 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
7463 {
7464         if (tp->t_fb_ptr) {
7465                 struct tcp_rack *rack;
7466                 struct rack_sendmap *rsm, *nrsm, *rm;
7467                 if (tp->t_inpcb) {
7468                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
7469                         tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
7470                 }
7471                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7472 #ifdef TCP_BLACKBOX
7473                 tcp_log_flowend(tp);
7474 #endif
7475                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
7476                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7477 #ifdef INVARIANTS
7478                         if (rm != rsm) {
7479                                 panic("At fini, rack:%p rsm:%p rm:%p",
7480                                       rack, rsm, rm);
7481                         }
7482 #endif
7483                         uma_zfree(rack_zone, rsm);
7484                 }
7485                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7486                 while (rsm) {
7487                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
7488                         uma_zfree(rack_zone, rsm);
7489                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
7490                 }
7491                 rack->rc_free_cnt = 0;
7492                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
7493                 tp->t_fb_ptr = NULL;
7494         }
7495         /* Make sure snd_nxt is correctly set */
7496         tp->snd_nxt = tp->snd_max;
7497 }
7498
7499
7500 static void
7501 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
7502 {
7503         switch (tp->t_state) {
7504         case TCPS_SYN_SENT:
7505                 rack->r_state = TCPS_SYN_SENT;
7506                 rack->r_substate = rack_do_syn_sent;
7507                 break;
7508         case TCPS_SYN_RECEIVED:
7509                 rack->r_state = TCPS_SYN_RECEIVED;
7510                 rack->r_substate = rack_do_syn_recv;
7511                 break;
7512         case TCPS_ESTABLISHED:
7513                 rack_set_pace_segments(tp, rack);
7514                 rack->r_state = TCPS_ESTABLISHED;
7515                 rack->r_substate = rack_do_established;
7516                 break;
7517         case TCPS_CLOSE_WAIT:
7518                 rack->r_state = TCPS_CLOSE_WAIT;
7519                 rack->r_substate = rack_do_close_wait;
7520                 break;
7521         case TCPS_FIN_WAIT_1:
7522                 rack->r_state = TCPS_FIN_WAIT_1;
7523                 rack->r_substate = rack_do_fin_wait_1;
7524                 break;
7525         case TCPS_CLOSING:
7526                 rack->r_state = TCPS_CLOSING;
7527                 rack->r_substate = rack_do_closing;
7528                 break;
7529         case TCPS_LAST_ACK:
7530                 rack->r_state = TCPS_LAST_ACK;
7531                 rack->r_substate = rack_do_lastack;
7532                 break;
7533         case TCPS_FIN_WAIT_2:
7534                 rack->r_state = TCPS_FIN_WAIT_2;
7535                 rack->r_substate = rack_do_fin_wait_2;
7536                 break;
7537         case TCPS_LISTEN:
7538         case TCPS_CLOSED:
7539         case TCPS_TIME_WAIT:
7540         default:
7541                 break;
7542         };
7543 }
7544
7545
7546 static void
7547 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
7548 {
7549         /*
7550          * We received an ack, and then did not
7551          * call send or were bounced out due to the
7552          * hpts was running. Now a timer is up as well, is
7553          * it the right timer?
7554          */
7555         struct rack_sendmap *rsm;
7556         int tmr_up;
7557
7558         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7559         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
7560                 return;
7561         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7562         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
7563             (tmr_up == PACE_TMR_RXT)) {
7564                 /* Should be an RXT */
7565                 return;
7566         }
7567         if (rsm == NULL) {
7568                 /* Nothing outstanding? */
7569                 if (tp->t_flags & TF_DELACK) {
7570                         if (tmr_up == PACE_TMR_DELACK)
7571                                 /* We are supposed to have delayed ack up and we do */
7572                                 return;
7573                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
7574                         /*
7575                          * if we hit enobufs then we would expect the possiblity
7576                          * of nothing outstanding and the RXT up (and the hptsi timer).
7577                          */
7578                         return;
7579                 } else if (((V_tcp_always_keepalive ||
7580                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7581                             (tp->t_state <= TCPS_CLOSING)) &&
7582                            (tmr_up == PACE_TMR_KEEP) &&
7583                            (tp->snd_max == tp->snd_una)) {
7584                         /* We should have keep alive up and we do */
7585                         return;
7586                 }
7587         }
7588         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
7589                    ((tmr_up == PACE_TMR_TLP) ||
7590                     (tmr_up == PACE_TMR_RACK) ||
7591                     (tmr_up == PACE_TMR_RXT))) {
7592                 /*
7593                  * Either a Rack, TLP or RXT is fine if  we
7594                  * have outstanding data.
7595                  */
7596                 return;
7597         } else if (tmr_up == PACE_TMR_DELACK) {
7598                 /*
7599                  * If the delayed ack was going to go off
7600                  * before the rtx/tlp/rack timer were going to
7601                  * expire, then that would be the timer in control.
7602                  * Note we don't check the time here trusting the
7603                  * code is correct.
7604                  */
7605                 return;
7606         }
7607         /*
7608          * Ok the timer originally started is not what we want now.
7609          * We will force the hpts to be stopped if any, and restart
7610          * with the slot set to what was in the saved slot.
7611          */
7612         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
7613         rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7614 }
7615
7616 static int
7617 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
7618     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
7619     int32_t nxt_pkt, struct timeval *tv)
7620 {
7621         int32_t thflags, retval, did_out = 0;
7622         int32_t way_out = 0;
7623         uint32_t cts;
7624         uint32_t tiwin;
7625         struct tcpopt to;
7626         struct tcp_rack *rack;
7627         struct rack_sendmap *rsm;
7628         int32_t prev_state = 0;
7629
7630         if (m->m_flags & M_TSTMP_LRO) {
7631                 tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7632                 tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7633         }
7634         cts = tcp_tv_to_mssectick(tv);
7635         rack = (struct tcp_rack *)tp->t_fb_ptr;
7636
7637         kern_prefetch(rack, &prev_state);
7638         prev_state = 0;
7639         thflags = th->th_flags;
7640
7641         NET_EPOCH_ASSERT();
7642         INP_WLOCK_ASSERT(tp->t_inpcb);
7643         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
7644             __func__));
7645         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
7646             __func__));
7647         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
7648                 union tcp_log_stackspecific log;
7649                 struct timeval tv;
7650
7651                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
7652                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
7653                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
7654                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
7655                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
7656                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
7657                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
7658                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
7659                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
7660                     tlen, &log, true, &tv);
7661         }
7662         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
7663                 way_out = 4;
7664                 retval = 0;
7665                 goto done_with_input;
7666         }
7667         /*
7668          * If a segment with the ACK-bit set arrives in the SYN-SENT state
7669          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
7670          */
7671         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
7672             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
7673                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
7674                 return(1);
7675         }
7676         /*
7677          * Segment received on connection. Reset idle time and keep-alive
7678          * timer. XXX: This should be done after segment validation to
7679          * ignore broken/spoofed segs.
7680          */
7681         if  (tp->t_idle_reduce &&
7682              (tp->snd_max == tp->snd_una) &&
7683              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
7684                 counter_u64_add(rack_input_idle_reduces, 1);
7685                 rack_cc_after_idle(tp);
7686         }
7687         tp->t_rcvtime = ticks;
7688
7689         /*
7690          * Unscale the window into a 32-bit value. For the SYN_SENT state
7691          * the scale is zero.
7692          */
7693         tiwin = th->th_win << tp->snd_scale;
7694 #ifdef STATS
7695         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
7696 #endif
7697         if (tiwin > rack->r_ctl.rc_high_rwnd)
7698                 rack->r_ctl.rc_high_rwnd = tiwin;
7699         /*
7700          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
7701          * this to occur after we've validated the segment.
7702          */
7703         if (tp->t_flags2 & TF2_ECN_PERMIT) {
7704                 if (thflags & TH_CWR)
7705                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
7706                 switch (iptos & IPTOS_ECN_MASK) {
7707                 case IPTOS_ECN_CE:
7708                         tp->t_flags2 |= TF2_ECN_SND_ECE;
7709                         TCPSTAT_INC(tcps_ecn_ce);
7710                         break;
7711                 case IPTOS_ECN_ECT0:
7712                         TCPSTAT_INC(tcps_ecn_ect0);
7713                         break;
7714                 case IPTOS_ECN_ECT1:
7715                         TCPSTAT_INC(tcps_ecn_ect1);
7716                         break;
7717                 }
7718
7719                 /* Process a packet differently from RFC3168. */
7720                 cc_ecnpkt_handler(tp, th, iptos);
7721
7722                 /* Congestion experienced. */
7723                 if (thflags & TH_ECE) {
7724                         rack_cong_signal(tp, th, CC_ECN);
7725                 }
7726         }
7727         /*
7728          * Parse options on any incoming segment.
7729          */
7730         tcp_dooptions(&to, (u_char *)(th + 1),
7731             (th->th_off << 2) - sizeof(struct tcphdr),
7732             (thflags & TH_SYN) ? TO_SYN : 0);
7733
7734         /*
7735          * If echoed timestamp is later than the current time, fall back to
7736          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
7737          * were used when this connection was established.
7738          */
7739         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
7740                 to.to_tsecr -= tp->ts_offset;
7741                 if (TSTMP_GT(to.to_tsecr, cts))
7742                         to.to_tsecr = 0;
7743         }
7744         /*
7745          * If its the first time in we need to take care of options and
7746          * verify we can do SACK for rack!
7747          */
7748         if (rack->r_state == 0) {
7749                 /* Should be init'd by rack_init() */
7750                 KASSERT(rack->rc_inp != NULL,
7751                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
7752                 if (rack->rc_inp == NULL) {
7753                         rack->rc_inp = tp->t_inpcb;
7754                 }
7755
7756                 /*
7757                  * Process options only when we get SYN/ACK back. The SYN
7758                  * case for incoming connections is handled in tcp_syncache.
7759                  * According to RFC1323 the window field in a SYN (i.e., a
7760                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
7761                  * this is traditional behavior, may need to be cleaned up.
7762                  */
7763                 rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
7764                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
7765                         if ((to.to_flags & TOF_SCALE) &&
7766                             (tp->t_flags & TF_REQ_SCALE)) {
7767                                 tp->t_flags |= TF_RCVD_SCALE;
7768                                 tp->snd_scale = to.to_wscale;
7769                         }
7770                         /*
7771                          * Initial send window.  It will be updated with the
7772                          * next incoming segment to the scaled value.
7773                          */
7774                         tp->snd_wnd = th->th_win;
7775                         if (to.to_flags & TOF_TS) {
7776                                 tp->t_flags |= TF_RCVD_TSTMP;
7777                                 tp->ts_recent = to.to_tsval;
7778                                 tp->ts_recent_age = cts;
7779                         }
7780                         if (to.to_flags & TOF_MSS)
7781                                 tcp_mss(tp, to.to_mss);
7782                         if ((tp->t_flags & TF_SACK_PERMIT) &&
7783                             (to.to_flags & TOF_SACKPERM) == 0)
7784                                 tp->t_flags &= ~TF_SACK_PERMIT;
7785                         if (IS_FASTOPEN(tp->t_flags)) {
7786                                 if (to.to_flags & TOF_FASTOPEN) {
7787                                         uint16_t mss;
7788
7789                                         if (to.to_flags & TOF_MSS)
7790                                                 mss = to.to_mss;
7791                                         else
7792                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
7793                                                         mss = TCP6_MSS;
7794                                                 else
7795                                                         mss = TCP_MSS;
7796                                         tcp_fastopen_update_cache(tp, mss,
7797                                             to.to_tfo_len, to.to_tfo_cookie);
7798                                 } else
7799                                         tcp_fastopen_disable_path(tp);
7800                         }
7801                 }
7802                 /*
7803                  * At this point we are at the initial call. Here we decide
7804                  * if we are doing RACK or not. We do this by seeing if
7805                  * TF_SACK_PERMIT is set, if not rack is *not* possible and
7806                  * we switch to the default code.
7807                  */
7808                 if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
7809                         tcp_switch_back_to_default(tp);
7810                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
7811                             tlen, iptos);
7812                         return (1);
7813                 }
7814                 /* Set the flag */
7815                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
7816                 tcp_set_hpts(tp->t_inpcb);
7817                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
7818         }
7819         /*
7820          * This is the one exception case where we set the rack state
7821          * always. All other times (timers etc) we must have a rack-state
7822          * set (so we assure we have done the checks above for SACK).
7823          */
7824         memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval));
7825         rack->r_ctl.rc_rcvtime = cts;
7826         if (rack->r_state != tp->t_state)
7827                 rack_set_state(tp, rack);
7828         if (SEQ_GT(th->th_ack, tp->snd_una) &&
7829             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
7830                 kern_prefetch(rsm, &prev_state);
7831         prev_state = rack->r_state;
7832         rack->r_ctl.rc_tlp_send_cnt = 0;
7833         rack_clear_rate_sample(rack);
7834         retval = (*rack->r_substate) (m, th, so,
7835             tp, &to, drop_hdrlen,
7836             tlen, tiwin, thflags, nxt_pkt, iptos);
7837 #ifdef INVARIANTS
7838         if ((retval == 0) &&
7839             (tp->t_inpcb == NULL)) {
7840                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
7841                     retval, tp, prev_state);
7842         }
7843 #endif
7844         if (retval == 0) {
7845                 /*
7846                  * If retval is 1 the tcb is unlocked and most likely the tp
7847                  * is gone.
7848                  */
7849                 INP_WLOCK_ASSERT(tp->t_inpcb);
7850                 if (rack->set_pacing_done_a_iw == 0) {
7851                         /* How much has been acked? */
7852                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
7853                                 /* We have enough to set in the pacing segment size */
7854                                 rack->set_pacing_done_a_iw = 1;
7855                                 rack_set_pace_segments(tp, rack);
7856                         }
7857                 }
7858                 tcp_rack_xmit_timer_commit(rack, tp);
7859                 if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) {
7860                         if (rack->r_wanted_output != 0) {
7861                                 did_out = 1;
7862                                 (void)tp->t_fb->tfb_tcp_output(tp);
7863                         }
7864                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7865                 }
7866                 if ((nxt_pkt == 0) &&
7867                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
7868                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
7869                      (tp->t_flags & TF_DELACK) ||
7870                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
7871                       (tp->t_state <= TCPS_CLOSING)))) {
7872                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
7873                         if ((tp->snd_max == tp->snd_una) &&
7874                             ((tp->t_flags & TF_DELACK) == 0) &&
7875                             (rack->rc_inp->inp_in_hpts) &&
7876                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
7877                                 /* keep alive not needed if we are hptsi output yet */
7878                                 ;
7879                         } else {
7880                                 if (rack->rc_inp->inp_in_hpts) {
7881                                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7882                                         counter_u64_add(rack_per_timer_hole, 1);
7883                                 }
7884                                 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0);
7885                         }
7886                         way_out = 1;
7887                 } else if (nxt_pkt == 0) {
7888                         /* Do we have the correct timer running? */
7889                         rack_timer_audit(tp, rack, &so->so_snd);
7890                         way_out = 2;
7891                 }
7892         done_with_input:
7893                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
7894                 if (did_out)
7895                         rack->r_wanted_output = 0;
7896 #ifdef INVARIANTS
7897                 if (tp->t_inpcb == NULL) {
7898                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
7899                               did_out,
7900                               retval, tp, prev_state);
7901                 }
7902 #endif
7903         }
7904         return (retval);
7905 }
7906
7907 void
7908 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
7909     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
7910 {
7911         struct timeval tv;
7912
7913         /* First lets see if we have old packets */
7914         if (tp->t_in_pkt) {
7915                 if (ctf_do_queued_segments(so, tp, 1)) {
7916                         m_freem(m);
7917                         return;
7918                 }
7919         }
7920         if (m->m_flags & M_TSTMP_LRO) {
7921                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
7922                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
7923         } else {
7924                 /* Should not be should we kassert instead? */
7925                 tcp_get_usecs(&tv);
7926         }
7927         if(rack_do_segment_nounlock(m, th, so, tp,
7928                                     drop_hdrlen, tlen, iptos, 0, &tv) == 0)
7929                 INP_WUNLOCK(tp->t_inpcb);
7930 }
7931
7932 struct rack_sendmap *
7933 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
7934 {
7935         struct rack_sendmap *rsm = NULL;
7936         int32_t idx;
7937         uint32_t srtt = 0, thresh = 0, ts_low = 0;
7938
7939         /* Return the next guy to be re-transmitted */
7940         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
7941                 return (NULL);
7942         }
7943         if (tp->t_flags & TF_SENTFIN) {
7944                 /* retran the end FIN? */
7945                 return (NULL);
7946         }
7947         /* ok lets look at this one */
7948         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7949         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
7950                 goto check_it;
7951         }
7952         rsm = rack_find_lowest_rsm(rack);
7953         if (rsm == NULL) {
7954                 return (NULL);
7955         }
7956 check_it:
7957         if (rsm->r_flags & RACK_ACKED) {
7958                 return (NULL);
7959         }
7960         if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
7961                 /* Its not yet ready */
7962                 return (NULL);
7963         }
7964         srtt = rack_grab_rtt(tp, rack);
7965         idx = rsm->r_rtr_cnt - 1;
7966         ts_low = rsm->r_tim_lastsent[idx];
7967         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
7968         if ((tsused == ts_low) ||
7969             (TSTMP_LT(tsused, ts_low))) {
7970                 /* No time since sending */
7971                 return (NULL);
7972         }
7973         if ((tsused - ts_low) < thresh) {
7974                 /* It has not been long enough yet */
7975                 return (NULL);
7976         }
7977         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
7978             ((rsm->r_flags & RACK_SACK_PASSED) &&
7979              (rack->sack_attack_disable == 0))) {
7980                 /*
7981                  * We have passed the dup-ack threshold <or>
7982                  * a SACK has indicated this is missing.
7983                  * Note that if you are a declared attacker
7984                  * it is only the dup-ack threshold that
7985                  * will cause retransmits.
7986                  */
7987                 /* log retransmit reason */
7988                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
7989                 return (rsm);
7990         }
7991         return (NULL);
7992 }
7993
7994 static int32_t
7995 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len)
7996 {
7997         int32_t slot = 0;
7998
7999         if ((rack->rack_per_of_gp == 0) ||
8000             (rack->rc_always_pace == 0)) {
8001                 /*
8002                  * We use the most optimistic possible cwnd/srtt for
8003                  * sending calculations. This will make our
8004                  * calculation anticipate getting more through
8005                  * quicker then possible. But thats ok we don't want
8006                  * the peer to have a gap in data sending.
8007                  */
8008                 uint32_t srtt, cwnd, tr_perms = 0;
8009
8010 old_method:
8011                 if (rack->r_ctl.rc_rack_min_rtt)
8012                         srtt = rack->r_ctl.rc_rack_min_rtt;
8013                 else
8014                         srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8015                 if (rack->r_ctl.rc_rack_largest_cwnd)
8016                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8017                 else
8018                         cwnd = tp->snd_cwnd;
8019                 tr_perms = cwnd / srtt;
8020                 if (tr_perms == 0) {
8021                         tr_perms = ctf_fixed_maxseg(tp);
8022                 }
8023                 /*
8024                  * Calculate how long this will take to drain, if
8025                  * the calculation comes out to zero, thats ok we
8026                  * will use send_a_lot to possibly spin around for
8027                  * more increasing tot_len_this_send to the point
8028                  * that its going to require a pace, or we hit the
8029                  * cwnd. Which in that case we are just waiting for
8030                  * a ACK.
8031                  */
8032                 slot = len / tr_perms;
8033                 /* Now do we reduce the time so we don't run dry? */
8034                 if (slot && rack->rc_pace_reduce) {
8035                         int32_t reduce;
8036
8037                         reduce = (slot / rack->rc_pace_reduce);
8038                         if (reduce < slot) {
8039                                 slot -= reduce;
8040                         } else
8041                                 slot = 0;
8042                 }
8043         } else {
8044                 int cnt;
8045                 uint64_t bw_est, bw_raise, res, lentim;
8046
8047                 bw_est = 0;
8048                 for (cnt=0; cnt<RACK_GP_HIST; cnt++) {
8049                         if ((rack->r_ctl.rc_gp_hist_filled == 0) &&
8050                             (rack->r_ctl.rc_gp_history[cnt] == 0))
8051                                 break;
8052                         bw_est += rack->r_ctl.rc_gp_history[cnt];
8053                 }
8054                 if (bw_est == 0) {
8055                         /*
8056                          * No way yet to make a b/w estimate
8057                          * (no goodput est yet).
8058                          */
8059                         goto old_method;
8060                 }
8061                 /* Covert to bytes per second */
8062                 bw_est *= MSEC_IN_SECOND;
8063                 /*
8064                  * Now ratchet it up by our percentage. Note
8065                  * that the minimum you can do is 1 which would
8066                  * get you 101% of the average last N goodput estimates.
8067                  * The max you can do is 256 which would yeild you
8068                  * 356% of the last N goodput estimates.
8069                  */
8070                 bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp;
8071                 bw_est += bw_raise;
8072                 /* average by the number we added */
8073                 bw_est /= cnt;
8074                 /* Now calculate a rate based on this b/w */
8075                 lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND;
8076                 res = lentim / bw_est;
8077                 slot = (uint32_t)res;
8078         }
8079         if (rack->r_enforce_min_pace &&
8080             (slot == 0)) {
8081                 /* We are enforcing a minimum pace time of 1ms */
8082                 slot = rack->r_enforce_min_pace;
8083         }
8084         if (slot)
8085                 counter_u64_add(rack_calc_nonzero, 1);
8086         else
8087                 counter_u64_add(rack_calc_zero, 1);
8088         return (slot);
8089 }
8090
8091 static int
8092 rack_output(struct tcpcb *tp)
8093 {
8094         struct socket *so;
8095         uint32_t recwin, sendwin;
8096         uint32_t sb_offset;
8097         int32_t len, flags, error = 0;
8098         struct mbuf *m;
8099         struct mbuf *mb;
8100         uint32_t if_hw_tsomaxsegcount = 0;
8101         uint32_t if_hw_tsomaxsegsize = 0;
8102         int32_t maxseg;
8103         long tot_len_this_send = 0;
8104         struct ip *ip = NULL;
8105 #ifdef TCPDEBUG
8106         struct ipovly *ipov = NULL;
8107 #endif
8108         struct udphdr *udp = NULL;
8109         struct tcp_rack *rack;
8110         struct tcphdr *th;
8111         uint8_t pass = 0;
8112         uint8_t wanted_cookie = 0;
8113         u_char opt[TCP_MAXOLEN];
8114         unsigned ipoptlen, optlen, hdrlen, ulen=0;
8115         uint32_t rack_seq;
8116
8117 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8118         unsigned ipsec_optlen = 0;
8119
8120 #endif
8121         int32_t idle, sendalot;
8122         int32_t sub_from_prr = 0;
8123         volatile int32_t sack_rxmit;
8124         struct rack_sendmap *rsm = NULL;
8125         int32_t tso, mtu;
8126         struct tcpopt to;
8127         int32_t slot = 0;
8128         int32_t sup_rack = 0;
8129         uint32_t cts;
8130         uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0;
8131         int32_t do_a_prefetch;
8132         int32_t prefetch_rsm = 0;
8133         int force_tso = 0;
8134         int32_t orig_len;
8135         int32_t prefetch_so_done = 0;
8136         struct tcp_log_buffer *lgb = NULL;
8137         struct inpcb *inp;
8138         struct sockbuf *sb;
8139 #ifdef INET6
8140         struct ip6_hdr *ip6 = NULL;
8141         int32_t isipv6;
8142 #endif
8143         uint8_t filled_all = 0;
8144         bool hw_tls = false;
8145
8146         /* setup and take the cache hits here */
8147         rack = (struct tcp_rack *)tp->t_fb_ptr;
8148         inp = rack->rc_inp;
8149         so = inp->inp_socket;
8150         sb = &so->so_snd;
8151         kern_prefetch(sb, &do_a_prefetch);
8152         do_a_prefetch = 1;
8153
8154 #ifdef KERN_TLS
8155         hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
8156 #endif
8157
8158         INP_WLOCK_ASSERT(inp);
8159 #ifdef TCP_OFFLOAD
8160         if (tp->t_flags & TF_TOE)
8161                 return (tcp_offload_output(tp));
8162 #endif
8163         maxseg = ctf_fixed_maxseg(tp);
8164         /*
8165          * For TFO connections in SYN_RECEIVED, only allow the initial
8166          * SYN|ACK and those sent by the retransmit timer.
8167          */
8168         if (IS_FASTOPEN(tp->t_flags) &&
8169             (tp->t_state == TCPS_SYN_RECEIVED) &&
8170             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
8171             (rack->r_ctl.rc_resend == NULL))         /* not a retransmit */
8172                 return (0);
8173 #ifdef INET6
8174         if (rack->r_state) {
8175                 /* Use the cache line loaded if possible */
8176                 isipv6 = rack->r_is_v6;
8177         } else {
8178                 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
8179         }
8180 #endif
8181         cts = tcp_ts_getticks();
8182         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
8183             inp->inp_in_hpts) {
8184                 /*
8185                  * We are on the hpts for some timer but not hptsi output.
8186                  * Remove from the hpts unconditionally.
8187                  */
8188                 rack_timer_cancel(tp, rack, cts, __LINE__);
8189         }
8190         /* Mark that we have called rack_output(). */
8191         if ((rack->r_timer_override) ||
8192             (tp->t_flags & TF_FORCEDATA) ||
8193             (tp->t_state < TCPS_ESTABLISHED)) {
8194                 if (tp->t_inpcb->inp_in_hpts)
8195                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
8196         } else if (tp->t_inpcb->inp_in_hpts) {
8197                 /*
8198                  * On the hpts you can't pass even if ACKNOW is on, we will
8199                  * when the hpts fires.
8200                  */
8201                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
8202                 return (0);
8203         }
8204         hpts_calling = inp->inp_hpts_calls;
8205         inp->inp_hpts_calls = 0;
8206         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
8207                 if (rack_process_timers(tp, rack, cts, hpts_calling)) {
8208                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
8209                         return (0);
8210                 }
8211         }
8212         rack->r_wanted_output = 0;
8213         rack->r_timer_override = 0;
8214         /*
8215          * For TFO connections in SYN_SENT or SYN_RECEIVED,
8216          * only allow the initial SYN or SYN|ACK and those sent
8217          * by the retransmit timer.
8218          */
8219         if (IS_FASTOPEN(tp->t_flags) &&
8220             ((tp->t_state == TCPS_SYN_RECEIVED) ||
8221              (tp->t_state == TCPS_SYN_SENT)) &&
8222             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
8223             (tp->t_rxtshift == 0))              /* not a retransmit */
8224                 return (0);
8225         /*
8226          * Determine length of data that should be transmitted, and flags
8227          * that will be used. If there is some data or critical controls
8228          * (SYN, RST) to send, then transmit; otherwise, investigate
8229          * further.
8230          */
8231         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
8232         if (tp->t_idle_reduce) {
8233                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
8234                         rack_cc_after_idle(tp);
8235         }
8236         tp->t_flags &= ~TF_LASTIDLE;
8237         if (idle) {
8238                 if (tp->t_flags & TF_MORETOCOME) {
8239                         tp->t_flags |= TF_LASTIDLE;
8240                         idle = 0;
8241                 }
8242         }
8243 again:
8244         /*
8245          * If we've recently taken a timeout, snd_max will be greater than
8246          * snd_nxt.  There may be SACK information that allows us to avoid
8247          * resending already delivered data.  Adjust snd_nxt accordingly.
8248          */
8249         sendalot = 0;
8250         cts = tcp_ts_getticks();
8251         tso = 0;
8252         mtu = 0;
8253         sb_offset = tp->snd_max - tp->snd_una;
8254         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
8255
8256         flags = tcp_outflags[tp->t_state];
8257         while (rack->rc_free_cnt < rack_free_cache) {
8258                 rsm = rack_alloc(rack);
8259                 if (rsm == NULL) {
8260                         if (inp->inp_hpts_calls)
8261                                 /* Retry in a ms */
8262                                 slot = 1;
8263                         goto just_return_nolock;
8264                 }
8265                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
8266                 rack->rc_free_cnt++;
8267                 rsm = NULL;
8268         }
8269         if (inp->inp_hpts_calls)
8270                 inp->inp_hpts_calls = 0;
8271         sack_rxmit = 0;
8272         len = 0;
8273         rsm = NULL;
8274         if (flags & TH_RST) {
8275                 SOCKBUF_LOCK(sb);
8276                 goto send;
8277         }
8278         if (rack->r_ctl.rc_tlpsend) {
8279                 /* Tail loss probe */
8280                 long cwin;
8281                 long tlen;
8282
8283                 doing_tlp = 1;
8284                 /*
8285                  * Check if we can do a TLP with a RACK'd packet
8286                  * this can happen if we are not doing the rack
8287                  * cheat and we skipped to a TLP and it
8288                  * went off.
8289                  */
8290                 rsm = tcp_rack_output(tp, rack, cts);
8291                 if (rsm == NULL)
8292                         rsm = rack->r_ctl.rc_tlpsend;
8293                 rack->r_ctl.rc_tlpsend = NULL;
8294                 sack_rxmit = 1;
8295                 tlen = rsm->r_end - rsm->r_start;
8296                 if (tlen > ctf_fixed_maxseg(tp))
8297                         tlen = ctf_fixed_maxseg(tp);
8298                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8299                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8300                     __func__, __LINE__,
8301                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8302                 sb_offset = rsm->r_start - tp->snd_una;
8303                 cwin = min(tp->snd_wnd, tlen);
8304                 len = cwin;
8305         } else if (rack->r_ctl.rc_resend) {
8306                 /* Retransmit timer */
8307                 rsm = rack->r_ctl.rc_resend;
8308                 rack->r_ctl.rc_resend = NULL;
8309                 len = rsm->r_end - rsm->r_start;
8310                 sack_rxmit = 1;
8311                 sendalot = 0;
8312                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8313                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8314                     __func__, __LINE__,
8315                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8316                 sb_offset = rsm->r_start - tp->snd_una;
8317                 if (len >= ctf_fixed_maxseg(tp)) {
8318                         len = ctf_fixed_maxseg(tp);
8319                 }
8320         } else if ((rack->rc_in_persist == 0) &&
8321             ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
8322                 int maxseg;
8323
8324                 maxseg = ctf_fixed_maxseg(tp);
8325                 if ((!IN_RECOVERY(tp->t_flags)) &&
8326                     ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
8327                         /* Enter recovery if not induced by a time-out */
8328                         rack->r_ctl.rc_rsm_start = rsm->r_start;
8329                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8330                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8331                         rack_cong_signal(tp, NULL, CC_NDUPACK);
8332                         /*
8333                          * When we enter recovery we need to assure we send
8334                          * one packet.
8335                          */
8336                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
8337                         rack_log_to_prr(rack, 13);
8338                 }
8339 #ifdef INVARIANTS
8340                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
8341                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
8342                             tp, rack, rsm, rsm->r_start, tp->snd_una);
8343                 }
8344 #endif
8345                 len = rsm->r_end - rsm->r_start;
8346                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
8347                     ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
8348                     __func__, __LINE__,
8349                     rsm->r_start, tp->snd_una, tp, rack, rsm));
8350                 sb_offset = rsm->r_start - tp->snd_una;
8351                 /* Can we send it within the PRR boundary? */
8352                 if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) {
8353                         /* It does not fit */
8354                         if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) &&
8355                             (rack->r_ctl.rc_prr_sndcnt < maxseg)) {
8356                                 /*
8357                                  * prr is less than a segment, we
8358                                  * have more acks due in besides
8359                                  * what we need to resend. Lets not send
8360                                  * to avoid sending small pieces of
8361                                  * what we need to retransmit.
8362                                  */
8363                                 len = 0;
8364                                 goto just_return_nolock;
8365                         }
8366                         len = rack->r_ctl.rc_prr_sndcnt;
8367                 }
8368                 sendalot = 0;
8369                 if (len >= maxseg) {
8370                         len = maxseg;
8371                 }
8372                 if (len > 0) {
8373                         sub_from_prr = 1;
8374                         sack_rxmit = 1;
8375                         TCPSTAT_INC(tcps_sack_rexmits);
8376                         TCPSTAT_ADD(tcps_sack_rexmit_bytes,
8377                             min(len, ctf_fixed_maxseg(tp)));
8378                         counter_u64_add(rack_rtm_prr_retran, 1);
8379                 }
8380         }
8381         /*
8382          * Enforce a connection sendmap count limit if set
8383          * as long as we are not retransmiting.
8384          */
8385         if ((rsm == NULL) &&
8386             (rack->do_detection == 0) &&
8387             (V_tcp_map_entries_limit > 0) &&
8388             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
8389                 counter_u64_add(rack_to_alloc_limited, 1);
8390                 if (!rack->alloc_limit_reported) {
8391                         rack->alloc_limit_reported = 1;
8392                         counter_u64_add(rack_alloc_limited_conns, 1);
8393                 }
8394                 goto just_return_nolock;
8395         }
8396         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
8397                 /* we are retransmitting the fin */
8398                 len--;
8399                 if (len) {
8400                         /*
8401                          * When retransmitting data do *not* include the
8402                          * FIN. This could happen from a TLP probe.
8403                          */
8404                         flags &= ~TH_FIN;
8405                 }
8406         }
8407 #ifdef INVARIANTS
8408         /* For debugging */
8409         rack->r_ctl.rc_rsm_at_retran = rsm;
8410 #endif
8411         /*
8412          * Get standard flags, and add SYN or FIN if requested by 'hidden'
8413          * state flags.
8414          */
8415         if (tp->t_flags & TF_NEEDFIN)
8416                 flags |= TH_FIN;
8417         if (tp->t_flags & TF_NEEDSYN)
8418                 flags |= TH_SYN;
8419         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
8420                 void *end_rsm;
8421                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
8422                 if (end_rsm)
8423                         kern_prefetch(end_rsm, &prefetch_rsm);
8424                 prefetch_rsm = 1;
8425         }
8426         SOCKBUF_LOCK(sb);
8427         /*
8428          * If in persist timeout with window of 0, send 1 byte. Otherwise,
8429          * if window is small but nonzero and time TF_SENTFIN expired, we
8430          * will send what we can and go to transmit state.
8431          */
8432         if (tp->t_flags & TF_FORCEDATA) {
8433                 if (sendwin == 0) {
8434                         /*
8435                          * If we still have some data to send, then clear
8436                          * the FIN bit.  Usually this would happen below
8437                          * when it realizes that we aren't sending all the
8438                          * data.  However, if we have exactly 1 byte of
8439                          * unsent data, then it won't clear the FIN bit
8440                          * below, and if we are in persist state, we wind up
8441                          * sending the packet without recording that we sent
8442                          * the FIN bit.
8443                          *
8444                          * We can't just blindly clear the FIN bit, because
8445                          * if we don't have any more data to send then the
8446                          * probe will be the FIN itself.
8447                          */
8448                         if (sb_offset < sbused(sb))
8449                                 flags &= ~TH_FIN;
8450                         sendwin = 1;
8451                 } else {
8452                         if ((rack->rc_in_persist != 0) &&
8453                             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
8454                                                rack->r_ctl.rc_pace_min_segs)))
8455                                 rack_exit_persist(tp, rack);
8456                         /*
8457                          * If we are dropping persist mode then we need to
8458                          * correct snd_nxt/snd_max and off.
8459                          */
8460                         tp->snd_nxt = tp->snd_max;
8461                         sb_offset = tp->snd_nxt - tp->snd_una;
8462                 }
8463         }
8464         /*
8465          * If snd_nxt == snd_max and we have transmitted a FIN, the
8466          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
8467          * negative length.  This can also occur when TCP opens up its
8468          * congestion window while receiving additional duplicate acks after
8469          * fast-retransmit because TCP will reset snd_nxt to snd_max after
8470          * the fast-retransmit.
8471          *
8472          * In the normal retransmit-FIN-only case, however, snd_nxt will be
8473          * set to snd_una, the sb_offset will be 0, and the length may wind
8474          * up 0.
8475          *
8476          * If sack_rxmit is true we are retransmitting from the scoreboard
8477          * in which case len is already set.
8478          */
8479         if (sack_rxmit == 0) {
8480                 uint32_t avail;
8481
8482                 avail = sbavail(sb);
8483                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
8484                         sb_offset = tp->snd_nxt - tp->snd_una;
8485                 else
8486                         sb_offset = 0;
8487                 if (IN_RECOVERY(tp->t_flags) == 0) {
8488                         if (rack->r_ctl.rc_tlp_new_data) {
8489                                 /* TLP is forcing out new data */
8490                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
8491                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
8492                                 }
8493                                 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
8494                                         len = tp->snd_wnd;
8495                                 else
8496                                         len = rack->r_ctl.rc_tlp_new_data;
8497                                 rack->r_ctl.rc_tlp_new_data = 0;
8498                                 new_data_tlp = doing_tlp = 1;
8499                         } else {
8500                                 if (sendwin > avail) {
8501                                         /* use the available */
8502                                         if (avail > sb_offset) {
8503                                                 len = (int32_t)(avail - sb_offset);
8504                                         } else {
8505                                                 len = 0;
8506                                         }
8507                                 } else {
8508                                         if (sendwin > sb_offset) {
8509                                                 len = (int32_t)(sendwin - sb_offset);
8510                                         } else {
8511                                                 len = 0;
8512                                         }
8513                                 }
8514                         }
8515                 } else {
8516                         uint32_t outstanding;
8517
8518                         /*
8519                          * We are inside of a SACK recovery episode and are
8520                          * sending new data, having retransmitted all the
8521                          * data possible so far in the scoreboard.
8522                          */
8523                         outstanding = tp->snd_max - tp->snd_una;
8524                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
8525                                 if (tp->snd_wnd > outstanding) {
8526                                         len = tp->snd_wnd - outstanding;
8527                                         /* Check to see if we have the data */
8528                                         if (((sb_offset + len) > avail) &&
8529                                             (avail > sb_offset))
8530                                                 len = avail - sb_offset;
8531                                         else
8532                                                 len = 0;
8533                                 } else
8534                                         len = 0;
8535                         } else if (avail > sb_offset)
8536                                 len = avail - sb_offset;
8537                         else
8538                                 len = 0;
8539                         if (len > 0) {
8540                                 if (len > rack->r_ctl.rc_prr_sndcnt)
8541                                         len = rack->r_ctl.rc_prr_sndcnt;
8542                                 if (len > 0) {
8543                                         sub_from_prr = 1;
8544                                         counter_u64_add(rack_rtm_prr_newdata, 1);
8545                                 }
8546                         }
8547                         if (len > ctf_fixed_maxseg(tp)) {
8548                                 /*
8549                                  * We should never send more than a MSS when
8550                                  * retransmitting or sending new data in prr
8551                                  * mode unless the override flag is on. Most
8552                                  * likely the PRR algorithm is not going to
8553                                  * let us send a lot as well :-)
8554                                  */
8555                                 if (rack->r_ctl.rc_prr_sendalot == 0)
8556                                         len = ctf_fixed_maxseg(tp);
8557                         } else if (len < ctf_fixed_maxseg(tp)) {
8558                                 /*
8559                                  * Do we send any? The idea here is if the
8560                                  * send empty's the socket buffer we want to
8561                                  * do it. However if not then lets just wait
8562                                  * for our prr_sndcnt to get bigger.
8563                                  */
8564                                 long leftinsb;
8565
8566                                 leftinsb = sbavail(sb) - sb_offset;
8567                                 if (leftinsb > len) {
8568                                         /* This send does not empty the sb */
8569                                         len = 0;
8570                                 }
8571                         }
8572                 }
8573         }
8574         if (prefetch_so_done == 0) {
8575                 kern_prefetch(so, &prefetch_so_done);
8576                 prefetch_so_done = 1;
8577         }
8578         /*
8579          * Lop off SYN bit if it has already been sent.  However, if this is
8580          * SYN-SENT state and if segment contains data and if we don't know
8581          * that foreign host supports TAO, suppress sending segment.
8582          */
8583         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
8584             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
8585                 if (tp->t_state != TCPS_SYN_RECEIVED)
8586                         flags &= ~TH_SYN;
8587                 /*
8588                  * When sending additional segments following a TFO SYN|ACK,
8589                  * do not include the SYN bit.
8590                  */
8591                 if (IS_FASTOPEN(tp->t_flags) &&
8592                     (tp->t_state == TCPS_SYN_RECEIVED))
8593                         flags &= ~TH_SYN;
8594                 sb_offset--, len++;
8595         }
8596         /*
8597          * Be careful not to send data and/or FIN on SYN segments. This
8598          * measure is needed to prevent interoperability problems with not
8599          * fully conformant TCP implementations.
8600          */
8601         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
8602                 len = 0;
8603                 flags &= ~TH_FIN;
8604         }
8605         /*
8606          * On TFO sockets, ensure no data is sent in the following cases:
8607          *
8608          *  - When retransmitting SYN|ACK on a passively-created socket
8609          *
8610          *  - When retransmitting SYN on an actively created socket
8611          *
8612          *  - When sending a zero-length cookie (cookie request) on an
8613          *    actively created socket
8614          *
8615          *  - When the socket is in the CLOSED state (RST is being sent)
8616          */
8617         if (IS_FASTOPEN(tp->t_flags) &&
8618             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
8619              ((tp->t_state == TCPS_SYN_SENT) &&
8620               (tp->t_tfo_client_cookie_len == 0)) ||
8621              (flags & TH_RST))) {
8622                 sack_rxmit = 0;
8623                 len = 0;
8624         }
8625         /* Without fast-open there should never be data sent on a SYN */
8626         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
8627                 len = 0;
8628         orig_len = len;
8629         if (len <= 0) {
8630                 /*
8631                  * If FIN has been sent but not acked, but we haven't been
8632                  * called to retransmit, len will be < 0.  Otherwise, window
8633                  * shrank after we sent into it.  If window shrank to 0,
8634                  * cancel pending retransmit, pull snd_nxt back to (closed)
8635                  * window, and set the persist timer if it isn't already
8636                  * going.  If the window didn't close completely, just wait
8637                  * for an ACK.
8638                  *
8639                  * We also do a general check here to ensure that we will
8640                  * set the persist timer when we have data to send, but a
8641                  * 0-byte window. This makes sure the persist timer is set
8642                  * even if the packet hits one of the "goto send" lines
8643                  * below.
8644                  */
8645                 len = 0;
8646                 if ((tp->snd_wnd == 0) &&
8647                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8648                     (tp->snd_una == tp->snd_max) &&
8649                     (sb_offset < (int)sbavail(sb))) {
8650                         tp->snd_nxt = tp->snd_una;
8651                         rack_enter_persist(tp, rack, cts);
8652                 }
8653         } else if ((rsm == NULL) &&
8654                    ((doing_tlp == 0) || (new_data_tlp == 1)) &&
8655                    (len < rack->r_ctl.rc_pace_max_segs)) {
8656                 /*
8657                  * We are not sending a full segment for
8658                  * some reason. Should we not send anything (think
8659                  * sws or persists)?
8660                  */
8661                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8662                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
8663                     (len < (int)(sbavail(sb) - sb_offset))) {
8664                         /*
8665                          * Here the rwnd is less than
8666                          * the pacing size, this is not a retransmit,
8667                          * we are established and
8668                          * the send is not the last in the socket buffer
8669                          * we send nothing, and may enter persists.
8670                          */
8671                         len = 0;
8672                         if (tp->snd_max == tp->snd_una) {
8673                                 /*
8674                                  * Nothing out we can
8675                                  * go into persists.
8676                                  */
8677                                 rack_enter_persist(tp, rack, cts);
8678                                 tp->snd_nxt = tp->snd_una;
8679                         }
8680                 } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) &&
8681                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8682                            (len < (int)(sbavail(sb) - sb_offset)) &&
8683                            (len < rack->r_ctl.rc_pace_min_segs)) {
8684                         /*
8685                          * Here we are not retransmitting, and
8686                          * the cwnd is not so small that we could
8687                          * not send at least a min size (rxt timer
8688                          * not having gone off), We have 2 segments or
8689                          * more already in flight, its not the tail end
8690                          * of the socket buffer  and the cwnd is blocking
8691                          * us from sending out a minimum pacing segment size.
8692                          * Lets not send anything.
8693                          */
8694                         len = 0;
8695                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
8696                             min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
8697                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) &&
8698                            (len < (int)(sbavail(sb) - sb_offset)) &&
8699                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
8700                         /*
8701                          * Here we have a send window but we have
8702                          * filled it up and we can't send another pacing segment.
8703                          * We also have in flight more than 2 segments
8704                          * and we are not completing the sb i.e. we allow
8705                          * the last bytes of the sb to go out even if
8706                          * its not a full pacing segment.
8707                          */
8708                         len = 0;
8709                 }
8710         }
8711         /* len will be >= 0 after this point. */
8712         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
8713         tcp_sndbuf_autoscale(tp, so, sendwin);
8714         /*
8715          * Decide if we can use TCP Segmentation Offloading (if supported by
8716          * hardware).
8717          *
8718          * TSO may only be used if we are in a pure bulk sending state.  The
8719          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
8720          * options prevent using TSO.  With TSO the TCP header is the same
8721          * (except for the sequence number) for all generated packets.  This
8722          * makes it impossible to transmit any options which vary per
8723          * generated segment or packet.
8724          *
8725          * IPv4 handling has a clear separation of ip options and ip header
8726          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
8727          * the right thing below to provide length of just ip options and thus
8728          * checking for ipoptlen is enough to decide if ip options are present.
8729          */
8730
8731 #ifdef INET6
8732         if (isipv6)
8733                 ipoptlen = ip6_optlen(tp->t_inpcb);
8734         else
8735 #endif
8736                 if (tp->t_inpcb->inp_options)
8737                         ipoptlen = tp->t_inpcb->inp_options->m_len -
8738                             offsetof(struct ipoption, ipopt_list);
8739                 else
8740                         ipoptlen = 0;
8741 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8742         /*
8743          * Pre-calculate here as we save another lookup into the darknesses
8744          * of IPsec that way and can actually decide if TSO is ok.
8745          */
8746 #ifdef INET6
8747         if (isipv6 && IPSEC_ENABLED(ipv6))
8748                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
8749 #ifdef INET
8750         else
8751 #endif
8752 #endif                          /* INET6 */
8753 #ifdef INET
8754         if (IPSEC_ENABLED(ipv4))
8755                 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
8756 #endif                          /* INET */
8757 #endif
8758
8759 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8760         ipoptlen += ipsec_optlen;
8761 #endif
8762         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) &&
8763             (tp->t_port == 0) &&
8764             ((tp->t_flags & TF_SIGNATURE) == 0) &&
8765             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
8766             ipoptlen == 0)
8767                 tso = 1;
8768         {
8769                 uint32_t outstanding;
8770
8771                 outstanding = tp->snd_max - tp->snd_una;
8772                 if (tp->t_flags & TF_SENTFIN) {
8773                         /*
8774                          * If we sent a fin, snd_max is 1 higher than
8775                          * snd_una
8776                          */
8777                         outstanding--;
8778                 }
8779                 if (sack_rxmit) {
8780                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
8781                                 flags &= ~TH_FIN;
8782                 } else {
8783                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
8784                             sbused(sb)))
8785                                 flags &= ~TH_FIN;
8786                 }
8787         }
8788         recwin = sbspace(&so->so_rcv);
8789
8790         /*
8791          * Sender silly window avoidance.   We transmit under the following
8792          * conditions when len is non-zero:
8793          *
8794          * - We have a full segment (or more with TSO) - This is the last
8795          * buffer in a write()/send() and we are either idle or running
8796          * NODELAY - we've timed out (e.g. persist timer) - we have more
8797          * then 1/2 the maximum send window's worth of data (receiver may be
8798          * limited the window size) - we need to retransmit
8799          */
8800         if (len) {
8801                 if (len >= ctf_fixed_maxseg(tp)) {
8802                         pass = 1;
8803                         goto send;
8804                 }
8805                 /*
8806                  * NOTE! on localhost connections an 'ack' from the remote
8807                  * end may occur synchronously with the output and cause us
8808                  * to flush a buffer queued with moretocome.  XXX
8809                  *
8810                  */
8811                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
8812                     (idle || (tp->t_flags & TF_NODELAY)) &&
8813                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
8814                     (tp->t_flags & TF_NOPUSH) == 0) {
8815                         pass = 2;
8816                         goto send;
8817                 }
8818                 if (tp->t_flags & TF_FORCEDATA) {       /* typ. timeout case */
8819                         pass = 3;
8820                         goto send;
8821                 }
8822                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
8823                         goto send;
8824                 }
8825                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
8826                         pass = 4;
8827                         goto send;
8828                 }
8829                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
8830                         pass = 5;
8831                         goto send;
8832                 }
8833                 if (sack_rxmit) {
8834                         pass = 6;
8835                         goto send;
8836                 }
8837         }
8838         /*
8839          * Sending of standalone window updates.
8840          *
8841          * Window updates are important when we close our window due to a
8842          * full socket buffer and are opening it again after the application
8843          * reads data from it.  Once the window has opened again and the
8844          * remote end starts to send again the ACK clock takes over and
8845          * provides the most current window information.
8846          *
8847          * We must avoid the silly window syndrome whereas every read from
8848          * the receive buffer, no matter how small, causes a window update
8849          * to be sent.  We also should avoid sending a flurry of window
8850          * updates when the socket buffer had queued a lot of data and the
8851          * application is doing small reads.
8852          *
8853          * Prevent a flurry of pointless window updates by only sending an
8854          * update when we can increase the advertized window by more than
8855          * 1/4th of the socket buffer capacity.  When the buffer is getting
8856          * full or is very small be more aggressive and send an update
8857          * whenever we can increase by two mss sized segments. In all other
8858          * situations the ACK's to new incoming data will carry further
8859          * window increases.
8860          *
8861          * Don't send an independent window update if a delayed ACK is
8862          * pending (it will get piggy-backed on it) or the remote side
8863          * already has done a half-close and won't send more data.  Skip
8864          * this if the connection is in T/TCP half-open state.
8865          */
8866         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
8867             !(tp->t_flags & TF_DELACK) &&
8868             !TCPS_HAVERCVDFIN(tp->t_state)) {
8869                 /*
8870                  * "adv" is the amount we could increase the window, taking
8871                  * into account that we are limited by TCP_MAXWIN <<
8872                  * tp->rcv_scale.
8873                  */
8874                 int32_t adv;
8875                 int oldwin;
8876
8877                 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
8878                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
8879                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
8880                         adv -= oldwin;
8881                 } else
8882                         oldwin = 0;
8883
8884                 /*
8885                  * If the new window size ends up being the same as the old
8886                  * size when it is scaled, then don't force a window update.
8887                  */
8888                 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
8889                         goto dontupdate;
8890
8891                 if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) &&
8892                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
8893                     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
8894                      so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) {
8895                         pass = 7;
8896                         goto send;
8897                 }
8898                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
8899                         goto send;
8900         }
8901 dontupdate:
8902
8903         /*
8904          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
8905          * is also a catch-all for the retransmit timer timeout case.
8906          */
8907         if (tp->t_flags & TF_ACKNOW) {
8908                 pass = 8;
8909                 goto send;
8910         }
8911         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
8912                 pass = 9;
8913                 goto send;
8914         }
8915         if (SEQ_GT(tp->snd_up, tp->snd_una)) {
8916                 pass = 10;
8917                 goto send;
8918         }
8919         /*
8920          * If our state indicates that FIN should be sent and we have not
8921          * yet done so, then we need to send.
8922          */
8923         if ((flags & TH_FIN) &&
8924             (tp->snd_nxt == tp->snd_una)) {
8925                 pass = 11;
8926                 goto send;
8927         }
8928         /*
8929          * No reason to send a segment, just return.
8930          */
8931 just_return:
8932         SOCKBUF_UNLOCK(sb);
8933 just_return_nolock:
8934         if (tot_len_this_send == 0)
8935                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
8936         if (slot) {
8937                 /* set the rack tcb into the slot N */
8938                 counter_u64_add(rack_paced_segments, 1);
8939         } else if (tot_len_this_send) {
8940                 counter_u64_add(rack_unpaced_segments, 1);
8941         }
8942         /* Check if we need to go into persists or not */
8943         if ((rack->rc_in_persist == 0) &&
8944             (tp->snd_max == tp->snd_una) &&
8945             TCPS_HAVEESTABLISHED(tp->t_state) &&
8946             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
8947             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) &&
8948             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) {
8949                 /* Yes lets make sure to move to persist before timer-start */
8950                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
8951         }
8952         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
8953         rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
8954         tp->t_flags &= ~TF_FORCEDATA;
8955         return (0);
8956
8957 send:
8958         if ((flags & TH_FIN) &&
8959             sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8960                 /*
8961                  * We do not transmit a FIN
8962                  * with data outstanding. We
8963                  * need to make it so all data
8964                  * is acked first.
8965                  */
8966                 flags &= ~TH_FIN;
8967         }
8968         if (doing_tlp == 0) {
8969                 /*
8970                  * Data not a TLP, and its not the rxt firing. If it is the
8971                  * rxt firing, we want to leave the tlp_in_progress flag on
8972                  * so we don't send another TLP. It has to be a rack timer
8973                  * or normal send (response to acked data) to clear the tlp
8974                  * in progress flag.
8975                  */
8976                 rack->rc_tlp_in_progress = 0;
8977         }
8978         SOCKBUF_LOCK_ASSERT(sb);
8979         if (len > 0) {
8980                 if (len >= ctf_fixed_maxseg(tp))
8981                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
8982                 else
8983                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
8984         }
8985         /*
8986          * Before ESTABLISHED, force sending of initial options unless TCP
8987          * set not to do any options. NOTE: we assume that the IP/TCP header
8988          * plus TCP options always fit in a single mbuf, leaving room for a
8989          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
8990          * + optlen <= MCLBYTES
8991          */
8992         optlen = 0;
8993 #ifdef INET6
8994         if (isipv6)
8995                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
8996         else
8997 #endif
8998                 hdrlen = sizeof(struct tcpiphdr);
8999
9000         /*
9001          * Compute options for segment. We only have to care about SYN and
9002          * established connection segments.  Options for SYN-ACK segments
9003          * are handled in TCP syncache.
9004          */
9005         to.to_flags = 0;
9006         if ((tp->t_flags & TF_NOOPT) == 0) {
9007                 /* Maximum segment size. */
9008                 if (flags & TH_SYN) {
9009                         tp->snd_nxt = tp->iss;
9010                         to.to_mss = tcp_mssopt(&inp->inp_inc);
9011 #ifdef NETFLIX_TCPOUDP
9012                         if (tp->t_port)
9013                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
9014 #endif
9015                         to.to_flags |= TOF_MSS;
9016
9017                         /*
9018                          * On SYN or SYN|ACK transmits on TFO connections,
9019                          * only include the TFO option if it is not a
9020                          * retransmit, as the presence of the TFO option may
9021                          * have caused the original SYN or SYN|ACK to have
9022                          * been dropped by a middlebox.
9023                          */
9024                         if (IS_FASTOPEN(tp->t_flags) &&
9025                             (tp->t_rxtshift == 0)) {
9026                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
9027                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
9028                                         to.to_tfo_cookie =
9029                                             (u_int8_t *)&tp->t_tfo_cookie.server;
9030                                         to.to_flags |= TOF_FASTOPEN;
9031                                         wanted_cookie = 1;
9032                                 } else if (tp->t_state == TCPS_SYN_SENT) {
9033                                         to.to_tfo_len =
9034                                             tp->t_tfo_client_cookie_len;
9035                                         to.to_tfo_cookie =
9036                                             tp->t_tfo_cookie.client;
9037                                         to.to_flags |= TOF_FASTOPEN;
9038                                         wanted_cookie = 1;
9039                                         /*
9040                                          * If we wind up having more data to
9041                                          * send with the SYN than can fit in
9042                                          * one segment, don't send any more
9043                                          * until the SYN|ACK comes back from
9044                                          * the other end.
9045                                          */
9046                                         sendalot = 0;
9047                                 }
9048                         }
9049                 }
9050                 /* Window scaling. */
9051                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
9052                         to.to_wscale = tp->request_r_scale;
9053                         to.to_flags |= TOF_SCALE;
9054                 }
9055                 /* Timestamps. */
9056                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
9057                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
9058                         to.to_tsval = cts + tp->ts_offset;
9059                         to.to_tsecr = tp->ts_recent;
9060                         to.to_flags |= TOF_TS;
9061                 }
9062                 /* Set receive buffer autosizing timestamp. */
9063                 if (tp->rfbuf_ts == 0 &&
9064                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
9065                         tp->rfbuf_ts = tcp_ts_getticks();
9066                 /* Selective ACK's. */
9067                 if (flags & TH_SYN)
9068                         to.to_flags |= TOF_SACKPERM;
9069                 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9070                     tp->rcv_numsacks > 0) {
9071                         to.to_flags |= TOF_SACK;
9072                         to.to_nsacks = tp->rcv_numsacks;
9073                         to.to_sacks = (u_char *)tp->sackblks;
9074                 }
9075 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9076                 /* TCP-MD5 (RFC2385). */
9077                 if (tp->t_flags & TF_SIGNATURE)
9078                         to.to_flags |= TOF_SIGNATURE;
9079 #endif                          /* TCP_SIGNATURE */
9080
9081                 /* Processing the options. */
9082                 hdrlen += optlen = tcp_addoptions(&to, opt);
9083                 /*
9084                  * If we wanted a TFO option to be added, but it was unable
9085                  * to fit, ensure no data is sent.
9086                  */
9087                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
9088                     !(to.to_flags & TOF_FASTOPEN))
9089                         len = 0;
9090         }
9091 #ifdef NETFLIX_TCPOUDP
9092         if (tp->t_port) {
9093                 if (V_tcp_udp_tunneling_port == 0) {
9094                         /* The port was removed?? */
9095                         SOCKBUF_UNLOCK(&so->so_snd);
9096                         return (EHOSTUNREACH);
9097                 }
9098                 hdrlen += sizeof(struct udphdr);
9099         }
9100 #endif
9101 #ifdef INET6
9102         if (isipv6)
9103                 ipoptlen = ip6_optlen(tp->t_inpcb);
9104         else
9105 #endif
9106         if (tp->t_inpcb->inp_options)
9107                 ipoptlen = tp->t_inpcb->inp_options->m_len -
9108                     offsetof(struct ipoption, ipopt_list);
9109         else
9110                 ipoptlen = 0;
9111 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
9112         ipoptlen += ipsec_optlen;
9113 #endif
9114
9115 #ifdef KERN_TLS
9116         /* force TSO for so TLS offload can get mss */
9117         if (sb->sb_flags & SB_TLS_IFNET) {
9118                 force_tso = 1;
9119         }
9120 #endif
9121         /*
9122          * Adjust data length if insertion of options will bump the packet
9123          * length beyond the t_maxseg length. Clear the FIN bit because we
9124          * cut off the tail of the segment.
9125          */
9126         if (len + optlen + ipoptlen > tp->t_maxseg) {
9127                 if (tso) {
9128                         uint32_t if_hw_tsomax;
9129                         uint32_t moff;
9130                         int32_t max_len;
9131
9132                         /* extract TSO information */
9133                         if_hw_tsomax = tp->t_tsomax;
9134                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
9135                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
9136                         KASSERT(ipoptlen == 0,
9137                             ("%s: TSO can't do IP options", __func__));
9138
9139                         /*
9140                          * Check if we should limit by maximum payload
9141                          * length:
9142                          */
9143                         if (if_hw_tsomax != 0) {
9144                                 /* compute maximum TSO length */
9145                                 max_len = (if_hw_tsomax - hdrlen -
9146                                     max_linkhdr);
9147                                 if (max_len <= 0) {
9148                                         len = 0;
9149                                 } else if (len > max_len) {
9150                                         sendalot = 1;
9151                                         len = max_len;
9152                                 }
9153                         }
9154                         /*
9155                          * Prevent the last segment from being fractional
9156                          * unless the send sockbuf can be emptied:
9157                          */
9158                         max_len = (tp->t_maxseg - optlen);
9159                         if (((sb_offset + len) < sbavail(sb)) &&
9160                             (hw_tls == 0)) {
9161                                 moff = len % (u_int)max_len;
9162                                 if (moff != 0) {
9163                                         len -= moff;
9164                                         sendalot = 1;
9165                                 }
9166                         }
9167                         /*
9168                          * In case there are too many small fragments don't
9169                          * use TSO:
9170                          */
9171                         if (len <= maxseg) {
9172                                 len = max_len;
9173                                 sendalot = 1;
9174                                 tso = 0;
9175                         }
9176                         /*
9177                          * Send the FIN in a separate segment after the bulk
9178                          * sending is done. We don't trust the TSO
9179                          * implementations to clear the FIN flag on all but
9180                          * the last segment.
9181                          */
9182                         if (tp->t_flags & TF_NEEDFIN)
9183                                 sendalot = 1;
9184
9185                 } else {
9186                         if (optlen + ipoptlen >= tp->t_maxseg) {
9187                                 /*
9188                                  * Since we don't have enough space to put
9189                                  * the IP header chain and the TCP header in
9190                                  * one packet as required by RFC 7112, don't
9191                                  * send it. Also ensure that at least one
9192                                  * byte of the payload can be put into the
9193                                  * TCP segment.
9194                                  */
9195                                 SOCKBUF_UNLOCK(&so->so_snd);
9196                                 error = EMSGSIZE;
9197                                 sack_rxmit = 0;
9198                                 goto out;
9199                         }
9200                         len = tp->t_maxseg - optlen - ipoptlen;
9201                         sendalot = 1;
9202                 }
9203         } else
9204                 tso = 0;
9205         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
9206             ("%s: len > IP_MAXPACKET", __func__));
9207 #ifdef DIAGNOSTIC
9208 #ifdef INET6
9209         if (max_linkhdr + hdrlen > MCLBYTES)
9210 #else
9211         if (max_linkhdr + hdrlen > MHLEN)
9212 #endif
9213                 panic("tcphdr too big");
9214 #endif
9215
9216         /*
9217          * This KASSERT is here to catch edge cases at a well defined place.
9218          * Before, those had triggered (random) panic conditions further
9219          * down.
9220          */
9221         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
9222         if ((len == 0) &&
9223             (flags & TH_FIN) &&
9224             (sbused(sb))) {
9225                 /*
9226                  * We have outstanding data, don't send a fin by itself!.
9227                  */
9228                 goto just_return;
9229         }
9230         /*
9231          * Grab a header mbuf, attaching a copy of data to be transmitted,
9232          * and initialize the header from the template for sends on this
9233          * connection.
9234          */
9235         if (len) {
9236                 uint32_t max_val;
9237                 uint32_t moff;
9238
9239                 if (rack->rc_pace_max_segs)
9240                         max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp);
9241                 else
9242                         max_val = len;
9243                 if (rack->r_ctl.rc_pace_max_segs < max_val)
9244                         max_val = rack->r_ctl.rc_pace_max_segs;
9245                 /*
9246                  * We allow a limit on sending with hptsi.
9247                  */
9248                 if (len > max_val) {
9249                         len = max_val;
9250                 }
9251 #ifdef INET6
9252                 if (MHLEN < hdrlen + max_linkhdr)
9253                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
9254                 else
9255 #endif
9256                         m = m_gethdr(M_NOWAIT, MT_DATA);
9257
9258                 if (m == NULL) {
9259                         SOCKBUF_UNLOCK(sb);
9260                         error = ENOBUFS;
9261                         sack_rxmit = 0;
9262                         goto out;
9263                 }
9264                 m->m_data += max_linkhdr;
9265                 m->m_len = hdrlen;
9266
9267                 /*
9268                  * Start the m_copy functions from the closest mbuf to the
9269                  * sb_offset in the socket buffer chain.
9270                  */
9271                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
9272                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
9273                         m_copydata(mb, moff, (int)len,
9274                             mtod(m, caddr_t)+hdrlen);
9275                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9276                                 sbsndptr_adv(sb, mb, len);
9277                         m->m_len += len;
9278                 } else {
9279                         struct sockbuf *msb;
9280
9281                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9282                                 msb = NULL;
9283                         else
9284                                 msb = sb;
9285                         m->m_next = tcp_m_copym(
9286 #ifdef NETFLIX_COPY_ARGS
9287                                 tp,
9288 #endif
9289                                 mb, moff, &len,
9290                             if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
9291                             ((rsm == NULL) ? hw_tls : 0)
9292 #ifdef NETFLIX_COPY_ARGS
9293                                 , &filled_all
9294 #endif
9295                                 );
9296                         if (len <= (tp->t_maxseg - optlen)) {
9297                                 /*
9298                                  * Must have ran out of mbufs for the copy
9299                                  * shorten it to no longer need tso. Lets
9300                                  * not put on sendalot since we are low on
9301                                  * mbufs.
9302                                  */
9303                                 tso = 0;
9304                         }
9305                         if (m->m_next == NULL) {
9306                                 SOCKBUF_UNLOCK(sb);
9307                                 (void)m_free(m);
9308                                 error = ENOBUFS;
9309                                 sack_rxmit = 0;
9310                                 goto out;
9311                         }
9312                 }
9313                 if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
9314                         TCPSTAT_INC(tcps_sndprobe);
9315 #ifdef STATS
9316                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
9317                                 stats_voi_update_abs_u32(tp->t_stats,
9318                                     VOI_TCP_RETXPB, len);
9319                         else
9320                                 stats_voi_update_abs_u64(tp->t_stats,
9321                                     VOI_TCP_TXPB, len);
9322 #endif
9323                 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
9324                         if (rsm && (rsm->r_flags & RACK_TLP)) {
9325                                 /*
9326                                  * TLP should not count in retran count, but
9327                                  * in its own bin
9328                                  */
9329                                 counter_u64_add(rack_tlp_retran, 1);
9330                                 counter_u64_add(rack_tlp_retran_bytes, len);
9331                         } else {
9332                                 tp->t_sndrexmitpack++;
9333                                 TCPSTAT_INC(tcps_sndrexmitpack);
9334                                 TCPSTAT_ADD(tcps_sndrexmitbyte, len);
9335                         }
9336 #ifdef STATS
9337                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
9338                             len);
9339 #endif
9340                 } else {
9341                         TCPSTAT_INC(tcps_sndpack);
9342                         TCPSTAT_ADD(tcps_sndbyte, len);
9343 #ifdef STATS
9344                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
9345                             len);
9346 #endif
9347                 }
9348                 /*
9349                  * If we're sending everything we've got, set PUSH. (This
9350                  * will keep happy those implementations which only give
9351                  * data to the user when a buffer fills or a PUSH comes in.)
9352                  */
9353                 if (sb_offset + len == sbused(sb) &&
9354                     sbused(sb) &&
9355                     !(flags & TH_SYN))
9356                         flags |= TH_PUSH;
9357
9358                 /*
9359                  * Are we doing pacing, if so we must calculate the slot. We
9360                  * only do hptsi in ESTABLISHED and with no RESET being
9361                  * sent where we have data to send.
9362                  */
9363                 if (((tp->t_state == TCPS_ESTABLISHED) ||
9364                     (tp->t_state == TCPS_CLOSE_WAIT) ||
9365                     ((tp->t_state == TCPS_FIN_WAIT_1) &&
9366                     ((tp->t_flags & TF_SENTFIN) == 0) &&
9367                     ((flags & TH_FIN) == 0))) &&
9368                     ((flags & TH_RST) == 0)) {
9369                         /* Get our pacing rate */
9370                         tot_len_this_send += len;
9371                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send);
9372                 }
9373                 SOCKBUF_UNLOCK(sb);
9374         } else {
9375                 SOCKBUF_UNLOCK(sb);
9376                 if (tp->t_flags & TF_ACKNOW)
9377                         TCPSTAT_INC(tcps_sndacks);
9378                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
9379                         TCPSTAT_INC(tcps_sndctrl);
9380                 else if (SEQ_GT(tp->snd_up, tp->snd_una))
9381                         TCPSTAT_INC(tcps_sndurg);
9382                 else
9383                         TCPSTAT_INC(tcps_sndwinup);
9384
9385                 m = m_gethdr(M_NOWAIT, MT_DATA);
9386                 if (m == NULL) {
9387                         error = ENOBUFS;
9388                         sack_rxmit = 0;
9389                         goto out;
9390                 }
9391 #ifdef INET6
9392                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
9393                     MHLEN >= hdrlen) {
9394                         M_ALIGN(m, hdrlen);
9395                 } else
9396 #endif
9397                         m->m_data += max_linkhdr;
9398                 m->m_len = hdrlen;
9399         }
9400         SOCKBUF_UNLOCK_ASSERT(sb);
9401         m->m_pkthdr.rcvif = (struct ifnet *)0;
9402 #ifdef MAC
9403         mac_inpcb_create_mbuf(inp, m);
9404 #endif
9405 #ifdef INET6
9406         if (isipv6) {
9407                 ip6 = mtod(m, struct ip6_hdr *);
9408 #ifdef NETFLIX_TCPOUDP
9409                 if (tp->t_port) {
9410                         udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
9411                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9412                         udp->uh_dport = tp->t_port;
9413                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
9414                         udp->uh_ulen = htons(ulen);
9415                         th = (struct tcphdr *)(udp + 1);
9416                 } else
9417 #endif
9418                         th = (struct tcphdr *)(ip6 + 1);
9419                 tcpip_fillheaders(inp,
9420 #ifdef NETFLIX_TCPOUDP
9421                                   tp->t_port,
9422 #endif
9423                                   ip6, th);
9424         } else
9425 #endif                          /* INET6 */
9426         {
9427                 ip = mtod(m, struct ip *);
9428 #ifdef TCPDEBUG
9429                 ipov = (struct ipovly *)ip;
9430 #endif
9431 #ifdef NETFLIX_TCPOUDP
9432                 if (tp->t_port) {
9433                         udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
9434                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
9435                         udp->uh_dport = tp->t_port;
9436                         ulen = hdrlen + len - sizeof(struct ip);
9437                         udp->uh_ulen = htons(ulen);
9438                         th = (struct tcphdr *)(udp + 1);
9439                 } else
9440 #endif
9441                         th = (struct tcphdr *)(ip + 1);
9442                 tcpip_fillheaders(inp,
9443 #ifdef NETFLIX_TCPOUDP
9444                                   tp->t_port,
9445 #endif
9446                                   ip, th);
9447         }
9448         /*
9449          * Fill in fields, remembering maximum advertised window for use in
9450          * delaying messages about window sizes. If resending a FIN, be sure
9451          * not to use a new sequence number.
9452          */
9453         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
9454             tp->snd_nxt == tp->snd_max)
9455                 tp->snd_nxt--;
9456         /*
9457          * If we are starting a connection, send ECN setup SYN packet. If we
9458          * are on a retransmit, we may resend those bits a number of times
9459          * as per RFC 3168.
9460          */
9461         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
9462                 if (tp->t_rxtshift >= 1) {
9463                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
9464                                 flags |= TH_ECE | TH_CWR;
9465                 } else
9466                         flags |= TH_ECE | TH_CWR;
9467         }
9468         if (tp->t_state == TCPS_ESTABLISHED &&
9469             (tp->t_flags2 & TF2_ECN_PERMIT)) {
9470                 /*
9471                  * If the peer has ECN, mark data packets with ECN capable
9472                  * transmission (ECT). Ignore pure ack packets,
9473                  * retransmissions and window probes.
9474                  */
9475                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
9476                     !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
9477 #ifdef INET6
9478                         if (isipv6)
9479                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
9480                         else
9481 #endif
9482                                 ip->ip_tos |= IPTOS_ECN_ECT0;
9483                         TCPSTAT_INC(tcps_ecn_ect0);
9484                 }
9485                 /*
9486                  * Reply with proper ECN notifications.
9487                  */
9488                 if (tp->t_flags2 & TF2_ECN_SND_CWR) {
9489                         flags |= TH_CWR;
9490                         tp->t_flags2 &= ~TF2_ECN_SND_CWR;
9491                 }
9492                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
9493                         flags |= TH_ECE;
9494         }
9495         /*
9496          * If we are doing retransmissions, then snd_nxt will not reflect
9497          * the first unsent octet.  For ACK only packets, we do not want the
9498          * sequence number of the retransmitted packet, we want the sequence
9499          * number of the next unsent octet.  So, if there is no data (and no
9500          * SYN or FIN), use snd_max instead of snd_nxt when filling in
9501          * ti_seq.  But if we are in persist state, snd_max might reflect
9502          * one byte beyond the right edge of the window, so use snd_nxt in
9503          * that case, since we know we aren't doing a retransmission.
9504          * (retransmit and persist are mutually exclusive...)
9505          */
9506         if (sack_rxmit == 0) {
9507                 if (len || (flags & (TH_SYN | TH_FIN)) ||
9508                     rack->rc_in_persist) {
9509                         th->th_seq = htonl(tp->snd_nxt);
9510                         rack_seq = tp->snd_nxt;
9511                 } else if (flags & TH_RST) {
9512                         /*
9513                          * For a Reset send the last cum ack in sequence
9514                          * (this like any other choice may still generate a
9515                          * challenge ack, if a ack-update packet is in
9516                          * flight).
9517                          */
9518                         th->th_seq = htonl(tp->snd_una);
9519                         rack_seq = tp->snd_una;
9520                 } else {
9521                         th->th_seq = htonl(tp->snd_max);
9522                         rack_seq = tp->snd_max;
9523                 }
9524         } else {
9525                 th->th_seq = htonl(rsm->r_start);
9526                 rack_seq = rsm->r_start;
9527         }
9528         th->th_ack = htonl(tp->rcv_nxt);
9529         if (optlen) {
9530                 bcopy(opt, th + 1, optlen);
9531                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
9532         }
9533         th->th_flags = flags;
9534         /*
9535          * Calculate receive window.  Don't shrink window, but avoid silly
9536          * window syndrome.
9537          * If a RST segment is sent, advertise a window of zero.
9538          */
9539         if (flags & TH_RST) {
9540                 recwin = 0;
9541         } else {
9542                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
9543                     recwin < (long)ctf_fixed_maxseg(tp))
9544                         recwin = 0;
9545                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
9546                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
9547                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
9548                 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
9549                         recwin = (long)TCP_MAXWIN << tp->rcv_scale;
9550         }
9551
9552         /*
9553          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
9554          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
9555          * handled in syncache.
9556          */
9557         if (flags & TH_SYN)
9558                 th->th_win = htons((u_short)
9559                     (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
9560         else
9561                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
9562         /*
9563          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
9564          * window.  This may cause the remote transmitter to stall.  This
9565          * flag tells soreceive() to disable delayed acknowledgements when
9566          * draining the buffer.  This can occur if the receiver is
9567          * attempting to read more data than can be buffered prior to
9568          * transmitting on the connection.
9569          */
9570         if (th->th_win == 0) {
9571                 tp->t_sndzerowin++;
9572                 tp->t_flags |= TF_RXWIN0SENT;
9573         } else
9574                 tp->t_flags &= ~TF_RXWIN0SENT;
9575         if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
9576                 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
9577                 th->th_flags |= TH_URG;
9578         } else
9579                 /*
9580                  * If no urgent pointer to send, then we pull the urgent
9581                  * pointer to the left edge of the send window so that it
9582                  * doesn't drift into the send window on sequence number
9583                  * wraparound.
9584                  */
9585                 tp->snd_up = tp->snd_una;       /* drag it along */
9586
9587 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
9588         if (to.to_flags & TOF_SIGNATURE) {
9589                 /*
9590                  * Calculate MD5 signature and put it into the place
9591                  * determined before.
9592                  * NOTE: since TCP options buffer doesn't point into
9593                  * mbuf's data, calculate offset and use it.
9594                  */
9595                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
9596                     (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
9597                         /*
9598                          * Do not send segment if the calculation of MD5
9599                          * digest has failed.
9600                          */
9601                         goto out;
9602                 }
9603         }
9604 #endif
9605
9606         /*
9607          * Put TCP length in extended header, and then checksum extended
9608          * header and data.
9609          */
9610         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
9611 #ifdef INET6
9612         if (isipv6) {
9613                 /*
9614                  * ip6_plen is not need to be filled now, and will be filled
9615                  * in ip6_output.
9616                  */
9617                 if (tp->t_port) {
9618                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
9619                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9620                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
9621                         th->th_sum = htons(0);
9622                         UDPSTAT_INC(udps_opackets);
9623                 } else {
9624                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
9625                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9626                         th->th_sum = in6_cksum_pseudo(ip6,
9627                             sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
9628                             0);
9629                 }
9630         }
9631 #endif
9632 #if defined(INET6) && defined(INET)
9633         else
9634 #endif
9635 #ifdef INET
9636         {
9637                 if (tp->t_port) {
9638                         m->m_pkthdr.csum_flags = CSUM_UDP;
9639                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
9640                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
9641                            ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
9642                         th->th_sum = htons(0);
9643                         UDPSTAT_INC(udps_opackets);
9644                 } else {
9645                         m->m_pkthdr.csum_flags = CSUM_TCP;
9646                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
9647                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
9648                             ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
9649                             IPPROTO_TCP + len + optlen));
9650                 }
9651                 /* IP version must be set here for ipv4/ipv6 checking later */
9652                 KASSERT(ip->ip_v == IPVERSION,
9653                     ("%s: IP version incorrect: %d", __func__, ip->ip_v));
9654         }
9655 #endif
9656         /*
9657          * Enable TSO and specify the size of the segments. The TCP pseudo
9658          * header checksum is always provided. XXX: Fixme: This is currently
9659          * not the case for IPv6.
9660          */
9661         if (tso || force_tso) {
9662                 KASSERT(force_tso || len > tp->t_maxseg - optlen,
9663                     ("%s: len <= tso_segsz", __func__));
9664                 m->m_pkthdr.csum_flags |= CSUM_TSO;
9665                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
9666         }
9667         KASSERT(len + hdrlen == m_length(m, NULL),
9668             ("%s: mbuf chain different than expected: %d + %u != %u",
9669             __func__, len, hdrlen, m_length(m, NULL)));
9670
9671 #ifdef TCP_HHOOK
9672         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
9673         hhook_run_tcp_est_out(tp, th, &to, len, tso);
9674 #endif
9675 #ifdef TCPDEBUG
9676         /*
9677          * Trace.
9678          */
9679         if (so->so_options & SO_DEBUG) {
9680                 u_short save = 0;
9681
9682 #ifdef INET6
9683                 if (!isipv6)
9684 #endif
9685                 {
9686                         save = ipov->ih_len;
9687                         ipov->ih_len = htons(m->m_pkthdr.len    /* - hdrlen +
9688                               * (th->th_off << 2) */ );
9689                 }
9690                 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
9691 #ifdef INET6
9692                 if (!isipv6)
9693 #endif
9694                         ipov->ih_len = save;
9695         }
9696 #endif                          /* TCPDEBUG */
9697
9698         /* We're getting ready to send; log now. */
9699         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
9700                 union tcp_log_stackspecific log;
9701                 struct timeval tv;
9702
9703                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
9704                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
9705                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
9706                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
9707                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
9708                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
9709                 log.u_bbr.flex4 = orig_len;
9710                 if (filled_all)
9711                         log.u_bbr.flex5 = 0x80000000;
9712                 else
9713                         log.u_bbr.flex5 = 0;
9714                 if (rsm || sack_rxmit) {
9715                         log.u_bbr.flex8 = 1;
9716                 } else {
9717                         log.u_bbr.flex8 = 0;
9718                 }
9719                 log.u_bbr.pkts_out = tp->t_maxseg;
9720                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
9721                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9722                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
9723                     len, &log, false, NULL, NULL, 0, &tv);
9724         } else
9725                 lgb = NULL;
9726
9727         /*
9728          * Fill in IP length and desired time to live and send to IP level.
9729          * There should be a better way to handle ttl and tos; we could keep
9730          * them in the template, but need a way to checksum without them.
9731          */
9732         /*
9733          * m->m_pkthdr.len should have been set before cksum calcuration,
9734          * because in6_cksum() need it.
9735          */
9736 #ifdef INET6
9737         if (isipv6) {
9738                 /*
9739                  * we separately set hoplimit for every segment, since the
9740                  * user might want to change the value via setsockopt. Also,
9741                  * desired default hop limit might be changed via Neighbor
9742                  * Discovery.
9743                  */
9744                 ip6->ip6_hlim = in6_selecthlim(inp, NULL);
9745
9746                 /*
9747                  * Set the packet size here for the benefit of DTrace
9748                  * probes. ip6_output() will set it properly; it's supposed
9749                  * to include the option header lengths as well.
9750                  */
9751                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
9752
9753                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
9754                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9755                 else
9756                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9757
9758                 if (tp->t_state == TCPS_SYN_SENT)
9759                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
9760
9761                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
9762                 /* TODO: IPv6 IP6TOS_ECT bit on */
9763                 error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
9764                     &inp->inp_route6,
9765                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
9766                     NULL, NULL, inp);
9767
9768                 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
9769                         mtu = inp->inp_route6.ro_rt->rt_mtu;
9770         }
9771 #endif                          /* INET6 */
9772 #if defined(INET) && defined(INET6)
9773         else
9774 #endif
9775 #ifdef INET
9776         {
9777                 ip->ip_len = htons(m->m_pkthdr.len);
9778 #ifdef INET6
9779                 if (inp->inp_vflag & INP_IPV6PROTO)
9780                         ip->ip_ttl = in6_selecthlim(inp, NULL);
9781 #endif                          /* INET6 */
9782                 /*
9783                  * If we do path MTU discovery, then we set DF on every
9784                  * packet. This might not be the best thing to do according
9785                  * to RFC3390 Section 2. However the tcp hostcache migitates
9786                  * the problem so it affects only the first tcp connection
9787                  * with a host.
9788                  *
9789                  * NB: Don't set DF on small MTU/MSS to have a safe
9790                  * fallback.
9791                  */
9792                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
9793                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
9794                         if (tp->t_port == 0 || len < V_tcp_minmss) {
9795                                 ip->ip_off |= htons(IP_DF);
9796                         }
9797                 } else {
9798                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
9799                 }
9800
9801                 if (tp->t_state == TCPS_SYN_SENT)
9802                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
9803
9804                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
9805
9806                 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
9807                     ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
9808                     inp);
9809                 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
9810                         mtu = inp->inp_route.ro_rt->rt_mtu;
9811         }
9812 #endif                          /* INET */
9813
9814 out:
9815         if (lgb) {
9816                 lgb->tlb_errno = error;
9817                 lgb = NULL;
9818         }
9819         /*
9820          * In transmit state, time the transmission and arrange for the
9821          * retransmit.  In persist state, just set snd_max.
9822          */
9823         if (error == 0) {
9824                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
9825                     (tp->t_flags & TF_SACK_PERMIT) &&
9826                     tp->rcv_numsacks > 0)
9827                         tcp_clean_dsack_blocks(tp);
9828                 if (len == 0)
9829                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
9830                 else if (len == 1) {
9831                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
9832                 } else if (len > 1) {
9833                         int idx;
9834
9835                         idx = (len / ctf_fixed_maxseg(tp)) + 3;
9836                         if (idx >= TCP_MSS_ACCT_ATIMER)
9837                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
9838                         else
9839                                 counter_u64_add(rack_out_size[idx], 1);
9840                 }
9841                 if (hw_tls && len > 0) {
9842                         if (filled_all) {
9843                                 counter_u64_add(rack_tls_filled, 1);
9844                                 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1);
9845                         } else {
9846                                 if (rsm) {
9847                                         counter_u64_add(rack_tls_rxt, 1);
9848                                         rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1);
9849                                 } else if (doing_tlp) {
9850                                         counter_u64_add(rack_tls_tlp, 1);
9851                                         rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1);
9852                                 } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) {
9853                                         counter_u64_add(rack_tls_app, 1);
9854                                         rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1);
9855                                 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) {
9856                                         counter_u64_add(rack_tls_cwnd, 1);
9857                                         rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1);
9858                                 } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) {
9859                                         counter_u64_add(rack_tls_rwnd, 1);
9860                                         rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1);
9861                                 } else {
9862                                         rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1);
9863                                         counter_u64_add(rack_tls_other, 1);
9864                                 }
9865                         }
9866                 }
9867         }
9868         if (sub_from_prr && (error == 0)) {
9869                 if (rack->r_ctl.rc_prr_sndcnt >= len)
9870                         rack->r_ctl.rc_prr_sndcnt -= len;
9871                 else
9872                         rack->r_ctl.rc_prr_sndcnt = 0;
9873         }
9874         sub_from_prr = 0;
9875         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
9876             pass, rsm);
9877         if ((error == 0) &&
9878             (len > 0) &&
9879             (tp->snd_una == tp->snd_max))
9880                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
9881         if ((tp->t_flags & TF_FORCEDATA) == 0 ||
9882             (rack->rc_in_persist == 0)) {
9883                 tcp_seq startseq = tp->snd_nxt;
9884
9885                 /*
9886                  * Advance snd_nxt over sequence space of this segment.
9887                  */
9888                 if (error)
9889                         /* We don't log or do anything with errors */
9890                         goto nomore;
9891
9892                 if (flags & (TH_SYN | TH_FIN)) {
9893                         if (flags & TH_SYN)
9894                                 tp->snd_nxt++;
9895                         if (flags & TH_FIN) {
9896                                 tp->snd_nxt++;
9897                                 tp->t_flags |= TF_SENTFIN;
9898                         }
9899                 }
9900                 /* In the ENOBUFS case we do *not* update snd_max */
9901                 if (sack_rxmit)
9902                         goto nomore;
9903
9904                 tp->snd_nxt += len;
9905                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
9906                         if (tp->snd_una == tp->snd_max) {
9907                                 /*
9908                                  * Update the time we just added data since
9909                                  * none was outstanding.
9910                                  */
9911                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9912                                 tp->t_acktime = ticks;
9913                         }
9914                         tp->snd_max = tp->snd_nxt;
9915                         /*
9916                          * Time this transmission if not a retransmission and
9917                          * not currently timing anything.
9918                          * This is only relevant in case of switching back to
9919                          * the base stack.
9920                          */
9921                         if (tp->t_rtttime == 0) {
9922                                 tp->t_rtttime = ticks;
9923                                 tp->t_rtseq = startseq;
9924                                 TCPSTAT_INC(tcps_segstimed);
9925                         }
9926 #ifdef STATS
9927                         if (!(tp->t_flags & TF_GPUTINPROG) && len) {
9928                                 tp->t_flags |= TF_GPUTINPROG;
9929                                 tp->gput_seq = startseq;
9930                                 tp->gput_ack = startseq +
9931                                     ulmin(sbavail(sb) - sb_offset, sendwin);
9932                                 tp->gput_ts = tcp_ts_getticks();
9933                         }
9934 #endif
9935                 }
9936         } else {
9937                 /*
9938                  * Persist case, update snd_max but since we are in persist
9939                  * mode (no window) we do not update snd_nxt.
9940                  */
9941                 int32_t xlen = len;
9942
9943                 if (error)
9944                         goto nomore;
9945
9946                 if (flags & TH_SYN)
9947                         ++xlen;
9948                 if (flags & TH_FIN) {
9949                         ++xlen;
9950                         tp->t_flags |= TF_SENTFIN;
9951                 }
9952                 /* In the ENOBUFS case we do *not* update snd_max */
9953                 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
9954                         if (tp->snd_una == tp->snd_max) {
9955                                 /*
9956                                  * Update the time we just added data since
9957                                  * none was outstanding.
9958                                  */
9959                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
9960                                 tp->t_acktime = ticks;
9961                         }
9962                         tp->snd_max = tp->snd_nxt + len;
9963                 }
9964         }
9965 nomore:
9966         if (error) {
9967                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
9968                 /*
9969                  * Failures do not advance the seq counter above. For the
9970                  * case of ENOBUFS we will fall out and retry in 1ms with
9971                  * the hpts. Everything else will just have to retransmit
9972                  * with the timer.
9973                  *
9974                  * In any case, we do not want to loop around for another
9975                  * send without a good reason.
9976                  */
9977                 sendalot = 0;
9978                 switch (error) {
9979                 case EPERM:
9980                         tp->t_flags &= ~TF_FORCEDATA;
9981                         tp->t_softerror = error;
9982                         return (error);
9983                 case ENOBUFS:
9984                         if (slot == 0) {
9985                                 /*
9986                                  * Pace us right away to retry in a some
9987                                  * time
9988                                  */
9989                                 slot = 1 + rack->rc_enobuf;
9990                                 if (rack->rc_enobuf < 255)
9991                                         rack->rc_enobuf++;
9992                                 if (slot > (rack->rc_rack_rtt / 2)) {
9993                                         slot = rack->rc_rack_rtt / 2;
9994                                 }
9995                                 if (slot < 10)
9996                                         slot = 10;
9997                         }
9998                         counter_u64_add(rack_saw_enobuf, 1);
9999                         error = 0;
10000                         goto enobufs;
10001                 case EMSGSIZE:
10002                         /*
10003                          * For some reason the interface we used initially
10004                          * to send segments changed to another or lowered
10005                          * its MTU. If TSO was active we either got an
10006                          * interface without TSO capabilits or TSO was
10007                          * turned off. If we obtained mtu from ip_output()
10008                          * then update it and try again.
10009                          */
10010                         if (tso)
10011                                 tp->t_flags &= ~TF_TSO;
10012                         if (mtu != 0) {
10013                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
10014                                 goto again;
10015                         }
10016                         slot = 10;
10017                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10018                         tp->t_flags &= ~TF_FORCEDATA;
10019                         return (error);
10020                 case ENETUNREACH:
10021                         counter_u64_add(rack_saw_enetunreach, 1);
10022                 case EHOSTDOWN:
10023                 case EHOSTUNREACH:
10024                 case ENETDOWN:
10025                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
10026                                 tp->t_softerror = error;
10027                         }
10028                         /* FALLTHROUGH */
10029                 default:
10030                         slot = 10;
10031                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
10032                         tp->t_flags &= ~TF_FORCEDATA;
10033                         return (error);
10034                 }
10035         } else {
10036                 rack->rc_enobuf = 0;
10037         }
10038         TCPSTAT_INC(tcps_sndtotal);
10039
10040         /*
10041          * Data sent (as far as we can tell). If this advertises a larger
10042          * window than any other segment, then remember the size of the
10043          * advertised window. Any pending ACK has now been sent.
10044          */
10045         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
10046                 tp->rcv_adv = tp->rcv_nxt + recwin;
10047         tp->last_ack_sent = tp->rcv_nxt;
10048         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
10049 enobufs:
10050         rack->r_tlp_running = 0;
10051         if (flags & TH_RST) {
10052                 /*
10053                  * We don't send again after sending a RST.
10054                  */
10055                 slot = 0;
10056                 sendalot = 0;
10057         }
10058         if (rsm && (slot == 0)) {
10059                 /*
10060                  * Dup ack retransmission possibly, so
10061                  * lets assure we have at least min rack
10062                  * time, if its a rack resend then the rack
10063                  * to will also be set to this.
10064                  */
10065                 slot = rack->r_ctl.rc_min_to;
10066         }
10067         if (slot) {
10068                 /* set the rack tcb into the slot N */
10069                 counter_u64_add(rack_paced_segments, 1);
10070         } else if (sendalot) {
10071                 if (len)
10072                         counter_u64_add(rack_unpaced_segments, 1);
10073                 sack_rxmit = 0;
10074                 tp->t_flags &= ~TF_FORCEDATA;
10075                 goto again;
10076         } else if (len) {
10077                 counter_u64_add(rack_unpaced_segments, 1);
10078         }
10079         tp->t_flags &= ~TF_FORCEDATA;
10080         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
10081         return (error);
10082 }
10083
10084 /*
10085  * rack_ctloutput() must drop the inpcb lock before performing copyin on
10086  * socket option arguments.  When it re-acquires the lock after the copy, it
10087  * has to revalidate that the connection is still valid for the socket
10088  * option.
10089  */
10090 static int
10091 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
10092     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10093 {
10094         int32_t error = 0, optval;
10095
10096         switch (sopt->sopt_name) {
10097         case TCP_RACK_PROP_RATE:
10098         case TCP_RACK_PROP:
10099         case TCP_RACK_TLP_REDUCE:
10100         case TCP_RACK_EARLY_RECOV:
10101         case TCP_RACK_PACE_ALWAYS:
10102         case TCP_DELACK:
10103         case TCP_RACK_PACE_REDUCE:
10104         case TCP_RACK_PACE_MAX_SEG:
10105         case TCP_RACK_PRR_SENDALOT:
10106         case TCP_RACK_MIN_TO:
10107         case TCP_RACK_EARLY_SEG:
10108         case TCP_RACK_REORD_THRESH:
10109         case TCP_RACK_REORD_FADE:
10110         case TCP_RACK_TLP_THRESH:
10111         case TCP_RACK_PKT_DELAY:
10112         case TCP_RACK_TLP_USE:
10113         case TCP_RACK_TLP_INC_VAR:
10114         case TCP_RACK_IDLE_REDUCE_HIGH:
10115         case TCP_RACK_MIN_PACE:
10116         case TCP_RACK_GP_INCREASE:
10117         case TCP_BBR_RACK_RTT_USE:
10118         case TCP_BBR_USE_RACK_CHEAT:
10119         case TCP_RACK_DO_DETECTION:
10120         case TCP_DATA_AFTER_CLOSE:
10121                 break;
10122         default:
10123                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10124                 break;
10125         }
10126         INP_WUNLOCK(inp);
10127         error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
10128         if (error)
10129                 return (error);
10130         INP_WLOCK(inp);
10131         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
10132                 INP_WUNLOCK(inp);
10133                 return (ECONNRESET);
10134         }
10135         tp = intotcpcb(inp);
10136         rack = (struct tcp_rack *)tp->t_fb_ptr;
10137         switch (sopt->sopt_name) {
10138         case TCP_RACK_DO_DETECTION:
10139                 RACK_OPTS_INC(tcp_rack_do_detection);
10140                 if (optval == 0)
10141                         rack->do_detection = 0;
10142                 else
10143                         rack->do_detection = 1;
10144                 break;
10145         case TCP_RACK_PROP_RATE:
10146                 if ((optval <= 0) || (optval >= 100)) {
10147                         error = EINVAL;
10148                         break;
10149                 }
10150                 RACK_OPTS_INC(tcp_rack_prop_rate);
10151                 rack->r_ctl.rc_prop_rate = optval;
10152                 break;
10153         case TCP_RACK_TLP_USE:
10154                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
10155                         error = EINVAL;
10156                         break;
10157                 }
10158                 RACK_OPTS_INC(tcp_tlp_use);
10159                 rack->rack_tlp_threshold_use = optval;
10160                 break;
10161         case TCP_RACK_PROP:
10162                 /* RACK proportional rate reduction (bool) */
10163                 RACK_OPTS_INC(tcp_rack_prop);
10164                 rack->r_ctl.rc_prop_reduce = optval;
10165                 break;
10166         case TCP_RACK_TLP_REDUCE:
10167                 /* RACK TLP cwnd reduction (bool) */
10168                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
10169                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
10170                 break;
10171         case TCP_RACK_EARLY_RECOV:
10172                 /* Should recovery happen early (bool) */
10173                 RACK_OPTS_INC(tcp_rack_early_recov);
10174                 rack->r_ctl.rc_early_recovery = optval;
10175                 break;
10176         case TCP_RACK_PACE_ALWAYS:
10177                 /* Use the always pace method (bool)  */
10178                 RACK_OPTS_INC(tcp_rack_pace_always);
10179                 if (optval > 0)
10180                         rack->rc_always_pace = 1;
10181                 else
10182                         rack->rc_always_pace = 0;
10183                 break;
10184         case TCP_RACK_PACE_REDUCE:
10185                 /* RACK Hptsi reduction factor (divisor) */
10186                 RACK_OPTS_INC(tcp_rack_pace_reduce);
10187                 if (optval)
10188                         /* Must be non-zero */
10189                         rack->rc_pace_reduce = optval;
10190                 else
10191                         error = EINVAL;
10192                 break;
10193         case TCP_RACK_PACE_MAX_SEG:
10194                 /* Max segments in a pace */
10195                 RACK_OPTS_INC(tcp_rack_max_seg);
10196                 rack->rc_pace_max_segs = optval;
10197                 rack_set_pace_segments(tp, rack);
10198                 break;
10199         case TCP_RACK_PRR_SENDALOT:
10200                 /* Allow PRR to send more than one seg */
10201                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
10202                 rack->r_ctl.rc_prr_sendalot = optval;
10203                 break;
10204         case TCP_RACK_MIN_TO:
10205                 /* Minimum time between rack t-o's in ms */
10206                 RACK_OPTS_INC(tcp_rack_min_to);
10207                 rack->r_ctl.rc_min_to = optval;
10208                 break;
10209         case TCP_RACK_EARLY_SEG:
10210                 /* If early recovery max segments */
10211                 RACK_OPTS_INC(tcp_rack_early_seg);
10212                 rack->r_ctl.rc_early_recovery_segs = optval;
10213                 break;
10214         case TCP_RACK_REORD_THRESH:
10215                 /* RACK reorder threshold (shift amount) */
10216                 RACK_OPTS_INC(tcp_rack_reord_thresh);
10217                 if ((optval > 0) && (optval < 31))
10218                         rack->r_ctl.rc_reorder_shift = optval;
10219                 else
10220                         error = EINVAL;
10221                 break;
10222         case TCP_RACK_REORD_FADE:
10223                 /* Does reordering fade after ms time */
10224                 RACK_OPTS_INC(tcp_rack_reord_fade);
10225                 rack->r_ctl.rc_reorder_fade = optval;
10226                 break;
10227         case TCP_RACK_TLP_THRESH:
10228                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10229                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
10230                 if (optval)
10231                         rack->r_ctl.rc_tlp_threshold = optval;
10232                 else
10233                         error = EINVAL;
10234                 break;
10235         case TCP_BBR_USE_RACK_CHEAT:
10236                 RACK_OPTS_INC(tcp_rack_cheat);
10237                 if (optval)
10238                         rack->use_rack_cheat = 1;
10239                 else
10240                         rack->use_rack_cheat = 0;
10241                 break;
10242         case TCP_RACK_PKT_DELAY:
10243                 /* RACK added ms i.e. rack-rtt + reord + N */
10244                 RACK_OPTS_INC(tcp_rack_pkt_delay);
10245                 rack->r_ctl.rc_pkt_delay = optval;
10246                 break;
10247         case TCP_RACK_TLP_INC_VAR:
10248                 /* Does TLP include rtt variance in t-o */
10249                 error = EINVAL;
10250                 break;
10251         case TCP_RACK_IDLE_REDUCE_HIGH:
10252                 error = EINVAL;
10253                 break;
10254         case TCP_DELACK:
10255                 if (optval == 0)
10256                         tp->t_delayed_ack = 0;
10257                 else
10258                         tp->t_delayed_ack = 1;
10259                 if (tp->t_flags & TF_DELACK) {
10260                         tp->t_flags &= ~TF_DELACK;
10261                         tp->t_flags |= TF_ACKNOW;
10262                         rack_output(tp);
10263                 }
10264                 break;
10265         case TCP_RACK_MIN_PACE:
10266                 RACK_OPTS_INC(tcp_rack_min_pace);
10267                 if (optval > 3)
10268                         rack->r_enforce_min_pace = 3;
10269                 else
10270                         rack->r_enforce_min_pace = optval;
10271                 break;
10272         case TCP_RACK_GP_INCREASE:
10273                 if ((optval >= 0) &&
10274                     (optval <= 256))
10275                         rack->rack_per_of_gp = optval;
10276                 else
10277                         error = EINVAL;
10278
10279                 break;
10280         case TCP_BBR_RACK_RTT_USE:
10281                 if ((optval != USE_RTT_HIGH) &&
10282                     (optval != USE_RTT_LOW) &&
10283                     (optval != USE_RTT_AVG))
10284                         error = EINVAL;
10285                 else
10286                         rack->r_ctl.rc_rate_sample_method = optval;
10287                 break;
10288         case TCP_DATA_AFTER_CLOSE:
10289                 if (optval)
10290                         rack->rc_allow_data_af_clo = 1;
10291                 else
10292                         rack->rc_allow_data_af_clo = 0;
10293                 break;
10294         default:
10295                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10296                 break;
10297         }
10298 #ifdef NETFLIX_STATS
10299         tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
10300 #endif
10301         INP_WUNLOCK(inp);
10302         return (error);
10303 }
10304
10305 static int
10306 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
10307     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
10308 {
10309         int32_t error, optval;
10310
10311         /*
10312          * Because all our options are either boolean or an int, we can just
10313          * pull everything into optval and then unlock and copy. If we ever
10314          * add a option that is not a int, then this will have quite an
10315          * impact to this routine.
10316          */
10317         error = 0;
10318         switch (sopt->sopt_name) {
10319         case TCP_RACK_DO_DETECTION:
10320                 optval = rack->do_detection;
10321                 break;
10322
10323         case TCP_RACK_PROP_RATE:
10324                 optval = rack->r_ctl.rc_prop_rate;
10325                 break;
10326         case TCP_RACK_PROP:
10327                 /* RACK proportional rate reduction (bool) */
10328                 optval = rack->r_ctl.rc_prop_reduce;
10329                 break;
10330         case TCP_RACK_TLP_REDUCE:
10331                 /* RACK TLP cwnd reduction (bool) */
10332                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
10333                 break;
10334         case TCP_RACK_EARLY_RECOV:
10335                 /* Should recovery happen early (bool) */
10336                 optval = rack->r_ctl.rc_early_recovery;
10337                 break;
10338         case TCP_RACK_PACE_REDUCE:
10339                 /* RACK Hptsi reduction factor (divisor) */
10340                 optval = rack->rc_pace_reduce;
10341                 break;
10342         case TCP_RACK_PACE_MAX_SEG:
10343                 /* Max segments in a pace */
10344                 optval = rack->rc_pace_max_segs;
10345                 break;
10346         case TCP_RACK_PACE_ALWAYS:
10347                 /* Use the always pace method */
10348                 optval = rack->rc_always_pace;
10349                 break;
10350         case TCP_RACK_PRR_SENDALOT:
10351                 /* Allow PRR to send more than one seg */
10352                 optval = rack->r_ctl.rc_prr_sendalot;
10353                 break;
10354         case TCP_RACK_MIN_TO:
10355                 /* Minimum time between rack t-o's in ms */
10356                 optval = rack->r_ctl.rc_min_to;
10357                 break;
10358         case TCP_RACK_EARLY_SEG:
10359                 /* If early recovery max segments */
10360                 optval = rack->r_ctl.rc_early_recovery_segs;
10361                 break;
10362         case TCP_RACK_REORD_THRESH:
10363                 /* RACK reorder threshold (shift amount) */
10364                 optval = rack->r_ctl.rc_reorder_shift;
10365                 break;
10366         case TCP_RACK_REORD_FADE:
10367                 /* Does reordering fade after ms time */
10368                 optval = rack->r_ctl.rc_reorder_fade;
10369                 break;
10370         case TCP_BBR_USE_RACK_CHEAT:
10371                 /* Do we use the rack cheat for rxt */
10372                 optval = rack->use_rack_cheat;
10373                 break;
10374         case TCP_RACK_TLP_THRESH:
10375                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
10376                 optval = rack->r_ctl.rc_tlp_threshold;
10377                 break;
10378         case TCP_RACK_PKT_DELAY:
10379                 /* RACK added ms i.e. rack-rtt + reord + N */
10380                 optval = rack->r_ctl.rc_pkt_delay;
10381                 break;
10382         case TCP_RACK_TLP_USE:
10383                 optval = rack->rack_tlp_threshold_use;
10384                 break;
10385         case TCP_RACK_TLP_INC_VAR:
10386                 /* Does TLP include rtt variance in t-o */
10387                 error = EINVAL;
10388                 break;
10389         case TCP_RACK_IDLE_REDUCE_HIGH:
10390                 error = EINVAL;
10391                 break;
10392         case TCP_RACK_MIN_PACE:
10393                 optval = rack->r_enforce_min_pace;
10394                 break;
10395         case TCP_RACK_GP_INCREASE:
10396                 optval = rack->rack_per_of_gp;
10397                 break;
10398         case TCP_BBR_RACK_RTT_USE:
10399                 optval = rack->r_ctl.rc_rate_sample_method;
10400                 break;
10401         case TCP_DELACK:
10402                 optval = tp->t_delayed_ack;
10403                 break;
10404         case TCP_DATA_AFTER_CLOSE:
10405                 optval = rack->rc_allow_data_af_clo;
10406                 break;
10407         default:
10408                 return (tcp_default_ctloutput(so, sopt, inp, tp));
10409                 break;
10410         }
10411         INP_WUNLOCK(inp);
10412         if (error == 0) {
10413                 error = sooptcopyout(sopt, &optval, sizeof optval);
10414         }
10415         return (error);
10416 }
10417
10418 static int
10419 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
10420 {
10421         int32_t error = EINVAL;
10422         struct tcp_rack *rack;
10423
10424         rack = (struct tcp_rack *)tp->t_fb_ptr;
10425         if (rack == NULL) {
10426                 /* Huh? */
10427                 goto out;
10428         }
10429         if (sopt->sopt_dir == SOPT_SET) {
10430                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
10431         } else if (sopt->sopt_dir == SOPT_GET) {
10432                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
10433         }
10434 out:
10435         INP_WUNLOCK(inp);
10436         return (error);
10437 }
10438
10439
10440 static struct tcp_function_block __tcp_rack = {
10441         .tfb_tcp_block_name = __XSTRING(STACKNAME),
10442         .tfb_tcp_output = rack_output,
10443         .tfb_do_queued_segments = ctf_do_queued_segments,
10444         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
10445         .tfb_tcp_do_segment = rack_do_segment,
10446         .tfb_tcp_ctloutput = rack_ctloutput,
10447         .tfb_tcp_fb_init = rack_init,
10448         .tfb_tcp_fb_fini = rack_fini,
10449         .tfb_tcp_timer_stop_all = rack_stopall,
10450         .tfb_tcp_timer_activate = rack_timer_activate,
10451         .tfb_tcp_timer_active = rack_timer_active,
10452         .tfb_tcp_timer_stop = rack_timer_stop,
10453         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
10454         .tfb_tcp_handoff_ok = rack_handoff_ok
10455 };
10456
10457 static const char *rack_stack_names[] = {
10458         __XSTRING(STACKNAME),
10459 #ifdef STACKALIAS
10460         __XSTRING(STACKALIAS),
10461 #endif
10462 };
10463
10464 static int
10465 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
10466 {
10467         memset(mem, 0, size);
10468         return (0);
10469 }
10470
10471 static void
10472 rack_dtor(void *mem, int32_t size, void *arg)
10473 {
10474
10475 }
10476
10477 static bool rack_mod_inited = false;
10478
10479 static int
10480 tcp_addrack(module_t mod, int32_t type, void *data)
10481 {
10482         int32_t err = 0;
10483         int num_stacks;
10484
10485         switch (type) {
10486         case MOD_LOAD:
10487                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
10488                     sizeof(struct rack_sendmap),
10489                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
10490
10491                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
10492                     sizeof(struct tcp_rack),
10493                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
10494
10495                 sysctl_ctx_init(&rack_sysctl_ctx);
10496                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
10497                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
10498                     OID_AUTO,
10499 #ifdef STACKALIAS
10500                     __XSTRING(STACKALIAS),
10501 #else
10502                     __XSTRING(STACKNAME),
10503 #endif
10504                     CTLFLAG_RW, 0,
10505                     "");
10506                 if (rack_sysctl_root == NULL) {
10507                         printf("Failed to add sysctl node\n");
10508                         err = EFAULT;
10509                         goto free_uma;
10510                 }
10511                 rack_init_sysctls();
10512                 num_stacks = nitems(rack_stack_names);
10513                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
10514                     rack_stack_names, &num_stacks);
10515                 if (err) {
10516                         printf("Failed to register %s stack name for "
10517                             "%s module\n", rack_stack_names[num_stacks],
10518                             __XSTRING(MODNAME));
10519                         sysctl_ctx_free(&rack_sysctl_ctx);
10520 free_uma:
10521                         uma_zdestroy(rack_zone);
10522                         uma_zdestroy(rack_pcb_zone);
10523                         rack_counter_destroy();
10524                         printf("Failed to register rack module -- err:%d\n", err);
10525                         return (err);
10526                 }
10527                 tcp_lro_reg_mbufq();
10528                 rack_mod_inited = true;
10529                 break;
10530         case MOD_QUIESCE:
10531                 err = deregister_tcp_functions(&__tcp_rack, true, false);
10532                 break;
10533         case MOD_UNLOAD:
10534                 err = deregister_tcp_functions(&__tcp_rack, false, true);
10535                 if (err == EBUSY)
10536                         break;
10537                 if (rack_mod_inited) {
10538                         uma_zdestroy(rack_zone);
10539                         uma_zdestroy(rack_pcb_zone);
10540                         sysctl_ctx_free(&rack_sysctl_ctx);
10541                         rack_counter_destroy();
10542                         rack_mod_inited = false;
10543                 }
10544                 tcp_lro_dereg_mbufq();
10545                 err = 0;
10546                 break;
10547         default:
10548                 return (EOPNOTSUPP);
10549         }
10550         return (err);
10551 }
10552
10553 static moduledata_t tcp_rack = {
10554         .name = __XSTRING(MODNAME),
10555         .evhand = tcp_addrack,
10556         .priv = 0
10557 };
10558
10559 MODULE_VERSION(MODNAME, 1);
10560 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
10561 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);