]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_stacks/rack.c
tcp: Rack might retransmit forever.
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_stacks / rack.c
1 /*-
2  * Copyright (c) 2016-2020 Netflix, Inc.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_ipsec.h"
33 #include "opt_tcpdebug.h"
34 #include "opt_ratelimit.h"
35 #include "opt_kern_tls.h"
36 #include <sys/param.h>
37 #include <sys/arb.h>
38 #include <sys/module.h>
39 #include <sys/kernel.h>
40 #ifdef TCP_HHOOK
41 #include <sys/hhook.h>
42 #endif
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/lock.h>
46 #include <sys/mutex.h>
47 #include <sys/mbuf.h>
48 #include <sys/proc.h>           /* for proc0 declaration */
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sysctl.h>
52 #include <sys/systm.h>
53 #ifdef STATS
54 #include <sys/qmath.h>
55 #include <sys/tree.h>
56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
57 #else
58 #include <sys/tree.h>
59 #endif
60 #include <sys/refcount.h>
61 #include <sys/queue.h>
62 #include <sys/tim_filter.h>
63 #include <sys/smp.h>
64 #include <sys/kthread.h>
65 #include <sys/kern_prefetch.h>
66 #include <sys/protosw.h>
67 #ifdef TCP_ACCOUNTING
68 #include <sys/sched.h>
69 #include <machine/cpu.h>
70 #endif
71 #include <vm/uma.h>
72
73 #include <net/route.h>
74 #include <net/route/nhop.h>
75 #include <net/vnet.h>
76
77 #define TCPSTATES               /* for logging */
78
79 #include <netinet/in.h>
80 #include <netinet/in_kdtrace.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/ip.h>
83 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
84 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
85 #include <netinet/ip_var.h>
86 #include <netinet/ip6.h>
87 #include <netinet6/in6_pcb.h>
88 #include <netinet6/ip6_var.h>
89 #include <netinet/tcp.h>
90 #define TCPOUTFLAGS
91 #include <netinet/tcp_fsm.h>
92 #include <netinet/tcp_log_buf.h>
93 #include <netinet/tcp_seq.h>
94 #include <netinet/tcp_timer.h>
95 #include <netinet/tcp_var.h>
96 #include <netinet/tcp_hpts.h>
97 #include <netinet/tcp_ratelimit.h>
98 #include <netinet/tcp_accounting.h>
99 #include <netinet/tcpip.h>
100 #include <netinet/cc/cc.h>
101 #include <netinet/cc/cc_newreno.h>
102 #include <netinet/tcp_fastopen.h>
103 #include <netinet/tcp_lro.h>
104 #ifdef NETFLIX_SHARED_CWND
105 #include <netinet/tcp_shared_cwnd.h>
106 #endif
107 #ifdef TCPDEBUG
108 #include <netinet/tcp_debug.h>
109 #endif                          /* TCPDEBUG */
110 #ifdef TCP_OFFLOAD
111 #include <netinet/tcp_offload.h>
112 #endif
113 #ifdef INET6
114 #include <netinet6/tcp6_var.h>
115 #endif
116
117 #include <netipsec/ipsec_support.h>
118
119 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
120 #include <netipsec/ipsec.h>
121 #include <netipsec/ipsec6.h>
122 #endif                          /* IPSEC */
123
124 #include <netinet/udp.h>
125 #include <netinet/udp_var.h>
126 #include <machine/in_cksum.h>
127
128 #ifdef MAC
129 #include <security/mac/mac_framework.h>
130 #endif
131 #include "sack_filter.h"
132 #include "tcp_rack.h"
133 #include "rack_bbr_common.h"
134
135 uma_zone_t rack_zone;
136 uma_zone_t rack_pcb_zone;
137
138 #ifndef TICKS2SBT
139 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
140 #endif
141
142 VNET_DECLARE(uint32_t, newreno_beta);
143 VNET_DECLARE(uint32_t, newreno_beta_ecn);
144 #define V_newreno_beta VNET(newreno_beta)
145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
146
147
148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
150
151 struct sysctl_ctx_list rack_sysctl_ctx;
152 struct sysctl_oid *rack_sysctl_root;
153
154 #define CUM_ACKED 1
155 #define SACKED 2
156
157 /*
158  * The RACK module incorporates a number of
159  * TCP ideas that have been put out into the IETF
160  * over the last few years:
161  * - Matt Mathis's Rate Halving which slowly drops
162  *    the congestion window so that the ack clock can
163  *    be maintained during a recovery.
164  * - Yuchung Cheng's RACK TCP (for which its named) that
165  *    will stop us using the number of dup acks and instead
166  *    use time as the gage of when we retransmit.
167  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
168  *    of Dukkipati et.al.
169  * RACK depends on SACK, so if an endpoint arrives that
170  * cannot do SACK the state machine below will shuttle the
171  * connection back to using the "default" TCP stack that is
172  * in FreeBSD.
173  *
174  * To implement RACK the original TCP stack was first decomposed
175  * into a functional state machine with individual states
176  * for each of the possible TCP connection states. The do_segement
177  * functions role in life is to mandate the connection supports SACK
178  * initially and then assure that the RACK state matches the conenction
179  * state before calling the states do_segment function. Each
180  * state is simplified due to the fact that the original do_segment
181  * has been decomposed and we *know* what state we are in (no
182  * switches on the state) and all tests for SACK are gone. This
183  * greatly simplifies what each state does.
184  *
185  * TCP output is also over-written with a new version since it
186  * must maintain the new rack scoreboard.
187  *
188  */
189 static int32_t rack_tlp_thresh = 1;
190 static int32_t rack_tlp_limit = 2;      /* No more than 2 TLPs w-out new data */
191 static int32_t rack_tlp_use_greater = 1;
192 static int32_t rack_reorder_thresh = 2;
193 static int32_t rack_reorder_fade = 60000000;    /* 0 - never fade, def 60,000,000
194                                                  * - 60 seconds */
195 static uint8_t rack_req_measurements = 1;
196 /* Attack threshold detections */
197 static uint32_t rack_highest_sack_thresh_seen = 0;
198 static uint32_t rack_highest_move_thresh_seen = 0;
199 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
200 static int32_t rack_hw_pace_extra_slots = 2;    /* 2 extra MSS time betweens */
201 static int32_t rack_hw_rate_caps = 1; /* 1; */
202 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
203 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
204 static int32_t rack_hw_up_only = 1;
205 static int32_t rack_stats_gets_ms_rtt = 1;
206 static int32_t rack_prr_addbackmax = 2;
207 static int32_t rack_do_hystart = 0;
208
209 static int32_t rack_pkt_delay = 1000;
210 static int32_t rack_send_a_lot_in_prr = 1;
211 static int32_t rack_min_to = 1000;      /* Number of microsecond  min timeout */
212 static int32_t rack_verbose_logging = 0;
213 static int32_t rack_ignore_data_after_close = 1;
214 static int32_t rack_enable_shared_cwnd = 1;
215 static int32_t rack_use_cmp_acks = 1;
216 static int32_t rack_use_fsb = 1;
217 static int32_t rack_use_rfo = 1;
218 static int32_t rack_use_rsm_rfo = 1;
219 static int32_t rack_max_abc_post_recovery = 2;
220 static int32_t rack_client_low_buf = 0;
221 static int32_t rack_dsack_std_based = 0x3;      /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
222 #ifdef TCP_ACCOUNTING
223 static int32_t rack_tcp_accounting = 0;
224 #endif
225 static int32_t rack_limits_scwnd = 1;
226 static int32_t rack_enable_mqueue_for_nonpaced = 0;
227 static int32_t rack_disable_prr = 0;
228 static int32_t use_rack_rr = 1;
229 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
230 static int32_t rack_persist_min = 250000;       /* 250usec */
231 static int32_t rack_persist_max = 2000000;      /* 2 Second in usec's */
232 static int32_t rack_sack_not_required = 1;      /* set to one to allow non-sack to use rack */
233 static int32_t rack_default_init_window = 0;    /* Use system default */
234 static int32_t rack_limit_time_with_srtt = 0;
235 static int32_t rack_autosndbuf_inc = 20;        /* In percentage form */
236 static int32_t rack_enobuf_hw_boost_mult = 2;   /* How many times the hw rate we boost slot using time_between */
237 static int32_t rack_enobuf_hw_max = 12000;      /* 12 ms in usecs */
238 static int32_t rack_enobuf_hw_min = 10000;      /* 10 ms in usecs */
239 static int32_t rack_hw_rwnd_factor = 2;         /* How many max_segs the rwnd must be before we hold off sending */
240 /*
241  * Currently regular tcp has a rto_min of 30ms
242  * the backoff goes 12 times so that ends up
243  * being a total of 122.850 seconds before a
244  * connection is killed.
245  */
246 static uint32_t rack_def_data_window = 20;
247 static uint32_t rack_goal_bdp = 2;
248 static uint32_t rack_min_srtts = 1;
249 static uint32_t rack_min_measure_usec = 0;
250 static int32_t rack_tlp_min = 10000;    /* 10ms */
251 static int32_t rack_rto_min = 30000;    /* 30,000 usec same as main freebsd */
252 static int32_t rack_rto_max = 4000000;  /* 4 seconds in usec's */
253 static const int32_t rack_free_cache = 2;
254 static int32_t rack_hptsi_segments = 40;
255 static int32_t rack_rate_sample_method = USE_RTT_LOW;
256 static int32_t rack_pace_every_seg = 0;
257 static int32_t rack_delayed_ack_time = 40000;   /* 40ms in usecs */
258 static int32_t rack_slot_reduction = 4;
259 static int32_t rack_wma_divisor = 8;            /* For WMA calculation */
260 static int32_t rack_cwnd_block_ends_measure = 0;
261 static int32_t rack_rwnd_block_ends_measure = 0;
262 static int32_t rack_def_profile = 0;
263
264 static int32_t rack_lower_cwnd_at_tlp = 0;
265 static int32_t rack_limited_retran = 0;
266 static int32_t rack_always_send_oldest = 0;
267 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
268
269 static uint16_t rack_per_of_gp_ss = 250;        /* 250 % slow-start */
270 static uint16_t rack_per_of_gp_ca = 200;        /* 200 % congestion-avoidance */
271 static uint16_t rack_per_of_gp_rec = 200;       /* 200 % of bw */
272
273 /* Probertt */
274 static uint16_t rack_per_of_gp_probertt = 60;   /* 60% of bw */
275 static uint16_t rack_per_of_gp_lowthresh = 40;  /* 40% is bottom */
276 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
277 static uint16_t rack_atexit_prtt_hbp = 130;     /* Clamp to 130% on exit prtt if highly buffered path */
278 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
279
280 static uint32_t rack_max_drain_wait = 2;        /* How man gp srtt's before we give up draining */
281 static uint32_t rack_must_drain = 1;            /* How many GP srtt's we *must* wait */
282 static uint32_t rack_probertt_use_min_rtt_entry = 1;    /* Use the min to calculate the goal else gp_srtt */
283 static uint32_t rack_probertt_use_min_rtt_exit = 0;
284 static uint32_t rack_probe_rtt_sets_cwnd = 0;
285 static uint32_t rack_probe_rtt_safety_val = 2000000;    /* No more than 2 sec in probe-rtt */
286 static uint32_t rack_time_between_probertt = 9600000;   /* 9.6 sec in usecs */
287 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;       /* How many srtt periods does probe-rtt last top fraction */
288 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;       /* How many srtt periods does probe-rtt last bottom fraction */
289 static uint32_t rack_min_probertt_hold = 40000;         /* Equal to delayed ack time */
290 static uint32_t rack_probertt_filter_life = 10000000;
291 static uint32_t rack_probertt_lower_within = 10;
292 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds)  to count as a lowering */
293 static int32_t rack_pace_one_seg = 0;           /* Shall we pace for less than 1.4Meg 1MSS at a time */
294 static int32_t rack_probertt_clear_is = 1;
295 static int32_t rack_max_drain_hbp = 1;          /* Extra drain times gpsrtt for highly buffered paths */
296 static int32_t rack_hbp_thresh = 3;             /* what is the divisor max_rtt/min_rtt to decided a hbp */
297
298 /* Part of pacing */
299 static int32_t rack_max_per_above = 30;         /* When we go to increment stop if above 100+this% */
300
301 /* Timely information */
302 /* Combine these two gives the range of 'no change' to bw */
303 /* ie the up/down provide the upper and lower bound */
304 static int32_t rack_gp_per_bw_mul_up = 2;       /* 2% */
305 static int32_t rack_gp_per_bw_mul_down = 4;     /* 4% */
306 static int32_t rack_gp_rtt_maxmul = 3;          /* 3 x maxmin */
307 static int32_t rack_gp_rtt_minmul = 1;          /* minrtt + (minrtt/mindiv) is lower rtt */
308 static int32_t rack_gp_rtt_mindiv = 4;          /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
309 static int32_t rack_gp_decrease_per = 20;       /* 20% decrease in multipler */
310 static int32_t rack_gp_increase_per = 2;        /* 2% increase in multipler */
311 static int32_t rack_per_lower_bound = 50;       /* Don't allow to drop below this multiplier */
312 static int32_t rack_per_upper_bound_ss = 0;     /* Don't allow SS to grow above this */
313 static int32_t rack_per_upper_bound_ca = 0;     /* Don't allow CA to grow above this */
314 static int32_t rack_do_dyn_mul = 0;             /* Are the rack gp multipliers dynamic */
315 static int32_t rack_gp_no_rec_chg = 1;          /* Prohibit recovery from reducing it's multiplier */
316 static int32_t rack_timely_dec_clear = 6;       /* Do we clear decrement count at a value (6)? */
317 static int32_t rack_timely_max_push_rise = 3;   /* One round of pushing */
318 static int32_t rack_timely_max_push_drop = 3;   /* Three round of pushing */
319 static int32_t rack_timely_min_segs = 4;        /* 4 segment minimum */
320 static int32_t rack_use_max_for_nobackoff = 0;
321 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
322 static int32_t rack_timely_no_stopping = 0;
323 static int32_t rack_down_raise_thresh = 100;
324 static int32_t rack_req_segs = 1;
325 static uint64_t rack_bw_rate_cap = 0;
326
327 /* Weird delayed ack mode */
328 static int32_t rack_use_imac_dack = 0;
329 /* Rack specific counters */
330 counter_u64_t rack_badfr;
331 counter_u64_t rack_badfr_bytes;
332 counter_u64_t rack_rtm_prr_retran;
333 counter_u64_t rack_rtm_prr_newdata;
334 counter_u64_t rack_timestamp_mismatch;
335 counter_u64_t rack_reorder_seen;
336 counter_u64_t rack_paced_segments;
337 counter_u64_t rack_unpaced_segments;
338 counter_u64_t rack_calc_zero;
339 counter_u64_t rack_calc_nonzero;
340 counter_u64_t rack_saw_enobuf;
341 counter_u64_t rack_saw_enobuf_hw;
342 counter_u64_t rack_saw_enetunreach;
343 counter_u64_t rack_per_timer_hole;
344 counter_u64_t rack_large_ackcmp;
345 counter_u64_t rack_small_ackcmp;
346 #ifdef INVARIANTS
347 counter_u64_t rack_adjust_map_bw;
348 #endif
349 /* Tail loss probe counters */
350 counter_u64_t rack_tlp_tot;
351 counter_u64_t rack_tlp_newdata;
352 counter_u64_t rack_tlp_retran;
353 counter_u64_t rack_tlp_retran_bytes;
354 counter_u64_t rack_tlp_retran_fail;
355 counter_u64_t rack_to_tot;
356 counter_u64_t rack_to_arm_rack;
357 counter_u64_t rack_to_arm_tlp;
358 counter_u64_t rack_hot_alloc;
359 counter_u64_t rack_to_alloc;
360 counter_u64_t rack_to_alloc_hard;
361 counter_u64_t rack_to_alloc_emerg;
362 counter_u64_t rack_to_alloc_limited;
363 counter_u64_t rack_alloc_limited_conns;
364 counter_u64_t rack_split_limited;
365
366 #define MAX_NUM_OF_CNTS 13
367 counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS];
368 counter_u64_t rack_multi_single_eq;
369 counter_u64_t rack_proc_non_comp_ack;
370
371 counter_u64_t rack_fto_send;
372 counter_u64_t rack_fto_rsm_send;
373 counter_u64_t rack_nfto_resend;
374 counter_u64_t rack_non_fto_send;
375 counter_u64_t rack_extended_rfo;
376
377 counter_u64_t rack_sack_proc_all;
378 counter_u64_t rack_sack_proc_short;
379 counter_u64_t rack_sack_proc_restart;
380 counter_u64_t rack_sack_attacks_detected;
381 counter_u64_t rack_sack_attacks_reversed;
382 counter_u64_t rack_sack_used_next_merge;
383 counter_u64_t rack_sack_splits;
384 counter_u64_t rack_sack_used_prev_merge;
385 counter_u64_t rack_sack_skipped_acked;
386 counter_u64_t rack_ack_total;
387 counter_u64_t rack_express_sack;
388 counter_u64_t rack_sack_total;
389 counter_u64_t rack_move_none;
390 counter_u64_t rack_move_some;
391
392 counter_u64_t rack_used_tlpmethod;
393 counter_u64_t rack_used_tlpmethod2;
394 counter_u64_t rack_enter_tlp_calc;
395 counter_u64_t rack_input_idle_reduces;
396 counter_u64_t rack_collapsed_win;
397 counter_u64_t rack_tlp_does_nada;
398 counter_u64_t rack_try_scwnd;
399 counter_u64_t rack_hw_pace_init_fail;
400 counter_u64_t rack_hw_pace_lost;
401 counter_u64_t rack_sbsndptr_right;
402 counter_u64_t rack_sbsndptr_wrong;
403
404 /* Temp CPU counters */
405 counter_u64_t rack_find_high;
406
407 counter_u64_t rack_progress_drops;
408 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
409 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
410
411
412 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
413
414 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {  \
415         (tv) = (value) + slop;   \
416         if ((u_long)(tv) < (u_long)(tvmin)) \
417                 (tv) = (tvmin); \
418         if ((u_long)(tv) > (u_long)(tvmax)) \
419                 (tv) = (tvmax); \
420 } while (0)
421
422 static void
423 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
424
425 static int
426 rack_process_ack(struct mbuf *m, struct tcphdr *th,
427     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
428     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
429 static int
430 rack_process_data(struct mbuf *m, struct tcphdr *th,
431     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
432     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
433 static void
434 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
435    uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
436 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
437 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
438     uint8_t limit_type);
439 static struct rack_sendmap *
440 rack_check_recovery_mode(struct tcpcb *tp,
441     uint32_t tsused);
442 static void
443 rack_cong_signal(struct tcpcb *tp,
444                  uint32_t type, uint32_t ack);
445 static void rack_counter_destroy(void);
446 static int
447 rack_ctloutput(struct socket *so, struct sockopt *sopt,
448     struct inpcb *inp, struct tcpcb *tp);
449 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
450 static void
451 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
452 static void
453 rack_do_segment(struct mbuf *m, struct tcphdr *th,
454     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
455     uint8_t iptos);
456 static void rack_dtor(void *mem, int32_t size, void *arg);
457 static void
458 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
459     uint32_t flex1, uint32_t flex2,
460     uint32_t flex3, uint32_t flex4,
461     uint32_t flex5, uint32_t flex6,
462     uint16_t flex7, uint8_t mod);
463
464 static void
465 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
466    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
467    struct rack_sendmap *rsm, uint8_t quality);
468 static struct rack_sendmap *
469 rack_find_high_nonack(struct tcp_rack *rack,
470     struct rack_sendmap *rsm);
471 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
472 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
473 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
474 static int
475 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
476     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
477 static void
478 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
479                             tcp_seq th_ack, int line, uint8_t quality);
480 static uint32_t
481 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
482 static int32_t rack_handoff_ok(struct tcpcb *tp);
483 static int32_t rack_init(struct tcpcb *tp);
484 static void rack_init_sysctls(void);
485 static void
486 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
487     struct tcphdr *th, int entered_rec, int dup_ack_struck);
488 static void
489 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
490     uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts,
491     struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
492
493 static void
494 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
495     struct rack_sendmap *rsm);
496 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
497 static int32_t rack_output(struct tcpcb *tp);
498
499 static uint32_t
500 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
501     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
502     uint32_t cts, int *moved_two);
503 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
504 static void rack_remxt_tmr(struct tcpcb *tp);
505 static int
506 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
507     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
508 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
509 static int32_t rack_stopall(struct tcpcb *tp);
510 static void
511 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
512     uint32_t delta);
513 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
514 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
515 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
516 static uint32_t
517 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
518     struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
519 static void
520 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
521     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
522 static int
523 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
524     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
525 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
526 static int
527 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
528     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
529     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
530 static int
531 rack_do_closing(struct mbuf *m, struct tcphdr *th,
532     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
533     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
534 static int
535 rack_do_established(struct mbuf *m, struct tcphdr *th,
536     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
537     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
538 static int
539 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
540     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
541     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
542 static int
543 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
544     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
545     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
546 static int
547 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
548     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
549     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
550 static int
551 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
552     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
553     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
554 static int
555 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
556     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
557     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
558 static int
559 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
560     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
561     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
562 struct rack_sendmap *
563 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
564     uint32_t tsused);
565 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
566     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
567 static void
568      tcp_rack_partialack(struct tcpcb *tp);
569 static int
570 rack_set_profile(struct tcp_rack *rack, int prof);
571 static void
572 rack_apply_deferred_options(struct tcp_rack *rack);
573
574 int32_t rack_clear_counter=0;
575
576 static void
577 rack_set_cc_pacing(struct tcp_rack *rack)
578 {
579         struct sockopt sopt;
580         struct cc_newreno_opts opt;
581         struct newreno old, *ptr;
582         struct tcpcb *tp;
583         int error;
584
585         if (rack->rc_pacing_cc_set)
586                 return;
587
588         tp = rack->rc_tp;
589         if (tp->cc_algo == NULL) {
590                 /* Tcb is leaving */
591                 printf("No cc algorithm?\n");
592                 return;
593         }
594         rack->rc_pacing_cc_set = 1;
595         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
596                 /* Not new-reno we can't play games with beta! */
597                 goto out;
598         }
599         ptr = ((struct newreno *)tp->ccv->cc_data);
600         if (CC_ALGO(tp)->ctl_output == NULL)  {
601                 /* Huh, why does new_reno no longer have a set function? */
602                 printf("no ctl_output for algo:%s\n", tp->cc_algo->name);
603                 goto out;
604         }
605         if (ptr == NULL) {
606                 /* Just the default values */
607                 old.beta = V_newreno_beta_ecn;
608                 old.beta_ecn = V_newreno_beta_ecn;
609                 old.newreno_flags = 0;
610         } else {
611                 old.beta = ptr->beta;
612                 old.beta_ecn = ptr->beta_ecn;
613                 old.newreno_flags = ptr->newreno_flags;
614         }
615         sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
616         sopt.sopt_dir = SOPT_SET;
617         opt.name = CC_NEWRENO_BETA;
618         opt.val = rack->r_ctl.rc_saved_beta.beta;
619         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
620         if (error)  {
621                 printf("Error returned by ctl_output %d\n", error);
622                 goto out;
623         }
624         /*
625          * Hack alert we need to set in our newreno_flags
626          * so that Abe behavior is also applied.
627          */
628         ((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
629         opt.name = CC_NEWRENO_BETA_ECN;
630         opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
631         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
632         if (error) {
633                 printf("Error returned by ctl_output %d\n", error);
634                 goto out;
635         }
636         /* Save off the original values for restoral */
637         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
638 out:
639         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
640                 union tcp_log_stackspecific log;
641                 struct timeval tv;
642
643                 ptr = ((struct newreno *)tp->ccv->cc_data);
644                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
645                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
646                 if (ptr) {
647                         log.u_bbr.flex1 = ptr->beta;
648                         log.u_bbr.flex2 = ptr->beta_ecn;
649                         log.u_bbr.flex3 = ptr->newreno_flags;
650                 }
651                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
652                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
653                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
654                 log.u_bbr.flex7 = rack->gp_ready;
655                 log.u_bbr.flex7 <<= 1;
656                 log.u_bbr.flex7 |= rack->use_fixed_rate;
657                 log.u_bbr.flex7 <<= 1;
658                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
659                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
660                 log.u_bbr.flex8 = 3;
661                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
662                                0, &log, false, NULL, NULL, 0, &tv);
663         }
664 }
665
666 static void
667 rack_undo_cc_pacing(struct tcp_rack *rack)
668 {
669         struct newreno old, *ptr;
670         struct tcpcb *tp;
671
672         if (rack->rc_pacing_cc_set == 0)
673                 return;
674         tp = rack->rc_tp;
675         rack->rc_pacing_cc_set = 0;
676         if (tp->cc_algo == NULL)
677                 /* Tcb is leaving */
678                 return;
679         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
680                 /* Not new-reno nothing to do! */
681                 return;
682         }
683         ptr = ((struct newreno *)tp->ccv->cc_data);
684         if (ptr == NULL) {
685                 /*
686                  * This happens at rack_fini() if the
687                  * cc module gets freed on us. In that
688                  * case we loose our "new" settings but
689                  * thats ok, since the tcb is going away anyway.
690                  */
691                 return;
692         }
693         /* Grab out our set values */
694         memcpy(&old, ptr, sizeof(struct newreno));
695         /* Copy back in the original values */
696         memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
697         /* Now save back the values we had set in (for when pacing is restored) */
698         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
699         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
700                 union tcp_log_stackspecific log;
701                 struct timeval tv;
702
703                 ptr = ((struct newreno *)tp->ccv->cc_data);
704                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
705                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
706                 log.u_bbr.flex1 = ptr->beta;
707                 log.u_bbr.flex2 = ptr->beta_ecn;
708                 log.u_bbr.flex3 = ptr->newreno_flags;
709                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
710                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
711                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
712                 log.u_bbr.flex7 = rack->gp_ready;
713                 log.u_bbr.flex7 <<= 1;
714                 log.u_bbr.flex7 |= rack->use_fixed_rate;
715                 log.u_bbr.flex7 <<= 1;
716                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
717                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
718                 log.u_bbr.flex8 = 4;
719                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
720                                0, &log, false, NULL, NULL, 0, &tv);
721         }
722 }
723
724 #ifdef NETFLIX_PEAKRATE
725 static inline void
726 rack_update_peakrate_thr(struct tcpcb *tp)
727 {
728         /* Keep in mind that t_maxpeakrate is in B/s. */
729         uint64_t peak;
730         peak = uqmax((tp->t_maxseg * 2),
731                      (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
732         tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
733 }
734 #endif
735
736 static int
737 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
738 {
739         uint32_t stat;
740         int32_t error;
741         int i;
742
743         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
744         if (error || req->newptr == NULL)
745                 return error;
746
747         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
748         if (error)
749                 return (error);
750         if (stat == 1) {
751 #ifdef INVARIANTS
752                 printf("Clearing RACK counters\n");
753 #endif
754                 counter_u64_zero(rack_badfr);
755                 counter_u64_zero(rack_badfr_bytes);
756                 counter_u64_zero(rack_rtm_prr_retran);
757                 counter_u64_zero(rack_rtm_prr_newdata);
758                 counter_u64_zero(rack_timestamp_mismatch);
759                 counter_u64_zero(rack_reorder_seen);
760                 counter_u64_zero(rack_tlp_tot);
761                 counter_u64_zero(rack_tlp_newdata);
762                 counter_u64_zero(rack_tlp_retran);
763                 counter_u64_zero(rack_tlp_retran_bytes);
764                 counter_u64_zero(rack_tlp_retran_fail);
765                 counter_u64_zero(rack_to_tot);
766                 counter_u64_zero(rack_to_arm_rack);
767                 counter_u64_zero(rack_to_arm_tlp);
768                 counter_u64_zero(rack_paced_segments);
769                 counter_u64_zero(rack_calc_zero);
770                 counter_u64_zero(rack_calc_nonzero);
771                 counter_u64_zero(rack_unpaced_segments);
772                 counter_u64_zero(rack_saw_enobuf);
773                 counter_u64_zero(rack_saw_enobuf_hw);
774                 counter_u64_zero(rack_saw_enetunreach);
775                 counter_u64_zero(rack_per_timer_hole);
776                 counter_u64_zero(rack_large_ackcmp);
777                 counter_u64_zero(rack_small_ackcmp);
778 #ifdef INVARIANTS
779                 counter_u64_zero(rack_adjust_map_bw);
780 #endif
781                 counter_u64_zero(rack_to_alloc_hard);
782                 counter_u64_zero(rack_to_alloc_emerg);
783                 counter_u64_zero(rack_sack_proc_all);
784                 counter_u64_zero(rack_fto_send);
785                 counter_u64_zero(rack_fto_rsm_send);
786                 counter_u64_zero(rack_extended_rfo);
787                 counter_u64_zero(rack_hw_pace_init_fail);
788                 counter_u64_zero(rack_hw_pace_lost);
789                 counter_u64_zero(rack_sbsndptr_wrong);
790                 counter_u64_zero(rack_sbsndptr_right);
791                 counter_u64_zero(rack_non_fto_send);
792                 counter_u64_zero(rack_nfto_resend);
793                 counter_u64_zero(rack_sack_proc_short);
794                 counter_u64_zero(rack_sack_proc_restart);
795                 counter_u64_zero(rack_to_alloc);
796                 counter_u64_zero(rack_to_alloc_limited);
797                 counter_u64_zero(rack_alloc_limited_conns);
798                 counter_u64_zero(rack_split_limited);
799                 for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
800                         counter_u64_zero(rack_proc_comp_ack[i]);
801                 }
802                 counter_u64_zero(rack_multi_single_eq);
803                 counter_u64_zero(rack_proc_non_comp_ack);
804                 counter_u64_zero(rack_find_high);
805                 counter_u64_zero(rack_sack_attacks_detected);
806                 counter_u64_zero(rack_sack_attacks_reversed);
807                 counter_u64_zero(rack_sack_used_next_merge);
808                 counter_u64_zero(rack_sack_used_prev_merge);
809                 counter_u64_zero(rack_sack_splits);
810                 counter_u64_zero(rack_sack_skipped_acked);
811                 counter_u64_zero(rack_ack_total);
812                 counter_u64_zero(rack_express_sack);
813                 counter_u64_zero(rack_sack_total);
814                 counter_u64_zero(rack_move_none);
815                 counter_u64_zero(rack_move_some);
816                 counter_u64_zero(rack_used_tlpmethod);
817                 counter_u64_zero(rack_used_tlpmethod2);
818                 counter_u64_zero(rack_enter_tlp_calc);
819                 counter_u64_zero(rack_progress_drops);
820                 counter_u64_zero(rack_tlp_does_nada);
821                 counter_u64_zero(rack_try_scwnd);
822                 counter_u64_zero(rack_collapsed_win);
823         }
824         rack_clear_counter = 0;
825         return (0);
826 }
827
828 static void
829 rack_init_sysctls(void)
830 {
831         int i;
832         struct sysctl_oid *rack_counters;
833         struct sysctl_oid *rack_attack;
834         struct sysctl_oid *rack_pacing;
835         struct sysctl_oid *rack_timely;
836         struct sysctl_oid *rack_timers;
837         struct sysctl_oid *rack_tlp;
838         struct sysctl_oid *rack_misc;
839         struct sysctl_oid *rack_features;
840         struct sysctl_oid *rack_measure;
841         struct sysctl_oid *rack_probertt;
842         struct sysctl_oid *rack_hw_pacing;
843
844         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
845             SYSCTL_CHILDREN(rack_sysctl_root),
846             OID_AUTO,
847             "sack_attack",
848             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
849             "Rack Sack Attack Counters and Controls");
850         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
851             SYSCTL_CHILDREN(rack_sysctl_root),
852             OID_AUTO,
853             "stats",
854             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
855             "Rack Counters");
856         SYSCTL_ADD_S32(&rack_sysctl_ctx,
857             SYSCTL_CHILDREN(rack_sysctl_root),
858             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
859             &rack_rate_sample_method , USE_RTT_LOW,
860             "What method should we use for rate sampling 0=high, 1=low ");
861         /* Probe rtt related controls */
862         rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
863             SYSCTL_CHILDREN(rack_sysctl_root),
864             OID_AUTO,
865             "probertt",
866             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
867             "ProbeRTT related Controls");
868         SYSCTL_ADD_U16(&rack_sysctl_ctx,
869             SYSCTL_CHILDREN(rack_probertt),
870             OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
871             &rack_atexit_prtt_hbp, 130,
872             "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
873         SYSCTL_ADD_U16(&rack_sysctl_ctx,
874             SYSCTL_CHILDREN(rack_probertt),
875             OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
876             &rack_atexit_prtt, 130,
877             "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
878         SYSCTL_ADD_U16(&rack_sysctl_ctx,
879             SYSCTL_CHILDREN(rack_probertt),
880             OID_AUTO, "gp_per_mul", CTLFLAG_RW,
881             &rack_per_of_gp_probertt, 60,
882             "What percentage of goodput do we pace at in probertt");
883         SYSCTL_ADD_U16(&rack_sysctl_ctx,
884             SYSCTL_CHILDREN(rack_probertt),
885             OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
886             &rack_per_of_gp_probertt_reduce, 10,
887             "What percentage of goodput do we reduce every gp_srtt");
888         SYSCTL_ADD_U16(&rack_sysctl_ctx,
889             SYSCTL_CHILDREN(rack_probertt),
890             OID_AUTO, "gp_per_low", CTLFLAG_RW,
891             &rack_per_of_gp_lowthresh, 40,
892             "What percentage of goodput do we allow the multiplier to fall to");
893         SYSCTL_ADD_U32(&rack_sysctl_ctx,
894             SYSCTL_CHILDREN(rack_probertt),
895             OID_AUTO, "time_between", CTLFLAG_RW,
896             & rack_time_between_probertt, 96000000,
897             "How many useconds between the lowest rtt falling must past before we enter probertt");
898         SYSCTL_ADD_U32(&rack_sysctl_ctx,
899             SYSCTL_CHILDREN(rack_probertt),
900             OID_AUTO, "safety", CTLFLAG_RW,
901             &rack_probe_rtt_safety_val, 2000000,
902             "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
903         SYSCTL_ADD_U32(&rack_sysctl_ctx,
904             SYSCTL_CHILDREN(rack_probertt),
905             OID_AUTO, "sets_cwnd", CTLFLAG_RW,
906             &rack_probe_rtt_sets_cwnd, 0,
907             "Do we set the cwnd too (if always_lower is on)");
908         SYSCTL_ADD_U32(&rack_sysctl_ctx,
909             SYSCTL_CHILDREN(rack_probertt),
910             OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
911             &rack_max_drain_wait, 2,
912             "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
913         SYSCTL_ADD_U32(&rack_sysctl_ctx,
914             SYSCTL_CHILDREN(rack_probertt),
915             OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
916             &rack_must_drain, 1,
917             "We must drain this many gp_srtt's waiting for flight to reach goal");
918         SYSCTL_ADD_U32(&rack_sysctl_ctx,
919             SYSCTL_CHILDREN(rack_probertt),
920             OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
921             &rack_probertt_use_min_rtt_entry, 1,
922             "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
923         SYSCTL_ADD_U32(&rack_sysctl_ctx,
924             SYSCTL_CHILDREN(rack_probertt),
925             OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
926             &rack_probertt_use_min_rtt_exit, 0,
927             "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
928         SYSCTL_ADD_U32(&rack_sysctl_ctx,
929             SYSCTL_CHILDREN(rack_probertt),
930             OID_AUTO, "length_div", CTLFLAG_RW,
931             &rack_probertt_gpsrtt_cnt_div, 0,
932             "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
933         SYSCTL_ADD_U32(&rack_sysctl_ctx,
934             SYSCTL_CHILDREN(rack_probertt),
935             OID_AUTO, "length_mul", CTLFLAG_RW,
936             &rack_probertt_gpsrtt_cnt_mul, 0,
937             "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
938         SYSCTL_ADD_U32(&rack_sysctl_ctx,
939             SYSCTL_CHILDREN(rack_probertt),
940             OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
941             &rack_min_probertt_hold, 200000,
942             "What is the minimum time we hold probertt at target");
943         SYSCTL_ADD_U32(&rack_sysctl_ctx,
944             SYSCTL_CHILDREN(rack_probertt),
945             OID_AUTO, "filter_life", CTLFLAG_RW,
946             &rack_probertt_filter_life, 10000000,
947             "What is the time for the filters life in useconds");
948         SYSCTL_ADD_U32(&rack_sysctl_ctx,
949             SYSCTL_CHILDREN(rack_probertt),
950             OID_AUTO, "lower_within", CTLFLAG_RW,
951             &rack_probertt_lower_within, 10,
952             "If the rtt goes lower within this percentage of the time, go into probe-rtt");
953         SYSCTL_ADD_U32(&rack_sysctl_ctx,
954             SYSCTL_CHILDREN(rack_probertt),
955             OID_AUTO, "must_move", CTLFLAG_RW,
956             &rack_min_rtt_movement, 250,
957             "How much is the minimum movement in rtt to count as a drop for probertt purposes");
958         SYSCTL_ADD_U32(&rack_sysctl_ctx,
959             SYSCTL_CHILDREN(rack_probertt),
960             OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
961             &rack_probertt_clear_is, 1,
962             "Do we clear I/S counts on exiting probe-rtt");
963         SYSCTL_ADD_S32(&rack_sysctl_ctx,
964             SYSCTL_CHILDREN(rack_probertt),
965             OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
966             &rack_max_drain_hbp, 1,
967             "How many extra drain gpsrtt's do we get in highly buffered paths");
968         SYSCTL_ADD_S32(&rack_sysctl_ctx,
969             SYSCTL_CHILDREN(rack_probertt),
970             OID_AUTO, "hbp_threshold", CTLFLAG_RW,
971             &rack_hbp_thresh, 3,
972             "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
973         /* Pacing related sysctls */
974         rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
975             SYSCTL_CHILDREN(rack_sysctl_root),
976             OID_AUTO,
977             "pacing",
978             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
979             "Pacing related Controls");
980         SYSCTL_ADD_S32(&rack_sysctl_ctx,
981             SYSCTL_CHILDREN(rack_pacing),
982             OID_AUTO, "max_pace_over", CTLFLAG_RW,
983             &rack_max_per_above, 30,
984             "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
985         SYSCTL_ADD_S32(&rack_sysctl_ctx,
986             SYSCTL_CHILDREN(rack_pacing),
987             OID_AUTO, "pace_to_one", CTLFLAG_RW,
988             &rack_pace_one_seg, 0,
989             "Do we allow low b/w pacing of 1MSS instead of two");
990         SYSCTL_ADD_S32(&rack_sysctl_ctx,
991             SYSCTL_CHILDREN(rack_pacing),
992             OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
993             &rack_limit_time_with_srtt, 0,
994             "Do we limit pacing time based on srtt");
995         SYSCTL_ADD_S32(&rack_sysctl_ctx,
996             SYSCTL_CHILDREN(rack_pacing),
997             OID_AUTO, "init_win", CTLFLAG_RW,
998             &rack_default_init_window, 0,
999             "Do we have a rack initial window 0 = system default");
1000         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1001             SYSCTL_CHILDREN(rack_pacing),
1002             OID_AUTO, "gp_per_ss", CTLFLAG_RW,
1003             &rack_per_of_gp_ss, 250,
1004             "If non zero, what percentage of goodput to pace at in slow start");
1005         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1006             SYSCTL_CHILDREN(rack_pacing),
1007             OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1008             &rack_per_of_gp_ca, 150,
1009             "If non zero, what percentage of goodput to pace at in congestion avoidance");
1010         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1011             SYSCTL_CHILDREN(rack_pacing),
1012             OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1013             &rack_per_of_gp_rec, 200,
1014             "If non zero, what percentage of goodput to pace at in recovery");
1015         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1016             SYSCTL_CHILDREN(rack_pacing),
1017             OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1018             &rack_hptsi_segments, 40,
1019             "What size is the max for TSO segments in pacing and burst mitigation");
1020         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1021             SYSCTL_CHILDREN(rack_pacing),
1022             OID_AUTO, "burst_reduces", CTLFLAG_RW,
1023             &rack_slot_reduction, 4,
1024             "When doing only burst mitigation what is the reduce divisor");
1025         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1026             SYSCTL_CHILDREN(rack_sysctl_root),
1027             OID_AUTO, "use_pacing", CTLFLAG_RW,
1028             &rack_pace_every_seg, 0,
1029             "If set we use pacing, if clear we use only the original burst mitigation");
1030         SYSCTL_ADD_U64(&rack_sysctl_ctx,
1031             SYSCTL_CHILDREN(rack_pacing),
1032             OID_AUTO, "rate_cap", CTLFLAG_RW,
1033             &rack_bw_rate_cap, 0,
1034             "If set we apply this value to the absolute rate cap used by pacing");
1035         SYSCTL_ADD_U8(&rack_sysctl_ctx,
1036             SYSCTL_CHILDREN(rack_sysctl_root),
1037             OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1038             &rack_req_measurements, 1,
1039             "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1040         /* Hardware pacing */
1041         rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1042             SYSCTL_CHILDREN(rack_sysctl_root),
1043             OID_AUTO,
1044             "hdwr_pacing",
1045             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1046             "Pacing related Controls");
1047         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1048             SYSCTL_CHILDREN(rack_hw_pacing),
1049             OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1050             &rack_hw_rwnd_factor, 2,
1051             "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1052         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1053             SYSCTL_CHILDREN(rack_hw_pacing),
1054             OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1055             &rack_enobuf_hw_boost_mult, 2,
1056             "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1057         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1058             SYSCTL_CHILDREN(rack_hw_pacing),
1059             OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1060             &rack_enobuf_hw_max, 2,
1061             "What is the max boost the pacing time if we see a ENOBUFS?");
1062         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1063             SYSCTL_CHILDREN(rack_hw_pacing),
1064             OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1065             &rack_enobuf_hw_min, 2,
1066             "What is the min boost the pacing time if we see a ENOBUFS?");
1067         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1068             SYSCTL_CHILDREN(rack_hw_pacing),
1069             OID_AUTO, "enable", CTLFLAG_RW,
1070             &rack_enable_hw_pacing, 0,
1071             "Should RACK attempt to use hw pacing?");
1072         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1073             SYSCTL_CHILDREN(rack_hw_pacing),
1074             OID_AUTO, "rate_cap", CTLFLAG_RW,
1075             &rack_hw_rate_caps, 1,
1076             "Does the highest hardware pacing rate cap the rate we will send at??");
1077         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1078             SYSCTL_CHILDREN(rack_hw_pacing),
1079             OID_AUTO, "rate_min", CTLFLAG_RW,
1080             &rack_hw_rate_min, 0,
1081             "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1082         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1083             SYSCTL_CHILDREN(rack_hw_pacing),
1084             OID_AUTO, "rate_to_low", CTLFLAG_RW,
1085             &rack_hw_rate_to_low, 0,
1086             "If we fall below this rate, dis-engage hw pacing?");
1087         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1088             SYSCTL_CHILDREN(rack_hw_pacing),
1089             OID_AUTO, "up_only", CTLFLAG_RW,
1090             &rack_hw_up_only, 1,
1091             "Do we allow hw pacing to lower the rate selected?");
1092         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1093             SYSCTL_CHILDREN(rack_hw_pacing),
1094             OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
1095             &rack_hw_pace_extra_slots, 2,
1096             "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
1097         rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1098             SYSCTL_CHILDREN(rack_sysctl_root),
1099             OID_AUTO,
1100             "timely",
1101             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1102             "Rack Timely RTT Controls");
1103         /* Timely based GP dynmics */
1104         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1105             SYSCTL_CHILDREN(rack_timely),
1106             OID_AUTO, "upper", CTLFLAG_RW,
1107             &rack_gp_per_bw_mul_up, 2,
1108             "Rack timely upper range for equal b/w (in percentage)");
1109         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1110             SYSCTL_CHILDREN(rack_timely),
1111             OID_AUTO, "lower", CTLFLAG_RW,
1112             &rack_gp_per_bw_mul_down, 4,
1113             "Rack timely lower range for equal b/w (in percentage)");
1114         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1115             SYSCTL_CHILDREN(rack_timely),
1116             OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1117             &rack_gp_rtt_maxmul, 3,
1118             "Rack timely multipler of lowest rtt for rtt_max");
1119         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1120             SYSCTL_CHILDREN(rack_timely),
1121             OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1122             &rack_gp_rtt_mindiv, 4,
1123             "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1124         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1125             SYSCTL_CHILDREN(rack_timely),
1126             OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1127             &rack_gp_rtt_minmul, 1,
1128             "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1129         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1130             SYSCTL_CHILDREN(rack_timely),
1131             OID_AUTO, "decrease", CTLFLAG_RW,
1132             &rack_gp_decrease_per, 20,
1133             "Rack timely decrease percentage of our GP multiplication factor");
1134         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1135             SYSCTL_CHILDREN(rack_timely),
1136             OID_AUTO, "increase", CTLFLAG_RW,
1137             &rack_gp_increase_per, 2,
1138             "Rack timely increase perentage of our GP multiplication factor");
1139         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1140             SYSCTL_CHILDREN(rack_timely),
1141             OID_AUTO, "lowerbound", CTLFLAG_RW,
1142             &rack_per_lower_bound, 50,
1143             "Rack timely lowest percentage we allow GP multiplier to fall to");
1144         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1145             SYSCTL_CHILDREN(rack_timely),
1146             OID_AUTO, "upperboundss", CTLFLAG_RW,
1147             &rack_per_upper_bound_ss, 0,
1148             "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1149         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1150             SYSCTL_CHILDREN(rack_timely),
1151             OID_AUTO, "upperboundca", CTLFLAG_RW,
1152             &rack_per_upper_bound_ca, 0,
1153             "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1154         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1155             SYSCTL_CHILDREN(rack_timely),
1156             OID_AUTO, "dynamicgp", CTLFLAG_RW,
1157             &rack_do_dyn_mul, 0,
1158             "Rack timely do we enable dynmaic timely goodput by default");
1159         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1160             SYSCTL_CHILDREN(rack_timely),
1161             OID_AUTO, "no_rec_red", CTLFLAG_RW,
1162             &rack_gp_no_rec_chg, 1,
1163             "Rack timely do we prohibit the recovery multiplier from being lowered");
1164         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1165             SYSCTL_CHILDREN(rack_timely),
1166             OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1167             &rack_timely_dec_clear, 6,
1168             "Rack timely what threshold do we count to before another boost during b/w decent");
1169         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1170             SYSCTL_CHILDREN(rack_timely),
1171             OID_AUTO, "max_push_rise", CTLFLAG_RW,
1172             &rack_timely_max_push_rise, 3,
1173             "Rack timely how many times do we push up with b/w increase");
1174         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1175             SYSCTL_CHILDREN(rack_timely),
1176             OID_AUTO, "max_push_drop", CTLFLAG_RW,
1177             &rack_timely_max_push_drop, 3,
1178             "Rack timely how many times do we push back on b/w decent");
1179         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1180             SYSCTL_CHILDREN(rack_timely),
1181             OID_AUTO, "min_segs", CTLFLAG_RW,
1182             &rack_timely_min_segs, 4,
1183             "Rack timely when setting the cwnd what is the min num segments");
1184         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1185             SYSCTL_CHILDREN(rack_timely),
1186             OID_AUTO, "noback_max", CTLFLAG_RW,
1187             &rack_use_max_for_nobackoff, 0,
1188             "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
1189         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1190             SYSCTL_CHILDREN(rack_timely),
1191             OID_AUTO, "interim_timely_only", CTLFLAG_RW,
1192             &rack_timely_int_timely_only, 0,
1193             "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
1194         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1195             SYSCTL_CHILDREN(rack_timely),
1196             OID_AUTO, "nonstop", CTLFLAG_RW,
1197             &rack_timely_no_stopping, 0,
1198             "Rack timely don't stop increase");
1199         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1200             SYSCTL_CHILDREN(rack_timely),
1201             OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1202             &rack_down_raise_thresh, 100,
1203             "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1204         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1205             SYSCTL_CHILDREN(rack_timely),
1206             OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1207             &rack_req_segs, 1,
1208             "Bottom dragging if not these many segments outstanding and room");
1209
1210         /* TLP and Rack related parameters */
1211         rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1212             SYSCTL_CHILDREN(rack_sysctl_root),
1213             OID_AUTO,
1214             "tlp",
1215             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1216             "TLP and Rack related Controls");
1217         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1218             SYSCTL_CHILDREN(rack_tlp),
1219             OID_AUTO, "use_rrr", CTLFLAG_RW,
1220             &use_rack_rr, 1,
1221             "Do we use Rack Rapid Recovery");
1222         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1223             SYSCTL_CHILDREN(rack_tlp),
1224             OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1225             &rack_max_abc_post_recovery, 2,
1226             "Since we do early recovery, do we override the l_abc to a value, if so what?");
1227         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1228             SYSCTL_CHILDREN(rack_tlp),
1229             OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1230             &rack_non_rxt_use_cr, 0,
1231             "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1232         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1233             SYSCTL_CHILDREN(rack_tlp),
1234             OID_AUTO, "tlpmethod", CTLFLAG_RW,
1235             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1236             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1237         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1238             SYSCTL_CHILDREN(rack_tlp),
1239             OID_AUTO, "limit", CTLFLAG_RW,
1240             &rack_tlp_limit, 2,
1241             "How many TLP's can be sent without sending new data");
1242         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1243             SYSCTL_CHILDREN(rack_tlp),
1244             OID_AUTO, "use_greater", CTLFLAG_RW,
1245             &rack_tlp_use_greater, 1,
1246             "Should we use the rack_rtt time if its greater than srtt");
1247         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1248             SYSCTL_CHILDREN(rack_tlp),
1249             OID_AUTO, "tlpminto", CTLFLAG_RW,
1250             &rack_tlp_min, 10000,
1251             "TLP minimum timeout per the specification (in microseconds)");
1252         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1253             SYSCTL_CHILDREN(rack_tlp),
1254             OID_AUTO, "send_oldest", CTLFLAG_RW,
1255             &rack_always_send_oldest, 0,
1256             "Should we always send the oldest TLP and RACK-TLP");
1257         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1258             SYSCTL_CHILDREN(rack_tlp),
1259             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
1260             &rack_limited_retran, 0,
1261             "How many times can a rack timeout drive out sends");
1262         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1263             SYSCTL_CHILDREN(rack_tlp),
1264             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1265             &rack_lower_cwnd_at_tlp, 0,
1266             "When a TLP completes a retran should we enter recovery");
1267         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1268             SYSCTL_CHILDREN(rack_tlp),
1269             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1270             &rack_reorder_thresh, 2,
1271             "What factor for rack will be added when seeing reordering (shift right)");
1272         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1273             SYSCTL_CHILDREN(rack_tlp),
1274             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1275             &rack_tlp_thresh, 1,
1276             "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1277         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1278             SYSCTL_CHILDREN(rack_tlp),
1279             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1280             &rack_reorder_fade, 60000000,
1281             "Does reorder detection fade, if so how many microseconds (0 means never)");
1282         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1283             SYSCTL_CHILDREN(rack_tlp),
1284             OID_AUTO, "pktdelay", CTLFLAG_RW,
1285             &rack_pkt_delay, 1000,
1286             "Extra RACK time (in microseconds) besides reordering thresh");
1287
1288         /* Timer related controls */
1289         rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1290             SYSCTL_CHILDREN(rack_sysctl_root),
1291             OID_AUTO,
1292             "timers",
1293             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1294             "Timer related controls");
1295         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1296             SYSCTL_CHILDREN(rack_timers),
1297             OID_AUTO, "persmin", CTLFLAG_RW,
1298             &rack_persist_min, 250000,
1299             "What is the minimum time in microseconds between persists");
1300         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1301             SYSCTL_CHILDREN(rack_timers),
1302             OID_AUTO, "persmax", CTLFLAG_RW,
1303             &rack_persist_max, 2000000,
1304             "What is the largest delay in microseconds between persists");
1305         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1306             SYSCTL_CHILDREN(rack_timers),
1307             OID_AUTO, "delayed_ack", CTLFLAG_RW,
1308             &rack_delayed_ack_time, 40000,
1309             "Delayed ack time (40ms in microseconds)");
1310         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1311             SYSCTL_CHILDREN(rack_timers),
1312             OID_AUTO, "minrto", CTLFLAG_RW,
1313             &rack_rto_min, 30000,
1314             "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1315         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1316             SYSCTL_CHILDREN(rack_timers),
1317             OID_AUTO, "maxrto", CTLFLAG_RW,
1318             &rack_rto_max, 4000000,
1319             "Maxiumum RTO in microseconds -- should be at least as large as min_rto");
1320         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1321             SYSCTL_CHILDREN(rack_timers),
1322             OID_AUTO, "minto", CTLFLAG_RW,
1323             &rack_min_to, 1000,
1324             "Minimum rack timeout in microseconds");
1325         /* Measure controls */
1326         rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1327             SYSCTL_CHILDREN(rack_sysctl_root),
1328             OID_AUTO,
1329             "measure",
1330             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1331             "Measure related controls");
1332         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1333             SYSCTL_CHILDREN(rack_measure),
1334             OID_AUTO, "wma_divisor", CTLFLAG_RW,
1335             &rack_wma_divisor, 8,
1336             "When doing b/w calculation what is the  divisor for the WMA");
1337         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1338             SYSCTL_CHILDREN(rack_measure),
1339             OID_AUTO, "end_cwnd", CTLFLAG_RW,
1340             &rack_cwnd_block_ends_measure, 0,
1341             "Does a cwnd just-return end the measurement window (app limited)");
1342         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1343             SYSCTL_CHILDREN(rack_measure),
1344             OID_AUTO, "end_rwnd", CTLFLAG_RW,
1345             &rack_rwnd_block_ends_measure, 0,
1346             "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1347         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1348             SYSCTL_CHILDREN(rack_measure),
1349             OID_AUTO, "min_target", CTLFLAG_RW,
1350             &rack_def_data_window, 20,
1351             "What is the minimum target window (in mss) for a GP measurements");
1352         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1353             SYSCTL_CHILDREN(rack_measure),
1354             OID_AUTO, "goal_bdp", CTLFLAG_RW,
1355             &rack_goal_bdp, 2,
1356             "What is the goal BDP to measure");
1357         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1358             SYSCTL_CHILDREN(rack_measure),
1359             OID_AUTO, "min_srtts", CTLFLAG_RW,
1360             &rack_min_srtts, 1,
1361             "What is the goal BDP to measure");
1362         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1363             SYSCTL_CHILDREN(rack_measure),
1364             OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1365             &rack_min_measure_usec, 0,
1366             "What is the Minimum time time for a measurement if 0, this is off");
1367         /* Features */
1368         rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1369             SYSCTL_CHILDREN(rack_sysctl_root),
1370             OID_AUTO,
1371             "features",
1372             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1373             "Feature controls");
1374         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1375             SYSCTL_CHILDREN(rack_features),
1376             OID_AUTO, "cmpack", CTLFLAG_RW,
1377             &rack_use_cmp_acks, 1,
1378             "Should RACK have LRO send compressed acks");
1379         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1380             SYSCTL_CHILDREN(rack_features),
1381             OID_AUTO, "fsb", CTLFLAG_RW,
1382             &rack_use_fsb, 1,
1383             "Should RACK use the fast send block?");
1384         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1385             SYSCTL_CHILDREN(rack_features),
1386             OID_AUTO, "rfo", CTLFLAG_RW,
1387             &rack_use_rfo, 1,
1388             "Should RACK use rack_fast_output()?");
1389         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1390             SYSCTL_CHILDREN(rack_features),
1391             OID_AUTO, "rsmrfo", CTLFLAG_RW,
1392             &rack_use_rsm_rfo, 1,
1393             "Should RACK use rack_fast_rsm_output()?");
1394         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1395             SYSCTL_CHILDREN(rack_features),
1396             OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1397             &rack_enable_mqueue_for_nonpaced, 0,
1398             "Should RACK use mbuf queuing for non-paced connections");
1399         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1400             SYSCTL_CHILDREN(rack_features),
1401             OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1402             &rack_do_hystart, 0,
1403             "Should RACK enable HyStart++ on connections?");
1404         /* Misc rack controls */
1405         rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1406             SYSCTL_CHILDREN(rack_sysctl_root),
1407             OID_AUTO,
1408             "misc",
1409             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1410             "Misc related controls");
1411 #ifdef TCP_ACCOUNTING
1412         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1413             SYSCTL_CHILDREN(rack_misc),
1414             OID_AUTO, "tcp_acct", CTLFLAG_RW,
1415             &rack_tcp_accounting, 0,
1416             "Should we turn on TCP accounting for all rack sessions?");
1417 #endif
1418         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1419             SYSCTL_CHILDREN(rack_misc),
1420             OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1421             &rack_dsack_std_based, 3,
1422             "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1423         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1424             SYSCTL_CHILDREN(rack_misc),
1425             OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1426             &rack_prr_addbackmax, 2,
1427             "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1428         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1429             SYSCTL_CHILDREN(rack_misc),
1430             OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1431             &rack_stats_gets_ms_rtt, 1,
1432             "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1433         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1434             SYSCTL_CHILDREN(rack_misc),
1435             OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1436             &rack_client_low_buf, 0,
1437             "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1438         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1439             SYSCTL_CHILDREN(rack_misc),
1440             OID_AUTO, "defprofile", CTLFLAG_RW,
1441             &rack_def_profile, 0,
1442             "Should RACK use a default profile (0=no, num == profile num)?");
1443         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1444             SYSCTL_CHILDREN(rack_misc),
1445             OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1446             &rack_enable_shared_cwnd, 1,
1447             "Should RACK try to use the shared cwnd on connections where allowed");
1448         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1449             SYSCTL_CHILDREN(rack_misc),
1450             OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1451             &rack_limits_scwnd, 1,
1452             "Should RACK place low end time limits on the shared cwnd feature");
1453         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1454             SYSCTL_CHILDREN(rack_misc),
1455             OID_AUTO, "iMac_dack", CTLFLAG_RW,
1456             &rack_use_imac_dack, 0,
1457             "Should RACK try to emulate iMac delayed ack");
1458         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1459             SYSCTL_CHILDREN(rack_misc),
1460             OID_AUTO, "no_prr", CTLFLAG_RW,
1461             &rack_disable_prr, 0,
1462             "Should RACK not use prr and only pace (must have pacing on)");
1463         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1464             SYSCTL_CHILDREN(rack_misc),
1465             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1466             &rack_verbose_logging, 0,
1467             "Should RACK black box logging be verbose");
1468         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1469             SYSCTL_CHILDREN(rack_misc),
1470             OID_AUTO, "data_after_close", CTLFLAG_RW,
1471             &rack_ignore_data_after_close, 1,
1472             "Do we hold off sending a RST until all pending data is ack'd");
1473         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1474             SYSCTL_CHILDREN(rack_misc),
1475             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1476             &rack_sack_not_required, 1,
1477             "Do we allow rack to run on connections not supporting SACK");
1478         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1479             SYSCTL_CHILDREN(rack_misc),
1480             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1481             &rack_send_a_lot_in_prr, 1,
1482             "Send a lot in prr");
1483         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1484             SYSCTL_CHILDREN(rack_misc),
1485             OID_AUTO, "autoscale", CTLFLAG_RW,
1486             &rack_autosndbuf_inc, 20,
1487             "What percentage should rack scale up its snd buffer by?");
1488         /* Sack Attacker detection stuff */
1489         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1490             SYSCTL_CHILDREN(rack_attack),
1491             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
1492             &rack_highest_sack_thresh_seen, 0,
1493             "Highest sack to ack ratio seen");
1494         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1495             SYSCTL_CHILDREN(rack_attack),
1496             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
1497             &rack_highest_move_thresh_seen, 0,
1498             "Highest move to non-move ratio seen");
1499         rack_ack_total = counter_u64_alloc(M_WAITOK);
1500         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1501             SYSCTL_CHILDREN(rack_attack),
1502             OID_AUTO, "acktotal", CTLFLAG_RD,
1503             &rack_ack_total,
1504             "Total number of Ack's");
1505         rack_express_sack = counter_u64_alloc(M_WAITOK);
1506         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1507             SYSCTL_CHILDREN(rack_attack),
1508             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
1509             &rack_express_sack,
1510             "Total expresss number of Sack's");
1511         rack_sack_total = counter_u64_alloc(M_WAITOK);
1512         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1513             SYSCTL_CHILDREN(rack_attack),
1514             OID_AUTO, "sacktotal", CTLFLAG_RD,
1515             &rack_sack_total,
1516             "Total number of SACKs");
1517         rack_move_none = counter_u64_alloc(M_WAITOK);
1518         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1519             SYSCTL_CHILDREN(rack_attack),
1520             OID_AUTO, "move_none", CTLFLAG_RD,
1521             &rack_move_none,
1522             "Total number of SACK index reuse of postions under threshold");
1523         rack_move_some = counter_u64_alloc(M_WAITOK);
1524         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1525             SYSCTL_CHILDREN(rack_attack),
1526             OID_AUTO, "move_some", CTLFLAG_RD,
1527             &rack_move_some,
1528             "Total number of SACK index reuse of postions over threshold");
1529         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
1530         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1531             SYSCTL_CHILDREN(rack_attack),
1532             OID_AUTO, "attacks", CTLFLAG_RD,
1533             &rack_sack_attacks_detected,
1534             "Total number of SACK attackers that had sack disabled");
1535         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
1536         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1537             SYSCTL_CHILDREN(rack_attack),
1538             OID_AUTO, "reversed", CTLFLAG_RD,
1539             &rack_sack_attacks_reversed,
1540             "Total number of SACK attackers that were later determined false positive");
1541         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
1542         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1543             SYSCTL_CHILDREN(rack_attack),
1544             OID_AUTO, "nextmerge", CTLFLAG_RD,
1545             &rack_sack_used_next_merge,
1546             "Total number of times we used the next merge");
1547         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
1548         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1549             SYSCTL_CHILDREN(rack_attack),
1550             OID_AUTO, "prevmerge", CTLFLAG_RD,
1551             &rack_sack_used_prev_merge,
1552             "Total number of times we used the prev merge");
1553         /* Counters */
1554         rack_fto_send = counter_u64_alloc(M_WAITOK);
1555         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1556             SYSCTL_CHILDREN(rack_counters),
1557             OID_AUTO, "fto_send", CTLFLAG_RD,
1558             &rack_fto_send, "Total number of rack_fast_output sends");
1559         rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1560         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1561             SYSCTL_CHILDREN(rack_counters),
1562             OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1563             &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1564         rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1565         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1566             SYSCTL_CHILDREN(rack_counters),
1567             OID_AUTO, "nfto_resend", CTLFLAG_RD,
1568             &rack_nfto_resend, "Total number of rack_output retransmissions");
1569         rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1570         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1571             SYSCTL_CHILDREN(rack_counters),
1572             OID_AUTO, "nfto_send", CTLFLAG_RD,
1573             &rack_non_fto_send, "Total number of rack_output first sends");
1574         rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1575         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1576             SYSCTL_CHILDREN(rack_counters),
1577             OID_AUTO, "rfo_extended", CTLFLAG_RD,
1578             &rack_extended_rfo, "Total number of times we extended rfo");
1579
1580         rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1581         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1582             SYSCTL_CHILDREN(rack_counters),
1583             OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1584             &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1585         rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1586
1587         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1588             SYSCTL_CHILDREN(rack_counters),
1589             OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1590             &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1591         rack_badfr = counter_u64_alloc(M_WAITOK);
1592         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1593             SYSCTL_CHILDREN(rack_counters),
1594             OID_AUTO, "badfr", CTLFLAG_RD,
1595             &rack_badfr, "Total number of bad FRs");
1596         rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
1597         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1598             SYSCTL_CHILDREN(rack_counters),
1599             OID_AUTO, "badfr_bytes", CTLFLAG_RD,
1600             &rack_badfr_bytes, "Total number of bad FRs");
1601         rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
1602         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1603             SYSCTL_CHILDREN(rack_counters),
1604             OID_AUTO, "prrsndret", CTLFLAG_RD,
1605             &rack_rtm_prr_retran,
1606             "Total number of prr based retransmits");
1607         rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
1608         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1609             SYSCTL_CHILDREN(rack_counters),
1610             OID_AUTO, "prrsndnew", CTLFLAG_RD,
1611             &rack_rtm_prr_newdata,
1612             "Total number of prr based new transmits");
1613         rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
1614         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1615             SYSCTL_CHILDREN(rack_counters),
1616             OID_AUTO, "tsnf", CTLFLAG_RD,
1617             &rack_timestamp_mismatch,
1618             "Total number of timestamps that we could not find the reported ts");
1619         rack_find_high = counter_u64_alloc(M_WAITOK);
1620         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1621             SYSCTL_CHILDREN(rack_counters),
1622             OID_AUTO, "findhigh", CTLFLAG_RD,
1623             &rack_find_high,
1624             "Total number of FIN causing find-high");
1625         rack_reorder_seen = counter_u64_alloc(M_WAITOK);
1626         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1627             SYSCTL_CHILDREN(rack_counters),
1628             OID_AUTO, "reordering", CTLFLAG_RD,
1629             &rack_reorder_seen,
1630             "Total number of times we added delay due to reordering");
1631         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1632         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1633             SYSCTL_CHILDREN(rack_counters),
1634             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1635             &rack_tlp_tot,
1636             "Total number of tail loss probe expirations");
1637         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1638         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1639             SYSCTL_CHILDREN(rack_counters),
1640             OID_AUTO, "tlp_new", CTLFLAG_RD,
1641             &rack_tlp_newdata,
1642             "Total number of tail loss probe sending new data");
1643         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1644         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1645             SYSCTL_CHILDREN(rack_counters),
1646             OID_AUTO, "tlp_retran", CTLFLAG_RD,
1647             &rack_tlp_retran,
1648             "Total number of tail loss probe sending retransmitted data");
1649         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1650         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1651             SYSCTL_CHILDREN(rack_counters),
1652             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1653             &rack_tlp_retran_bytes,
1654             "Total bytes of tail loss probe sending retransmitted data");
1655         rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
1656         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1657             SYSCTL_CHILDREN(rack_counters),
1658             OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
1659             &rack_tlp_retran_fail,
1660             "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
1661         rack_to_tot = counter_u64_alloc(M_WAITOK);
1662         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1663             SYSCTL_CHILDREN(rack_counters),
1664             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1665             &rack_to_tot,
1666             "Total number of times the rack to expired");
1667         rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
1668         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1669             SYSCTL_CHILDREN(rack_counters),
1670             OID_AUTO, "arm_rack", CTLFLAG_RD,
1671             &rack_to_arm_rack,
1672             "Total number of times the rack timer armed");
1673         rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
1674         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1675             SYSCTL_CHILDREN(rack_counters),
1676             OID_AUTO, "arm_tlp", CTLFLAG_RD,
1677             &rack_to_arm_tlp,
1678             "Total number of times the tlp timer armed");
1679         rack_calc_zero = counter_u64_alloc(M_WAITOK);
1680         rack_calc_nonzero = counter_u64_alloc(M_WAITOK);
1681         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1682             SYSCTL_CHILDREN(rack_counters),
1683             OID_AUTO, "calc_zero", CTLFLAG_RD,
1684             &rack_calc_zero,
1685             "Total number of times pacing time worked out to zero");
1686         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1687             SYSCTL_CHILDREN(rack_counters),
1688             OID_AUTO, "calc_nonzero", CTLFLAG_RD,
1689             &rack_calc_nonzero,
1690             "Total number of times pacing time worked out to non-zero");
1691         rack_paced_segments = counter_u64_alloc(M_WAITOK);
1692         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1693             SYSCTL_CHILDREN(rack_counters),
1694             OID_AUTO, "paced", CTLFLAG_RD,
1695             &rack_paced_segments,
1696             "Total number of times a segment send caused hptsi");
1697         rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
1698         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1699             SYSCTL_CHILDREN(rack_counters),
1700             OID_AUTO, "unpaced", CTLFLAG_RD,
1701             &rack_unpaced_segments,
1702             "Total number of times a segment did not cause hptsi");
1703         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1704         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1705             SYSCTL_CHILDREN(rack_counters),
1706             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1707             &rack_saw_enobuf,
1708             "Total number of times a sends returned enobuf for non-hdwr paced connections");
1709         rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1710         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1711             SYSCTL_CHILDREN(rack_counters),
1712             OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1713             &rack_saw_enobuf_hw,
1714             "Total number of times a send returned enobuf for hdwr paced connections");
1715         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1716         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1717             SYSCTL_CHILDREN(rack_counters),
1718             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1719             &rack_saw_enetunreach,
1720             "Total number of times a send received a enetunreachable");
1721         rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1722         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1723             SYSCTL_CHILDREN(rack_counters),
1724             OID_AUTO, "alloc_hot", CTLFLAG_RD,
1725             &rack_hot_alloc,
1726             "Total allocations from the top of our list");
1727         rack_to_alloc = counter_u64_alloc(M_WAITOK);
1728         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1729             SYSCTL_CHILDREN(rack_counters),
1730             OID_AUTO, "allocs", CTLFLAG_RD,
1731             &rack_to_alloc,
1732             "Total allocations of tracking structures");
1733         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1734         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1735             SYSCTL_CHILDREN(rack_counters),
1736             OID_AUTO, "allochard", CTLFLAG_RD,
1737             &rack_to_alloc_hard,
1738             "Total allocations done with sleeping the hard way");
1739         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1740         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1741             SYSCTL_CHILDREN(rack_counters),
1742             OID_AUTO, "allocemerg", CTLFLAG_RD,
1743             &rack_to_alloc_emerg,
1744             "Total allocations done from emergency cache");
1745         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1746         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1747             SYSCTL_CHILDREN(rack_counters),
1748             OID_AUTO, "alloc_limited", CTLFLAG_RD,
1749             &rack_to_alloc_limited,
1750             "Total allocations dropped due to limit");
1751         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1752         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1753             SYSCTL_CHILDREN(rack_counters),
1754             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1755             &rack_alloc_limited_conns,
1756             "Connections with allocations dropped due to limit");
1757         rack_split_limited = counter_u64_alloc(M_WAITOK);
1758         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1759             SYSCTL_CHILDREN(rack_counters),
1760             OID_AUTO, "split_limited", CTLFLAG_RD,
1761             &rack_split_limited,
1762             "Split allocations dropped due to limit");
1763
1764         for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
1765                 char name[32];
1766                 sprintf(name, "cmp_ack_cnt_%d", i);
1767                 rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK);
1768                 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1769                                        SYSCTL_CHILDREN(rack_counters),
1770                                        OID_AUTO, name, CTLFLAG_RD,
1771                                        &rack_proc_comp_ack[i],
1772                                        "Number of compressed acks we processed");
1773         }
1774         rack_large_ackcmp = counter_u64_alloc(M_WAITOK);
1775         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1776             SYSCTL_CHILDREN(rack_counters),
1777             OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
1778             &rack_large_ackcmp,
1779             "Number of TCP connections with large mbuf's for compressed acks");
1780         rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
1781         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1782             SYSCTL_CHILDREN(rack_counters),
1783             OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD,
1784             &rack_small_ackcmp,
1785             "Number of TCP connections with small mbuf's for compressed acks");
1786 #ifdef INVARIANTS
1787         rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1788         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1789             SYSCTL_CHILDREN(rack_counters),
1790             OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1791             &rack_adjust_map_bw,
1792             "Number of times we hit the case where the sb went up and down on a sendmap entry");
1793 #endif
1794         rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1795         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1796             SYSCTL_CHILDREN(rack_counters),
1797             OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1798             &rack_multi_single_eq,
1799             "Number of compressed acks total represented");
1800         rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1801         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1802             SYSCTL_CHILDREN(rack_counters),
1803             OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1804             &rack_proc_non_comp_ack,
1805             "Number of non compresseds acks that we processed");
1806
1807
1808         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1809         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1810             SYSCTL_CHILDREN(rack_counters),
1811             OID_AUTO, "sack_long", CTLFLAG_RD,
1812             &rack_sack_proc_all,
1813             "Total times we had to walk whole list for sack processing");
1814         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1815         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1816             SYSCTL_CHILDREN(rack_counters),
1817             OID_AUTO, "sack_restart", CTLFLAG_RD,
1818             &rack_sack_proc_restart,
1819             "Total times we had to walk whole list due to a restart");
1820         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1821         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1822             SYSCTL_CHILDREN(rack_counters),
1823             OID_AUTO, "sack_short", CTLFLAG_RD,
1824             &rack_sack_proc_short,
1825             "Total times we took shortcut for sack processing");
1826         rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
1827         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1828             SYSCTL_CHILDREN(rack_counters),
1829             OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
1830             &rack_enter_tlp_calc,
1831             "Total times we called calc-tlp");
1832         rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
1833         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1834             SYSCTL_CHILDREN(rack_counters),
1835             OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
1836             &rack_used_tlpmethod,
1837             "Total number of runt sacks");
1838         rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
1839         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1840             SYSCTL_CHILDREN(rack_counters),
1841             OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
1842             &rack_used_tlpmethod2,
1843             "Total number of times we hit TLP method 2");
1844         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
1845         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1846             SYSCTL_CHILDREN(rack_attack),
1847             OID_AUTO, "skipacked", CTLFLAG_RD,
1848             &rack_sack_skipped_acked,
1849             "Total number of times we skipped previously sacked");
1850         rack_sack_splits = counter_u64_alloc(M_WAITOK);
1851         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1852             SYSCTL_CHILDREN(rack_attack),
1853             OID_AUTO, "ofsplit", CTLFLAG_RD,
1854             &rack_sack_splits,
1855             "Total number of times we did the old fashion tree split");
1856         rack_progress_drops = counter_u64_alloc(M_WAITOK);
1857         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1858             SYSCTL_CHILDREN(rack_counters),
1859             OID_AUTO, "prog_drops", CTLFLAG_RD,
1860             &rack_progress_drops,
1861             "Total number of progress drops");
1862         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1863         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1864             SYSCTL_CHILDREN(rack_counters),
1865             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1866             &rack_input_idle_reduces,
1867             "Total number of idle reductions on input");
1868         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1869         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1870             SYSCTL_CHILDREN(rack_counters),
1871             OID_AUTO, "collapsed_win", CTLFLAG_RD,
1872             &rack_collapsed_win,
1873             "Total number of collapsed windows");
1874         rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
1875         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1876             SYSCTL_CHILDREN(rack_counters),
1877             OID_AUTO, "tlp_nada", CTLFLAG_RD,
1878             &rack_tlp_does_nada,
1879             "Total number of nada tlp calls");
1880         rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1881         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1882             SYSCTL_CHILDREN(rack_counters),
1883             OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1884             &rack_try_scwnd,
1885             "Total number of scwnd attempts");
1886
1887         rack_per_timer_hole = counter_u64_alloc(M_WAITOK);
1888         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1889             SYSCTL_CHILDREN(rack_counters),
1890             OID_AUTO, "timer_hole", CTLFLAG_RD,
1891             &rack_per_timer_hole,
1892             "Total persists start in timer hole");
1893
1894         rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK);
1895         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1896             SYSCTL_CHILDREN(rack_counters),
1897             OID_AUTO, "sndptr_wrong", CTLFLAG_RD,
1898             &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret");
1899         rack_sbsndptr_right = counter_u64_alloc(M_WAITOK);
1900         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1901             SYSCTL_CHILDREN(rack_counters),
1902             OID_AUTO, "sndptr_right", CTLFLAG_RD,
1903             &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret");
1904
1905         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1906         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1907             OID_AUTO, "outsize", CTLFLAG_RD,
1908             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1909         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1910         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1911             OID_AUTO, "opts", CTLFLAG_RD,
1912             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1913         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1914             SYSCTL_CHILDREN(rack_sysctl_root),
1915             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1916             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1917 }
1918
1919 static __inline int
1920 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1921 {
1922         if (SEQ_GEQ(b->r_start, a->r_start) &&
1923             SEQ_LT(b->r_start, a->r_end)) {
1924                 /*
1925                  * The entry b is within the
1926                  * block a. i.e.:
1927                  * a --   |-------------|
1928                  * b --   |----|
1929                  * <or>
1930                  * b --       |------|
1931                  * <or>
1932                  * b --       |-----------|
1933                  */
1934                 return (0);
1935         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1936                 /*
1937                  * b falls as either the next
1938                  * sequence block after a so a
1939                  * is said to be smaller than b.
1940                  * i.e:
1941                  * a --   |------|
1942                  * b --          |--------|
1943                  * or
1944                  * b --              |-----|
1945                  */
1946                 return (1);
1947         }
1948         /*
1949          * Whats left is where a is
1950          * larger than b. i.e:
1951          * a --         |-------|
1952          * b --  |---|
1953          * or even possibly
1954          * b --   |--------------|
1955          */
1956         return (-1);
1957 }
1958
1959 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1960 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1961
1962 static uint32_t
1963 rc_init_window(struct tcp_rack *rack)
1964 {
1965         uint32_t win;
1966
1967         if (rack->rc_init_win == 0) {
1968                 /*
1969                  * Nothing set by the user, use the system stack
1970                  * default.
1971                  */
1972                 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1973         }
1974         win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
1975         return (win);
1976 }
1977
1978 static uint64_t
1979 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1980 {
1981         if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
1982                 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1983         else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1984                 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1985         else
1986                 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1987 }
1988
1989 static uint64_t
1990 rack_get_bw(struct tcp_rack *rack)
1991 {
1992         if (rack->use_fixed_rate) {
1993                 /* Return the fixed pacing rate */
1994                 return (rack_get_fixed_pacing_bw(rack));
1995         }
1996         if (rack->r_ctl.gp_bw == 0) {
1997                 /*
1998                  * We have yet no b/w measurement,
1999                  * if we have a user set initial bw
2000                  * return it. If we don't have that and
2001                  * we have an srtt, use the tcp IW (10) to
2002                  * calculate a fictional b/w over the SRTT
2003                  * which is more or less a guess. Note
2004                  * we don't use our IW from rack on purpose
2005                  * so if we have like IW=30, we are not
2006                  * calculating a "huge" b/w.
2007                  */
2008                 uint64_t bw, srtt;
2009                 if (rack->r_ctl.init_rate)
2010                         return (rack->r_ctl.init_rate);
2011
2012                 /* Has the user set a max peak rate? */
2013 #ifdef NETFLIX_PEAKRATE
2014                 if (rack->rc_tp->t_maxpeakrate)
2015                         return (rack->rc_tp->t_maxpeakrate);
2016 #endif
2017                 /* Ok lets come up with the IW guess, if we have a srtt */
2018                 if (rack->rc_tp->t_srtt == 0) {
2019                         /*
2020                          * Go with old pacing method
2021                          * i.e. burst mitigation only.
2022                          */
2023                         return (0);
2024                 }
2025                 /* Ok lets get the initial TCP win (not racks) */
2026                 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
2027                 srtt = (uint64_t)rack->rc_tp->t_srtt;
2028                 bw *= (uint64_t)USECS_IN_SECOND;
2029                 bw /= srtt;
2030                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
2031                         bw = rack->r_ctl.bw_rate_cap;
2032                 return (bw);
2033         } else {
2034                 uint64_t bw;
2035
2036                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
2037                         /* Averaging is done, we can return the value */
2038                         bw = rack->r_ctl.gp_bw;
2039                 } else {
2040                         /* Still doing initial average must calculate */
2041                         bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
2042                 }
2043 #ifdef NETFLIX_PEAKRATE
2044                 if ((rack->rc_tp->t_maxpeakrate) &&
2045                     (bw > rack->rc_tp->t_maxpeakrate)) {
2046                         /* The user has set a peak rate to pace at
2047                          * don't allow us to pace faster than that.
2048                          */
2049                         return (rack->rc_tp->t_maxpeakrate);
2050                 }
2051 #endif
2052                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
2053                         bw = rack->r_ctl.bw_rate_cap;
2054                 return (bw);
2055         }
2056 }
2057
2058 static uint16_t
2059 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
2060 {
2061         if (rack->use_fixed_rate) {
2062                 return (100);
2063         } else if (rack->in_probe_rtt && (rsm == NULL))
2064                 return (rack->r_ctl.rack_per_of_gp_probertt);
2065         else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
2066                   rack->r_ctl.rack_per_of_gp_rec)) {
2067                 if (rsm) {
2068                         /* a retransmission always use the recovery rate */
2069                         return (rack->r_ctl.rack_per_of_gp_rec);
2070                 } else if (rack->rack_rec_nonrxt_use_cr) {
2071                         /* Directed to use the configured rate */
2072                         goto configured_rate;
2073                 } else if (rack->rack_no_prr &&
2074                            (rack->r_ctl.rack_per_of_gp_rec > 100)) {
2075                         /* No PRR, lets just use the b/w estimate only */
2076                         return (100);
2077                 } else {
2078                         /*
2079                          * Here we may have a non-retransmit but we
2080                          * have no overrides, so just use the recovery
2081                          * rate (prr is in effect).
2082                          */
2083                         return (rack->r_ctl.rack_per_of_gp_rec);
2084                 }
2085         }
2086 configured_rate:
2087         /* For the configured rate we look at our cwnd vs the ssthresh */
2088         if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2089                 return (rack->r_ctl.rack_per_of_gp_ss);
2090         else
2091                 return (rack->r_ctl.rack_per_of_gp_ca);
2092 }
2093
2094 static void
2095 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
2096 {
2097         /*
2098          * Types of logs (mod value)
2099          * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
2100          * 2 = a dsack round begins, persist is reset to 16.
2101          * 3 = a dsack round ends
2102          * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
2103          * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
2104          * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
2105          */
2106         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2107                 union tcp_log_stackspecific log;
2108                 struct timeval tv;
2109
2110                 memset(&log, 0, sizeof(log));
2111                 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
2112                 log.u_bbr.flex1 <<= 1;
2113                 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
2114                 log.u_bbr.flex1 <<= 1;
2115                 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
2116                 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
2117                 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
2118                 log.u_bbr.flex4 = flex4;
2119                 log.u_bbr.flex5 = flex5;
2120                 log.u_bbr.flex6 = flex6;
2121                 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
2122                 log.u_bbr.flex8 = mod;
2123                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2124                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2125                     &rack->rc_inp->inp_socket->so_rcv,
2126                     &rack->rc_inp->inp_socket->so_snd,
2127                     RACK_DSACK_HANDLING, 0,
2128                     0, &log, false, &tv);
2129         }
2130 }
2131
2132 static void
2133 rack_log_hdwr_pacing(struct tcp_rack *rack,
2134                      uint64_t rate, uint64_t hw_rate, int line,
2135                      int error, uint16_t mod)
2136 {
2137         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2138                 union tcp_log_stackspecific log;
2139                 struct timeval tv;
2140                 const struct ifnet *ifp;
2141
2142                 memset(&log, 0, sizeof(log));
2143                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2144                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2145                 if (rack->r_ctl.crte) {
2146                         ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2147                 } else if (rack->rc_inp->inp_route.ro_nh &&
2148                            rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2149                         ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2150                 } else
2151                         ifp = NULL;
2152                 if (ifp) {
2153                         log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2154                         log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2155                 }
2156                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2157                 log.u_bbr.bw_inuse = rate;
2158                 log.u_bbr.flex5 = line;
2159                 log.u_bbr.flex6 = error;
2160                 log.u_bbr.flex7 = mod;
2161                 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2162                 log.u_bbr.flex8 = rack->use_fixed_rate;
2163                 log.u_bbr.flex8 <<= 1;
2164                 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2165                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2166                 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2167                 if (rack->r_ctl.crte)
2168                         log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2169                 else
2170                         log.u_bbr.cur_del_rate = 0;
2171                 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2172                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2173                     &rack->rc_inp->inp_socket->so_rcv,
2174                     &rack->rc_inp->inp_socket->so_snd,
2175                     BBR_LOG_HDWR_PACE, 0,
2176                     0, &log, false, &tv);
2177         }
2178 }
2179
2180 static uint64_t
2181 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2182 {
2183         /*
2184          * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2185          */
2186         uint64_t bw_est, high_rate;
2187         uint64_t gain;
2188
2189         gain = (uint64_t)rack_get_output_gain(rack, rsm);
2190         bw_est = bw * gain;
2191         bw_est /= (uint64_t)100;
2192         /* Never fall below the minimum (def 64kbps) */
2193         if (bw_est < RACK_MIN_BW)
2194                 bw_est = RACK_MIN_BW;
2195         if (rack->r_rack_hw_rate_caps) {
2196                 /* Rate caps are in place */
2197                 if (rack->r_ctl.crte != NULL) {
2198                         /* We have a hdwr rate already */
2199                         high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2200                         if (bw_est >= high_rate) {
2201                                 /* We are capping bw at the highest rate table entry */
2202                                 rack_log_hdwr_pacing(rack,
2203                                                      bw_est, high_rate, __LINE__,
2204                                                      0, 3);
2205                                 bw_est = high_rate;
2206                                 if (capped)
2207                                         *capped = 1;
2208                         }
2209                 } else if ((rack->rack_hdrw_pacing == 0) &&
2210                            (rack->rack_hdw_pace_ena) &&
2211                            (rack->rack_attempt_hdwr_pace == 0) &&
2212                            (rack->rc_inp->inp_route.ro_nh != NULL) &&
2213                            (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2214                         /*
2215                          * Special case, we have not yet attempted hardware
2216                          * pacing, and yet we may, when we do, find out if we are
2217                          * above the highest rate. We need to know the maxbw for the interface
2218                          * in question (if it supports ratelimiting). We get back
2219                          * a 0, if the interface is not found in the RL lists.
2220                          */
2221                         high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2222                         if (high_rate) {
2223                                 /* Yep, we have a rate is it above this rate? */
2224                                 if (bw_est > high_rate) {
2225                                         bw_est = high_rate;
2226                                         if (capped)
2227                                                 *capped = 1;
2228                                 }
2229                         }
2230                 }
2231         }
2232         return (bw_est);
2233 }
2234
2235 static void
2236 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2237 {
2238         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2239                 union tcp_log_stackspecific log;
2240                 struct timeval tv;
2241
2242                 if ((mod != 1) && (rack_verbose_logging == 0)) {
2243                         /*
2244                          * We get 3 values currently for mod
2245                          * 1 - We are retransmitting and this tells the reason.
2246                          * 2 - We are clearing a dup-ack count.
2247                          * 3 - We are incrementing a dup-ack count.
2248                          *
2249                          * The clear/increment are only logged
2250                          * if you have BBverbose on.
2251                          */
2252                         return;
2253                 }
2254                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2255                 log.u_bbr.flex1 = tsused;
2256                 log.u_bbr.flex2 = thresh;
2257                 log.u_bbr.flex3 = rsm->r_flags;
2258                 log.u_bbr.flex4 = rsm->r_dupack;
2259                 log.u_bbr.flex5 = rsm->r_start;
2260                 log.u_bbr.flex6 = rsm->r_end;
2261                 log.u_bbr.flex8 = mod;
2262                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2263                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2264                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2265                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2266                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2267                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2268                 log.u_bbr.pacing_gain = rack->r_must_retran;
2269                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2270                     &rack->rc_inp->inp_socket->so_rcv,
2271                     &rack->rc_inp->inp_socket->so_snd,
2272                     BBR_LOG_SETTINGS_CHG, 0,
2273                     0, &log, false, &tv);
2274         }
2275 }
2276
2277 static void
2278 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2279 {
2280         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2281                 union tcp_log_stackspecific log;
2282                 struct timeval tv;
2283
2284                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2285                 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2286                 log.u_bbr.flex2 = to;
2287                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2288                 log.u_bbr.flex4 = slot;
2289                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
2290                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2291                 log.u_bbr.flex7 = rack->rc_in_persist;
2292                 log.u_bbr.flex8 = which;
2293                 if (rack->rack_no_prr)
2294                         log.u_bbr.pkts_out = 0;
2295                 else
2296                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2297                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2298                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2299                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2300                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2301                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2302                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2303                 log.u_bbr.pacing_gain = rack->r_must_retran;
2304                 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2305                 log.u_bbr.lost = rack_rto_min;
2306                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2307                     &rack->rc_inp->inp_socket->so_rcv,
2308                     &rack->rc_inp->inp_socket->so_snd,
2309                     BBR_LOG_TIMERSTAR, 0,
2310                     0, &log, false, &tv);
2311         }
2312 }
2313
2314 static void
2315 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2316 {
2317         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2318                 union tcp_log_stackspecific log;
2319                 struct timeval tv;
2320
2321                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2322                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2323                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2324                 log.u_bbr.flex8 = to_num;
2325                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2326                 log.u_bbr.flex2 = rack->rc_rack_rtt;
2327                 if (rsm == NULL)
2328                         log.u_bbr.flex3 = 0;
2329                 else
2330                         log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2331                 if (rack->rack_no_prr)
2332                         log.u_bbr.flex5 = 0;
2333                 else
2334                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2335                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2336                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2337                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2338                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2339                 log.u_bbr.pacing_gain = rack->r_must_retran;
2340                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2341                     &rack->rc_inp->inp_socket->so_rcv,
2342                     &rack->rc_inp->inp_socket->so_snd,
2343                     BBR_LOG_RTO, 0,
2344                     0, &log, false, &tv);
2345         }
2346 }
2347
2348 static void
2349 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2350                  struct rack_sendmap *prev,
2351                  struct rack_sendmap *rsm,
2352                  struct rack_sendmap *next,
2353                  int flag, uint32_t th_ack, int line)
2354 {
2355         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2356                 union tcp_log_stackspecific log;
2357                 struct timeval tv;
2358
2359                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2360                 log.u_bbr.flex8 = flag;
2361                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2362                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2363                 log.u_bbr.cur_del_rate = (uint64_t)prev;
2364                 log.u_bbr.delRate = (uint64_t)rsm;
2365                 log.u_bbr.rttProp = (uint64_t)next;
2366                 log.u_bbr.flex7 = 0;
2367                 if (prev) {
2368                         log.u_bbr.flex1 = prev->r_start;
2369                         log.u_bbr.flex2 = prev->r_end;
2370                         log.u_bbr.flex7 |= 0x4;
2371                 }
2372                 if (rsm) {
2373                         log.u_bbr.flex3 = rsm->r_start;
2374                         log.u_bbr.flex4 = rsm->r_end;
2375                         log.u_bbr.flex7 |= 0x2;
2376                 }
2377                 if (next) {
2378                         log.u_bbr.flex5 = next->r_start;
2379                         log.u_bbr.flex6 = next->r_end;
2380                         log.u_bbr.flex7 |= 0x1;
2381                 }
2382                 log.u_bbr.applimited = line;
2383                 log.u_bbr.pkts_out = th_ack;
2384                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2385                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2386                 if (rack->rack_no_prr)
2387                         log.u_bbr.lost = 0;
2388                 else
2389                         log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2390                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2391                     &rack->rc_inp->inp_socket->so_rcv,
2392                     &rack->rc_inp->inp_socket->so_snd,
2393                     TCP_LOG_MAPCHG, 0,
2394                     0, &log, false, &tv);
2395         }
2396 }
2397
2398 static void
2399 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2400                  struct rack_sendmap *rsm, int conf)
2401 {
2402         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2403                 union tcp_log_stackspecific log;
2404                 struct timeval tv;
2405                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2406                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2407                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2408                 log.u_bbr.flex1 = t;
2409                 log.u_bbr.flex2 = len;
2410                 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2411                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2412                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2413                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2414                 log.u_bbr.flex7 = conf;
2415                 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2416                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2417                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2418                 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2419                 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2420                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2421                 if (rsm) {
2422                         log.u_bbr.pkt_epoch = rsm->r_start;
2423                         log.u_bbr.lost = rsm->r_end;
2424                         log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2425                         /* We loose any upper of the 24 bits */
2426                         log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2427                 } else {
2428                         /* Its a SYN */
2429                         log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2430                         log.u_bbr.lost = 0;
2431                         log.u_bbr.cwnd_gain = 0;
2432                         log.u_bbr.pacing_gain = 0;
2433                 }
2434                 /* Write out general bits of interest rrs here */
2435                 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2436                 log.u_bbr.use_lt_bw <<= 1;
2437                 log.u_bbr.use_lt_bw |= rack->forced_ack;
2438                 log.u_bbr.use_lt_bw <<= 1;
2439                 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2440                 log.u_bbr.use_lt_bw <<= 1;
2441                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2442                 log.u_bbr.use_lt_bw <<= 1;
2443                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2444                 log.u_bbr.use_lt_bw <<= 1;
2445                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2446                 log.u_bbr.use_lt_bw <<= 1;
2447                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2448                 log.u_bbr.use_lt_bw <<= 1;
2449                 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2450                 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2451                 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2452                 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2453                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2454                 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2455                 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
2456                 log.u_bbr.bw_inuse <<= 32;
2457                 if (rsm)
2458                         log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2459                 TCP_LOG_EVENTP(tp, NULL,
2460                     &rack->rc_inp->inp_socket->so_rcv,
2461                     &rack->rc_inp->inp_socket->so_snd,
2462                     BBR_LOG_BBRRTT, 0,
2463                     0, &log, false, &tv);
2464
2465
2466         }
2467 }
2468
2469 static void
2470 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2471 {
2472         /*
2473          * Log the rtt sample we are
2474          * applying to the srtt algorithm in
2475          * useconds.
2476          */
2477         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2478                 union tcp_log_stackspecific log;
2479                 struct timeval tv;
2480
2481                 /* Convert our ms to a microsecond */
2482                 memset(&log, 0, sizeof(log));
2483                 log.u_bbr.flex1 = rtt;
2484                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2485                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
2486                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2487                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
2488                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2489                 log.u_bbr.flex7 = 1;
2490                 log.u_bbr.flex8 = rack->sack_attack_disable;
2491                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2492                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2493                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2494                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2495                 log.u_bbr.pacing_gain = rack->r_must_retran;
2496                 /*
2497                  * We capture in delRate the upper 32 bits as
2498                  * the confidence level we had declared, and the
2499                  * lower 32 bits as the actual RTT using the arrival
2500                  * timestamp.
2501                  */
2502                 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2503                 log.u_bbr.delRate <<= 32;
2504                 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2505                 /* Lets capture all the things that make up t_rtxcur */
2506                 log.u_bbr.applimited = rack_rto_min;
2507                 log.u_bbr.epoch = rack_rto_max;
2508                 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2509                 log.u_bbr.lost = rack_rto_min;
2510                 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2511                 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2512                 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2513                 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2514                 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2515                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2516                     &rack->rc_inp->inp_socket->so_rcv,
2517                     &rack->rc_inp->inp_socket->so_snd,
2518                     TCP_LOG_RTT, 0,
2519                     0, &log, false, &tv);
2520         }
2521 }
2522
2523 static void
2524 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2525 {
2526         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2527                 union tcp_log_stackspecific log;
2528                 struct timeval tv;
2529
2530                 /* Convert our ms to a microsecond */
2531                 memset(&log, 0, sizeof(log));
2532                 log.u_bbr.flex1 = rtt;
2533                 log.u_bbr.flex2 = send_time;
2534                 log.u_bbr.flex3 = ack_time;
2535                 log.u_bbr.flex4 = where;
2536                 log.u_bbr.flex7 = 2;
2537                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2538                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2539                     &rack->rc_inp->inp_socket->so_rcv,
2540                     &rack->rc_inp->inp_socket->so_snd,
2541                     TCP_LOG_RTT, 0,
2542                     0, &log, false, &tv);
2543         }
2544 }
2545
2546
2547
2548 static inline void
2549 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
2550 {
2551         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2552                 union tcp_log_stackspecific log;
2553                 struct timeval tv;
2554
2555                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2556                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2557                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2558                 log.u_bbr.flex1 = line;
2559                 log.u_bbr.flex2 = tick;
2560                 log.u_bbr.flex3 = tp->t_maxunacktime;
2561                 log.u_bbr.flex4 = tp->t_acktime;
2562                 log.u_bbr.flex8 = event;
2563                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2564                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2565                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2566                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2567                 log.u_bbr.pacing_gain = rack->r_must_retran;
2568                 TCP_LOG_EVENTP(tp, NULL,
2569                     &rack->rc_inp->inp_socket->so_rcv,
2570                     &rack->rc_inp->inp_socket->so_snd,
2571                     BBR_LOG_PROGRESS, 0,
2572                     0, &log, false, &tv);
2573         }
2574 }
2575
2576 static void
2577 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
2578 {
2579         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2580                 union tcp_log_stackspecific log;
2581
2582                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2583                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2584                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2585                 log.u_bbr.flex1 = slot;
2586                 if (rack->rack_no_prr)
2587                         log.u_bbr.flex2 = 0;
2588                 else
2589                         log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2590                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2591                 log.u_bbr.flex8 = rack->rc_in_persist;
2592                 log.u_bbr.timeStamp = cts;
2593                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2594                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2595                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2596                 log.u_bbr.pacing_gain = rack->r_must_retran;
2597                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2598                     &rack->rc_inp->inp_socket->so_rcv,
2599                     &rack->rc_inp->inp_socket->so_snd,
2600                     BBR_LOG_BBRSND, 0,
2601                     0, &log, false, tv);
2602         }
2603 }
2604
2605 static void
2606 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2607 {
2608         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2609                 union tcp_log_stackspecific log;
2610                 struct timeval tv;
2611
2612                 memset(&log, 0, sizeof(log));
2613                 log.u_bbr.flex1 = did_out;
2614                 log.u_bbr.flex2 = nxt_pkt;
2615                 log.u_bbr.flex3 = way_out;
2616                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2617                 if (rack->rack_no_prr)
2618                         log.u_bbr.flex5 = 0;
2619                 else
2620                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2621                 log.u_bbr.flex6 = nsegs;
2622                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2623                 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data;        /* Do we have ack-can-send set */
2624                 log.u_bbr.flex7 <<= 1;
2625                 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2626                 log.u_bbr.flex7 <<= 1;
2627                 log.u_bbr.flex7 |= rack->r_wanted_output;       /* Do we want output */
2628                 log.u_bbr.flex8 = rack->rc_in_persist;
2629                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2630                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2631                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2632                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2633                 log.u_bbr.use_lt_bw <<= 1;
2634                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2635                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2636                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2637                 log.u_bbr.pacing_gain = rack->r_must_retran;
2638                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2639                     &rack->rc_inp->inp_socket->so_rcv,
2640                     &rack->rc_inp->inp_socket->so_snd,
2641                     BBR_LOG_DOSEG_DONE, 0,
2642                     0, &log, false, &tv);
2643         }
2644 }
2645
2646 static void
2647 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2648 {
2649         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2650                 union tcp_log_stackspecific log;
2651                 struct timeval tv;
2652                 uint32_t cts;
2653
2654                 memset(&log, 0, sizeof(log));
2655                 cts = tcp_get_usecs(&tv);
2656                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2657                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
2658                 log.u_bbr.flex4 = arg1;
2659                 log.u_bbr.flex5 = arg2;
2660                 log.u_bbr.flex6 = arg3;
2661                 log.u_bbr.flex8 = frm;
2662                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2663                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2664                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2665                 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
2666                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2667                 log.u_bbr.pacing_gain = rack->r_must_retran;
2668                 TCP_LOG_EVENTP(tp, NULL,
2669                     &tp->t_inpcb->inp_socket->so_rcv,
2670                     &tp->t_inpcb->inp_socket->so_snd,
2671                     TCP_HDWR_PACE_SIZE, 0,
2672                     0, &log, false, &tv);
2673         }
2674 }
2675
2676 static void
2677 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
2678                           uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
2679 {
2680         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2681                 union tcp_log_stackspecific log;
2682                 struct timeval tv;
2683
2684                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2685                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2686                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2687                 log.u_bbr.flex1 = slot;
2688                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
2689                 log.u_bbr.flex4 = reason;
2690                 if (rack->rack_no_prr)
2691                         log.u_bbr.flex5 = 0;
2692                 else
2693                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2694                 log.u_bbr.flex7 = hpts_calling;
2695                 log.u_bbr.flex8 = rack->rc_in_persist;
2696                 log.u_bbr.lt_epoch = cwnd_to_use;
2697                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2698                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2699                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2700                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2701                 log.u_bbr.pacing_gain = rack->r_must_retran;
2702                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2703                     &rack->rc_inp->inp_socket->so_rcv,
2704                     &rack->rc_inp->inp_socket->so_snd,
2705                     BBR_LOG_JUSTRET, 0,
2706                     tlen, &log, false, &tv);
2707         }
2708 }
2709
2710 static void
2711 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
2712                    struct timeval *tv, uint32_t flags_on_entry)
2713 {
2714         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2715                 union tcp_log_stackspecific log;
2716
2717                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2718                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
2719                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
2720                 log.u_bbr.flex1 = line;
2721                 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
2722                 log.u_bbr.flex3 = flags_on_entry;
2723                 log.u_bbr.flex4 = us_cts;
2724                 if (rack->rack_no_prr)
2725                         log.u_bbr.flex5 = 0;
2726                 else
2727                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2728                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2729                 log.u_bbr.flex7 = hpts_removed;
2730                 log.u_bbr.flex8 = 1;
2731                 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
2732                 log.u_bbr.timeStamp = us_cts;
2733                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2734                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2735                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2736                 log.u_bbr.pacing_gain = rack->r_must_retran;
2737                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2738                     &rack->rc_inp->inp_socket->so_rcv,
2739                     &rack->rc_inp->inp_socket->so_snd,
2740                     BBR_LOG_TIMERCANC, 0,
2741                     0, &log, false, tv);
2742         }
2743 }
2744
2745 static void
2746 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
2747                           uint32_t flex1, uint32_t flex2,
2748                           uint32_t flex3, uint32_t flex4,
2749                           uint32_t flex5, uint32_t flex6,
2750                           uint16_t flex7, uint8_t mod)
2751 {
2752         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2753                 union tcp_log_stackspecific log;
2754                 struct timeval tv;
2755
2756                 if (mod == 1) {
2757                         /* No you can't use 1, its for the real to cancel */
2758                         return;
2759                 }
2760                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2761                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2762                 log.u_bbr.flex1 = flex1;
2763                 log.u_bbr.flex2 = flex2;
2764                 log.u_bbr.flex3 = flex3;
2765                 log.u_bbr.flex4 = flex4;
2766                 log.u_bbr.flex5 = flex5;
2767                 log.u_bbr.flex6 = flex6;
2768                 log.u_bbr.flex7 = flex7;
2769                 log.u_bbr.flex8 = mod;
2770                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2771                     &rack->rc_inp->inp_socket->so_rcv,
2772                     &rack->rc_inp->inp_socket->so_snd,
2773                     BBR_LOG_TIMERCANC, 0,
2774                     0, &log, false, &tv);
2775         }
2776 }
2777
2778 static void
2779 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
2780 {
2781         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2782                 union tcp_log_stackspecific log;
2783                 struct timeval tv;
2784
2785                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2786                 log.u_bbr.flex1 = timers;
2787                 log.u_bbr.flex2 = ret;
2788                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
2789                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2790                 log.u_bbr.flex5 = cts;
2791                 if (rack->rack_no_prr)
2792                         log.u_bbr.flex6 = 0;
2793                 else
2794                         log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
2795                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2796                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2797                 log.u_bbr.pacing_gain = rack->r_must_retran;
2798                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2799                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2800                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2801                     &rack->rc_inp->inp_socket->so_rcv,
2802                     &rack->rc_inp->inp_socket->so_snd,
2803                     BBR_LOG_TO_PROCESS, 0,
2804                     0, &log, false, &tv);
2805         }
2806 }
2807
2808 static void
2809 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd)
2810 {
2811         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2812                 union tcp_log_stackspecific log;
2813                 struct timeval tv;
2814
2815                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2816                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
2817                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
2818                 if (rack->rack_no_prr)
2819                         log.u_bbr.flex3 = 0;
2820                 else
2821                         log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
2822                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
2823                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
2824                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
2825                 log.u_bbr.flex8 = frm;
2826                 log.u_bbr.pkts_out = orig_cwnd;
2827                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2828                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2829                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2830                 log.u_bbr.use_lt_bw <<= 1;
2831                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2832                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2833                     &rack->rc_inp->inp_socket->so_rcv,
2834                     &rack->rc_inp->inp_socket->so_snd,
2835                     BBR_LOG_BBRUPD, 0,
2836                     0, &log, false, &tv);
2837         }
2838 }
2839
2840 #ifdef NETFLIX_EXP_DETECTION
2841 static void
2842 rack_log_sad(struct tcp_rack *rack, int event)
2843 {
2844         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2845                 union tcp_log_stackspecific log;
2846                 struct timeval tv;
2847
2848                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2849                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
2850                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2851                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
2852                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2853                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
2854                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
2855                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
2856                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
2857                 log.u_bbr.lt_epoch |= rack->do_detection;
2858                 log.u_bbr.applimited = tcp_map_minimum;
2859                 log.u_bbr.flex7 = rack->sack_attack_disable;
2860                 log.u_bbr.flex8 = event;
2861                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2862                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2863                 log.u_bbr.delivered = tcp_sad_decay_val;
2864                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2865                     &rack->rc_inp->inp_socket->so_rcv,
2866                     &rack->rc_inp->inp_socket->so_snd,
2867                     TCP_SAD_DETECTION, 0,
2868                     0, &log, false, &tv);
2869         }
2870 }
2871 #endif
2872
2873 static void
2874 rack_counter_destroy(void)
2875 {
2876         int i;
2877
2878         counter_u64_free(rack_fto_send);
2879         counter_u64_free(rack_fto_rsm_send);
2880         counter_u64_free(rack_nfto_resend);
2881         counter_u64_free(rack_hw_pace_init_fail);
2882         counter_u64_free(rack_hw_pace_lost);
2883         counter_u64_free(rack_non_fto_send);
2884         counter_u64_free(rack_extended_rfo);
2885         counter_u64_free(rack_ack_total);
2886         counter_u64_free(rack_express_sack);
2887         counter_u64_free(rack_sack_total);
2888         counter_u64_free(rack_move_none);
2889         counter_u64_free(rack_move_some);
2890         counter_u64_free(rack_sack_attacks_detected);
2891         counter_u64_free(rack_sack_attacks_reversed);
2892         counter_u64_free(rack_sack_used_next_merge);
2893         counter_u64_free(rack_sack_used_prev_merge);
2894         counter_u64_free(rack_badfr);
2895         counter_u64_free(rack_badfr_bytes);
2896         counter_u64_free(rack_rtm_prr_retran);
2897         counter_u64_free(rack_rtm_prr_newdata);
2898         counter_u64_free(rack_timestamp_mismatch);
2899         counter_u64_free(rack_find_high);
2900         counter_u64_free(rack_reorder_seen);
2901         counter_u64_free(rack_tlp_tot);
2902         counter_u64_free(rack_tlp_newdata);
2903         counter_u64_free(rack_tlp_retran);
2904         counter_u64_free(rack_tlp_retran_bytes);
2905         counter_u64_free(rack_tlp_retran_fail);
2906         counter_u64_free(rack_to_tot);
2907         counter_u64_free(rack_to_arm_rack);
2908         counter_u64_free(rack_to_arm_tlp);
2909         counter_u64_free(rack_calc_zero);
2910         counter_u64_free(rack_calc_nonzero);
2911         counter_u64_free(rack_paced_segments);
2912         counter_u64_free(rack_unpaced_segments);
2913         counter_u64_free(rack_saw_enobuf);
2914         counter_u64_free(rack_saw_enobuf_hw);
2915         counter_u64_free(rack_saw_enetunreach);
2916         counter_u64_free(rack_hot_alloc);
2917         counter_u64_free(rack_to_alloc);
2918         counter_u64_free(rack_to_alloc_hard);
2919         counter_u64_free(rack_to_alloc_emerg);
2920         counter_u64_free(rack_to_alloc_limited);
2921         counter_u64_free(rack_alloc_limited_conns);
2922         counter_u64_free(rack_split_limited);
2923         for (i = 0; i < MAX_NUM_OF_CNTS; i++) {
2924                 counter_u64_free(rack_proc_comp_ack[i]);
2925         }
2926         counter_u64_free(rack_multi_single_eq);
2927         counter_u64_free(rack_proc_non_comp_ack);
2928         counter_u64_free(rack_sack_proc_all);
2929         counter_u64_free(rack_sack_proc_restart);
2930         counter_u64_free(rack_sack_proc_short);
2931         counter_u64_free(rack_enter_tlp_calc);
2932         counter_u64_free(rack_used_tlpmethod);
2933         counter_u64_free(rack_used_tlpmethod2);
2934         counter_u64_free(rack_sack_skipped_acked);
2935         counter_u64_free(rack_sack_splits);
2936         counter_u64_free(rack_progress_drops);
2937         counter_u64_free(rack_input_idle_reduces);
2938         counter_u64_free(rack_collapsed_win);
2939         counter_u64_free(rack_tlp_does_nada);
2940         counter_u64_free(rack_try_scwnd);
2941         counter_u64_free(rack_per_timer_hole);
2942         counter_u64_free(rack_large_ackcmp);
2943         counter_u64_free(rack_small_ackcmp);
2944 #ifdef INVARIANTS
2945         counter_u64_free(rack_adjust_map_bw);
2946 #endif
2947         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
2948         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
2949 }
2950
2951 static struct rack_sendmap *
2952 rack_alloc(struct tcp_rack *rack)
2953 {
2954         struct rack_sendmap *rsm;
2955
2956         /*
2957          * First get the top of the list it in
2958          * theory is the "hottest" rsm we have,
2959          * possibly just freed by ack processing.
2960          */
2961         if (rack->rc_free_cnt > rack_free_cache) {
2962                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
2963                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2964                 counter_u64_add(rack_hot_alloc, 1);
2965                 rack->rc_free_cnt--;
2966                 return (rsm);
2967         }
2968         /*
2969          * Once we get under our free cache we probably
2970          * no longer have a "hot" one available. Lets
2971          * get one from UMA.
2972          */
2973         rsm = uma_zalloc(rack_zone, M_NOWAIT);
2974         if (rsm) {
2975                 rack->r_ctl.rc_num_maps_alloced++;
2976                 counter_u64_add(rack_to_alloc, 1);
2977                 return (rsm);
2978         }
2979         /*
2980          * Dig in to our aux rsm's (the last two) since
2981          * UMA failed to get us one.
2982          */
2983         if (rack->rc_free_cnt) {
2984                 counter_u64_add(rack_to_alloc_emerg, 1);
2985                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
2986                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2987                 rack->rc_free_cnt--;
2988                 return (rsm);
2989         }
2990         return (NULL);
2991 }
2992
2993 static struct rack_sendmap *
2994 rack_alloc_full_limit(struct tcp_rack *rack)
2995 {
2996         if ((V_tcp_map_entries_limit > 0) &&
2997             (rack->do_detection == 0) &&
2998             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
2999                 counter_u64_add(rack_to_alloc_limited, 1);
3000                 if (!rack->alloc_limit_reported) {
3001                         rack->alloc_limit_reported = 1;
3002                         counter_u64_add(rack_alloc_limited_conns, 1);
3003                 }
3004                 return (NULL);
3005         }
3006         return (rack_alloc(rack));
3007 }
3008
3009 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3010 static struct rack_sendmap *
3011 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
3012 {
3013         struct rack_sendmap *rsm;
3014
3015         if (limit_type) {
3016                 /* currently there is only one limit type */
3017                 if (V_tcp_map_split_limit > 0 &&
3018                     (rack->do_detection == 0) &&
3019                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
3020                         counter_u64_add(rack_split_limited, 1);
3021                         if (!rack->alloc_limit_reported) {
3022                                 rack->alloc_limit_reported = 1;
3023                                 counter_u64_add(rack_alloc_limited_conns, 1);
3024                         }
3025                         return (NULL);
3026                 }
3027         }
3028
3029         /* allocate and mark in the limit type, if set */
3030         rsm = rack_alloc(rack);
3031         if (rsm != NULL && limit_type) {
3032                 rsm->r_limit_type = limit_type;
3033                 rack->r_ctl.rc_num_split_allocs++;
3034         }
3035         return (rsm);
3036 }
3037
3038 static void
3039 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
3040 {
3041         if (rsm->r_flags & RACK_APP_LIMITED) {
3042                 if (rack->r_ctl.rc_app_limited_cnt > 0) {
3043                         rack->r_ctl.rc_app_limited_cnt--;
3044                 }
3045         }
3046         if (rsm->r_limit_type) {
3047                 /* currently there is only one limit type */
3048                 rack->r_ctl.rc_num_split_allocs--;
3049         }
3050         if (rsm == rack->r_ctl.rc_first_appl) {
3051                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3052                         rack->r_ctl.rc_first_appl = NULL;
3053                 else {
3054                         /* Follow the next one out */
3055                         struct rack_sendmap fe;
3056
3057                         fe.r_start = rsm->r_nseq_appl;
3058                         rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
3059                 }
3060         }
3061         if (rsm == rack->r_ctl.rc_resend)
3062                 rack->r_ctl.rc_resend = NULL;
3063         if (rsm == rack->r_ctl.rc_rsm_at_retran)
3064                 rack->r_ctl.rc_rsm_at_retran = NULL;
3065         if (rsm == rack->r_ctl.rc_end_appl)
3066                 rack->r_ctl.rc_end_appl = NULL;
3067         if (rack->r_ctl.rc_tlpsend == rsm)
3068                 rack->r_ctl.rc_tlpsend = NULL;
3069         if (rack->r_ctl.rc_sacklast == rsm)
3070                 rack->r_ctl.rc_sacklast = NULL;
3071         memset(rsm, 0, sizeof(struct rack_sendmap));
3072         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
3073         rack->rc_free_cnt++;
3074 }
3075
3076 static void
3077 rack_free_trim(struct tcp_rack *rack)
3078 {
3079         struct rack_sendmap *rsm;
3080
3081         /*
3082          * Free up all the tail entries until
3083          * we get our list down to the limit.
3084          */
3085         while (rack->rc_free_cnt > rack_free_cache) {
3086                 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
3087                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3088                 rack->rc_free_cnt--;
3089                 uma_zfree(rack_zone, rsm);
3090         }
3091 }
3092
3093
3094 static uint32_t
3095 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
3096 {
3097         uint64_t srtt, bw, len, tim;
3098         uint32_t segsiz, def_len, minl;
3099
3100         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3101         def_len = rack_def_data_window * segsiz;
3102         if (rack->rc_gp_filled == 0) {
3103                 /*
3104                  * We have no measurement (IW is in flight?) so
3105                  * we can only guess using our data_window sysctl
3106                  * value (usually 20MSS).
3107                  */
3108                 return (def_len);
3109         }
3110         /*
3111          * Now we have a number of factors to consider.
3112          *
3113          * 1) We have a desired BDP which is usually
3114          *    at least 2.
3115          * 2) We have a minimum number of rtt's usually 1 SRTT
3116          *    but we allow it too to be more.
3117          * 3) We want to make sure a measurement last N useconds (if
3118          *    we have set rack_min_measure_usec.
3119          *
3120          * We handle the first concern here by trying to create a data
3121          * window of max(rack_def_data_window, DesiredBDP). The
3122          * second concern we handle in not letting the measurement
3123          * window end normally until at least the required SRTT's
3124          * have gone by which is done further below in
3125          * rack_enough_for_measurement(). Finally the third concern
3126          * we also handle here by calculating how long that time
3127          * would take at the current BW and then return the
3128          * max of our first calculation and that length. Note
3129          * that if rack_min_measure_usec is 0, we don't deal
3130          * with concern 3. Also for both Concern 1 and 3 an
3131          * application limited period could end the measurement
3132          * earlier.
3133          *
3134          * So lets calculate the BDP with the "known" b/w using
3135          * the SRTT has our rtt and then multiply it by the
3136          * goal.
3137          */
3138         bw = rack_get_bw(rack);
3139         srtt = (uint64_t)tp->t_srtt;
3140         len = bw * srtt;
3141         len /= (uint64_t)HPTS_USEC_IN_SEC;
3142         len *= max(1, rack_goal_bdp);
3143         /* Now we need to round up to the nearest MSS */
3144         len = roundup(len, segsiz);
3145         if (rack_min_measure_usec) {
3146                 /* Now calculate our min length for this b/w */
3147                 tim = rack_min_measure_usec;
3148                 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
3149                 if (minl == 0)
3150                         minl = 1;
3151                 minl = roundup(minl, segsiz);
3152                 if (len < minl)
3153                         len = minl;
3154         }
3155         /*
3156          * Now if we have a very small window we want
3157          * to attempt to get the window that is
3158          * as small as possible. This happens on
3159          * low b/w connections and we don't want to
3160          * span huge numbers of rtt's between measurements.
3161          *
3162          * We basically include 2 over our "MIN window" so
3163          * that the measurement can be shortened (possibly) by
3164          * an ack'ed packet.
3165          */
3166         if (len < def_len)
3167                 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3168         else
3169                 return (max((uint32_t)len, def_len));
3170
3171 }
3172
3173 static int
3174 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3175 {
3176         uint32_t tim, srtts, segsiz;
3177
3178         /*
3179          * Has enough time passed for the GP measurement to be valid?
3180          */
3181         if ((tp->snd_max == tp->snd_una) ||
3182             (th_ack == tp->snd_max)){
3183                 /* All is acked */
3184                 *quality = RACK_QUALITY_ALLACKED;
3185                 return (1);
3186         }
3187         if (SEQ_LT(th_ack, tp->gput_seq)) {
3188                 /* Not enough bytes yet */
3189                 return (0);
3190         }
3191         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3192         if (SEQ_LT(th_ack, tp->gput_ack) &&
3193             ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3194                 /* Not enough bytes yet */
3195                 return (0);
3196         }
3197         if (rack->r_ctl.rc_first_appl &&
3198             (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3199                 /*
3200                  * We are up to the app limited send point
3201                  * we have to measure irrespective of the time..
3202                  */
3203                 *quality = RACK_QUALITY_APPLIMITED;
3204                 return (1);
3205         }
3206         /* Now what about time? */
3207         srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3208         tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3209         if (tim >= srtts) {
3210                 *quality = RACK_QUALITY_HIGH;
3211                 return (1);
3212         }
3213         /* Nope not even a full SRTT has passed */
3214         return (0);
3215 }
3216
3217 static void
3218 rack_log_timely(struct tcp_rack *rack,
3219                 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3220                 uint64_t up_bnd, int line, uint8_t method)
3221 {
3222         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3223                 union tcp_log_stackspecific log;
3224                 struct timeval tv;
3225
3226                 memset(&log, 0, sizeof(log));
3227                 log.u_bbr.flex1 = logged;
3228                 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3229                 log.u_bbr.flex2 <<= 4;
3230                 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3231                 log.u_bbr.flex2 <<= 4;
3232                 log.u_bbr.flex2 |= rack->rc_gp_incr;
3233                 log.u_bbr.flex2 <<= 4;
3234                 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3235                 log.u_bbr.flex3 = rack->rc_gp_incr;
3236                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3237                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3238                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3239                 log.u_bbr.flex7 = rack->rc_gp_bwred;
3240                 log.u_bbr.flex8 = method;
3241                 log.u_bbr.cur_del_rate = cur_bw;
3242                 log.u_bbr.delRate = low_bnd;
3243                 log.u_bbr.bw_inuse = up_bnd;
3244                 log.u_bbr.rttProp = rack_get_bw(rack);
3245                 log.u_bbr.pkt_epoch = line;
3246                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3247                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3248                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3249                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3250                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3251                 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3252                 log.u_bbr.cwnd_gain <<= 1;
3253                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3254                 log.u_bbr.cwnd_gain <<= 1;
3255                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3256                 log.u_bbr.cwnd_gain <<= 1;
3257                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3258                 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3259                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3260                     &rack->rc_inp->inp_socket->so_rcv,
3261                     &rack->rc_inp->inp_socket->so_snd,
3262                     TCP_TIMELY_WORK, 0,
3263                     0, &log, false, &tv);
3264         }
3265 }
3266
3267 static int
3268 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3269 {
3270         /*
3271          * Before we increase we need to know if
3272          * the estimate just made was less than
3273          * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3274          *
3275          * If we already are pacing at a fast enough
3276          * rate to push us faster there is no sense of
3277          * increasing.
3278          *
3279          * We first caculate our actual pacing rate (ss or ca multipler
3280          * times our cur_bw).
3281          *
3282          * Then we take the last measured rate and multipy by our
3283          * maximum pacing overage to give us a max allowable rate.
3284          *
3285          * If our act_rate is smaller than our max_allowable rate
3286          * then we should increase. Else we should hold steady.
3287          *
3288          */
3289         uint64_t act_rate, max_allow_rate;
3290
3291         if (rack_timely_no_stopping)
3292                 return (1);
3293
3294         if ((cur_bw == 0) || (last_bw_est == 0)) {
3295                 /*
3296                  * Initial startup case or
3297                  * everything is acked case.
3298                  */
3299                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3300                                 __LINE__, 9);
3301                 return (1);
3302         }
3303         if (mult <= 100) {
3304                 /*
3305                  * We can always pace at or slightly above our rate.
3306                  */
3307                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3308                                 __LINE__, 9);
3309                 return (1);
3310         }
3311         act_rate = cur_bw * (uint64_t)mult;
3312         act_rate /= 100;
3313         max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3314         max_allow_rate /= 100;
3315         if (act_rate < max_allow_rate) {
3316                 /*
3317                  * Here the rate we are actually pacing at
3318                  * is smaller than 10% above our last measurement.
3319                  * This means we are pacing below what we would
3320                  * like to try to achieve (plus some wiggle room).
3321                  */
3322                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3323                                 __LINE__, 9);
3324                 return (1);
3325         } else {
3326                 /*
3327                  * Here we are already pacing at least rack_max_per_above(10%)
3328                  * what we are getting back. This indicates most likely
3329                  * that we are being limited (cwnd/rwnd/app) and can't
3330                  * get any more b/w. There is no sense of trying to
3331                  * raise up the pacing rate its not speeding us up
3332                  * and we already are pacing faster than we are getting.
3333                  */
3334                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3335                                 __LINE__, 8);
3336                 return (0);
3337         }
3338 }
3339
3340 static void
3341 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3342 {
3343         /*
3344          * When we drag bottom, we want to assure
3345          * that no multiplier is below 1.0, if so
3346          * we want to restore it to at least that.
3347          */
3348         if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
3349                 /* This is unlikely we usually do not touch recovery */
3350                 rack->r_ctl.rack_per_of_gp_rec = 100;
3351         }
3352         if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3353                 rack->r_ctl.rack_per_of_gp_ca = 100;
3354         }
3355         if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3356                 rack->r_ctl.rack_per_of_gp_ss = 100;
3357         }
3358 }
3359
3360 static void
3361 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3362 {
3363         if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3364                 rack->r_ctl.rack_per_of_gp_ca = 100;
3365         }
3366         if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3367                 rack->r_ctl.rack_per_of_gp_ss = 100;
3368         }
3369 }
3370
3371 static void
3372 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3373 {
3374         int32_t  calc, logged, plus;
3375
3376         logged = 0;
3377
3378         if (override) {
3379                 /*
3380                  * override is passed when we are
3381                  * loosing b/w and making one last
3382                  * gasp at trying to not loose out
3383                  * to a new-reno flow.
3384                  */
3385                 goto extra_boost;
3386         }
3387         /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3388         if (rack->rc_gp_incr &&
3389             ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3390                 /*
3391                  * Reset and get 5 strokes more before the boost. Note
3392                  * that the count is 0 based so we have to add one.
3393                  */
3394 extra_boost:
3395                 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3396                 rack->rc_gp_timely_inc_cnt = 0;
3397         } else
3398                 plus = (uint32_t)rack_gp_increase_per;
3399         /* Must be at least 1% increase for true timely increases */
3400         if ((plus < 1) &&
3401             ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3402                 plus = 1;
3403         if (rack->rc_gp_saw_rec &&
3404             (rack->rc_gp_no_rec_chg == 0) &&
3405             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3406                                   rack->r_ctl.rack_per_of_gp_rec)) {
3407                 /* We have been in recovery ding it too */
3408                 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3409                 if (calc > 0xffff)
3410                         calc = 0xffff;
3411                 logged |= 1;
3412                 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3413                 if (rack_per_upper_bound_ss &&
3414                     (rack->rc_dragged_bottom == 0) &&
3415                     (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
3416                         rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
3417         }
3418         if (rack->rc_gp_saw_ca &&
3419             (rack->rc_gp_saw_ss == 0) &&
3420             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3421                                   rack->r_ctl.rack_per_of_gp_ca)) {
3422                 /* In CA */
3423                 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3424                 if (calc > 0xffff)
3425                         calc = 0xffff;
3426                 logged |= 2;
3427                 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3428                 if (rack_per_upper_bound_ca &&
3429                     (rack->rc_dragged_bottom == 0) &&
3430                     (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
3431                         rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
3432         }
3433         if (rack->rc_gp_saw_ss &&
3434             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3435                                   rack->r_ctl.rack_per_of_gp_ss)) {
3436                 /* In SS */
3437                 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3438                 if (calc > 0xffff)
3439                         calc = 0xffff;
3440                 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3441                 if (rack_per_upper_bound_ss &&
3442                     (rack->rc_dragged_bottom == 0) &&
3443                     (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
3444                         rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
3445                 logged |= 4;
3446         }
3447         if (logged &&
3448             (rack->rc_gp_incr == 0)){
3449                 /* Go into increment mode */
3450                 rack->rc_gp_incr = 1;
3451                 rack->rc_gp_timely_inc_cnt = 0;
3452         }
3453         if (rack->rc_gp_incr &&
3454             logged &&
3455             (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3456                 rack->rc_gp_timely_inc_cnt++;
3457         }
3458         rack_log_timely(rack,  logged, plus, 0, 0,
3459                         __LINE__, 1);
3460 }
3461
3462 static uint32_t
3463 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3464 {
3465         /*
3466          * norm_grad = rtt_diff / minrtt;
3467          * new_per = curper * (1 - B * norm_grad)
3468          *
3469          * B = rack_gp_decrease_per (default 10%)
3470          * rtt_dif = input var current rtt-diff
3471          * curper = input var current percentage
3472          * minrtt = from rack filter
3473          *
3474          */
3475         uint64_t perf;
3476
3477         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3478                     ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3479                      (((uint64_t)rtt_diff * (uint64_t)1000000)/
3480                       (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3481                      (uint64_t)1000000)) /
3482                 (uint64_t)1000000);
3483         if (perf > curper) {
3484                 /* TSNH */
3485                 perf = curper - 1;
3486         }
3487         return ((uint32_t)perf);
3488 }
3489
3490 static uint32_t
3491 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3492 {
3493         /*
3494          *                                   highrttthresh
3495          * result = curper * (1 - (B * ( 1 -  ------          ))
3496          *                                     gp_srtt
3497          *
3498          * B = rack_gp_decrease_per (default 10%)
3499          * highrttthresh = filter_min * rack_gp_rtt_maxmul
3500          */
3501         uint64_t perf;
3502         uint32_t highrttthresh;
3503
3504         highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3505
3506         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3507                                      ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3508                                         ((uint64_t)highrttthresh * (uint64_t)1000000) /
3509                                                     (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3510         return (perf);
3511 }
3512
3513 static void
3514 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3515 {
3516         uint64_t logvar, logvar2, logvar3;
3517         uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3518
3519         if (rack->rc_gp_incr) {
3520                 /* Turn off increment counting */
3521                 rack->rc_gp_incr = 0;
3522                 rack->rc_gp_timely_inc_cnt = 0;
3523         }
3524         ss_red = ca_red = rec_red = 0;
3525         logged = 0;
3526         /* Calculate the reduction value */
3527         if (rtt_diff < 0) {
3528                 rtt_diff *= -1;
3529         }
3530         /* Must be at least 1% reduction */
3531         if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3532                 /* We have been in recovery ding it too */
3533                 if (timely_says == 2) {
3534                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3535                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3536                         if (alt < new_per)
3537                                 val = alt;
3538                         else
3539                                 val = new_per;
3540                 } else
3541                          val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3542                 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3543                         rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3544                         rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3545                 } else {
3546                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3547                         rec_red = 0;
3548                 }
3549                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3550                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3551                 logged |= 1;
3552         }
3553         if (rack->rc_gp_saw_ss) {
3554                 /* Sent in SS */
3555                 if (timely_says == 2) {
3556                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3557                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3558                         if (alt < new_per)
3559                                 val = alt;
3560                         else
3561                                 val = new_per;
3562                 } else
3563                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3564                 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3565                         ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3566                         rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3567                 } else {
3568                         ss_red = new_per;
3569                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3570                         logvar = new_per;
3571                         logvar <<= 32;
3572                         logvar |= alt;
3573                         logvar2 = (uint32_t)rtt;
3574                         logvar2 <<= 32;
3575                         logvar2 |= (uint32_t)rtt_diff;
3576                         logvar3 = rack_gp_rtt_maxmul;
3577                         logvar3 <<= 32;
3578                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3579                         rack_log_timely(rack, timely_says,
3580                                         logvar2, logvar3,
3581                                         logvar, __LINE__, 10);
3582                 }
3583                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3584                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3585                 logged |= 4;
3586         } else if (rack->rc_gp_saw_ca) {
3587                 /* Sent in CA */
3588                 if (timely_says == 2) {
3589                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3590                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3591                         if (alt < new_per)
3592                                 val = alt;
3593                         else
3594                                 val = new_per;
3595                 } else
3596                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3597                 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3598                         ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3599                         rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3600                 } else {
3601                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3602                         ca_red = 0;
3603                         logvar = new_per;
3604                         logvar <<= 32;
3605                         logvar |= alt;
3606                         logvar2 = (uint32_t)rtt;
3607                         logvar2 <<= 32;
3608                         logvar2 |= (uint32_t)rtt_diff;
3609                         logvar3 = rack_gp_rtt_maxmul;
3610                         logvar3 <<= 32;
3611                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3612                         rack_log_timely(rack, timely_says,
3613                                         logvar2, logvar3,
3614                                         logvar, __LINE__, 10);
3615                 }
3616                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3617                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3618                 logged |= 2;
3619         }
3620         if (rack->rc_gp_timely_dec_cnt < 0x7) {
3621                 rack->rc_gp_timely_dec_cnt++;
3622                 if (rack_timely_dec_clear &&
3623                     (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3624                         rack->rc_gp_timely_dec_cnt = 0;
3625         }
3626         logvar = ss_red;
3627         logvar <<= 32;
3628         logvar |= ca_red;
3629         rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
3630                         __LINE__, 2);
3631 }
3632
3633 static void
3634 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3635                      uint32_t rtt, uint32_t line, uint8_t reas)
3636 {
3637         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3638                 union tcp_log_stackspecific log;
3639                 struct timeval tv;
3640
3641                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3642                 log.u_bbr.flex1 = line;
3643                 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3644                 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3645                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3646                 log.u_bbr.flex5 = rtt;
3647                 log.u_bbr.flex6 = rack->rc_highly_buffered;
3648                 log.u_bbr.flex6 <<= 1;
3649                 log.u_bbr.flex6 |= rack->forced_ack;
3650                 log.u_bbr.flex6 <<= 1;
3651                 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3652                 log.u_bbr.flex6 <<= 1;
3653                 log.u_bbr.flex6 |= rack->in_probe_rtt;
3654                 log.u_bbr.flex6 <<= 1;
3655                 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
3656                 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
3657                 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
3658                 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
3659                 log.u_bbr.flex8 = reas;
3660                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3661                 log.u_bbr.delRate = rack_get_bw(rack);
3662                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
3663                 log.u_bbr.cur_del_rate <<= 32;
3664                 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
3665                 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
3666                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3667                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3668                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3669                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3670                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
3671                 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
3672                 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3673                 log.u_bbr.rttProp = us_cts;
3674                 log.u_bbr.rttProp <<= 32;
3675                 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
3676                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3677                     &rack->rc_inp->inp_socket->so_rcv,
3678                     &rack->rc_inp->inp_socket->so_snd,
3679                     BBR_LOG_RTT_SHRINKS, 0,
3680                     0, &log, false, &rack->r_ctl.act_rcv_time);
3681         }
3682 }
3683
3684 static void
3685 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
3686 {
3687         uint64_t bwdp;
3688
3689         bwdp = rack_get_bw(rack);
3690         bwdp *= (uint64_t)rtt;
3691         bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
3692         rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
3693         if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
3694                 /*
3695                  * A window protocol must be able to have 4 packets
3696                  * outstanding as the floor in order to function
3697                  * (especially considering delayed ack :D).
3698                  */
3699                 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
3700         }
3701 }
3702
3703 static void
3704 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
3705 {
3706         /**
3707          * ProbeRTT is a bit different in rack_pacing than in
3708          * BBR. It is like BBR in that it uses the lowering of
3709          * the RTT as a signal that we saw something new and
3710          * counts from there for how long between. But it is
3711          * different in that its quite simple. It does not
3712          * play with the cwnd and wait until we get down
3713          * to N segments outstanding and hold that for
3714          * 200ms. Instead it just sets the pacing reduction
3715          * rate to a set percentage (70 by default) and hold
3716          * that for a number of recent GP Srtt's.
3717          */
3718         uint32_t segsiz;
3719
3720         if (rack->rc_gp_dyn_mul == 0)
3721                 return;
3722
3723         if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
3724                 /* We are idle */
3725                 return;
3726         }
3727         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3728             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3729                 /*
3730                  * Stop the goodput now, the idea here is
3731                  * that future measurements with in_probe_rtt
3732                  * won't register if they are not greater so
3733                  * we want to get what info (if any) is available
3734                  * now.
3735                  */
3736                 rack_do_goodput_measurement(rack->rc_tp, rack,
3737                                             rack->rc_tp->snd_una, __LINE__,
3738                                             RACK_QUALITY_PROBERTT);
3739         }
3740         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3741         rack->r_ctl.rc_time_probertt_entered = us_cts;
3742         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3743                      rack->r_ctl.rc_pace_min_segs);
3744         rack->in_probe_rtt = 1;
3745         rack->measure_saw_probe_rtt = 1;
3746         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3747         rack->r_ctl.rc_time_probertt_starts = 0;
3748         rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
3749         if (rack_probertt_use_min_rtt_entry)
3750                 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3751         else
3752                 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
3753         rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3754                              __LINE__, RACK_RTTS_ENTERPROBE);
3755 }
3756
3757 static void
3758 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
3759 {
3760         struct rack_sendmap *rsm;
3761         uint32_t segsiz;
3762
3763         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3764                      rack->r_ctl.rc_pace_min_segs);
3765         rack->in_probe_rtt = 0;
3766         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3767             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3768                 /*
3769                  * Stop the goodput now, the idea here is
3770                  * that future measurements with in_probe_rtt
3771                  * won't register if they are not greater so
3772                  * we want to get what info (if any) is available
3773                  * now.
3774                  */
3775                 rack_do_goodput_measurement(rack->rc_tp, rack,
3776                                             rack->rc_tp->snd_una, __LINE__,
3777                                             RACK_QUALITY_PROBERTT);
3778         } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
3779                 /*
3780                  * We don't have enough data to make a measurement.
3781                  * So lets just stop and start here after exiting
3782                  * probe-rtt. We probably are not interested in
3783                  * the results anyway.
3784                  */
3785                 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
3786         }
3787         /*
3788          * Measurements through the current snd_max are going
3789          * to be limited by the slower pacing rate.
3790          *
3791          * We need to mark these as app-limited so we
3792          * don't collapse the b/w.
3793          */
3794         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3795         if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
3796                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3797                         rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
3798                 else {
3799                         /*
3800                          * Go out to the end app limited and mark
3801                          * this new one as next and move the end_appl up
3802                          * to this guy.
3803                          */
3804                         if (rack->r_ctl.rc_end_appl)
3805                                 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
3806                         rack->r_ctl.rc_end_appl = rsm;
3807                 }
3808                 rsm->r_flags |= RACK_APP_LIMITED;
3809                 rack->r_ctl.rc_app_limited_cnt++;
3810         }
3811         /*
3812          * Now, we need to examine our pacing rate multipliers.
3813          * If its under 100%, we need to kick it back up to
3814          * 100%. We also don't let it be over our "max" above
3815          * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
3816          * Note setting clamp_atexit_prtt to 0 has the effect
3817          * of setting CA/SS to 100% always at exit (which is
3818          * the default behavior).
3819          */
3820         if (rack_probertt_clear_is) {
3821                 rack->rc_gp_incr = 0;
3822                 rack->rc_gp_bwred = 0;
3823                 rack->rc_gp_timely_inc_cnt = 0;
3824                 rack->rc_gp_timely_dec_cnt = 0;
3825         }
3826         /* Do we do any clamping at exit? */
3827         if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
3828                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
3829                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
3830         }
3831         if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
3832                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
3833                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
3834         }
3835         /*
3836          * Lets set rtt_diff to 0, so that we will get a "boost"
3837          * after exiting.
3838          */
3839         rack->r_ctl.rc_rtt_diff = 0;
3840
3841         /* Clear all flags so we start fresh */
3842         rack->rc_tp->t_bytes_acked = 0;
3843         rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND;
3844         /*
3845          * If configured to, set the cwnd and ssthresh to
3846          * our targets.
3847          */
3848         if (rack_probe_rtt_sets_cwnd) {
3849                 uint64_t ebdp;
3850                 uint32_t setto;
3851
3852                 /* Set ssthresh so we get into CA once we hit our target */
3853                 if (rack_probertt_use_min_rtt_exit == 1) {
3854                         /* Set to min rtt */
3855                         rack_set_prtt_target(rack, segsiz,
3856                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3857                 } else if (rack_probertt_use_min_rtt_exit == 2) {
3858                         /* Set to current gp rtt */
3859                         rack_set_prtt_target(rack, segsiz,
3860                                              rack->r_ctl.rc_gp_srtt);
3861                 } else if (rack_probertt_use_min_rtt_exit == 3) {
3862                         /* Set to entry gp rtt */
3863                         rack_set_prtt_target(rack, segsiz,
3864                                              rack->r_ctl.rc_entry_gp_rtt);
3865                 } else {
3866                         uint64_t sum;
3867                         uint32_t setval;
3868
3869                         sum = rack->r_ctl.rc_entry_gp_rtt;
3870                         sum *= 10;
3871                         sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
3872                         if (sum >= 20) {
3873                                 /*
3874                                  * A highly buffered path needs
3875                                  * cwnd space for timely to work.
3876                                  * Lets set things up as if
3877                                  * we are heading back here again.
3878                                  */
3879                                 setval = rack->r_ctl.rc_entry_gp_rtt;
3880                         } else if (sum >= 15) {
3881                                 /*
3882                                  * Lets take the smaller of the
3883                                  * two since we are just somewhat
3884                                  * buffered.
3885                                  */
3886                                 setval = rack->r_ctl.rc_gp_srtt;
3887                                 if (setval > rack->r_ctl.rc_entry_gp_rtt)
3888                                         setval = rack->r_ctl.rc_entry_gp_rtt;
3889                         } else {
3890                                 /*
3891                                  * Here we are not highly buffered
3892                                  * and should pick the min we can to
3893                                  * keep from causing loss.
3894                                  */
3895                                 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3896                         }
3897                         rack_set_prtt_target(rack, segsiz,
3898                                              setval);
3899                 }
3900                 if (rack_probe_rtt_sets_cwnd > 1) {
3901                         /* There is a percentage here to boost */
3902                         ebdp = rack->r_ctl.rc_target_probertt_flight;
3903                         ebdp *= rack_probe_rtt_sets_cwnd;
3904                         ebdp /= 100;
3905                         setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
3906                 } else
3907                         setto = rack->r_ctl.rc_target_probertt_flight;
3908                 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
3909                 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
3910                         /* Enforce a min */
3911                         rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
3912                 }
3913                 /* If we set in the cwnd also set the ssthresh point so we are in CA */
3914                 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
3915         }
3916         rack_log_rtt_shrinks(rack,  us_cts,
3917                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3918                              __LINE__, RACK_RTTS_EXITPROBE);
3919         /* Clear times last so log has all the info */
3920         rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
3921         rack->r_ctl.rc_time_probertt_entered = us_cts;
3922         rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3923         rack->r_ctl.rc_time_of_last_probertt = us_cts;
3924 }
3925
3926 static void
3927 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
3928 {
3929         /* Check in on probe-rtt */
3930         if (rack->rc_gp_filled == 0) {
3931                 /* We do not do p-rtt unless we have gp measurements */
3932                 return;
3933         }
3934         if (rack->in_probe_rtt) {
3935                 uint64_t no_overflow;
3936                 uint32_t endtime, must_stay;
3937
3938                 if (rack->r_ctl.rc_went_idle_time &&
3939                     ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
3940                         /*
3941                          * We went idle during prtt, just exit now.
3942                          */
3943                         rack_exit_probertt(rack, us_cts);
3944                 } else if (rack_probe_rtt_safety_val &&
3945                     TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
3946                     ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
3947                         /*
3948                          * Probe RTT safety value triggered!
3949                          */
3950                         rack_log_rtt_shrinks(rack,  us_cts,
3951                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3952                                              __LINE__, RACK_RTTS_SAFETY);
3953                         rack_exit_probertt(rack, us_cts);
3954                 }
3955                 /* Calculate the max we will wait */
3956                 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
3957                 if (rack->rc_highly_buffered)
3958                         endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
3959                 /* Calculate the min we must wait */
3960                 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
3961                 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
3962                     TSTMP_LT(us_cts, endtime)) {
3963                         uint32_t calc;
3964                         /* Do we lower more? */
3965 no_exit:
3966                         if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
3967                                 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
3968                         else
3969                                 calc = 0;
3970                         calc /= max(rack->r_ctl.rc_gp_srtt, 1);
3971                         if (calc) {
3972                                 /* Maybe */
3973                                 calc *= rack_per_of_gp_probertt_reduce;
3974                                 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
3975                                 /* Limit it too */
3976                                 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
3977                                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
3978                         }
3979                         /* We must reach target or the time set */
3980                         return;
3981                 }
3982                 if (rack->r_ctl.rc_time_probertt_starts == 0) {
3983                         if ((TSTMP_LT(us_cts, must_stay) &&
3984                              rack->rc_highly_buffered) ||
3985                              (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
3986                               rack->r_ctl.rc_target_probertt_flight)) {
3987                                 /* We are not past the must_stay time */
3988                                 goto no_exit;
3989                         }
3990                         rack_log_rtt_shrinks(rack,  us_cts,
3991                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3992                                              __LINE__, RACK_RTTS_REACHTARGET);
3993                         rack->r_ctl.rc_time_probertt_starts = us_cts;
3994                         if (rack->r_ctl.rc_time_probertt_starts == 0)
3995                                 rack->r_ctl.rc_time_probertt_starts = 1;
3996                         /* Restore back to our rate we want to pace at in prtt */
3997                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3998                 }
3999                 /*
4000                  * Setup our end time, some number of gp_srtts plus 200ms.
4001                  */
4002                 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
4003                                (uint64_t)rack_probertt_gpsrtt_cnt_mul);
4004                 if (rack_probertt_gpsrtt_cnt_div)
4005                         endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
4006                 else
4007                         endtime = 0;
4008                 endtime += rack_min_probertt_hold;
4009                 endtime += rack->r_ctl.rc_time_probertt_starts;
4010                 if (TSTMP_GEQ(us_cts,  endtime)) {
4011                         /* yes, exit probertt */
4012                         rack_exit_probertt(rack, us_cts);
4013                 }
4014
4015         } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
4016                 /* Go into probertt, its been too long since we went lower */
4017                 rack_enter_probertt(rack, us_cts);
4018         }
4019 }
4020
4021 static void
4022 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
4023                        uint32_t rtt, int32_t rtt_diff)
4024 {
4025         uint64_t cur_bw, up_bnd, low_bnd, subfr;
4026         uint32_t losses;
4027
4028         if ((rack->rc_gp_dyn_mul == 0) ||
4029             (rack->use_fixed_rate) ||
4030             (rack->in_probe_rtt) ||
4031             (rack->rc_always_pace == 0)) {
4032                 /* No dynamic GP multipler in play */
4033                 return;
4034         }
4035         losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
4036         cur_bw = rack_get_bw(rack);
4037         /* Calculate our up and down range */
4038         up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
4039         up_bnd /= 100;
4040         up_bnd += rack->r_ctl.last_gp_comp_bw;
4041
4042         subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
4043         subfr /= 100;
4044         low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
4045         if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
4046                 /*
4047                  * This is the case where our RTT is above
4048                  * the max target and we have been configured
4049                  * to just do timely no bonus up stuff in that case.
4050                  *
4051                  * There are two configurations, set to 1, and we
4052                  * just do timely if we are over our max. If its
4053                  * set above 1 then we slam the multipliers down
4054                  * to 100 and then decrement per timely.
4055                  */
4056                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4057                                 __LINE__, 3);
4058                 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
4059                         rack_validate_multipliers_at_or_below_100(rack);
4060                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4061         } else if ((last_bw_est < low_bnd) && !losses) {
4062                 /*
4063                  * We are decreasing this is a bit complicated this
4064                  * means we are loosing ground. This could be
4065                  * because another flow entered and we are competing
4066                  * for b/w with it. This will push the RTT up which
4067                  * makes timely unusable unless we want to get shoved
4068                  * into a corner and just be backed off (the age
4069                  * old problem with delay based CC).
4070                  *
4071                  * On the other hand if it was a route change we
4072                  * would like to stay somewhat contained and not
4073                  * blow out the buffers.
4074                  */
4075                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4076                                 __LINE__, 3);
4077                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4078                 if (rack->rc_gp_bwred == 0) {
4079                         /* Go into reduction counting */
4080                         rack->rc_gp_bwred = 1;
4081                         rack->rc_gp_timely_dec_cnt = 0;
4082                 }
4083                 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
4084                     (timely_says == 0)) {
4085                         /*
4086                          * Push another time with a faster pacing
4087                          * to try to gain back (we include override to
4088                          * get a full raise factor).
4089                          */
4090                         if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
4091                             (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
4092                             (timely_says == 0) ||
4093                             (rack_down_raise_thresh == 0)) {
4094                                 /*
4095                                  * Do an override up in b/w if we were
4096                                  * below the threshold or if the threshold
4097                                  * is zero we always do the raise.
4098                                  */
4099                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
4100                         } else {
4101                                 /* Log it stays the same */
4102                                 rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
4103                                                 __LINE__, 11);
4104                         }
4105                         rack->rc_gp_timely_dec_cnt++;
4106                         /* We are not incrementing really no-count */
4107                         rack->rc_gp_incr = 0;
4108                         rack->rc_gp_timely_inc_cnt = 0;
4109                 } else {
4110                         /*
4111                          * Lets just use the RTT
4112                          * information and give up
4113                          * pushing.
4114                          */
4115                         goto use_timely;
4116                 }
4117         } else if ((timely_says != 2) &&
4118                     !losses &&
4119                     (last_bw_est > up_bnd)) {
4120                 /*
4121                  * We are increasing b/w lets keep going, updating
4122                  * our b/w and ignoring any timely input, unless
4123                  * of course we are at our max raise (if there is one).
4124                  */
4125
4126                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4127                                 __LINE__, 3);
4128                 rack->r_ctl.last_gp_comp_bw = cur_bw;
4129                 if (rack->rc_gp_saw_ss &&
4130                     rack_per_upper_bound_ss &&
4131                      (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
4132                             /*
4133                              * In cases where we can't go higher
4134                              * we should just use timely.
4135                              */
4136                             goto use_timely;
4137                 }
4138                 if (rack->rc_gp_saw_ca &&
4139                     rack_per_upper_bound_ca &&
4140                     (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
4141                             /*
4142                              * In cases where we can't go higher
4143                              * we should just use timely.
4144                              */
4145                             goto use_timely;
4146                 }
4147                 rack->rc_gp_bwred = 0;
4148                 rack->rc_gp_timely_dec_cnt = 0;
4149                 /* You get a set number of pushes if timely is trying to reduce */
4150                 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
4151                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4152                 } else {
4153                         /* Log it stays the same */
4154                         rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
4155                             __LINE__, 12);
4156                 }
4157                 return;
4158         } else {
4159                 /*
4160                  * We are staying between the lower and upper range bounds
4161                  * so use timely to decide.
4162                  */
4163                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
4164                                 __LINE__, 3);
4165 use_timely:
4166                 if (timely_says) {
4167                         rack->rc_gp_incr = 0;
4168                         rack->rc_gp_timely_inc_cnt = 0;
4169                         if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4170                             !losses &&
4171                             (last_bw_est < low_bnd)) {
4172                                 /* We are loosing ground */
4173                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4174                                 rack->rc_gp_timely_dec_cnt++;
4175                                 /* We are not incrementing really no-count */
4176                                 rack->rc_gp_incr = 0;
4177                                 rack->rc_gp_timely_inc_cnt = 0;
4178                         } else
4179                                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4180                 } else {
4181                         rack->rc_gp_bwred = 0;
4182                         rack->rc_gp_timely_dec_cnt = 0;
4183                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4184                 }
4185         }
4186 }
4187
4188 static int32_t
4189 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4190 {
4191         int32_t timely_says;
4192         uint64_t log_mult, log_rtt_a_diff;
4193
4194         log_rtt_a_diff = rtt;
4195         log_rtt_a_diff <<= 32;
4196         log_rtt_a_diff |= (uint32_t)rtt_diff;
4197         if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4198                     rack_gp_rtt_maxmul)) {
4199                 /* Reduce the b/w multipler */
4200                 timely_says = 2;
4201                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4202                 log_mult <<= 32;
4203                 log_mult |= prev_rtt;
4204                 rack_log_timely(rack,  timely_says, log_mult,
4205                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4206                                 log_rtt_a_diff, __LINE__, 4);
4207         } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4208                            ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4209                             max(rack_gp_rtt_mindiv , 1)))) {
4210                 /* Increase the b/w multipler */
4211                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4212                         ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4213                          max(rack_gp_rtt_mindiv , 1));
4214                 log_mult <<= 32;
4215                 log_mult |= prev_rtt;
4216                 timely_says = 0;
4217                 rack_log_timely(rack,  timely_says, log_mult ,
4218                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4219                                 log_rtt_a_diff, __LINE__, 5);
4220         } else {
4221                 /*
4222                  * Use a gradient to find it the timely gradient
4223                  * is:
4224                  * grad = rc_rtt_diff / min_rtt;
4225                  *
4226                  * anything below or equal to 0 will be
4227                  * a increase indication. Anything above
4228                  * zero is a decrease. Note we take care
4229                  * of the actual gradient calculation
4230                  * in the reduction (its not needed for
4231                  * increase).
4232                  */
4233                 log_mult = prev_rtt;
4234                 if (rtt_diff <= 0) {
4235                         /*
4236                          * Rttdiff is less than zero, increase the
4237                          * b/w multipler (its 0 or negative)
4238                          */
4239                         timely_says = 0;
4240                         rack_log_timely(rack,  timely_says, log_mult,
4241                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4242                 } else {
4243                         /* Reduce the b/w multipler */
4244                         timely_says = 1;
4245                         rack_log_timely(rack,  timely_says, log_mult,
4246                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4247                 }
4248         }
4249         return (timely_says);
4250 }
4251
4252 static void
4253 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4254                             tcp_seq th_ack, int line, uint8_t quality)
4255 {
4256         uint64_t tim, bytes_ps, ltim, stim, utim;
4257         uint32_t segsiz, bytes, reqbytes, us_cts;
4258         int32_t gput, new_rtt_diff, timely_says;
4259         uint64_t  resid_bw, subpart = 0, addpart = 0, srtt;
4260         int did_add = 0;
4261
4262         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4263         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4264         if (TSTMP_GEQ(us_cts, tp->gput_ts))
4265                 tim = us_cts - tp->gput_ts;
4266         else
4267                 tim = 0;
4268         if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4269                 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4270         else
4271                 stim = 0;
4272         /*
4273          * Use the larger of the send time or ack time. This prevents us
4274          * from being influenced by ack artifacts to come up with too
4275          * high of measurement. Note that since we are spanning over many more
4276          * bytes in most of our measurements hopefully that is less likely to
4277          * occur.
4278          */
4279         if (tim > stim)
4280                 utim = max(tim, 1);
4281         else
4282                 utim = max(stim, 1);
4283         /* Lets get a msec time ltim too for the old stuff */
4284         ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
4285         gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
4286         reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4287         if ((tim == 0) && (stim == 0)) {
4288                 /*
4289                  * Invalid measurement time, maybe
4290                  * all on one ack/one send?
4291                  */
4292                 bytes = 0;
4293                 bytes_ps = 0;
4294                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4295                                            0, 0, 0, 10, __LINE__, NULL, quality);
4296                 goto skip_measurement;
4297         }
4298         if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4299                 /* We never made a us_rtt measurement? */
4300                 bytes = 0;
4301                 bytes_ps = 0;
4302                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4303                                            0, 0, 0, 10, __LINE__, NULL, quality);
4304                 goto skip_measurement;
4305         }
4306         /*
4307          * Calculate the maximum possible b/w this connection
4308          * could have. We base our calculation on the lowest
4309          * rtt we have seen during the measurement and the
4310          * largest rwnd the client has given us in that time. This
4311          * forms a BDP that is the maximum that we could ever
4312          * get to the client. Anything larger is not valid.
4313          *
4314          * I originally had code here that rejected measurements
4315          * where the time was less than 1/2 the latest us_rtt.
4316          * But after thinking on that I realized its wrong since
4317          * say you had a 150Mbps or even 1Gbps link, and you
4318          * were a long way away.. example I am in Europe (100ms rtt)
4319          * talking to my 1Gbps link in S.C. Now measuring say 150,000
4320          * bytes my time would be 1.2ms, and yet my rtt would say
4321          * the measurement was invalid the time was < 50ms. The
4322          * same thing is true for 150Mb (8ms of time).
4323          *
4324          * A better way I realized is to look at what the maximum
4325          * the connection could possibly do. This is gated on
4326          * the lowest RTT we have seen and the highest rwnd.
4327          * We should in theory never exceed that, if we are
4328          * then something on the path is storing up packets
4329          * and then feeding them all at once to our endpoint
4330          * messing up our measurement.
4331          */
4332         rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4333         rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4334         rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4335         if (SEQ_LT(th_ack, tp->gput_seq)) {
4336                 /* No measurement can be made */
4337                 bytes = 0;
4338                 bytes_ps = 0;
4339                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4340                                            0, 0, 0, 10, __LINE__, NULL, quality);
4341                 goto skip_measurement;
4342         } else
4343                 bytes = (th_ack - tp->gput_seq);
4344         bytes_ps = (uint64_t)bytes;
4345         /*
4346          * Don't measure a b/w for pacing unless we have gotten at least
4347          * an initial windows worth of data in this measurement interval.
4348          *
4349          * Small numbers of bytes get badly influenced by delayed ack and
4350          * other artifacts. Note we take the initial window or our
4351          * defined minimum GP (defaulting to 10 which hopefully is the
4352          * IW).
4353          */
4354         if (rack->rc_gp_filled == 0) {
4355                 /*
4356                  * The initial estimate is special. We
4357                  * have blasted out an IW worth of packets
4358                  * without a real valid ack ts results. We
4359                  * then setup the app_limited_needs_set flag,
4360                  * this should get the first ack in (probably 2
4361                  * MSS worth) to be recorded as the timestamp.
4362                  * We thus allow a smaller number of bytes i.e.
4363                  * IW - 2MSS.
4364                  */
4365                 reqbytes -= (2 * segsiz);
4366                 /* Also lets fill previous for our first measurement to be neutral */
4367                 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4368         }
4369         if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4370                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4371                                            rack->r_ctl.rc_app_limited_cnt,
4372                                            0, 0, 10, __LINE__, NULL, quality);
4373                 goto skip_measurement;
4374         }
4375         /*
4376          * We now need to calculate the Timely like status so
4377          * we can update (possibly) the b/w multipliers.
4378          */
4379         new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4380         if (rack->rc_gp_filled == 0) {
4381                 /* No previous reading */
4382                 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4383         } else {
4384                 if (rack->measure_saw_probe_rtt == 0) {
4385                         /*
4386                          * We don't want a probertt to be counted
4387                          * since it will be negative incorrectly. We
4388                          * expect to be reducing the RTT when we
4389                          * pace at a slower rate.
4390                          */
4391                         rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4392                         rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4393                 }
4394         }
4395         timely_says = rack_make_timely_judgement(rack,
4396                 rack->r_ctl.rc_gp_srtt,
4397                 rack->r_ctl.rc_rtt_diff,
4398                 rack->r_ctl.rc_prev_gp_srtt
4399                 );
4400         bytes_ps *= HPTS_USEC_IN_SEC;
4401         bytes_ps /= utim;
4402         if (bytes_ps > rack->r_ctl.last_max_bw) {
4403                 /*
4404                  * Something is on path playing
4405                  * since this b/w is not possible based
4406                  * on our BDP (highest rwnd and lowest rtt
4407                  * we saw in the measurement window).
4408                  *
4409                  * Another option here would be to
4410                  * instead skip the measurement.
4411                  */
4412                 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4413                                            bytes_ps, rack->r_ctl.last_max_bw, 0,
4414                                            11, __LINE__, NULL, quality);
4415                 bytes_ps = rack->r_ctl.last_max_bw;
4416         }
4417         /* We store gp for b/w in bytes per second */
4418         if (rack->rc_gp_filled == 0) {
4419                 /* Initial measurment */
4420                 if (bytes_ps) {
4421                         rack->r_ctl.gp_bw = bytes_ps;
4422                         rack->rc_gp_filled = 1;
4423                         rack->r_ctl.num_measurements = 1;
4424                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4425                 } else {
4426                         rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4427                                                    rack->r_ctl.rc_app_limited_cnt,
4428                                                    0, 0, 10, __LINE__, NULL, quality);
4429                 }
4430                 if (rack->rc_inp->inp_in_hpts &&
4431                     (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4432                         /*
4433                          * Ok we can't trust the pacer in this case
4434                          * where we transition from un-paced to paced.
4435                          * Or for that matter when the burst mitigation
4436                          * was making a wild guess and got it wrong.
4437                          * Stop the pacer and clear up all the aggregate
4438                          * delays etc.
4439                          */
4440                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
4441                         rack->r_ctl.rc_hpts_flags = 0;
4442                         rack->r_ctl.rc_last_output_to = 0;
4443                 }
4444                 did_add = 2;
4445         } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4446                 /* Still a small number run an average */
4447                 rack->r_ctl.gp_bw += bytes_ps;
4448                 addpart = rack->r_ctl.num_measurements;
4449                 rack->r_ctl.num_measurements++;
4450                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4451                         /* We have collected enought to move forward */
4452                         rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4453                 }
4454                 did_add = 3;
4455         } else {
4456                 /*
4457                  * We want to take 1/wma of the goodput and add in to 7/8th
4458                  * of the old value weighted by the srtt. So if your measurement
4459                  * period is say 2 SRTT's long you would get 1/4 as the
4460                  * value, if it was like 1/2 SRTT then you would get 1/16th.
4461                  *
4462                  * But we must be careful not to take too much i.e. if the
4463                  * srtt is say 20ms and the measurement is taken over
4464                  * 400ms our weight would be 400/20 i.e. 20. On the
4465                  * other hand if we get a measurement over 1ms with a
4466                  * 10ms rtt we only want to take a much smaller portion.
4467                  */
4468                 if (rack->r_ctl.num_measurements < 0xff) {
4469                         rack->r_ctl.num_measurements++;
4470                 }
4471                 srtt = (uint64_t)tp->t_srtt;
4472                 if (srtt == 0) {
4473                         /*
4474                          * Strange why did t_srtt go back to zero?
4475                          */
4476                         if (rack->r_ctl.rc_rack_min_rtt)
4477                                 srtt = rack->r_ctl.rc_rack_min_rtt;
4478                         else
4479                                 srtt = HPTS_USEC_IN_MSEC;
4480                 }
4481                 /*
4482                  * XXXrrs: Note for reviewers, in playing with
4483                  * dynamic pacing I discovered this GP calculation
4484                  * as done originally leads to some undesired results.
4485                  * Basically you can get longer measurements contributing
4486                  * too much to the WMA. Thus I changed it if you are doing
4487                  * dynamic adjustments to only do the aportioned adjustment
4488                  * if we have a very small (time wise) measurement. Longer
4489                  * measurements just get there weight (defaulting to 1/8)
4490                  * add to the WMA. We may want to think about changing
4491                  * this to always do that for both sides i.e. dynamic
4492                  * and non-dynamic... but considering lots of folks
4493                  * were playing with this I did not want to change the
4494                  * calculation per.se. without your thoughts.. Lawerence?
4495                  * Peter??
4496                  */
4497                 if (rack->rc_gp_dyn_mul == 0) {
4498                         subpart = rack->r_ctl.gp_bw * utim;
4499                         subpart /= (srtt * 8);
4500                         if (subpart < (rack->r_ctl.gp_bw / 2)) {
4501                                 /*
4502                                  * The b/w update takes no more
4503                                  * away then 1/2 our running total
4504                                  * so factor it in.
4505                                  */
4506                                 addpart = bytes_ps * utim;
4507                                 addpart /= (srtt * 8);
4508                         } else {
4509                                 /*
4510                                  * Don't allow a single measurement
4511                                  * to account for more than 1/2 of the
4512                                  * WMA. This could happen on a retransmission
4513                                  * where utim becomes huge compared to
4514                                  * srtt (multiple retransmissions when using
4515                                  * the sending rate which factors in all the
4516                                  * transmissions from the first one).
4517                                  */
4518                                 subpart = rack->r_ctl.gp_bw / 2;
4519                                 addpart = bytes_ps / 2;
4520                         }
4521                         resid_bw = rack->r_ctl.gp_bw - subpart;
4522                         rack->r_ctl.gp_bw = resid_bw + addpart;
4523                         did_add = 1;
4524                 } else {
4525                         if ((utim / srtt) <= 1) {
4526                                 /*
4527                                  * The b/w update was over a small period
4528                                  * of time. The idea here is to prevent a small
4529                                  * measurement time period from counting
4530                                  * too much. So we scale it based on the
4531                                  * time so it attributes less than 1/rack_wma_divisor
4532                                  * of its measurement.
4533                                  */
4534                                 subpart = rack->r_ctl.gp_bw * utim;
4535                                 subpart /= (srtt * rack_wma_divisor);
4536                                 addpart = bytes_ps * utim;
4537                                 addpart /= (srtt * rack_wma_divisor);
4538                         } else {
4539                                 /*
4540                                  * The scaled measurement was long
4541                                  * enough so lets just add in the
4542                                  * portion of the measurment i.e. 1/rack_wma_divisor
4543                                  */
4544                                 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
4545                                 addpart = bytes_ps / rack_wma_divisor;
4546                         }
4547                         if ((rack->measure_saw_probe_rtt == 0) ||
4548                             (bytes_ps > rack->r_ctl.gp_bw)) {
4549                                 /*
4550                                  * For probe-rtt we only add it in
4551                                  * if its larger, all others we just
4552                                  * add in.
4553                                  */
4554                                 did_add = 1;
4555                                 resid_bw = rack->r_ctl.gp_bw - subpart;
4556                                 rack->r_ctl.gp_bw = resid_bw + addpart;
4557                         }
4558                 }
4559         }
4560         if ((rack->gp_ready == 0) &&
4561             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
4562                 /* We have enough measurements now */
4563                 rack->gp_ready = 1;
4564                 rack_set_cc_pacing(rack);
4565                 if (rack->defer_options)
4566                         rack_apply_deferred_options(rack);
4567         }
4568         rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
4569                                    rack_get_bw(rack), 22, did_add, NULL, quality);
4570         /* We do not update any multipliers if we are in or have seen a probe-rtt */
4571         if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
4572                 rack_update_multiplier(rack, timely_says, bytes_ps,
4573                                        rack->r_ctl.rc_gp_srtt,
4574                                        rack->r_ctl.rc_rtt_diff);
4575         rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
4576                                    rack_get_bw(rack), 3, line, NULL, quality);
4577         /* reset the gp srtt and setup the new prev */
4578         rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4579         /* Record the lost count for the next measurement */
4580         rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
4581         /*
4582          * We restart our diffs based on the gpsrtt in the
4583          * measurement window.
4584          */
4585         rack->rc_gp_rtt_set = 0;
4586         rack->rc_gp_saw_rec = 0;
4587         rack->rc_gp_saw_ca = 0;
4588         rack->rc_gp_saw_ss = 0;
4589         rack->rc_dragged_bottom = 0;
4590 skip_measurement:
4591
4592 #ifdef STATS
4593         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
4594                                  gput);
4595         /*
4596          * XXXLAS: This is a temporary hack, and should be
4597          * chained off VOI_TCP_GPUT when stats(9) grows an
4598          * API to deal with chained VOIs.
4599          */
4600         if (tp->t_stats_gput_prev > 0)
4601                 stats_voi_update_abs_s32(tp->t_stats,
4602                                          VOI_TCP_GPUT_ND,
4603                                          ((gput - tp->t_stats_gput_prev) * 100) /
4604                                          tp->t_stats_gput_prev);
4605 #endif
4606         tp->t_flags &= ~TF_GPUTINPROG;
4607         tp->t_stats_gput_prev = gput;
4608         /*
4609          * Now are we app limited now and there is space from where we
4610          * were to where we want to go?
4611          *
4612          * We don't do the other case i.e. non-applimited here since
4613          * the next send will trigger us picking up the missing data.
4614          */
4615         if (rack->r_ctl.rc_first_appl &&
4616             TCPS_HAVEESTABLISHED(tp->t_state) &&
4617             rack->r_ctl.rc_app_limited_cnt &&
4618             (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
4619             ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
4620              max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
4621                 /*
4622                  * Yep there is enough outstanding to make a measurement here.
4623                  */
4624                 struct rack_sendmap *rsm, fe;
4625
4626                 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
4627                 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
4628                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4629                 rack->app_limited_needs_set = 0;
4630                 tp->gput_seq = th_ack;
4631                 if (rack->in_probe_rtt)
4632                         rack->measure_saw_probe_rtt = 1;
4633                 else if ((rack->measure_saw_probe_rtt) &&
4634                          (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
4635                         rack->measure_saw_probe_rtt = 0;
4636                 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
4637                         /* There is a full window to gain info from */
4638                         tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
4639                 } else {
4640                         /* We can only measure up to the applimited point */
4641                         tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
4642                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
4643                                 /*
4644                                  * We don't have enough to make a measurement.
4645                                  */
4646                                 tp->t_flags &= ~TF_GPUTINPROG;
4647                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
4648                                                            0, 0, 0, 6, __LINE__, NULL, quality);
4649                                 return;
4650                         }
4651                 }
4652                 if (tp->t_state >= TCPS_FIN_WAIT_1) {
4653                         /*
4654                          * We will get no more data into the SB
4655                          * this means we need to have the data available
4656                          * before we start a measurement.
4657                          */
4658                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) {
4659                                 /* Nope not enough data. */
4660                                 return;
4661                         }
4662                 }
4663                 tp->t_flags |= TF_GPUTINPROG;
4664                 /*
4665                  * Now we need to find the timestamp of the send at tp->gput_seq
4666                  * for the send based measurement.
4667                  */
4668                 fe.r_start = tp->gput_seq;
4669                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4670                 if (rsm) {
4671                         /* Ok send-based limit is set */
4672                         if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
4673                                 /*
4674                                  * Move back to include the earlier part
4675                                  * so our ack time lines up right (this may
4676                                  * make an overlapping measurement but thats
4677                                  * ok).
4678                                  */
4679                                 tp->gput_seq = rsm->r_start;
4680                         }
4681                         if (rsm->r_flags & RACK_ACKED)
4682                                 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
4683                         else
4684                                 rack->app_limited_needs_set = 1;
4685                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
4686                 } else {
4687                         /*
4688                          * If we don't find the rsm due to some
4689                          * send-limit set the current time, which
4690                          * basically disables the send-limit.
4691                          */
4692                         struct timeval tv;
4693
4694                         microuptime(&tv);
4695                         rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
4696                 }
4697                 rack_log_pacing_delay_calc(rack,
4698                                            tp->gput_seq,
4699                                            tp->gput_ack,
4700                                            (uint64_t)rsm,
4701                                            tp->gput_ts,
4702                                            rack->r_ctl.rc_app_limited_cnt,
4703                                            9,
4704                                            __LINE__, NULL, quality);
4705         }
4706 }
4707
4708 /*
4709  * CC wrapper hook functions
4710  */
4711 static void
4712 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
4713     uint16_t type, int32_t recovery)
4714 {
4715         uint32_t prior_cwnd, acked;
4716         struct tcp_log_buffer *lgb = NULL;
4717         uint8_t labc_to_use, quality;
4718
4719         INP_WLOCK_ASSERT(tp->t_inpcb);
4720         tp->ccv->nsegs = nsegs;
4721         acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una);
4722         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
4723                 uint32_t max;
4724
4725                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
4726                 if (tp->ccv->bytes_this_ack > max) {
4727                         tp->ccv->bytes_this_ack = max;
4728                 }
4729         }
4730 #ifdef STATS
4731         stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
4732             ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
4733 #endif
4734         quality = RACK_QUALITY_NONE;
4735         if ((tp->t_flags & TF_GPUTINPROG) &&
4736             rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
4737                 /* Measure the Goodput */
4738                 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
4739 #ifdef NETFLIX_PEAKRATE
4740                 if ((type == CC_ACK) &&
4741                     (tp->t_maxpeakrate)) {
4742                         /*
4743                          * We update t_peakrate_thr. This gives us roughly
4744                          * one update per round trip time. Note
4745                          * it will only be used if pace_always is off i.e
4746                          * we don't do this for paced flows.
4747                          */
4748                         rack_update_peakrate_thr(tp);
4749                 }
4750 #endif
4751         }
4752         /* Which way our we limited, if not cwnd limited no advance in CA */
4753         if (tp->snd_cwnd <= tp->snd_wnd)
4754                 tp->ccv->flags |= CCF_CWND_LIMITED;
4755         else
4756                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
4757         if (tp->snd_cwnd > tp->snd_ssthresh) {
4758                 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
4759                          nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
4760                 /* For the setting of a window past use the actual scwnd we are using */
4761                 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
4762                         tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
4763                         tp->ccv->flags |= CCF_ABC_SENTAWND;
4764                 }
4765         } else {
4766                 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
4767                 tp->t_bytes_acked = 0;
4768         }
4769         prior_cwnd = tp->snd_cwnd;
4770         if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
4771             (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
4772                 labc_to_use = rack->rc_labc;
4773         else
4774                 labc_to_use = rack_max_abc_post_recovery;
4775         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4776                 union tcp_log_stackspecific log;
4777                 struct timeval tv;
4778
4779                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4780                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4781                 log.u_bbr.flex1 = th_ack;
4782                 log.u_bbr.flex2 = tp->ccv->flags;
4783                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4784                 log.u_bbr.flex4 = tp->ccv->nsegs;
4785                 log.u_bbr.flex5 = labc_to_use;
4786                 log.u_bbr.flex6 = prior_cwnd;
4787                 log.u_bbr.flex7 = V_tcp_do_newsack;
4788                 log.u_bbr.flex8 = 1;
4789                 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4790                                      0, &log, false, NULL, NULL, 0, &tv);
4791         }
4792         if (CC_ALGO(tp)->ack_received != NULL) {
4793                 /* XXXLAS: Find a way to live without this */
4794                 tp->ccv->curack = th_ack;
4795                 tp->ccv->labc = labc_to_use;
4796                 tp->ccv->flags |= CCF_USE_LOCAL_ABC;
4797                 CC_ALGO(tp)->ack_received(tp->ccv, type);
4798         }
4799         if (lgb) {
4800                 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
4801         }
4802         if (rack->r_must_retran) {
4803                 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
4804                         /*
4805                          * We now are beyond the rxt point so lets disable
4806                          * the flag.
4807                          */
4808                         rack->r_ctl.rc_out_at_rto = 0;
4809                         rack->r_must_retran = 0;
4810                 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
4811                         /*
4812                          * Only decrement the rc_out_at_rto if the cwnd advances
4813                          * at least a whole segment. Otherwise next time the peer
4814                          * acks, we won't be able to send this generaly happens
4815                          * when we are in Congestion Avoidance.
4816                          */
4817                         if (acked <= rack->r_ctl.rc_out_at_rto){
4818                                 rack->r_ctl.rc_out_at_rto -= acked;
4819                         } else {
4820                                 rack->r_ctl.rc_out_at_rto = 0;
4821                         }
4822                 }
4823         }
4824 #ifdef STATS
4825         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
4826 #endif
4827         if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
4828                 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
4829         }
4830 #ifdef NETFLIX_PEAKRATE
4831         /* we enforce max peak rate if it is set and we are not pacing */
4832         if ((rack->rc_always_pace == 0) &&
4833             tp->t_peakrate_thr &&
4834             (tp->snd_cwnd > tp->t_peakrate_thr)) {
4835                 tp->snd_cwnd = tp->t_peakrate_thr;
4836         }
4837 #endif
4838 }
4839
4840 static void
4841 tcp_rack_partialack(struct tcpcb *tp)
4842 {
4843         struct tcp_rack *rack;
4844
4845         rack = (struct tcp_rack *)tp->t_fb_ptr;
4846         INP_WLOCK_ASSERT(tp->t_inpcb);
4847         /*
4848          * If we are doing PRR and have enough
4849          * room to send <or> we are pacing and prr
4850          * is disabled we will want to see if we
4851          * can send data (by setting r_wanted_output to
4852          * true).
4853          */
4854         if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
4855             rack->rack_no_prr)
4856                 rack->r_wanted_output = 1;
4857 }
4858
4859 static void
4860 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
4861 {
4862         struct tcp_rack *rack;
4863         uint32_t orig_cwnd;
4864
4865         orig_cwnd = tp->snd_cwnd;
4866         INP_WLOCK_ASSERT(tp->t_inpcb);
4867         rack = (struct tcp_rack *)tp->t_fb_ptr;
4868         /* only alert CC if we alerted when we entered */
4869         if (CC_ALGO(tp)->post_recovery != NULL) {
4870                 tp->ccv->curack = th_ack;
4871                 CC_ALGO(tp)->post_recovery(tp->ccv);
4872                 if (tp->snd_cwnd < tp->snd_ssthresh) {
4873                         /*
4874                          * Rack has burst control and pacing
4875                          * so lets not set this any lower than
4876                          * snd_ssthresh per RFC-6582 (option 2).
4877                          */
4878                         tp->snd_cwnd = tp->snd_ssthresh;
4879                 }
4880         }
4881         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4882                 union tcp_log_stackspecific log;
4883                 struct timeval tv;
4884
4885                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4886                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4887                 log.u_bbr.flex1 = th_ack;
4888                 log.u_bbr.flex2 = tp->ccv->flags;
4889                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4890                 log.u_bbr.flex4 = tp->ccv->nsegs;
4891                 log.u_bbr.flex5 = V_tcp_abc_l_var;
4892                 log.u_bbr.flex6 = orig_cwnd;
4893                 log.u_bbr.flex7 = V_tcp_do_newsack;
4894                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
4895                 log.u_bbr.flex8 = 2;
4896                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4897                                0, &log, false, NULL, NULL, 0, &tv);
4898         }
4899         if ((rack->rack_no_prr == 0) &&
4900             (rack->no_prr_addback == 0) &&
4901             (rack->r_ctl.rc_prr_sndcnt > 0)) {
4902                 /*
4903                  * Suck the next prr cnt back into cwnd, but
4904                  * only do that if we are not application limited.
4905                  */
4906                 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
4907                         /*
4908                          * We are allowed to add back to the cwnd the amount we did
4909                          * not get out if:
4910                          * a) no_prr_addback is off.
4911                          * b) we are not app limited
4912                          * c) we are doing prr
4913                          * <and>
4914                          * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
4915                          */
4916                         tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
4917                                             rack->r_ctl.rc_prr_sndcnt);
4918                 }
4919                 rack->r_ctl.rc_prr_sndcnt = 0;
4920                 rack_log_to_prr(rack, 1, 0);
4921         }
4922         rack_log_to_prr(rack, 14, orig_cwnd);
4923         tp->snd_recover = tp->snd_una;
4924         if (rack->r_ctl.dsack_persist) {
4925                 rack->r_ctl.dsack_persist--;
4926                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
4927                         rack->r_ctl.num_dsack = 0;
4928                 }
4929                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
4930         }
4931         EXIT_RECOVERY(tp->t_flags);
4932 }
4933
4934 static void
4935 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack)
4936 {
4937         struct tcp_rack *rack;
4938         uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
4939
4940         INP_WLOCK_ASSERT(tp->t_inpcb);
4941 #ifdef STATS
4942         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
4943 #endif
4944         if (IN_RECOVERY(tp->t_flags) == 0) {
4945                 in_rec_at_entry = 0;
4946                 ssthresh_enter = tp->snd_ssthresh;
4947                 cwnd_enter = tp->snd_cwnd;
4948         } else
4949                 in_rec_at_entry = 1;
4950         rack = (struct tcp_rack *)tp->t_fb_ptr;
4951         switch (type) {
4952         case CC_NDUPACK:
4953                 tp->t_flags &= ~TF_WASFRECOVERY;
4954                 tp->t_flags &= ~TF_WASCRECOVERY;
4955                 if (!IN_FASTRECOVERY(tp->t_flags)) {
4956                         rack->r_ctl.rc_prr_delivered = 0;
4957                         rack->r_ctl.rc_prr_out = 0;
4958                         if (rack->rack_no_prr == 0) {
4959                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4960                                 rack_log_to_prr(rack, 2, in_rec_at_entry);
4961                         }
4962                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
4963                         tp->snd_recover = tp->snd_max;
4964                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4965                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4966                 }
4967                 break;
4968         case CC_ECN:
4969                 if (!IN_CONGRECOVERY(tp->t_flags) ||
4970                     /*
4971                      * Allow ECN reaction on ACK to CWR, if
4972                      * that data segment was also CE marked.
4973                      */
4974                     SEQ_GEQ(ack, tp->snd_recover)) {
4975                         EXIT_CONGRECOVERY(tp->t_flags);
4976                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
4977                         tp->snd_recover = tp->snd_max + 1;
4978                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4979                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4980                 }
4981                 break;
4982         case CC_RTO:
4983                 tp->t_dupacks = 0;
4984                 tp->t_bytes_acked = 0;
4985                 EXIT_RECOVERY(tp->t_flags);
4986                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
4987                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
4988                 orig_cwnd = tp->snd_cwnd;
4989                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
4990                 rack_log_to_prr(rack, 16, orig_cwnd);
4991                 if (tp->t_flags2 & TF2_ECN_PERMIT)
4992                         tp->t_flags2 |= TF2_ECN_SND_CWR;
4993                 break;
4994         case CC_RTO_ERR:
4995                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
4996                 /* RTO was unnecessary, so reset everything. */
4997                 tp->snd_cwnd = tp->snd_cwnd_prev;
4998                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
4999                 tp->snd_recover = tp->snd_recover_prev;
5000                 if (tp->t_flags & TF_WASFRECOVERY) {
5001                         ENTER_FASTRECOVERY(tp->t_flags);
5002                         tp->t_flags &= ~TF_WASFRECOVERY;
5003                 }
5004                 if (tp->t_flags & TF_WASCRECOVERY) {
5005                         ENTER_CONGRECOVERY(tp->t_flags);
5006                         tp->t_flags &= ~TF_WASCRECOVERY;
5007                 }
5008                 tp->snd_nxt = tp->snd_max;
5009                 tp->t_badrxtwin = 0;
5010                 break;
5011         }
5012         if ((CC_ALGO(tp)->cong_signal != NULL)  &&
5013             (type != CC_RTO)){
5014                 tp->ccv->curack = ack;
5015                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
5016         }
5017         if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
5018                 rack_log_to_prr(rack, 15, cwnd_enter);
5019                 rack->r_ctl.dsack_byte_cnt = 0;
5020                 rack->r_ctl.retran_during_recovery = 0;
5021                 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
5022                 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
5023                 rack->r_ent_rec_ns = 1;
5024         }
5025 }
5026
5027 static inline void
5028 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
5029 {
5030         uint32_t i_cwnd;
5031
5032         INP_WLOCK_ASSERT(tp->t_inpcb);
5033
5034 #ifdef NETFLIX_STATS
5035         KMOD_TCPSTAT_INC(tcps_idle_restarts);
5036         if (tp->t_state == TCPS_ESTABLISHED)
5037                 KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
5038 #endif
5039         if (CC_ALGO(tp)->after_idle != NULL)
5040                 CC_ALGO(tp)->after_idle(tp->ccv);
5041
5042         if (tp->snd_cwnd == 1)
5043                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
5044         else
5045                 i_cwnd = rc_init_window(rack);
5046
5047         /*
5048          * Being idle is no differnt than the initial window. If the cc
5049          * clamps it down below the initial window raise it to the initial
5050          * window.
5051          */
5052         if (tp->snd_cwnd < i_cwnd) {
5053                 tp->snd_cwnd = i_cwnd;
5054         }
5055 }
5056
5057 /*
5058  * Indicate whether this ack should be delayed.  We can delay the ack if
5059  * following conditions are met:
5060  *      - There is no delayed ack timer in progress.
5061  *      - Our last ack wasn't a 0-sized window. We never want to delay
5062  *        the ack that opens up a 0-sized window.
5063  *      - LRO wasn't used for this segment. We make sure by checking that the
5064  *        segment size is not larger than the MSS.
5065  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
5066  *        connection.
5067  */
5068 #define DELAY_ACK(tp, tlen)                      \
5069         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
5070         ((tp->t_flags & TF_DELACK) == 0) &&      \
5071         (tlen <= tp->t_maxseg) &&                \
5072         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
5073
5074 static struct rack_sendmap *
5075 rack_find_lowest_rsm(struct tcp_rack *rack)
5076 {
5077         struct rack_sendmap *rsm;
5078
5079         /*
5080          * Walk the time-order transmitted list looking for an rsm that is
5081          * not acked. This will be the one that was sent the longest time
5082          * ago that is still outstanding.
5083          */
5084         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
5085                 if (rsm->r_flags & RACK_ACKED) {
5086                         continue;
5087                 }
5088                 goto finish;
5089         }
5090 finish:
5091         return (rsm);
5092 }
5093
5094 static struct rack_sendmap *
5095 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
5096 {
5097         struct rack_sendmap *prsm;
5098
5099         /*
5100          * Walk the sequence order list backward until we hit and arrive at
5101          * the highest seq not acked. In theory when this is called it
5102          * should be the last segment (which it was not).
5103          */
5104         counter_u64_add(rack_find_high, 1);
5105         prsm = rsm;
5106         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
5107                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
5108                         continue;
5109                 }
5110                 return (prsm);
5111         }
5112         return (NULL);
5113 }
5114
5115 static uint32_t
5116 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
5117 {
5118         int32_t lro;
5119         uint32_t thresh;
5120
5121         /*
5122          * lro is the flag we use to determine if we have seen reordering.
5123          * If it gets set we have seen reordering. The reorder logic either
5124          * works in one of two ways:
5125          *
5126          * If reorder-fade is configured, then we track the last time we saw
5127          * re-ordering occur. If we reach the point where enough time as
5128          * passed we no longer consider reordering has occuring.
5129          *
5130          * Or if reorder-face is 0, then once we see reordering we consider
5131          * the connection to alway be subject to reordering and just set lro
5132          * to 1.
5133          *
5134          * In the end if lro is non-zero we add the extra time for
5135          * reordering in.
5136          */
5137         if (srtt == 0)
5138                 srtt = 1;
5139         if (rack->r_ctl.rc_reorder_ts) {
5140                 if (rack->r_ctl.rc_reorder_fade) {
5141                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
5142                                 lro = cts - rack->r_ctl.rc_reorder_ts;
5143                                 if (lro == 0) {
5144                                         /*
5145                                          * No time as passed since the last
5146                                          * reorder, mark it as reordering.
5147                                          */
5148                                         lro = 1;
5149                                 }
5150                         } else {
5151                                 /* Negative time? */
5152                                 lro = 0;
5153                         }
5154                         if (lro > rack->r_ctl.rc_reorder_fade) {
5155                                 /* Turn off reordering seen too */
5156                                 rack->r_ctl.rc_reorder_ts = 0;
5157                                 lro = 0;
5158                         }
5159                 } else {
5160                         /* Reodering does not fade */
5161                         lro = 1;
5162                 }
5163         } else {
5164                 lro = 0;
5165         }
5166         if (rack->rc_rack_tmr_std_based == 0) {
5167                 thresh = srtt + rack->r_ctl.rc_pkt_delay;
5168         } else {
5169                 /* Standards based pkt-delay is 1/4 srtt */
5170                 thresh = srtt +  (srtt >> 2);
5171         }
5172         if (lro && (rack->rc_rack_tmr_std_based == 0)) {
5173                 /* It must be set, if not you get 1/4 rtt */
5174                 if (rack->r_ctl.rc_reorder_shift)
5175                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
5176                 else
5177                         thresh += (srtt >> 2);
5178         }
5179         if (rack->rc_rack_use_dsack &&
5180             lro &&
5181             (rack->r_ctl.num_dsack > 0)) {
5182                 /*
5183                  * We only increase the reordering window if we
5184                  * have seen reordering <and> we have a DSACK count.
5185                  */
5186                 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
5187                 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
5188         }
5189         /* SRTT * 2 is the ceiling */
5190         if (thresh > (srtt * 2)) {
5191                 thresh = srtt * 2;
5192         }
5193         /* And we don't want it above the RTO max either */
5194         if (thresh > rack_rto_max) {
5195                 thresh = rack_rto_max;
5196         }
5197         rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
5198         return (thresh);
5199 }
5200
5201 static uint32_t
5202 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
5203                      struct rack_sendmap *rsm, uint32_t srtt)
5204 {
5205         struct rack_sendmap *prsm;
5206         uint32_t thresh, len;
5207         int segsiz;
5208
5209         if (srtt == 0)
5210                 srtt = 1;
5211         if (rack->r_ctl.rc_tlp_threshold)
5212                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
5213         else
5214                 thresh = (srtt * 2);
5215
5216         /* Get the previous sent packet, if any */
5217         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5218         counter_u64_add(rack_enter_tlp_calc, 1);
5219         len = rsm->r_end - rsm->r_start;
5220         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
5221                 /* Exactly like the ID */
5222                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
5223                         uint32_t alt_thresh;
5224                         /*
5225                          * Compensate for delayed-ack with the d-ack time.
5226                          */
5227                         counter_u64_add(rack_used_tlpmethod, 1);
5228                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5229                         if (alt_thresh > thresh)
5230                                 thresh = alt_thresh;
5231                 }
5232         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
5233                 /* 2.1 behavior */
5234                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
5235                 if (prsm && (len <= segsiz)) {
5236                         /*
5237                          * Two packets outstanding, thresh should be (2*srtt) +
5238                          * possible inter-packet delay (if any).
5239                          */
5240                         uint32_t inter_gap = 0;
5241                         int idx, nidx;
5242
5243                         counter_u64_add(rack_used_tlpmethod, 1);
5244                         idx = rsm->r_rtr_cnt - 1;
5245                         nidx = prsm->r_rtr_cnt - 1;
5246                         if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
5247                                 /* Yes it was sent later (or at the same time) */
5248                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
5249                         }
5250                         thresh += inter_gap;
5251                 } else if (len <= segsiz) {
5252                         /*
5253                          * Possibly compensate for delayed-ack.
5254                          */
5255                         uint32_t alt_thresh;
5256
5257                         counter_u64_add(rack_used_tlpmethod2, 1);
5258                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5259                         if (alt_thresh > thresh)
5260                                 thresh = alt_thresh;
5261                 }
5262         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
5263                 /* 2.2 behavior */
5264                 if (len <= segsiz) {
5265                         uint32_t alt_thresh;
5266                         /*
5267                          * Compensate for delayed-ack with the d-ack time.
5268                          */
5269                         counter_u64_add(rack_used_tlpmethod, 1);
5270                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5271                         if (alt_thresh > thresh)
5272                                 thresh = alt_thresh;
5273                 }
5274         }
5275         /* Not above an RTO */
5276         if (thresh > tp->t_rxtcur) {
5277                 thresh = tp->t_rxtcur;
5278         }
5279         /* Not above a RTO max */
5280         if (thresh > rack_rto_max) {
5281                 thresh = rack_rto_max;
5282         }
5283         /* Apply user supplied min TLP */
5284         if (thresh < rack_tlp_min) {
5285                 thresh = rack_tlp_min;
5286         }
5287         return (thresh);
5288 }
5289
5290 static uint32_t
5291 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
5292 {
5293         /*
5294          * We want the rack_rtt which is the
5295          * last rtt we measured. However if that
5296          * does not exist we fallback to the srtt (which
5297          * we probably will never do) and then as a last
5298          * resort we use RACK_INITIAL_RTO if no srtt is
5299          * yet set.
5300          */
5301         if (rack->rc_rack_rtt)
5302                 return (rack->rc_rack_rtt);
5303         else if (tp->t_srtt == 0)
5304                 return (RACK_INITIAL_RTO);
5305         return (tp->t_srtt);
5306 }
5307
5308 static struct rack_sendmap *
5309 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
5310 {
5311         /*
5312          * Check to see that we don't need to fall into recovery. We will
5313          * need to do so if our oldest transmit is past the time we should
5314          * have had an ack.
5315          */
5316         struct tcp_rack *rack;
5317         struct rack_sendmap *rsm;
5318         int32_t idx;
5319         uint32_t srtt, thresh;
5320
5321         rack = (struct tcp_rack *)tp->t_fb_ptr;
5322         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
5323                 return (NULL);
5324         }
5325         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5326         if (rsm == NULL)
5327                 return (NULL);
5328
5329         if (rsm->r_flags & RACK_ACKED) {
5330                 rsm = rack_find_lowest_rsm(rack);
5331                 if (rsm == NULL)
5332                         return (NULL);
5333         }
5334         idx = rsm->r_rtr_cnt - 1;
5335         srtt = rack_grab_rtt(tp, rack);
5336         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
5337         if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
5338                 return (NULL);
5339         }
5340         if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
5341                 return (NULL);
5342         }
5343         /* Ok if we reach here we are over-due and this guy can be sent */
5344         if (IN_RECOVERY(tp->t_flags) == 0) {
5345                 /*
5346                  * For the one that enters us into recovery record undo
5347                  * info.
5348                  */
5349                 rack->r_ctl.rc_rsm_start = rsm->r_start;
5350                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
5351                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
5352         }
5353         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
5354         return (rsm);
5355 }
5356
5357 static uint32_t
5358 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
5359 {
5360         int32_t t;
5361         int32_t tt;
5362         uint32_t ret_val;
5363
5364         t = (tp->t_srtt + (tp->t_rttvar << 2));
5365         RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
5366             rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
5367         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
5368         ret_val = (uint32_t)tt;
5369         return (ret_val);
5370 }
5371
5372 static uint32_t
5373 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
5374 {
5375         /*
5376          * Start the FR timer, we do this based on getting the first one in
5377          * the rc_tmap. Note that if its NULL we must stop the timer. in all
5378          * events we need to stop the running timer (if its running) before
5379          * starting the new one.
5380          */
5381         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
5382         uint32_t srtt_cur;
5383         int32_t idx;
5384         int32_t is_tlp_timer = 0;
5385         struct rack_sendmap *rsm;
5386
5387         if (rack->t_timers_stopped) {
5388                 /* All timers have been stopped none are to run */
5389                 return (0);
5390         }
5391         if (rack->rc_in_persist) {
5392                 /* We can't start any timer in persists */
5393                 return (rack_get_persists_timer_val(tp, rack));
5394         }
5395         rack->rc_on_min_to = 0;
5396         if ((tp->t_state < TCPS_ESTABLISHED) ||
5397             ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
5398                 goto activate_rxt;
5399         }
5400         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5401         if ((rsm == NULL) || sup_rack) {
5402                 /* Nothing on the send map or no rack */
5403 activate_rxt:
5404                 time_since_sent = 0;
5405                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5406                 if (rsm) {
5407                         /*
5408                          * Should we discount the RTX timer any?
5409                          *
5410                          * We want to discount it the smallest amount.
5411                          * If a timer (Rack/TLP or RXT) has gone off more
5412                          * recently thats the discount we want to use (now - timer time).
5413                          * If the retransmit of the oldest packet was more recent then
5414                          * we want to use that (now - oldest-packet-last_transmit_time).
5415                          *
5416                          */
5417                         idx = rsm->r_rtr_cnt - 1;
5418                         if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
5419                                 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5420                         else
5421                                 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5422                         if (TSTMP_GT(cts, tstmp_touse))
5423                             time_since_sent = cts - tstmp_touse;
5424                 }
5425                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
5426                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
5427                         to = tp->t_rxtcur;
5428                         if (to > time_since_sent)
5429                                 to -= time_since_sent;
5430                         else
5431                                 to = rack->r_ctl.rc_min_to;
5432                         if (to == 0)
5433                                 to = 1;
5434                         /* Special case for KEEPINIT */
5435                         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
5436                             (TP_KEEPINIT(tp) != 0) &&
5437                             rsm) {
5438                                 /*
5439                                  * We have to put a ceiling on the rxt timer
5440                                  * of the keep-init timeout.
5441                                  */
5442                                 uint32_t max_time, red;
5443
5444                                 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
5445                                 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
5446                                         red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
5447                                         if (red < max_time)
5448                                                 max_time -= red;
5449                                         else
5450                                                 max_time = 1;
5451                                 }
5452                                 /* Reduce timeout to the keep value if needed */
5453                                 if (max_time < to)
5454                                         to = max_time;
5455                         }
5456                         return (to);
5457                 }
5458                 return (0);
5459         }
5460         if (rsm->r_flags & RACK_ACKED) {
5461                 rsm = rack_find_lowest_rsm(rack);
5462                 if (rsm == NULL) {
5463                         /* No lowest? */
5464                         goto activate_rxt;
5465                 }
5466         }
5467         if (rack->sack_attack_disable) {
5468                 /*
5469                  * We don't want to do
5470                  * any TLP's if you are an attacker.
5471                  * Though if you are doing what
5472                  * is expected you may still have
5473                  * SACK-PASSED marks.
5474                  */
5475                 goto activate_rxt;
5476         }
5477         /* Convert from ms to usecs */
5478         if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
5479                 if ((tp->t_flags & TF_SENTFIN) &&
5480                     ((tp->snd_max - tp->snd_una) == 1) &&
5481                     (rsm->r_flags & RACK_HAS_FIN)) {
5482                         /*
5483                          * We don't start a rack timer if all we have is a
5484                          * FIN outstanding.
5485                          */
5486                         goto activate_rxt;
5487                 }
5488                 if ((rack->use_rack_rr == 0) &&
5489                     (IN_FASTRECOVERY(tp->t_flags)) &&
5490                     (rack->rack_no_prr == 0) &&
5491                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
5492                         /*
5493                          * We are not cheating, in recovery  and
5494                          * not enough ack's to yet get our next
5495                          * retransmission out.
5496                          *
5497                          * Note that classified attackers do not
5498                          * get to use the rack-cheat.
5499                          */
5500                         goto activate_tlp;
5501                 }
5502                 srtt = rack_grab_rtt(tp, rack);
5503                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
5504                 idx = rsm->r_rtr_cnt - 1;
5505                 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
5506                 if (SEQ_GEQ(exp, cts)) {
5507                         to = exp - cts;
5508                         if (to < rack->r_ctl.rc_min_to) {
5509                                 to = rack->r_ctl.rc_min_to;
5510                                 if (rack->r_rr_config == 3)
5511                                         rack->rc_on_min_to = 1;
5512                         }
5513                 } else {
5514                         to = rack->r_ctl.rc_min_to;
5515                         if (rack->r_rr_config == 3)
5516                                 rack->rc_on_min_to = 1;
5517                 }
5518         } else {
5519                 /* Ok we need to do a TLP not RACK */
5520 activate_tlp:
5521                 if ((rack->rc_tlp_in_progress != 0) &&
5522                     (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
5523                         /*
5524                          * The previous send was a TLP and we have sent
5525                          * N TLP's without sending new data.
5526                          */
5527                         goto activate_rxt;
5528                 }
5529                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
5530                 if (rsm == NULL) {
5531                         /* We found no rsm to TLP with. */
5532                         goto activate_rxt;
5533                 }
5534                 if (rsm->r_flags & RACK_HAS_FIN) {
5535                         /* If its a FIN we dont do TLP */
5536                         rsm = NULL;
5537                         goto activate_rxt;
5538                 }
5539                 idx = rsm->r_rtr_cnt - 1;
5540                 time_since_sent = 0;
5541                 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
5542                         tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5543                 else
5544                         tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5545                 if (TSTMP_GT(cts, tstmp_touse))
5546                     time_since_sent = cts - tstmp_touse;
5547                 is_tlp_timer = 1;
5548                 if (tp->t_srtt) {
5549                         if ((rack->rc_srtt_measure_made == 0) &&
5550                             (tp->t_srtt == 1)) {
5551                                 /*
5552                                  * If another stack as run and set srtt to 1,
5553                                  * then the srtt was 0, so lets use the initial.
5554                                  */
5555                                 srtt = RACK_INITIAL_RTO;
5556                         } else {
5557                                 srtt_cur = tp->t_srtt;
5558                                 srtt = srtt_cur;
5559                         }
5560                 } else
5561                         srtt = RACK_INITIAL_RTO;
5562                 /*
5563                  * If the SRTT is not keeping up and the
5564                  * rack RTT has spiked we want to use
5565                  * the last RTT not the smoothed one.
5566                  */
5567                 if (rack_tlp_use_greater &&
5568                     tp->t_srtt &&
5569                     (srtt < rack_grab_rtt(tp, rack))) {
5570                         srtt = rack_grab_rtt(tp, rack);
5571                 }
5572                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
5573                 if (thresh > time_since_sent) {
5574                         to = thresh - time_since_sent;
5575                 } else {
5576                         to = rack->r_ctl.rc_min_to;
5577                         rack_log_alt_to_to_cancel(rack,
5578                                                   thresh,               /* flex1 */
5579                                                   time_since_sent,      /* flex2 */
5580                                                   tstmp_touse,          /* flex3 */
5581                                                   rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
5582                                                   (uint32_t)rsm->r_tim_lastsent[idx],
5583                                                   srtt,
5584                                                   idx, 99);
5585                 }
5586                 if (to < rack_tlp_min) {
5587                         to = rack_tlp_min;
5588                 }
5589                 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
5590                         /*
5591                          * If the TLP time works out to larger than the max
5592                          * RTO lets not do TLP.. just RTO.
5593                          */
5594                         goto activate_rxt;
5595                 }
5596         }
5597         if (is_tlp_timer == 0) {
5598                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
5599         } else {
5600                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
5601         }
5602         if (to == 0)
5603                 to = 1;
5604         return (to);
5605 }
5606
5607 static void
5608 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5609 {
5610         if (rack->rc_in_persist == 0) {
5611                 if (tp->t_flags & TF_GPUTINPROG) {
5612                         /*
5613                          * Stop the goodput now, the calling of the
5614                          * measurement function clears the flag.
5615                          */
5616                         rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
5617                                                     RACK_QUALITY_PERSIST);
5618                 }
5619 #ifdef NETFLIX_SHARED_CWND
5620                 if (rack->r_ctl.rc_scw) {
5621                         tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5622                         rack->rack_scwnd_is_idle = 1;
5623                 }
5624 #endif
5625                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
5626                 if (rack->r_ctl.rc_went_idle_time == 0)
5627                         rack->r_ctl.rc_went_idle_time = 1;
5628                 rack_timer_cancel(tp, rack, cts, __LINE__);
5629                 tp->t_rxtshift = 0;
5630                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5631                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5632                 rack->rc_in_persist = 1;
5633         }
5634 }
5635
5636 static void
5637 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5638 {
5639         if (rack->rc_inp->inp_in_hpts) {
5640                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
5641                 rack->r_ctl.rc_hpts_flags = 0;
5642         }
5643 #ifdef NETFLIX_SHARED_CWND
5644         if (rack->r_ctl.rc_scw) {
5645                 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5646                 rack->rack_scwnd_is_idle = 0;
5647         }
5648 #endif
5649         if (rack->rc_gp_dyn_mul &&
5650             (rack->use_fixed_rate == 0) &&
5651             (rack->rc_always_pace)) {
5652                 /*
5653                  * Do we count this as if a probe-rtt just
5654                  * finished?
5655                  */
5656                 uint32_t time_idle, idle_min;
5657
5658                 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
5659                 idle_min = rack_min_probertt_hold;
5660                 if (rack_probertt_gpsrtt_cnt_div) {
5661                         uint64_t extra;
5662                         extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
5663                                 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
5664                         extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
5665                         idle_min += (uint32_t)extra;
5666                 }
5667                 if (time_idle >= idle_min) {
5668                         /* Yes, we count it as a probe-rtt. */
5669                         uint32_t us_cts;
5670
5671                         us_cts = tcp_get_usecs(NULL);
5672                         if (rack->in_probe_rtt == 0) {
5673                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
5674                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
5675                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
5676                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
5677                         } else {
5678                                 rack_exit_probertt(rack, us_cts);
5679                         }
5680                 }
5681         }
5682         rack->rc_in_persist = 0;
5683         rack->r_ctl.rc_went_idle_time = 0;
5684         tp->t_rxtshift = 0;
5685         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5686            rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5687         rack->r_ctl.rc_agg_delayed = 0;
5688         rack->r_early = 0;
5689         rack->r_late = 0;
5690         rack->r_ctl.rc_agg_early = 0;
5691 }
5692
5693 static void
5694 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
5695                    struct hpts_diag *diag, struct timeval *tv)
5696 {
5697         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5698                 union tcp_log_stackspecific log;
5699
5700                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5701                 log.u_bbr.flex1 = diag->p_nxt_slot;
5702                 log.u_bbr.flex2 = diag->p_cur_slot;
5703                 log.u_bbr.flex3 = diag->slot_req;
5704                 log.u_bbr.flex4 = diag->inp_hptsslot;
5705                 log.u_bbr.flex5 = diag->slot_remaining;
5706                 log.u_bbr.flex6 = diag->need_new_to;
5707                 log.u_bbr.flex7 = diag->p_hpts_active;
5708                 log.u_bbr.flex8 = diag->p_on_min_sleep;
5709                 /* Hijack other fields as needed */
5710                 log.u_bbr.epoch = diag->have_slept;
5711                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
5712                 log.u_bbr.pkts_out = diag->co_ret;
5713                 log.u_bbr.applimited = diag->hpts_sleep_time;
5714                 log.u_bbr.delivered = diag->p_prev_slot;
5715                 log.u_bbr.inflight = diag->p_runningslot;
5716                 log.u_bbr.bw_inuse = diag->wheel_slot;
5717                 log.u_bbr.rttProp = diag->wheel_cts;
5718                 log.u_bbr.timeStamp = cts;
5719                 log.u_bbr.delRate = diag->maxslots;
5720                 log.u_bbr.cur_del_rate = diag->p_curtick;
5721                 log.u_bbr.cur_del_rate <<= 32;
5722                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
5723                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5724                     &rack->rc_inp->inp_socket->so_rcv,
5725                     &rack->rc_inp->inp_socket->so_snd,
5726                     BBR_LOG_HPTSDIAG, 0,
5727                     0, &log, false, tv);
5728         }
5729
5730 }
5731
5732 static void
5733 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
5734 {
5735         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5736                 union tcp_log_stackspecific log;
5737                 struct timeval tv;
5738
5739                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5740                 log.u_bbr.flex1 = sb->sb_flags;
5741                 log.u_bbr.flex2 = len;
5742                 log.u_bbr.flex3 = sb->sb_state;
5743                 log.u_bbr.flex8 = type;
5744                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5745                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5746                     &rack->rc_inp->inp_socket->so_rcv,
5747                     &rack->rc_inp->inp_socket->so_snd,
5748                     TCP_LOG_SB_WAKE, 0,
5749                     len, &log, false, &tv);
5750         }
5751 }
5752
5753 static void
5754 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
5755       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
5756 {
5757         struct hpts_diag diag;
5758         struct inpcb *inp;
5759         struct timeval tv;
5760         uint32_t delayed_ack = 0;
5761         uint32_t hpts_timeout;
5762         uint32_t entry_slot = slot;
5763         uint8_t stopped;
5764         uint32_t left = 0;
5765         uint32_t us_cts;
5766
5767         inp = tp->t_inpcb;
5768         if ((tp->t_state == TCPS_CLOSED) ||
5769             (tp->t_state == TCPS_LISTEN)) {
5770                 return;
5771         }
5772         if (inp->inp_in_hpts) {
5773                 /* Already on the pacer */
5774                 return;
5775         }
5776         stopped = rack->rc_tmr_stopped;
5777         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
5778                 left = rack->r_ctl.rc_timer_exp - cts;
5779         }
5780         rack->r_ctl.rc_timer_exp = 0;
5781         rack->r_ctl.rc_hpts_flags = 0;
5782         us_cts = tcp_get_usecs(&tv);
5783         /* Now early/late accounting */
5784         rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
5785         if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
5786                 /*
5787                  * We have a early carry over set,
5788                  * we can always add more time so we
5789                  * can always make this compensation.
5790                  *
5791                  * Note if ack's are allowed to wake us do not
5792                  * penalize the next timer for being awoke
5793                  * by an ack aka the rc_agg_early (non-paced mode).
5794                  */
5795                 slot += rack->r_ctl.rc_agg_early;
5796                 rack->r_early = 0;
5797                 rack->r_ctl.rc_agg_early = 0;
5798         }
5799         if (rack->r_late) {
5800                 /*
5801                  * This is harder, we can
5802                  * compensate some but it
5803                  * really depends on what
5804                  * the current pacing time is.
5805                  */
5806                 if (rack->r_ctl.rc_agg_delayed >= slot) {
5807                         /*
5808                          * We can't compensate for it all.
5809                          * And we have to have some time
5810                          * on the clock. We always have a min
5811                          * 10 slots (10 x 10 i.e. 100 usecs).
5812                          */
5813                         if (slot <= HPTS_TICKS_PER_SLOT) {
5814                                 /* We gain delay */
5815                                 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
5816                                 slot = HPTS_TICKS_PER_SLOT;
5817                         } else {
5818                                 /* We take off some */
5819                                 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
5820                                 slot = HPTS_TICKS_PER_SLOT;
5821                         }
5822                 } else {
5823                         slot -= rack->r_ctl.rc_agg_delayed;
5824                         rack->r_ctl.rc_agg_delayed = 0;
5825                         /* Make sure we have 100 useconds at minimum */
5826                         if (slot < HPTS_TICKS_PER_SLOT) {
5827                                 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
5828                                 slot = HPTS_TICKS_PER_SLOT;
5829                         }
5830                         if (rack->r_ctl.rc_agg_delayed == 0)
5831                                 rack->r_late = 0;
5832                 }
5833         }
5834         if (slot) {
5835                 /* We are pacing too */
5836                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
5837         }
5838         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
5839 #ifdef NETFLIX_EXP_DETECTION
5840         if (rack->sack_attack_disable &&
5841             (slot < tcp_sad_pacing_interval)) {
5842                 /*
5843                  * We have a potential attacker on
5844                  * the line. We have possibly some
5845                  * (or now) pacing time set. We want to
5846                  * slow down the processing of sacks by some
5847                  * amount (if it is an attacker). Set the default
5848                  * slot for attackers in place (unless the orginal
5849                  * interval is longer). Its stored in
5850                  * micro-seconds, so lets convert to msecs.
5851                  */
5852                 slot = tcp_sad_pacing_interval;
5853         }
5854 #endif
5855         if (tp->t_flags & TF_DELACK) {
5856                 delayed_ack = TICKS_2_USEC(tcp_delacktime);
5857                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
5858         }
5859         if (delayed_ack && ((hpts_timeout == 0) ||
5860                             (delayed_ack < hpts_timeout)))
5861                 hpts_timeout = delayed_ack;
5862         else
5863                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
5864         /*
5865          * If no timers are going to run and we will fall off the hptsi
5866          * wheel, we resort to a keep-alive timer if its configured.
5867          */
5868         if ((hpts_timeout == 0) &&
5869             (slot == 0)) {
5870                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
5871                     (tp->t_state <= TCPS_CLOSING)) {
5872                         /*
5873                          * Ok we have no timer (persists, rack, tlp, rxt  or
5874                          * del-ack), we don't have segments being paced. So
5875                          * all that is left is the keepalive timer.
5876                          */
5877                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
5878                                 /* Get the established keep-alive time */
5879                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
5880                         } else {
5881                                 /*
5882                                  * Get the initial setup keep-alive time,
5883                                  * note that this is probably not going to
5884                                  * happen, since rack will be running a rxt timer
5885                                  * if a SYN of some sort is outstanding. It is
5886                                  * actually handled in rack_timeout_rxt().
5887                                  */
5888                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
5889                         }
5890                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
5891                         if (rack->in_probe_rtt) {
5892                                 /*
5893                                  * We want to instead not wake up a long time from
5894                                  * now but to wake up about the time we would
5895                                  * exit probe-rtt and initiate a keep-alive ack.
5896                                  * This will get us out of probe-rtt and update
5897                                  * our min-rtt.
5898                                  */
5899                                 hpts_timeout = rack_min_probertt_hold;
5900                         }
5901                 }
5902         }
5903         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
5904             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
5905                 /*
5906                  * RACK, TLP, persists and RXT timers all are restartable
5907                  * based on actions input .. i.e we received a packet (ack
5908                  * or sack) and that changes things (rw, or snd_una etc).
5909                  * Thus we can restart them with a new value. For
5910                  * keep-alive, delayed_ack we keep track of what was left
5911                  * and restart the timer with a smaller value.
5912                  */
5913                 if (left < hpts_timeout)
5914                         hpts_timeout = left;
5915         }
5916         if (hpts_timeout) {
5917                 /*
5918                  * Hack alert for now we can't time-out over 2,147,483
5919                  * seconds (a bit more than 596 hours), which is probably ok
5920                  * :).
5921                  */
5922                 if (hpts_timeout > 0x7ffffffe)
5923                         hpts_timeout = 0x7ffffffe;
5924                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
5925         }
5926         rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
5927         if ((rack->gp_ready == 0) &&
5928             (rack->use_fixed_rate == 0) &&
5929             (hpts_timeout < slot) &&
5930             (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
5931                 /*
5932                  * We have no good estimate yet for the
5933                  * old clunky burst mitigation or the
5934                  * real pacing. And the tlp or rxt is smaller
5935                  * than the pacing calculation. Lets not
5936                  * pace that long since we know the calculation
5937                  * so far is not accurate.
5938                  */
5939                 slot = hpts_timeout;
5940         }
5941         rack->r_ctl.last_pacing_time = slot;
5942         /**
5943          * Turn off all the flags for queuing by default. The
5944          * flags have important meanings to what happens when
5945          * LRO interacts with the transport. Most likely (by default now)
5946          * mbuf_queueing and ack compression are on. So the transport
5947          * has a couple of flags that control what happens (if those
5948          * are not on then these flags won't have any effect since it
5949          * won't go through the queuing LRO path).
5950          *
5951          * INP_MBUF_QUEUE_READY - This flags says that I am busy
5952          *                        pacing output, so don't disturb. But
5953          *                        it also means LRO can wake me if there
5954          *                        is a SACK arrival.
5955          *
5956          * INP_DONT_SACK_QUEUE - This flag is used in conjunction
5957          *                       with the above flag (QUEUE_READY) and
5958          *                       when present it says don't even wake me
5959          *                       if a SACK arrives.
5960          *
5961          * The idea behind these flags is that if we are pacing we
5962          * set the MBUF_QUEUE_READY and only get woken up if
5963          * a SACK arrives (which could change things) or if
5964          * our pacing timer expires. If, however, we have a rack
5965          * timer running, then we don't even want a sack to wake
5966          * us since the rack timer has to expire before we can send.
5967          *
5968          * Other cases should usually have none of the flags set
5969          * so LRO can call into us.
5970          */
5971         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
5972         if (slot) {
5973                 rack->r_ctl.rc_last_output_to = us_cts + slot;
5974                 /*
5975                  * A pacing timer (slot) is being set, in
5976                  * such a case we cannot send (we are blocked by
5977                  * the timer). So lets tell LRO that it should not
5978                  * wake us unless there is a SACK. Note this only
5979                  * will be effective if mbuf queueing is on or
5980                  * compressed acks are being processed.
5981                  */
5982                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
5983                 /*
5984                  * But wait if we have a Rack timer running
5985                  * even a SACK should not disturb us (with
5986                  * the exception of r_rr_config 3).
5987                  */
5988                 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
5989                     (rack->r_rr_config != 3))
5990                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
5991                 if (rack->rc_ack_can_sendout_data) {
5992                         /*
5993                          * Ahh but wait, this is that special case
5994                          * where the pacing timer can be disturbed
5995                          * backout the changes (used for non-paced
5996                          * burst limiting).
5997                          */
5998                         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
5999                 }
6000                 if ((rack->use_rack_rr) &&
6001                     (rack->r_rr_config < 2) &&
6002                     ((hpts_timeout) && (hpts_timeout < slot))) {
6003                         /*
6004                          * Arrange for the hpts to kick back in after the
6005                          * t-o if the t-o does not cause a send.
6006                          */
6007                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
6008                                                    __LINE__, &diag);
6009                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6010                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6011                 } else {
6012                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
6013                                                    __LINE__, &diag);
6014                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6015                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
6016                 }
6017         } else if (hpts_timeout) {
6018                 /*
6019                  * With respect to inp_flags2 here, lets let any new acks wake
6020                  * us up here. Since we are not pacing (no pacing timer), output
6021                  * can happen so we should let it. If its a Rack timer, then any inbound
6022                  * packet probably won't change the sending (we will be blocked)
6023                  * but it may change the prr stats so letting it in (the set defaults
6024                  * at the start of this block) are good enough.
6025                  */
6026                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
6027                                            __LINE__, &diag);
6028                 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6029                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
6030         } else {
6031                 /* No timer starting */
6032 #ifdef INVARIANTS
6033                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
6034                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
6035                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
6036                 }
6037 #endif
6038         }
6039         rack->rc_tmr_stopped = 0;
6040         if (slot)
6041                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
6042 }
6043
6044 /*
6045  * RACK Timer, here we simply do logging and house keeping.
6046  * the normal rack_output() function will call the
6047  * appropriate thing to check if we need to do a RACK retransmit.
6048  * We return 1, saying don't proceed with rack_output only
6049  * when all timers have been stopped (destroyed PCB?).
6050  */
6051 static int
6052 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6053 {
6054         /*
6055          * This timer simply provides an internal trigger to send out data.
6056          * The check_recovery_mode call will see if there are needed
6057          * retransmissions, if so we will enter fast-recovery. The output
6058          * call may or may not do the same thing depending on sysctl
6059          * settings.
6060          */
6061         struct rack_sendmap *rsm;
6062
6063         if (tp->t_timers->tt_flags & TT_STOPPED) {
6064                 return (1);
6065         }
6066         counter_u64_add(rack_to_tot, 1);
6067         if (rack->r_state && (rack->r_state != tp->t_state))
6068                 rack_set_state(tp, rack);
6069         rack->rc_on_min_to = 0;
6070         rsm = rack_check_recovery_mode(tp, cts);
6071         rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
6072         if (rsm) {
6073                 rack->r_ctl.rc_resend = rsm;
6074                 rack->r_timer_override = 1;
6075                 if (rack->use_rack_rr) {
6076                         /*
6077                          * Don't accumulate extra pacing delay
6078                          * we are allowing the rack timer to
6079                          * over-ride pacing i.e. rrr takes precedence
6080                          * if the pacing interval is longer than the rrr
6081                          * time (in other words we get the min pacing
6082                          * time versus rrr pacing time).
6083                          */
6084                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6085                 }
6086         }
6087         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
6088         if (rsm == NULL) {
6089                 /* restart a timer and return 1 */
6090                 rack_start_hpts_timer(rack, tp, cts,
6091                                       0, 0, 0);
6092                 return (1);
6093         }
6094         return (0);
6095 }
6096
6097 static void
6098 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
6099 {
6100         if (rsm->m->m_len > rsm->orig_m_len) {
6101                 /*
6102                  * Mbuf grew, caused by sbcompress, our offset does
6103                  * not change.
6104                  */
6105                 rsm->orig_m_len = rsm->m->m_len;
6106         } else if (rsm->m->m_len < rsm->orig_m_len) {
6107                 /*
6108                  * Mbuf shrank, trimmed off the top by an ack, our
6109                  * offset changes.
6110                  */
6111                 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
6112                 rsm->orig_m_len = rsm->m->m_len;
6113         }
6114 }
6115
6116 static void
6117 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
6118 {
6119         struct mbuf *m;
6120         uint32_t soff;
6121
6122         if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
6123                 /* Fix up the orig_m_len and possibly the mbuf offset */
6124                 rack_adjust_orig_mlen(src_rsm);
6125         }
6126         m = src_rsm->m;
6127         soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
6128         while (soff >= m->m_len) {
6129                 /* Move out past this mbuf */
6130                 soff -= m->m_len;
6131                 m = m->m_next;
6132                 KASSERT((m != NULL),
6133                         ("rsm:%p nrsm:%p hit at soff:%u null m",
6134                          src_rsm, rsm, soff));
6135         }
6136         rsm->m = m;
6137         rsm->soff = soff;
6138         rsm->orig_m_len = m->m_len;
6139 }
6140
6141 static __inline void
6142 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
6143                struct rack_sendmap *rsm, uint32_t start)
6144 {
6145         int idx;
6146
6147         nrsm->r_start = start;
6148         nrsm->r_end = rsm->r_end;
6149         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
6150         nrsm->r_flags = rsm->r_flags;
6151         nrsm->r_dupack = rsm->r_dupack;
6152         nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
6153         nrsm->r_rtr_bytes = 0;
6154         nrsm->r_fas = rsm->r_fas;
6155         rsm->r_end = nrsm->r_start;
6156         nrsm->r_just_ret = rsm->r_just_ret;
6157         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
6158                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
6159         }
6160         /* Now if we have SYN flag we keep it on the left edge */
6161         if (nrsm->r_flags & RACK_HAS_SYN)
6162                 nrsm->r_flags &= ~RACK_HAS_SYN;
6163         /* Now if we have a FIN flag we keep it on the right edge */
6164         if (rsm->r_flags & RACK_HAS_FIN)
6165                 rsm->r_flags &= ~RACK_HAS_FIN;
6166         /* Push bit must go to the right edge as well */
6167         if (rsm->r_flags & RACK_HAD_PUSH)
6168                 rsm->r_flags &= ~RACK_HAD_PUSH;
6169         /* Clone over the state of the hw_tls flag */
6170         nrsm->r_hw_tls = rsm->r_hw_tls;
6171         /*
6172          * Now we need to find nrsm's new location in the mbuf chain
6173          * we basically calculate a new offset, which is soff +
6174          * how much is left in original rsm. Then we walk out the mbuf
6175          * chain to find the righ postion, it may be the same mbuf
6176          * or maybe not.
6177          */
6178         KASSERT(((rsm->m != NULL) ||
6179                  (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
6180                 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
6181         if (rsm->m)
6182                 rack_setup_offset_for_rsm(rsm, nrsm);
6183 }
6184
6185 static struct rack_sendmap *
6186 rack_merge_rsm(struct tcp_rack *rack,
6187                struct rack_sendmap *l_rsm,
6188                struct rack_sendmap *r_rsm)
6189 {
6190         /*
6191          * We are merging two ack'd RSM's,
6192          * the l_rsm is on the left (lower seq
6193          * values) and the r_rsm is on the right
6194          * (higher seq value). The simplest way
6195          * to merge these is to move the right
6196          * one into the left. I don't think there
6197          * is any reason we need to try to find
6198          * the oldest (or last oldest retransmitted).
6199          */
6200         struct rack_sendmap *rm;
6201
6202         rack_log_map_chg(rack->rc_tp, rack, NULL,
6203                          l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
6204         l_rsm->r_end = r_rsm->r_end;
6205         if (l_rsm->r_dupack < r_rsm->r_dupack)
6206                 l_rsm->r_dupack = r_rsm->r_dupack;
6207         if (r_rsm->r_rtr_bytes)
6208                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
6209         if (r_rsm->r_in_tmap) {
6210                 /* This really should not happen */
6211                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
6212                 r_rsm->r_in_tmap = 0;
6213         }
6214
6215         /* Now the flags */
6216         if (r_rsm->r_flags & RACK_HAS_FIN)
6217                 l_rsm->r_flags |= RACK_HAS_FIN;
6218         if (r_rsm->r_flags & RACK_TLP)
6219                 l_rsm->r_flags |= RACK_TLP;
6220         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
6221                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
6222         if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
6223             ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
6224                 /*
6225                  * If both are app-limited then let the
6226                  * free lower the count. If right is app
6227                  * limited and left is not, transfer.
6228                  */
6229                 l_rsm->r_flags |= RACK_APP_LIMITED;
6230                 r_rsm->r_flags &= ~RACK_APP_LIMITED;
6231                 if (r_rsm == rack->r_ctl.rc_first_appl)
6232                         rack->r_ctl.rc_first_appl = l_rsm;
6233         }
6234         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
6235 #ifdef INVARIANTS
6236         if (rm != r_rsm) {
6237                 panic("removing head in rack:%p rsm:%p rm:%p",
6238                       rack, r_rsm, rm);
6239         }
6240 #endif
6241         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
6242                 /* Transfer the split limit to the map we free */
6243                 r_rsm->r_limit_type = l_rsm->r_limit_type;
6244                 l_rsm->r_limit_type = 0;
6245         }
6246         rack_free(rack, r_rsm);
6247         return (l_rsm);
6248 }
6249
6250 /*
6251  * TLP Timer, here we simply setup what segment we want to
6252  * have the TLP expire on, the normal rack_output() will then
6253  * send it out.
6254  *
6255  * We return 1, saying don't proceed with rack_output only
6256  * when all timers have been stopped (destroyed PCB?).
6257  */
6258 static int
6259 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
6260 {
6261         /*
6262          * Tail Loss Probe.
6263          */
6264         struct rack_sendmap *rsm = NULL;
6265         struct rack_sendmap *insret;
6266         struct socket *so;
6267         uint32_t amm;
6268         uint32_t out, avail;
6269         int collapsed_win = 0;
6270
6271         if (tp->t_timers->tt_flags & TT_STOPPED) {
6272                 return (1);
6273         }
6274         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
6275                 /* Its not time yet */
6276                 return (0);
6277         }
6278         if (ctf_progress_timeout_check(tp, true)) {
6279                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6280                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6281                 return (1);
6282         }
6283         /*
6284          * A TLP timer has expired. We have been idle for 2 rtts. So we now
6285          * need to figure out how to force a full MSS segment out.
6286          */
6287         rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
6288         rack->r_ctl.retran_during_recovery = 0;
6289         rack->r_ctl.dsack_byte_cnt = 0;
6290         counter_u64_add(rack_tlp_tot, 1);
6291         if (rack->r_state && (rack->r_state != tp->t_state))
6292                 rack_set_state(tp, rack);
6293         so = tp->t_inpcb->inp_socket;
6294         avail = sbavail(&so->so_snd);
6295         out = tp->snd_max - tp->snd_una;
6296         if (out > tp->snd_wnd) {
6297                 /* special case, we need a retransmission */
6298                 collapsed_win = 1;
6299                 goto need_retran;
6300         }
6301         if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
6302                 rack->r_ctl.dsack_persist--;
6303                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6304                         rack->r_ctl.num_dsack = 0;
6305                 }
6306                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6307         }
6308         if ((tp->t_flags & TF_GPUTINPROG) &&
6309             (rack->r_ctl.rc_tlp_cnt_out == 1)) {
6310                 /*
6311                  * If this is the second in a row
6312                  * TLP and we are doing a measurement
6313                  * its time to abandon the measurement.
6314                  * Something is likely broken on
6315                  * the clients network and measuring a
6316                  * broken network does us no good.
6317                  */
6318                 tp->t_flags &= ~TF_GPUTINPROG;
6319                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6320                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6321                                            tp->gput_seq,
6322                                            0, 0, 18, __LINE__, NULL, 0);
6323         }
6324         /*
6325          * Check our send oldest always settings, and if
6326          * there is an oldest to send jump to the need_retran.
6327          */
6328         if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
6329                 goto need_retran;
6330
6331         if (avail > out) {
6332                 /* New data is available */
6333                 amm = avail - out;
6334                 if (amm > ctf_fixed_maxseg(tp)) {
6335                         amm = ctf_fixed_maxseg(tp);
6336                         if ((amm + out) > tp->snd_wnd) {
6337                                 /* We are rwnd limited */
6338                                 goto need_retran;
6339                         }
6340                 } else if (amm < ctf_fixed_maxseg(tp)) {
6341                         /* not enough to fill a MTU */
6342                         goto need_retran;
6343                 }
6344                 if (IN_FASTRECOVERY(tp->t_flags)) {
6345                         /* Unlikely */
6346                         if (rack->rack_no_prr == 0) {
6347                                 if (out + amm <= tp->snd_wnd) {
6348                                         rack->r_ctl.rc_prr_sndcnt = amm;
6349                                         rack->r_ctl.rc_tlp_new_data = amm;
6350                                         rack_log_to_prr(rack, 4, 0);
6351                                 }
6352                         } else
6353                                 goto need_retran;
6354                 } else {
6355                         /* Set the send-new override */
6356                         if (out + amm <= tp->snd_wnd)
6357                                 rack->r_ctl.rc_tlp_new_data = amm;
6358                         else
6359                                 goto need_retran;
6360                 }
6361                 rack->r_ctl.rc_tlpsend = NULL;
6362                 counter_u64_add(rack_tlp_newdata, 1);
6363                 goto send;
6364         }
6365 need_retran:
6366         /*
6367          * Ok we need to arrange the last un-acked segment to be re-sent, or
6368          * optionally the first un-acked segment.
6369          */
6370         if (collapsed_win == 0) {
6371                 if (rack_always_send_oldest)
6372                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6373                 else {
6374                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6375                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
6376                                 rsm = rack_find_high_nonack(rack, rsm);
6377                         }
6378                 }
6379                 if (rsm == NULL) {
6380                         counter_u64_add(rack_tlp_does_nada, 1);
6381 #ifdef TCP_BLACKBOX
6382                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6383 #endif
6384                         goto out;
6385                 }
6386         } else {
6387                 /*
6388                  * We must find the last segment
6389                  * that was acceptable by the client.
6390                  */
6391                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6392                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
6393                                 /* Found one */
6394                                 break;
6395                         }
6396                 }
6397                 if (rsm == NULL) {
6398                         /* None? if so send the first */
6399                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6400                         if (rsm == NULL) {
6401                                 counter_u64_add(rack_tlp_does_nada, 1);
6402 #ifdef TCP_BLACKBOX
6403                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6404 #endif
6405                                 goto out;
6406                         }
6407                 }
6408         }
6409         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
6410                 /*
6411                  * We need to split this the last segment in two.
6412                  */
6413                 struct rack_sendmap *nrsm;
6414
6415                 nrsm = rack_alloc_full_limit(rack);
6416                 if (nrsm == NULL) {
6417                         /*
6418                          * No memory to split, we will just exit and punt
6419                          * off to the RXT timer.
6420                          */
6421                         counter_u64_add(rack_tlp_does_nada, 1);
6422                         goto out;
6423                 }
6424                 rack_clone_rsm(rack, nrsm, rsm,
6425                                (rsm->r_end - ctf_fixed_maxseg(tp)));
6426                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
6427                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6428 #ifdef INVARIANTS
6429                 if (insret != NULL) {
6430                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6431                               nrsm, insret, rack, rsm);
6432                 }
6433 #endif
6434                 if (rsm->r_in_tmap) {
6435                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6436                         nrsm->r_in_tmap = 1;
6437                 }
6438                 rsm = nrsm;
6439         }
6440         rack->r_ctl.rc_tlpsend = rsm;
6441 send:
6442         /* Make sure output path knows we are doing a TLP */
6443         *doing_tlp = 1;
6444         rack->r_timer_override = 1;
6445         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6446         return (0);
6447 out:
6448         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6449         return (0);
6450 }
6451
6452 /*
6453  * Delayed ack Timer, here we simply need to setup the
6454  * ACK_NOW flag and remove the DELACK flag. From there
6455  * the output routine will send the ack out.
6456  *
6457  * We only return 1, saying don't proceed, if all timers
6458  * are stopped (destroyed PCB?).
6459  */
6460 static int
6461 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6462 {
6463         if (tp->t_timers->tt_flags & TT_STOPPED) {
6464                 return (1);
6465         }
6466         rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
6467         tp->t_flags &= ~TF_DELACK;
6468         tp->t_flags |= TF_ACKNOW;
6469         KMOD_TCPSTAT_INC(tcps_delack);
6470         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6471         return (0);
6472 }
6473
6474 /*
6475  * Persists timer, here we simply send the
6476  * same thing as a keepalive will.
6477  * the one byte send.
6478  *
6479  * We only return 1, saying don't proceed, if all timers
6480  * are stopped (destroyed PCB?).
6481  */
6482 static int
6483 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6484 {
6485         struct tcptemp *t_template;
6486         struct inpcb *inp;
6487         int32_t retval = 1;
6488
6489         inp = tp->t_inpcb;
6490
6491         if (tp->t_timers->tt_flags & TT_STOPPED) {
6492                 return (1);
6493         }
6494         if (rack->rc_in_persist == 0)
6495                 return (0);
6496         if (ctf_progress_timeout_check(tp, false)) {
6497                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6498                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6499                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
6500                 return (1);
6501         }
6502         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
6503         /*
6504          * Persistence timer into zero window. Force a byte to be output, if
6505          * possible.
6506          */
6507         KMOD_TCPSTAT_INC(tcps_persisttimeo);
6508         /*
6509          * Hack: if the peer is dead/unreachable, we do not time out if the
6510          * window is closed.  After a full backoff, drop the connection if
6511          * the idle time (no responses to probes) reaches the maximum
6512          * backoff that we would use if retransmitting.
6513          */
6514         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
6515             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
6516              TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
6517                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6518                 retval = 1;
6519                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6520                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6521                 goto out;
6522         }
6523         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
6524             tp->snd_una == tp->snd_max)
6525                 rack_exit_persist(tp, rack, cts);
6526         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
6527         /*
6528          * If the user has closed the socket then drop a persisting
6529          * connection after a much reduced timeout.
6530          */
6531         if (tp->t_state > TCPS_CLOSE_WAIT &&
6532             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
6533                 retval = 1;
6534                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6535                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6536                 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6537                 goto out;
6538         }
6539         t_template = tcpip_maketemplate(rack->rc_inp);
6540         if (t_template) {
6541                 /* only set it if we were answered */
6542                 if (rack->forced_ack == 0) {
6543                         rack->forced_ack = 1;
6544                         rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6545                 }
6546                 tcp_respond(tp, t_template->tt_ipgen,
6547                             &t_template->tt_t, (struct mbuf *)NULL,
6548                             tp->rcv_nxt, tp->snd_una - 1, 0);
6549                 /* This sends an ack */
6550                 if (tp->t_flags & TF_DELACK)
6551                         tp->t_flags &= ~TF_DELACK;
6552                 free(t_template, M_TEMP);
6553         }
6554         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
6555                 tp->t_rxtshift++;
6556 out:
6557         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
6558         rack_start_hpts_timer(rack, tp, cts,
6559                               0, 0, 0);
6560         return (retval);
6561 }
6562
6563 /*
6564  * If a keepalive goes off, we had no other timers
6565  * happening. We always return 1 here since this
6566  * routine either drops the connection or sends
6567  * out a segment with respond.
6568  */
6569 static int
6570 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6571 {
6572         struct tcptemp *t_template;
6573         struct inpcb *inp;
6574
6575         if (tp->t_timers->tt_flags & TT_STOPPED) {
6576                 return (1);
6577         }
6578         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
6579         inp = tp->t_inpcb;
6580         rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
6581         /*
6582          * Keep-alive timer went off; send something or drop connection if
6583          * idle for too long.
6584          */
6585         KMOD_TCPSTAT_INC(tcps_keeptimeo);
6586         if (tp->t_state < TCPS_ESTABLISHED)
6587                 goto dropit;
6588         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6589             tp->t_state <= TCPS_CLOSING) {
6590                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
6591                         goto dropit;
6592                 /*
6593                  * Send a packet designed to force a response if the peer is
6594                  * up and reachable: either an ACK if the connection is
6595                  * still alive, or an RST if the peer has closed the
6596                  * connection due to timeout or reboot. Using sequence
6597                  * number tp->snd_una-1 causes the transmitted zero-length
6598                  * segment to lie outside the receive window; by the
6599                  * protocol spec, this requires the correspondent TCP to
6600                  * respond.
6601                  */
6602                 KMOD_TCPSTAT_INC(tcps_keepprobe);
6603                 t_template = tcpip_maketemplate(inp);
6604                 if (t_template) {
6605                         if (rack->forced_ack == 0) {
6606                                 rack->forced_ack = 1;
6607                                 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6608                         }
6609                         tcp_respond(tp, t_template->tt_ipgen,
6610                             &t_template->tt_t, (struct mbuf *)NULL,
6611                             tp->rcv_nxt, tp->snd_una - 1, 0);
6612                         free(t_template, M_TEMP);
6613                 }
6614         }
6615         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
6616         return (1);
6617 dropit:
6618         KMOD_TCPSTAT_INC(tcps_keepdrops);
6619         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6620         tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
6621         return (1);
6622 }
6623
6624 /*
6625  * Retransmit helper function, clear up all the ack
6626  * flags and take care of important book keeping.
6627  */
6628 static void
6629 rack_remxt_tmr(struct tcpcb *tp)
6630 {
6631         /*
6632          * The retransmit timer went off, all sack'd blocks must be
6633          * un-acked.
6634          */
6635         struct rack_sendmap *rsm, *trsm = NULL;
6636         struct tcp_rack *rack;
6637
6638         rack = (struct tcp_rack *)tp->t_fb_ptr;
6639         rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
6640         rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
6641         if (rack->r_state && (rack->r_state != tp->t_state))
6642                 rack_set_state(tp, rack);
6643         /*
6644          * Ideally we would like to be able to
6645          * mark SACK-PASS on anything not acked here.
6646          *
6647          * However, if we do that we would burst out
6648          * all that data 1ms apart. This would be unwise,
6649          * so for now we will just let the normal rxt timer
6650          * and tlp timer take care of it.
6651          *
6652          * Also we really need to stick them back in sequence
6653          * order. This way we send in the proper order and any
6654          * sacks that come floating in will "re-ack" the data.
6655          * To do this we zap the tmap with an INIT and then
6656          * walk through and place every rsm in the RB tree
6657          * back in its seq ordered place.
6658          */
6659         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6660         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6661                 rsm->r_dupack = 0;
6662                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
6663                 /* We must re-add it back to the tlist */
6664                 if (trsm == NULL) {
6665                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6666                 } else {
6667                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
6668                 }
6669                 rsm->r_in_tmap = 1;
6670                 trsm = rsm;
6671                 if (rsm->r_flags & RACK_ACKED)
6672                         rsm->r_flags |= RACK_WAS_ACKED;
6673                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
6674                 rsm->r_flags |= RACK_MUST_RXT;
6675         }
6676         /* Clear the count (we just un-acked them) */
6677         rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
6678         rack->r_ctl.rc_sacked = 0;
6679         rack->r_ctl.rc_sacklast = NULL;
6680         rack->r_ctl.rc_agg_delayed = 0;
6681         rack->r_early = 0;
6682         rack->r_ctl.rc_agg_early = 0;
6683         rack->r_late = 0;
6684         /* Clear the tlp rtx mark */
6685         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6686         if (rack->r_ctl.rc_resend != NULL)
6687                 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
6688         rack->r_ctl.rc_prr_sndcnt = 0;
6689         rack_log_to_prr(rack, 6, 0);
6690         rack->r_timer_override = 1;
6691         if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
6692 #ifdef NETFLIX_EXP_DETECTION
6693             || (rack->sack_attack_disable != 0)
6694 #endif
6695                     ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
6696                 /*
6697                  * For non-sack customers new data
6698                  * needs to go out as retransmits until
6699                  * we retransmit up to snd_max.
6700                  */
6701                 rack->r_must_retran = 1;
6702                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
6703                                                 rack->r_ctl.rc_sacked);
6704         }
6705         rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
6706 }
6707
6708 static void
6709 rack_convert_rtts(struct tcpcb *tp)
6710 {
6711         if (tp->t_srtt > 1) {
6712                 uint32_t val, frac;
6713
6714                 val = tp->t_srtt >> TCP_RTT_SHIFT;
6715                 frac = tp->t_srtt & 0x1f;
6716                 tp->t_srtt = TICKS_2_USEC(val);
6717                 /*
6718                  * frac is the fractional part of the srtt (if any)
6719                  * but its in ticks and every bit represents
6720                  * 1/32nd of a hz.
6721                  */
6722                 if (frac) {
6723                         if (hz == 1000) {
6724                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6725                         } else {
6726                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6727                         }
6728                         tp->t_srtt += frac;
6729                 }
6730         }
6731         if (tp->t_rttvar) {
6732                 uint32_t val, frac;
6733
6734                 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
6735                 frac = tp->t_rttvar & 0x1f;
6736                 tp->t_rttvar = TICKS_2_USEC(val);
6737                 /*
6738                  * frac is the fractional part of the srtt (if any)
6739                  * but its in ticks and every bit represents
6740                  * 1/32nd of a hz.
6741                  */
6742                 if (frac) {
6743                         if (hz == 1000) {
6744                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6745                         } else {
6746                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6747                         }
6748                         tp->t_rttvar += frac;
6749                 }
6750         }
6751         tp->t_rxtcur = RACK_REXMTVAL(tp);
6752         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6753                 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
6754         }
6755         if (tp->t_rxtcur > rack_rto_max) {
6756                 tp->t_rxtcur = rack_rto_max;
6757         }
6758 }
6759
6760 static void
6761 rack_cc_conn_init(struct tcpcb *tp)
6762 {
6763         struct tcp_rack *rack;
6764         uint32_t srtt;
6765
6766         rack = (struct tcp_rack *)tp->t_fb_ptr;
6767         srtt = tp->t_srtt;
6768         cc_conn_init(tp);
6769         /*
6770          * Now convert to rack's internal format,
6771          * if required.
6772          */
6773         if ((srtt == 0) && (tp->t_srtt != 0))
6774                 rack_convert_rtts(tp);
6775         /*
6776          * We want a chance to stay in slowstart as
6777          * we create a connection. TCP spec says that
6778          * initially ssthresh is infinite. For our
6779          * purposes that is the snd_wnd.
6780          */
6781         if (tp->snd_ssthresh < tp->snd_wnd) {
6782                 tp->snd_ssthresh = tp->snd_wnd;
6783         }
6784         /*
6785          * We also want to assure a IW worth of
6786          * data can get inflight.
6787          */
6788         if (rc_init_window(rack) < tp->snd_cwnd)
6789                 tp->snd_cwnd = rc_init_window(rack);
6790 }
6791
6792 /*
6793  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
6794  * we will setup to retransmit the lowest seq number outstanding.
6795  */
6796 static int
6797 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6798 {
6799         int32_t rexmt;
6800         struct inpcb *inp;
6801         int32_t retval = 0;
6802         bool isipv6;
6803
6804         inp = tp->t_inpcb;
6805         if (tp->t_timers->tt_flags & TT_STOPPED) {
6806                 return (1);
6807         }
6808         if ((tp->t_flags & TF_GPUTINPROG) &&
6809             (tp->t_rxtshift)) {
6810                 /*
6811                  * We have had a second timeout
6812                  * measurements on successive rxt's are not profitable.
6813                  * It is unlikely to be of any use (the network is
6814                  * broken or the client went away).
6815                  */
6816                 tp->t_flags &= ~TF_GPUTINPROG;
6817                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6818                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6819                                            tp->gput_seq,
6820                                            0, 0, 18, __LINE__, NULL, 0);
6821         }
6822         if (ctf_progress_timeout_check(tp, false)) {
6823                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6824                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6825                 tcp_set_inp_to_drop(inp, ETIMEDOUT);
6826                 return (1);
6827         }
6828         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
6829         rack->r_ctl.retran_during_recovery = 0;
6830         rack->r_ctl.dsack_byte_cnt = 0;
6831         if (IN_FASTRECOVERY(tp->t_flags))
6832                 tp->t_flags |= TF_WASFRECOVERY;
6833         else
6834                 tp->t_flags &= ~TF_WASFRECOVERY;
6835         if (IN_CONGRECOVERY(tp->t_flags))
6836                 tp->t_flags |= TF_WASCRECOVERY;
6837         else
6838                 tp->t_flags &= ~TF_WASCRECOVERY;
6839         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
6840             (tp->snd_una == tp->snd_max)) {
6841                 /* Nothing outstanding .. nothing to do */
6842                 return (0);
6843         }
6844         if (rack->r_ctl.dsack_persist) {
6845                 rack->r_ctl.dsack_persist--;
6846                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6847                         rack->r_ctl.num_dsack = 0;
6848                 }
6849                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6850         }
6851         /*
6852          * Rack can only run one timer  at a time, so we cannot
6853          * run a KEEPINIT (gating SYN sending) and a retransmit
6854          * timer for the SYN. So if we are in a front state and
6855          * have a KEEPINIT timer we need to check the first transmit
6856          * against now to see if we have exceeded the KEEPINIT time
6857          * (if one is set).
6858          */
6859         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6860             (TP_KEEPINIT(tp) != 0)) {
6861                 struct rack_sendmap *rsm;
6862
6863                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6864                 if (rsm) {
6865                         /* Ok we have something outstanding to test keepinit with */
6866                         if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
6867                             ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
6868                                 /* We have exceeded the KEEPINIT time */
6869                                 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6870                                 goto drop_it;
6871                         }
6872                 }
6873         }
6874         /*
6875          * Retransmission timer went off.  Message has not been acked within
6876          * retransmit interval.  Back off to a longer retransmit interval
6877          * and retransmit one segment.
6878          */
6879         rack_remxt_tmr(tp);
6880         if ((rack->r_ctl.rc_resend == NULL) ||
6881             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
6882                 /*
6883                  * If the rwnd collapsed on
6884                  * the one we are retransmitting
6885                  * it does not count against the
6886                  * rxt count.
6887                  */
6888                 tp->t_rxtshift++;
6889         }
6890         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
6891                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6892 drop_it:
6893                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
6894                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
6895                 retval = 1;
6896                 tcp_set_inp_to_drop(rack->rc_inp,
6897                     (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
6898                 goto out;
6899         }
6900         if (tp->t_state == TCPS_SYN_SENT) {
6901                 /*
6902                  * If the SYN was retransmitted, indicate CWND to be limited
6903                  * to 1 segment in cc_conn_init().
6904                  */
6905                 tp->snd_cwnd = 1;
6906         } else if (tp->t_rxtshift == 1) {
6907                 /*
6908                  * first retransmit; record ssthresh and cwnd so they can be
6909                  * recovered if this turns out to be a "bad" retransmit. A
6910                  * retransmit is considered "bad" if an ACK for this segment
6911                  * is received within RTT/2 interval; the assumption here is
6912                  * that the ACK was already in flight.  See "On Estimating
6913                  * End-to-End Network Path Properties" by Allman and Paxson
6914                  * for more details.
6915                  */
6916                 tp->snd_cwnd_prev = tp->snd_cwnd;
6917                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
6918                 tp->snd_recover_prev = tp->snd_recover;
6919                 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
6920                 tp->t_flags |= TF_PREVVALID;
6921         } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
6922                 tp->t_flags &= ~TF_PREVVALID;
6923         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
6924         if ((tp->t_state == TCPS_SYN_SENT) ||
6925             (tp->t_state == TCPS_SYN_RECEIVED))
6926                 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
6927         else
6928                 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
6929
6930         RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
6931            max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
6932         /*
6933          * We enter the path for PLMTUD if connection is established or, if
6934          * connection is FIN_WAIT_1 status, reason for the last is that if
6935          * amount of data we send is very small, we could send it in couple
6936          * of packets and process straight to FIN. In that case we won't
6937          * catch ESTABLISHED state.
6938          */
6939 #ifdef INET6
6940         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
6941 #else
6942         isipv6 = false;
6943 #endif
6944         if (((V_tcp_pmtud_blackhole_detect == 1) ||
6945             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
6946             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
6947             ((tp->t_state == TCPS_ESTABLISHED) ||
6948             (tp->t_state == TCPS_FIN_WAIT_1))) {
6949                 /*
6950                  * Idea here is that at each stage of mtu probe (usually,
6951                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
6952                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
6953                  * should take care of that.
6954                  */
6955                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
6956                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
6957                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
6958                     tp->t_rxtshift % 2 == 0)) {
6959                         /*
6960                          * Enter Path MTU Black-hole Detection mechanism: -
6961                          * Disable Path MTU Discovery (IP "DF" bit). -
6962                          * Reduce MTU to lower value than what we negotiated
6963                          * with peer.
6964                          */
6965                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
6966                                 /* Record that we may have found a black hole. */
6967                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
6968                                 /* Keep track of previous MSS. */
6969                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
6970                         }
6971
6972                         /*
6973                          * Reduce the MSS to blackhole value or to the
6974                          * default in an attempt to retransmit.
6975                          */
6976 #ifdef INET6
6977                         if (isipv6 &&
6978                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
6979                                 /* Use the sysctl tuneable blackhole MSS. */
6980                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
6981                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
6982                         } else if (isipv6) {
6983                                 /* Use the default MSS. */
6984                                 tp->t_maxseg = V_tcp_v6mssdflt;
6985                                 /*
6986                                  * Disable Path MTU Discovery when we switch
6987                                  * to minmss.
6988                                  */
6989                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
6990                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
6991                         }
6992 #endif
6993 #if defined(INET6) && defined(INET)
6994                         else
6995 #endif
6996 #ifdef INET
6997                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
6998                                 /* Use the sysctl tuneable blackhole MSS. */
6999                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
7000                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7001                         } else {
7002                                 /* Use the default MSS. */
7003                                 tp->t_maxseg = V_tcp_mssdflt;
7004                                 /*
7005                                  * Disable Path MTU Discovery when we switch
7006                                  * to minmss.
7007                                  */
7008                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7009                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7010                         }
7011 #endif
7012                 } else {
7013                         /*
7014                          * If further retransmissions are still unsuccessful
7015                          * with a lowered MTU, maybe this isn't a blackhole
7016                          * and we restore the previous MSS and blackhole
7017                          * detection flags. The limit '6' is determined by
7018                          * giving each probe stage (1448, 1188, 524) 2
7019                          * chances to recover.
7020                          */
7021                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
7022                             (tp->t_rxtshift >= 6)) {
7023                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
7024                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
7025                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
7026                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
7027                         }
7028                 }
7029         }
7030         /*
7031          * Disable RFC1323 and SACK if we haven't got any response to
7032          * our third SYN to work-around some broken terminal servers
7033          * (most of which have hopefully been retired) that have bad VJ
7034          * header compression code which trashes TCP segments containing
7035          * unknown-to-them TCP options.
7036          */
7037         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
7038             (tp->t_rxtshift == 3))
7039                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
7040         /*
7041          * If we backed off this far, our srtt estimate is probably bogus.
7042          * Clobber it so we'll take the next rtt measurement as our srtt;
7043          * move the current srtt into rttvar to keep the current retransmit
7044          * times until then.
7045          */
7046         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
7047 #ifdef INET6
7048                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
7049                         in6_losing(tp->t_inpcb);
7050                 else
7051 #endif
7052                         in_losing(tp->t_inpcb);
7053                 tp->t_rttvar += tp->t_srtt;
7054                 tp->t_srtt = 0;
7055         }
7056         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
7057         tp->snd_recover = tp->snd_max;
7058         tp->t_flags |= TF_ACKNOW;
7059         tp->t_rtttime = 0;
7060         rack_cong_signal(tp, CC_RTO, tp->snd_una);
7061 out:
7062         return (retval);
7063 }
7064
7065 static int
7066 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
7067 {
7068         int32_t ret = 0;
7069         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
7070
7071         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
7072             (tp->t_flags & TF_GPUTINPROG)) {
7073                 /*
7074                  * We have a goodput in progress
7075                  * and we have entered a late state.
7076                  * Do we have enough data in the sb
7077                  * to handle the GPUT request?
7078                  */
7079                 uint32_t bytes;
7080
7081                 bytes = tp->gput_ack - tp->gput_seq;
7082                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
7083                         bytes += tp->gput_seq - tp->snd_una;
7084                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
7085                         /*
7086                          * There are not enough bytes in the socket
7087                          * buffer that have been sent to cover this
7088                          * measurement. Cancel it.
7089                          */
7090                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7091                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
7092                                                    tp->gput_seq,
7093                                                    0, 0, 18, __LINE__, NULL, 0);
7094                         tp->t_flags &= ~TF_GPUTINPROG;
7095                 }
7096         }
7097         if (timers == 0) {
7098                 return (0);
7099         }
7100         if (tp->t_state == TCPS_LISTEN) {
7101                 /* no timers on listen sockets */
7102                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
7103                         return (0);
7104                 return (1);
7105         }
7106         if ((timers & PACE_TMR_RACK) &&
7107             rack->rc_on_min_to) {
7108                 /*
7109                  * For the rack timer when we
7110                  * are on a min-timeout (which means rrr_conf = 3)
7111                  * we don't want to check the timer. It may
7112                  * be going off for a pace and thats ok we
7113                  * want to send the retransmit (if its ready).
7114                  *
7115                  * If its on a normal rack timer (non-min) then
7116                  * we will check if its expired.
7117                  */
7118                 goto skip_time_check;
7119         }
7120         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7121                 uint32_t left;
7122
7123                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
7124                         ret = -1;
7125                         rack_log_to_processing(rack, cts, ret, 0);
7126                         return (0);
7127                 }
7128                 if (hpts_calling == 0) {
7129                         /*
7130                          * A user send or queued mbuf (sack) has called us? We
7131                          * return 0 and let the pacing guards
7132                          * deal with it if they should or
7133                          * should not cause a send.
7134                          */
7135                         ret = -2;
7136                         rack_log_to_processing(rack, cts, ret, 0);
7137                         return (0);
7138                 }
7139                 /*
7140                  * Ok our timer went off early and we are not paced false
7141                  * alarm, go back to sleep.
7142                  */
7143                 ret = -3;
7144                 left = rack->r_ctl.rc_timer_exp - cts;
7145                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
7146                 rack_log_to_processing(rack, cts, ret, left);
7147                 return (1);
7148         }
7149 skip_time_check:
7150         rack->rc_tmr_stopped = 0;
7151         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
7152         if (timers & PACE_TMR_DELACK) {
7153                 ret = rack_timeout_delack(tp, rack, cts);
7154         } else if (timers & PACE_TMR_RACK) {
7155                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7156                 rack->r_fast_output = 0;
7157                 ret = rack_timeout_rack(tp, rack, cts);
7158         } else if (timers & PACE_TMR_TLP) {
7159                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7160                 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
7161         } else if (timers & PACE_TMR_RXT) {
7162                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7163                 rack->r_fast_output = 0;
7164                 ret = rack_timeout_rxt(tp, rack, cts);
7165         } else if (timers & PACE_TMR_PERSIT) {
7166                 ret = rack_timeout_persist(tp, rack, cts);
7167         } else if (timers & PACE_TMR_KEEP) {
7168                 ret = rack_timeout_keepalive(tp, rack, cts);
7169         }
7170         rack_log_to_processing(rack, cts, ret, timers);
7171         return (ret);
7172 }
7173
7174 static void
7175 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
7176 {
7177         struct timeval tv;
7178         uint32_t us_cts, flags_on_entry;
7179         uint8_t hpts_removed = 0;
7180
7181         flags_on_entry = rack->r_ctl.rc_hpts_flags;
7182         us_cts = tcp_get_usecs(&tv);
7183         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
7184             ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
7185              ((tp->snd_max - tp->snd_una) == 0))) {
7186                 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7187                 hpts_removed = 1;
7188                 /* If we were not delayed cancel out the flag. */
7189                 if ((tp->snd_max - tp->snd_una) == 0)
7190                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
7191                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7192         }
7193         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7194                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7195                 if (rack->rc_inp->inp_in_hpts &&
7196                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
7197                         /*
7198                          * Canceling timer's when we have no output being
7199                          * paced. We also must remove ourselves from the
7200                          * hpts.
7201                          */
7202                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
7203                         hpts_removed = 1;
7204                 }
7205                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
7206         }
7207         if (hpts_removed == 0)
7208                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7209 }
7210
7211 static void
7212 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
7213 {
7214         return;
7215 }
7216
7217 static int
7218 rack_stopall(struct tcpcb *tp)
7219 {
7220         struct tcp_rack *rack;
7221         rack = (struct tcp_rack *)tp->t_fb_ptr;
7222         rack->t_timers_stopped = 1;
7223         return (0);
7224 }
7225
7226 static void
7227 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
7228 {
7229         return;
7230 }
7231
7232 static int
7233 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
7234 {
7235         return (0);
7236 }
7237
7238 static void
7239 rack_stop_all_timers(struct tcpcb *tp)
7240 {
7241         struct tcp_rack *rack;
7242
7243         /*
7244          * Assure no timers are running.
7245          */
7246         if (tcp_timer_active(tp, TT_PERSIST)) {
7247                 /* We enter in persists, set the flag appropriately */
7248                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7249                 rack->rc_in_persist = 1;
7250         }
7251         tcp_timer_suspend(tp, TT_PERSIST);
7252         tcp_timer_suspend(tp, TT_REXMT);
7253         tcp_timer_suspend(tp, TT_KEEP);
7254         tcp_timer_suspend(tp, TT_DELACK);
7255 }
7256
7257 static void
7258 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
7259     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
7260 {
7261         int32_t idx;
7262
7263         rsm->r_rtr_cnt++;
7264         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7265         rsm->r_dupack = 0;
7266         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
7267                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
7268                 rsm->r_flags |= RACK_OVERMAX;
7269         }
7270         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
7271                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
7272                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
7273         }
7274         idx = rsm->r_rtr_cnt - 1;
7275         rsm->r_tim_lastsent[idx] = ts;
7276         /*
7277          * Here we don't add in the len of send, since its already
7278          * in snduna <->snd_max.
7279          */
7280         rsm->r_fas = ctf_flight_size(rack->rc_tp,
7281                                      rack->r_ctl.rc_sacked);
7282         if (rsm->r_flags & RACK_ACKED) {
7283                 /* Problably MTU discovery messing with us */
7284                 rsm->r_flags &= ~RACK_ACKED;
7285                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7286         }
7287         if (rsm->r_in_tmap) {
7288                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7289                 rsm->r_in_tmap = 0;
7290         }
7291         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7292         rsm->r_in_tmap = 1;
7293         if (rsm->r_flags & RACK_SACK_PASSED) {
7294                 /* We have retransmitted due to the SACK pass */
7295                 rsm->r_flags &= ~RACK_SACK_PASSED;
7296                 rsm->r_flags |= RACK_WAS_SACKPASS;
7297         }
7298 }
7299
7300 static uint32_t
7301 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
7302     struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
7303 {
7304         /*
7305          * We (re-)transmitted starting at rsm->r_start for some length
7306          * (possibly less than r_end.
7307          */
7308         struct rack_sendmap *nrsm, *insret;
7309         uint32_t c_end;
7310         int32_t len;
7311
7312         len = *lenp;
7313         c_end = rsm->r_start + len;
7314         if (SEQ_GEQ(c_end, rsm->r_end)) {
7315                 /*
7316                  * We retransmitted the whole piece or more than the whole
7317                  * slopping into the next rsm.
7318                  */
7319                 rack_update_rsm(tp, rack, rsm, ts, add_flag);
7320                 if (c_end == rsm->r_end) {
7321                         *lenp = 0;
7322                         return (0);
7323                 } else {
7324                         int32_t act_len;
7325
7326                         /* Hangs over the end return whats left */
7327                         act_len = rsm->r_end - rsm->r_start;
7328                         *lenp = (len - act_len);
7329                         return (rsm->r_end);
7330                 }
7331                 /* We don't get out of this block. */
7332         }
7333         /*
7334          * Here we retransmitted less than the whole thing which means we
7335          * have to split this into what was transmitted and what was not.
7336          */
7337         nrsm = rack_alloc_full_limit(rack);
7338         if (nrsm == NULL) {
7339                 /*
7340                  * We can't get memory, so lets not proceed.
7341                  */
7342                 *lenp = 0;
7343                 return (0);
7344         }
7345         /*
7346          * So here we are going to take the original rsm and make it what we
7347          * retransmitted. nrsm will be the tail portion we did not
7348          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
7349          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
7350          * 1, 6 and the new piece will be 6, 11.
7351          */
7352         rack_clone_rsm(rack, nrsm, rsm, c_end);
7353         nrsm->r_dupack = 0;
7354         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
7355         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7356 #ifdef INVARIANTS
7357         if (insret != NULL) {
7358                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7359                       nrsm, insret, rack, rsm);
7360         }
7361 #endif
7362         if (rsm->r_in_tmap) {
7363                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7364                 nrsm->r_in_tmap = 1;
7365         }
7366         rsm->r_flags &= (~RACK_HAS_FIN);
7367         rack_update_rsm(tp, rack, rsm, ts, add_flag);
7368         /* Log a split of rsm into rsm and nrsm */
7369         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7370         *lenp = 0;
7371         return (0);
7372 }
7373
7374 static void
7375 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
7376                 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts,
7377                 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
7378 {
7379         struct tcp_rack *rack;
7380         struct rack_sendmap *rsm, *nrsm, *insret, fe;
7381         register uint32_t snd_max, snd_una;
7382
7383         /*
7384          * Add to the RACK log of packets in flight or retransmitted. If
7385          * there is a TS option we will use the TS echoed, if not we will
7386          * grab a TS.
7387          *
7388          * Retransmissions will increment the count and move the ts to its
7389          * proper place. Note that if options do not include TS's then we
7390          * won't be able to effectively use the ACK for an RTT on a retran.
7391          *
7392          * Notes about r_start and r_end. Lets consider a send starting at
7393          * sequence 1 for 10 bytes. In such an example the r_start would be
7394          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
7395          * This means that r_end is actually the first sequence for the next
7396          * slot (11).
7397          *
7398          */
7399         /*
7400          * If err is set what do we do XXXrrs? should we not add the thing?
7401          * -- i.e. return if err != 0 or should we pretend we sent it? --
7402          * i.e. proceed with add ** do this for now.
7403          */
7404         INP_WLOCK_ASSERT(tp->t_inpcb);
7405         if (err)
7406                 /*
7407                  * We don't log errors -- we could but snd_max does not
7408                  * advance in this case either.
7409                  */
7410                 return;
7411
7412         if (th_flags & TH_RST) {
7413                 /*
7414                  * We don't log resets and we return immediately from
7415                  * sending
7416                  */
7417                 return;
7418         }
7419         rack = (struct tcp_rack *)tp->t_fb_ptr;
7420         snd_una = tp->snd_una;
7421         snd_max = tp->snd_max;
7422         if (th_flags & (TH_SYN | TH_FIN)) {
7423                 /*
7424                  * The call to rack_log_output is made before bumping
7425                  * snd_max. This means we can record one extra byte on a SYN
7426                  * or FIN if seq_out is adding more on and a FIN is present
7427                  * (and we are not resending).
7428                  */
7429                 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
7430                         len++;
7431                 if (th_flags & TH_FIN)
7432                         len++;
7433                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
7434                         /*
7435                          * The add/update as not been done for the FIN/SYN
7436                          * yet.
7437                          */
7438                         snd_max = tp->snd_nxt;
7439                 }
7440         }
7441         if (SEQ_LEQ((seq_out + len), snd_una)) {
7442                 /* Are sending an old segment to induce an ack (keep-alive)? */
7443                 return;
7444         }
7445         if (SEQ_LT(seq_out, snd_una)) {
7446                 /* huh? should we panic? */
7447                 uint32_t end;
7448
7449                 end = seq_out + len;
7450                 seq_out = snd_una;
7451                 if (SEQ_GEQ(end, seq_out))
7452                         len = end - seq_out;
7453                 else
7454                         len = 0;
7455         }
7456         if (len == 0) {
7457                 /* We don't log zero window probes */
7458                 return;
7459         }
7460         rack->r_ctl.rc_time_last_sent = cts;
7461         if (IN_FASTRECOVERY(tp->t_flags)) {
7462                 rack->r_ctl.rc_prr_out += len;
7463         }
7464         /* First question is it a retransmission or new? */
7465         if (seq_out == snd_max) {
7466                 /* Its new */
7467 again:
7468                 rsm = rack_alloc(rack);
7469                 if (rsm == NULL) {
7470                         /*
7471                          * Hmm out of memory and the tcb got destroyed while
7472                          * we tried to wait.
7473                          */
7474                         return;
7475                 }
7476                 if (th_flags & TH_FIN) {
7477                         rsm->r_flags = RACK_HAS_FIN|add_flag;
7478                 } else {
7479                         rsm->r_flags = add_flag;
7480                 }
7481                 if (hw_tls)
7482                         rsm->r_hw_tls = 1;
7483                 rsm->r_tim_lastsent[0] = cts;
7484                 rsm->r_rtr_cnt = 1;
7485                 rsm->r_rtr_bytes = 0;
7486                 if (th_flags & TH_SYN) {
7487                         /* The data space is one beyond snd_una */
7488                         rsm->r_flags |= RACK_HAS_SYN;
7489                 }
7490                 rsm->r_start = seq_out;
7491                 rsm->r_end = rsm->r_start + len;
7492                 rsm->r_dupack = 0;
7493                 /*
7494                  * save off the mbuf location that
7495                  * sndmbuf_noadv returned (which is
7496                  * where we started copying from)..
7497                  */
7498                 rsm->m = s_mb;
7499                 rsm->soff = s_moff;
7500                 /*
7501                  * Here we do add in the len of send, since its not yet
7502                  * reflected in in snduna <->snd_max
7503                  */
7504                 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
7505                                               rack->r_ctl.rc_sacked) +
7506                               (rsm->r_end - rsm->r_start));
7507                 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
7508                 if (rsm->m) {
7509                         if (rsm->m->m_len <= rsm->soff) {
7510                                 /*
7511                                  * XXXrrs Question, will this happen?
7512                                  *
7513                                  * If sbsndptr is set at the correct place
7514                                  * then s_moff should always be somewhere
7515                                  * within rsm->m. But if the sbsndptr was
7516                                  * off then that won't be true. If it occurs
7517                                  * we need to walkout to the correct location.
7518                                  */
7519                                 struct mbuf *lm;
7520
7521                                 lm = rsm->m;
7522                                 while (lm->m_len <= rsm->soff) {
7523                                         rsm->soff -= lm->m_len;
7524                                         lm = lm->m_next;
7525                                         KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
7526                                                              __func__, rack, s_moff, s_mb, rsm->soff));
7527                                 }
7528                                 rsm->m = lm;
7529                                 counter_u64_add(rack_sbsndptr_wrong, 1);
7530                         } else
7531                                 counter_u64_add(rack_sbsndptr_right, 1);
7532                         rsm->orig_m_len = rsm->m->m_len;
7533                 } else
7534                         rsm->orig_m_len = 0;
7535                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7536                 /* Log a new rsm */
7537                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
7538                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7539 #ifdef INVARIANTS
7540                 if (insret != NULL) {
7541                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7542                               nrsm, insret, rack, rsm);
7543                 }
7544 #endif
7545                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7546                 rsm->r_in_tmap = 1;
7547                 /*
7548                  * Special case detection, is there just a single
7549                  * packet outstanding when we are not in recovery?
7550                  *
7551                  * If this is true mark it so.
7552                  */
7553                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
7554                     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
7555                         struct rack_sendmap *prsm;
7556
7557                         prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7558                         if (prsm)
7559                                 prsm->r_one_out_nr = 1;
7560                 }
7561                 return;
7562         }
7563         /*
7564          * If we reach here its a retransmission and we need to find it.
7565          */
7566         memset(&fe, 0, sizeof(fe));
7567 more:
7568         if (hintrsm && (hintrsm->r_start == seq_out)) {
7569                 rsm = hintrsm;
7570                 hintrsm = NULL;
7571         } else {
7572                 /* No hints sorry */
7573                 rsm = NULL;
7574         }
7575         if ((rsm) && (rsm->r_start == seq_out)) {
7576                 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7577                 if (len == 0) {
7578                         return;
7579                 } else {
7580                         goto more;
7581                 }
7582         }
7583         /* Ok it was not the last pointer go through it the hard way. */
7584 refind:
7585         fe.r_start = seq_out;
7586         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
7587         if (rsm) {
7588                 if (rsm->r_start == seq_out) {
7589                         seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7590                         if (len == 0) {
7591                                 return;
7592                         } else {
7593                                 goto refind;
7594                         }
7595                 }
7596                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
7597                         /* Transmitted within this piece */
7598                         /*
7599                          * Ok we must split off the front and then let the
7600                          * update do the rest
7601                          */
7602                         nrsm = rack_alloc_full_limit(rack);
7603                         if (nrsm == NULL) {
7604                                 rack_update_rsm(tp, rack, rsm, cts, add_flag);
7605                                 return;
7606                         }
7607                         /*
7608                          * copy rsm to nrsm and then trim the front of rsm
7609                          * to not include this part.
7610                          */
7611                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
7612                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7613                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7614 #ifdef INVARIANTS
7615                         if (insret != NULL) {
7616                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7617                                       nrsm, insret, rack, rsm);
7618                         }
7619 #endif
7620                         if (rsm->r_in_tmap) {
7621                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7622                                 nrsm->r_in_tmap = 1;
7623                         }
7624                         rsm->r_flags &= (~RACK_HAS_FIN);
7625                         seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
7626                         if (len == 0) {
7627                                 return;
7628                         } else if (len > 0)
7629                                 goto refind;
7630                 }
7631         }
7632         /*
7633          * Hmm not found in map did they retransmit both old and on into the
7634          * new?
7635          */
7636         if (seq_out == tp->snd_max) {
7637                 goto again;
7638         } else if (SEQ_LT(seq_out, tp->snd_max)) {
7639 #ifdef INVARIANTS
7640                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
7641                        seq_out, len, tp->snd_una, tp->snd_max);
7642                 printf("Starting Dump of all rack entries\n");
7643                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
7644                         printf("rsm:%p start:%u end:%u\n",
7645                                rsm, rsm->r_start, rsm->r_end);
7646                 }
7647                 printf("Dump complete\n");
7648                 panic("seq_out not found rack:%p tp:%p",
7649                       rack, tp);
7650 #endif
7651         } else {
7652 #ifdef INVARIANTS
7653                 /*
7654                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
7655                  * flag)
7656                  */
7657                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
7658                       seq_out, len, tp->snd_max, tp);
7659 #endif
7660         }
7661 }
7662
7663 /*
7664  * Record one of the RTT updates from an ack into
7665  * our sample structure.
7666  */
7667
7668 static void
7669 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
7670                     int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
7671 {
7672         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7673             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
7674                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
7675         }
7676         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7677             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
7678                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
7679         }
7680         if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
7681             if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
7682                 rack->r_ctl.rc_gp_lowrtt = us_rtt;
7683             if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
7684                     rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
7685         }
7686         if ((confidence == 1) &&
7687             ((rsm == NULL) ||
7688              (rsm->r_just_ret) ||
7689              (rsm->r_one_out_nr &&
7690               len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
7691                 /*
7692                  * If the rsm had a just return
7693                  * hit it then we can't trust the
7694                  * rtt measurement for buffer deterimination
7695                  * Note that a confidence of 2, indicates
7696                  * SACK'd which overrides the r_just_ret or
7697                  * the r_one_out_nr. If it was a CUM-ACK and
7698                  * we had only two outstanding, but get an
7699                  * ack for only 1. Then that also lowers our
7700                  * confidence.
7701                  */
7702                 confidence = 0;
7703         }
7704         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7705             (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
7706                 if (rack->r_ctl.rack_rs.confidence == 0) {
7707                         /*
7708                          * We take anything with no current confidence
7709                          * saved.
7710                          */
7711                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7712                         rack->r_ctl.rack_rs.confidence = confidence;
7713                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7714                 } else if (confidence || rack->r_ctl.rack_rs.confidence) {
7715                         /*
7716                          * Once we have a confident number,
7717                          * we can update it with a smaller
7718                          * value since this confident number
7719                          * may include the DSACK time until
7720                          * the next segment (the second one) arrived.
7721                          */
7722                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7723                         rack->r_ctl.rack_rs.confidence = confidence;
7724                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7725                 }
7726         }
7727         rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
7728         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
7729         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
7730         rack->r_ctl.rack_rs.rs_rtt_cnt++;
7731 }
7732
7733 /*
7734  * Collect new round-trip time estimate
7735  * and update averages and current timeout.
7736  */
7737 static void
7738 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
7739 {
7740         int32_t delta;
7741         uint32_t o_srtt, o_var;
7742         int32_t hrtt_up = 0;
7743         int32_t rtt;
7744
7745         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
7746                 /* No valid sample */
7747                 return;
7748         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
7749                 /* We are to use the lowest RTT seen in a single ack */
7750                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
7751         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
7752                 /* We are to use the highest RTT seen in a single ack */
7753                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
7754         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
7755                 /* We are to use the average RTT seen in a single ack */
7756                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
7757                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
7758         } else {
7759 #ifdef INVARIANTS
7760                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
7761 #endif
7762                 return;
7763         }
7764         if (rtt == 0)
7765                 rtt = 1;
7766         if (rack->rc_gp_rtt_set == 0) {
7767                 /*
7768                  * With no RTT we have to accept
7769                  * even one we are not confident of.
7770                  */
7771                 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
7772                 rack->rc_gp_rtt_set = 1;
7773         } else if (rack->r_ctl.rack_rs.confidence) {
7774                 /* update the running gp srtt */
7775                 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
7776                 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
7777         }
7778         if (rack->r_ctl.rack_rs.confidence) {
7779                 /*
7780                  * record the low and high for highly buffered path computation,
7781                  * we only do this if we are confident (not a retransmission).
7782                  */
7783                 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
7784                         rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7785                         hrtt_up = 1;
7786                 }
7787                 if (rack->rc_highly_buffered == 0) {
7788                         /*
7789                          * Currently once we declare a path has
7790                          * highly buffered there is no going
7791                          * back, which may be a problem...
7792                          */
7793                         if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
7794                                 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
7795                                                      rack->r_ctl.rc_highest_us_rtt,
7796                                                      rack->r_ctl.rc_lowest_us_rtt,
7797                                                      RACK_RTTS_SEEHBP);
7798                                 rack->rc_highly_buffered = 1;
7799                         }
7800                 }
7801         }
7802         if ((rack->r_ctl.rack_rs.confidence) ||
7803             (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
7804                 /*
7805                  * If we are highly confident of it <or> it was
7806                  * never retransmitted we accept it as the last us_rtt.
7807                  */
7808                 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7809                 /* The lowest rtt can be set if its was not retransmited */
7810                 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
7811                         rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7812                         if (rack->r_ctl.rc_lowest_us_rtt == 0)
7813                                 rack->r_ctl.rc_lowest_us_rtt = 1;
7814                 }
7815         }
7816         o_srtt = tp->t_srtt;
7817         o_var = tp->t_rttvar;
7818         rack = (struct tcp_rack *)tp->t_fb_ptr;
7819         if (tp->t_srtt != 0) {
7820                 /*
7821                  * We keep a simple srtt in microseconds, like our rtt
7822                  * measurement. We don't need to do any tricks with shifting
7823                  * etc. Instead we just add in 1/8th of the new measurement
7824                  * and subtract out 1/8 of the old srtt. We do the same with
7825                  * the variance after finding the absolute value of the
7826                  * difference between this sample and the current srtt.
7827                  */
7828                 delta = tp->t_srtt - rtt;
7829                 /* Take off 1/8th of the current sRTT */
7830                 tp->t_srtt -= (tp->t_srtt >> 3);
7831                 /* Add in 1/8th of the new RTT just measured */
7832                 tp->t_srtt += (rtt >> 3);
7833                 if (tp->t_srtt <= 0)
7834                         tp->t_srtt = 1;
7835                 /* Now lets make the absolute value of the variance */
7836                 if (delta < 0)
7837                         delta = -delta;
7838                 /* Subtract out 1/8th */
7839                 tp->t_rttvar -= (tp->t_rttvar >> 3);
7840                 /* Add in 1/8th of the new variance we just saw */
7841                 tp->t_rttvar += (delta >> 3);
7842                 if (tp->t_rttvar <= 0)
7843                         tp->t_rttvar = 1;
7844                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
7845                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7846         } else {
7847                 /*
7848                  * No rtt measurement yet - use the unsmoothed rtt. Set the
7849                  * variance to half the rtt (so our first retransmit happens
7850                  * at 3*rtt).
7851                  */
7852                 tp->t_srtt = rtt;
7853                 tp->t_rttvar = rtt >> 1;
7854                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7855         }
7856         rack->rc_srtt_measure_made = 1;
7857         KMOD_TCPSTAT_INC(tcps_rttupdated);
7858         tp->t_rttupdated++;
7859 #ifdef STATS
7860         if (rack_stats_gets_ms_rtt == 0) {
7861                 /* Send in the microsecond rtt used for rxt timeout purposes */
7862                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
7863         } else if (rack_stats_gets_ms_rtt == 1) {
7864                 /* Send in the millisecond rtt used for rxt timeout purposes */
7865                 int32_t ms_rtt;
7866
7867                 /* Round up */
7868                 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7869                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7870         } else if (rack_stats_gets_ms_rtt == 2) {
7871                 /* Send in the millisecond rtt has close to the path RTT as we can get  */
7872                 int32_t ms_rtt;
7873
7874                 /* Round up */
7875                 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7876                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7877         }  else {
7878                 /* Send in the microsecond rtt has close to the path RTT as we can get  */
7879                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
7880         }
7881
7882 #endif
7883         /*
7884          * the retransmit should happen at rtt + 4 * rttvar. Because of the
7885          * way we do the smoothing, srtt and rttvar will each average +1/2
7886          * tick of bias.  When we compute the retransmit timer, we want 1/2
7887          * tick of rounding and 1 extra tick because of +-1/2 tick
7888          * uncertainty in the firing of the timer.  The bias will give us
7889          * exactly the 1.5 tick we need.  But, because the bias is
7890          * statistical, we have to test that we don't drop below the minimum
7891          * feasible timer (which is 2 ticks).
7892          */
7893         tp->t_rxtshift = 0;
7894         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
7895                       max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
7896         rack_log_rtt_sample(rack, rtt);
7897         tp->t_softerror = 0;
7898 }
7899
7900
7901 static void
7902 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
7903 {
7904         /*
7905          * Apply to filter the inbound us-rtt at us_cts.
7906          */
7907         uint32_t old_rtt;
7908
7909         old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
7910         apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
7911                                us_rtt, us_cts);
7912         if (rack->r_ctl.last_pacing_time &&
7913             rack->rc_gp_dyn_mul &&
7914             (rack->r_ctl.last_pacing_time > us_rtt))
7915                 rack->pacing_longer_than_rtt = 1;
7916         else
7917                 rack->pacing_longer_than_rtt = 0;
7918         if (old_rtt > us_rtt) {
7919                 /* We just hit a new lower rtt time */
7920                 rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
7921                                      __LINE__, RACK_RTTS_NEWRTT);
7922                 /*
7923                  * Only count it if its lower than what we saw within our
7924                  * calculated range.
7925                  */
7926                 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
7927                         if (rack_probertt_lower_within &&
7928                             rack->rc_gp_dyn_mul &&
7929                             (rack->use_fixed_rate == 0) &&
7930                             (rack->rc_always_pace)) {
7931                                 /*
7932                                  * We are seeing a new lower rtt very close
7933                                  * to the time that we would have entered probe-rtt.
7934                                  * This is probably due to the fact that a peer flow
7935                                  * has entered probe-rtt. Lets go in now too.
7936                                  */
7937                                 uint32_t val;
7938
7939                                 val = rack_probertt_lower_within * rack_time_between_probertt;
7940                                 val /= 100;
7941                                 if ((rack->in_probe_rtt == 0)  &&
7942                                     ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
7943                                         rack_enter_probertt(rack, us_cts);
7944                                 }
7945                         }
7946                         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
7947                 }
7948         }
7949 }
7950
7951 static int
7952 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
7953     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
7954 {
7955         uint32_t us_rtt;
7956         int32_t i, all;
7957         uint32_t t, len_acked;
7958
7959         if ((rsm->r_flags & RACK_ACKED) ||
7960             (rsm->r_flags & RACK_WAS_ACKED))
7961                 /* Already done */
7962                 return (0);
7963         if (rsm->r_no_rtt_allowed) {
7964                 /* Not allowed */
7965                 return (0);
7966         }
7967         if (ack_type == CUM_ACKED) {
7968                 if (SEQ_GT(th_ack, rsm->r_end)) {
7969                         len_acked = rsm->r_end - rsm->r_start;
7970                         all = 1;
7971                 } else {
7972                         len_acked = th_ack - rsm->r_start;
7973                         all = 0;
7974                 }
7975         } else {
7976                 len_acked = rsm->r_end - rsm->r_start;
7977                 all = 0;
7978         }
7979         if (rsm->r_rtr_cnt == 1) {
7980
7981                 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
7982                 if ((int)t <= 0)
7983                         t = 1;
7984                 if (!tp->t_rttlow || tp->t_rttlow > t)
7985                         tp->t_rttlow = t;
7986                 if (!rack->r_ctl.rc_rack_min_rtt ||
7987                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
7988                         rack->r_ctl.rc_rack_min_rtt = t;
7989                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
7990                                 rack->r_ctl.rc_rack_min_rtt = 1;
7991                         }
7992                 }
7993                 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
7994                         us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
7995                 else
7996                         us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
7997                 if (us_rtt == 0)
7998                         us_rtt = 1;
7999                 if (CC_ALGO(tp)->rttsample != NULL) {
8000                         /* Kick the RTT to the CC */
8001                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
8002                 }
8003                 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
8004                 if (ack_type == SACKED) {
8005                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
8006                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
8007                 } else {
8008                         /*
8009                          * We need to setup what our confidence
8010                          * is in this ack.
8011                          *
8012                          * If the rsm was app limited and it is
8013                          * less than a mss in length (the end
8014                          * of the send) then we have a gap. If we
8015                          * were app limited but say we were sending
8016                          * multiple MSS's then we are more confident
8017                          * int it.
8018                          *
8019                          * When we are not app-limited then we see if
8020                          * the rsm is being included in the current
8021                          * measurement, we tell this by the app_limited_needs_set
8022                          * flag.
8023                          *
8024                          * Note that being cwnd blocked is not applimited
8025                          * as well as the pacing delay between packets which
8026                          * are sending only 1 or 2 MSS's also will show up
8027                          * in the RTT. We probably need to examine this algorithm
8028                          * a bit more and enhance it to account for the delay
8029                          * between rsm's. We could do that by saving off the
8030                          * pacing delay of each rsm (in an rsm) and then
8031                          * factoring that in somehow though for now I am
8032                          * not sure how :)
8033                          */
8034                         int calc_conf = 0;
8035
8036                         if (rsm->r_flags & RACK_APP_LIMITED) {
8037                                 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
8038                                         calc_conf = 0;
8039                                 else
8040                                         calc_conf = 1;
8041                         } else if (rack->app_limited_needs_set == 0) {
8042                                 calc_conf = 1;
8043                         } else {
8044                                 calc_conf = 0;
8045                         }
8046                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
8047                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
8048                                             calc_conf, rsm, rsm->r_rtr_cnt);
8049                 }
8050                 if ((rsm->r_flags & RACK_TLP) &&
8051                     (!IN_FASTRECOVERY(tp->t_flags))) {
8052                         /* Segment was a TLP and our retrans matched */
8053                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
8054                                 rack->r_ctl.rc_rsm_start = tp->snd_max;
8055                                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
8056                                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
8057                                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
8058                         }
8059                 }
8060                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
8061                         /* New more recent rack_tmit_time */
8062                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8063                         rack->rc_rack_rtt = t;
8064                 }
8065                 return (1);
8066         }
8067         /*
8068          * We clear the soft/rxtshift since we got an ack.
8069          * There is no assurance we will call the commit() function
8070          * so we need to clear these to avoid incorrect handling.
8071          */
8072         tp->t_rxtshift = 0;
8073         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8074                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
8075         tp->t_softerror = 0;
8076         if (to && (to->to_flags & TOF_TS) &&
8077             (ack_type == CUM_ACKED) &&
8078             (to->to_tsecr) &&
8079             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
8080                 /*
8081                  * Now which timestamp does it match? In this block the ACK
8082                  * must be coming from a previous transmission.
8083                  */
8084                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8085                         if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
8086                                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8087                                 if ((int)t <= 0)
8088                                         t = 1;
8089                                 if (CC_ALGO(tp)->rttsample != NULL) {
8090                                         /*
8091                                          * Kick the RTT to the CC, here
8092                                          * we lie a bit in that we know the
8093                                          * retransmission is correct even though
8094                                          * we retransmitted. This is because
8095                                          * we match the timestamps.
8096                                          */
8097                                         if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
8098                                                 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
8099                                         else
8100                                                 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
8101                                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
8102                                 }
8103                                 if ((i + 1) < rsm->r_rtr_cnt) {
8104                                         /*
8105                                          * The peer ack'd from our previous
8106                                          * transmission. We have a spurious
8107                                          * retransmission and thus we dont
8108                                          * want to update our rack_rtt.
8109                                          *
8110                                          * Hmm should there be a CC revert here?
8111                                          *
8112                                          */
8113                                         return (0);
8114                                 }
8115                                 if (!tp->t_rttlow || tp->t_rttlow > t)
8116                                         tp->t_rttlow = t;
8117                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8118                                         rack->r_ctl.rc_rack_min_rtt = t;
8119                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
8120                                                 rack->r_ctl.rc_rack_min_rtt = 1;
8121                                         }
8122                                 }
8123                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8124                                            (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
8125                                         /* New more recent rack_tmit_time */
8126                                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8127                                         rack->rc_rack_rtt = t;
8128                                 }
8129                                 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
8130                                 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
8131                                                     rsm->r_rtr_cnt);
8132                                 return (1);
8133                         }
8134                 }
8135                 goto ts_not_found;
8136         } else {
8137                 /*
8138                  * Ok its a SACK block that we retransmitted. or a windows
8139                  * machine without timestamps. We can tell nothing from the
8140                  * time-stamp since its not there or the time the peer last
8141                  * recieved a segment that moved forward its cum-ack point.
8142                  */
8143 ts_not_found:
8144                 i = rsm->r_rtr_cnt - 1;
8145                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8146                 if ((int)t <= 0)
8147                         t = 1;
8148                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8149                         /*
8150                          * We retransmitted and the ack came back in less
8151                          * than the smallest rtt we have observed. We most
8152                          * likely did an improper retransmit as outlined in
8153                          * 6.2 Step 2 point 2 in the rack-draft so we
8154                          * don't want to update our rack_rtt. We in
8155                          * theory (in future) might want to think about reverting our
8156                          * cwnd state but we won't for now.
8157                          */
8158                         return (0);
8159                 } else if (rack->r_ctl.rc_rack_min_rtt) {
8160                         /*
8161                          * We retransmitted it and the retransmit did the
8162                          * job.
8163                          */
8164                         if (!rack->r_ctl.rc_rack_min_rtt ||
8165                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8166                                 rack->r_ctl.rc_rack_min_rtt = t;
8167                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8168                                         rack->r_ctl.rc_rack_min_rtt = 1;
8169                                 }
8170                         }
8171                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
8172                                 /* New more recent rack_tmit_time */
8173                                 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
8174                                 rack->rc_rack_rtt = t;
8175                         }
8176                         return (1);
8177                 }
8178         }
8179         return (0);
8180 }
8181
8182 /*
8183  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
8184  */
8185 static void
8186 rack_log_sack_passed(struct tcpcb *tp,
8187     struct tcp_rack *rack, struct rack_sendmap *rsm)
8188 {
8189         struct rack_sendmap *nrsm;
8190
8191         nrsm = rsm;
8192         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
8193             rack_head, r_tnext) {
8194                 if (nrsm == rsm) {
8195                         /* Skip orginal segment he is acked */
8196                         continue;
8197                 }
8198                 if (nrsm->r_flags & RACK_ACKED) {
8199                         /*
8200                          * Skip ack'd segments, though we
8201                          * should not see these, since tmap
8202                          * should not have ack'd segments.
8203                          */
8204                         continue;
8205                 }
8206                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8207                         /*
8208                          * We found one that is already marked
8209                          * passed, we have been here before and
8210                          * so all others below this are marked.
8211                          */
8212                         break;
8213                 }
8214                 nrsm->r_flags |= RACK_SACK_PASSED;
8215                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
8216         }
8217 }
8218
8219 static void
8220 rack_need_set_test(struct tcpcb *tp,
8221                    struct tcp_rack *rack,
8222                    struct rack_sendmap *rsm,
8223                    tcp_seq th_ack,
8224                    int line,
8225                    int use_which)
8226 {
8227
8228         if ((tp->t_flags & TF_GPUTINPROG) &&
8229             SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8230                 /*
8231                  * We were app limited, and this ack
8232                  * butts up or goes beyond the point where we want
8233                  * to start our next measurement. We need
8234                  * to record the new gput_ts as here and
8235                  * possibly update the start sequence.
8236                  */
8237                 uint32_t seq, ts;
8238
8239                 if (rsm->r_rtr_cnt > 1) {
8240                         /*
8241                          * This is a retransmit, can we
8242                          * really make any assessment at this
8243                          * point?  We are not really sure of
8244                          * the timestamp, is it this or the
8245                          * previous transmission?
8246                          *
8247                          * Lets wait for something better that
8248                          * is not retransmitted.
8249                          */
8250                         return;
8251                 }
8252                 seq = tp->gput_seq;
8253                 ts = tp->gput_ts;
8254                 rack->app_limited_needs_set = 0;
8255                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
8256                 /* Do we start at a new end? */
8257                 if ((use_which == RACK_USE_BEG) &&
8258                     SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
8259                         /*
8260                          * When we get an ACK that just eats
8261                          * up some of the rsm, we set RACK_USE_BEG
8262                          * since whats at r_start (i.e. th_ack)
8263                          * is left unacked and thats where the
8264                          * measurement not starts.
8265                          */
8266                         tp->gput_seq = rsm->r_start;
8267                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8268                 }
8269                 if ((use_which == RACK_USE_END) &&
8270                     SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8271                             /*
8272                              * We use the end when the cumack
8273                              * is moving forward and completely
8274                              * deleting the rsm passed so basically
8275                              * r_end holds th_ack.
8276                              *
8277                              * For SACK's we also want to use the end
8278                              * since this piece just got sacked and
8279                              * we want to target anything after that
8280                              * in our measurement.
8281                              */
8282                             tp->gput_seq = rsm->r_end;
8283                             rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8284                 }
8285                 if (use_which == RACK_USE_END_OR_THACK) {
8286                         /*
8287                          * special case for ack moving forward,
8288                          * not a sack, we need to move all the
8289                          * way up to where this ack cum-ack moves
8290                          * to.
8291                          */
8292                         if (SEQ_GT(th_ack, rsm->r_end))
8293                                 tp->gput_seq = th_ack;
8294                         else
8295                                 tp->gput_seq = rsm->r_end;
8296                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8297                 }
8298                 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
8299                         /*
8300                          * We moved beyond this guy's range, re-calculate
8301                          * the new end point.
8302                          */
8303                         if (rack->rc_gp_filled == 0) {
8304                                 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
8305                         } else {
8306                                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
8307                         }
8308                 }
8309                 /*
8310                  * We are moving the goal post, we may be able to clear the
8311                  * measure_saw_probe_rtt flag.
8312                  */
8313                 if ((rack->in_probe_rtt == 0) &&
8314                     (rack->measure_saw_probe_rtt) &&
8315                     (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
8316                         rack->measure_saw_probe_rtt = 0;
8317                 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
8318                                            seq, tp->gput_seq, 0, 5, line, NULL, 0);
8319                 if (rack->rc_gp_filled &&
8320                     ((tp->gput_ack - tp->gput_seq) <
8321                      max(rc_init_window(rack), (MIN_GP_WIN *
8322                                                 ctf_fixed_maxseg(tp))))) {
8323                         uint32_t ideal_amount;
8324
8325                         ideal_amount = rack_get_measure_window(tp, rack);
8326                         if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8327                                 /*
8328                                  * There is no sense of continuing this measurement
8329                                  * because its too small to gain us anything we
8330                                  * trust. Skip it and that way we can start a new
8331                                  * measurement quicker.
8332                                  */
8333                                 tp->t_flags &= ~TF_GPUTINPROG;
8334                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
8335                                                            0, 0, 0, 6, __LINE__, NULL, 0);
8336                         } else {
8337                                 /*
8338                                  * Reset the window further out.
8339                                  */
8340                                 tp->gput_ack = tp->gput_seq + ideal_amount;
8341                         }
8342                 }
8343         }
8344 }
8345
8346 static inline int
8347 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
8348 {
8349         if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
8350                 /* Behind our TLP definition or right at */
8351                 return (0);
8352         }
8353         if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
8354                 /* The start is beyond or right at our end of TLP definition */
8355                 return (0);
8356         }
8357         /* It has to be a sub-part of the original TLP recorded */
8358         return (1);
8359 }
8360
8361
8362 static uint32_t
8363 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
8364                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
8365 {
8366         uint32_t start, end, changed = 0;
8367         struct rack_sendmap stack_map;
8368         struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next;
8369         int32_t used_ref = 1;
8370         int moved = 0;
8371
8372         start = sack->start;
8373         end = sack->end;
8374         rsm = *prsm;
8375         memset(&fe, 0, sizeof(fe));
8376 do_rest_ofb:
8377         if ((rsm == NULL) ||
8378             (SEQ_LT(end, rsm->r_start)) ||
8379             (SEQ_GEQ(start, rsm->r_end)) ||
8380             (SEQ_LT(start, rsm->r_start))) {
8381                 /*
8382                  * We are not in the right spot,
8383                  * find the correct spot in the tree.
8384                  */
8385                 used_ref = 0;
8386                 fe.r_start = start;
8387                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
8388                 moved++;
8389         }
8390         if (rsm == NULL) {
8391                 /* TSNH */
8392                 goto out;
8393         }
8394         /* Ok we have an ACK for some piece of this rsm */
8395         if (rsm->r_start != start) {
8396                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8397                         /*
8398                          * Before any splitting or hookery is
8399                          * done is it a TLP of interest i.e. rxt?
8400                          */
8401                         if ((rsm->r_flags & RACK_TLP) &&
8402                             (rsm->r_rtr_cnt > 1)) {
8403                                 /*
8404                                  * We are splitting a rxt TLP, check
8405                                  * if we need to save off the start/end
8406                                  */
8407                                 if (rack->rc_last_tlp_acked_set &&
8408                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8409                                         /*
8410                                          * We already turned this on since we are inside
8411                                          * the previous one was a partially sack now we
8412                                          * are getting another one (maybe all of it).
8413                                          *
8414                                          */
8415                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8416                                         /*
8417                                          * Lets make sure we have all of it though.
8418                                          */
8419                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8420                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8421                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8422                                                                      rack->r_ctl.last_tlp_acked_end);
8423                                         }
8424                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8425                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8426                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8427                                                                      rack->r_ctl.last_tlp_acked_end);
8428                                         }
8429                                 } else {
8430                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8431                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8432                                         rack->rc_last_tlp_past_cumack = 0;
8433                                         rack->rc_last_tlp_acked_set = 1;
8434                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8435                                 }
8436                         }
8437                         /**
8438                          * Need to split this in two pieces the before and after,
8439                          * the before remains in the map, the after must be
8440                          * added. In other words we have:
8441                          * rsm        |--------------|
8442                          * sackblk        |------->
8443                          * rsm will become
8444                          *     rsm    |---|
8445                          * and nrsm will be  the sacked piece
8446                          *     nrsm       |----------|
8447                          *
8448                          * But before we start down that path lets
8449                          * see if the sack spans over on top of
8450                          * the next guy and it is already sacked.
8451                          *
8452                          */
8453                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8454                         if (next && (next->r_flags & RACK_ACKED) &&
8455                             SEQ_GEQ(end, next->r_start)) {
8456                                 /**
8457                                  * So the next one is already acked, and
8458                                  * we can thus by hookery use our stack_map
8459                                  * to reflect the piece being sacked and
8460                                  * then adjust the two tree entries moving
8461                                  * the start and ends around. So we start like:
8462                                  *  rsm     |------------|             (not-acked)
8463                                  *  next                 |-----------| (acked)
8464                                  *  sackblk        |-------->
8465                                  *  We want to end like so:
8466                                  *  rsm     |------|                   (not-acked)
8467                                  *  next           |-----------------| (acked)
8468                                  *  nrsm           |-----|
8469                                  * Where nrsm is a temporary stack piece we
8470                                  * use to update all the gizmos.
8471                                  */
8472                                 /* Copy up our fudge block */
8473                                 nrsm = &stack_map;
8474                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8475                                 /* Now adjust our tree blocks */
8476                                 rsm->r_end = start;
8477                                 next->r_start = start;
8478                                 /* Now we must adjust back where next->m is */
8479                                 rack_setup_offset_for_rsm(rsm, next);
8480
8481                                 /* We don't need to adjust rsm, it did not change */
8482                                 /* Clear out the dup ack count of the remainder */
8483                                 rsm->r_dupack = 0;
8484                                 rsm->r_just_ret = 0;
8485                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8486                                 /* Now lets make sure our fudge block is right */
8487                                 nrsm->r_start = start;
8488                                 /* Now lets update all the stats and such */
8489                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8490                                 if (rack->app_limited_needs_set)
8491                                         rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8492                                 changed += (nrsm->r_end - nrsm->r_start);
8493                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8494                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8495                                         counter_u64_add(rack_reorder_seen, 1);
8496                                         rack->r_ctl.rc_reorder_ts = cts;
8497                                 }
8498                                 /*
8499                                  * Now we want to go up from rsm (the
8500                                  * one left un-acked) to the next one
8501                                  * in the tmap. We do this so when
8502                                  * we walk backwards we include marking
8503                                  * sack-passed on rsm (The one passed in
8504                                  * is skipped since it is generally called
8505                                  * on something sacked before removing it
8506                                  * from the tmap).
8507                                  */
8508                                 if (rsm->r_in_tmap) {
8509                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
8510                                         /*
8511                                          * Now that we have the next
8512                                          * one walk backwards from there.
8513                                          */
8514                                         if (nrsm && nrsm->r_in_tmap)
8515                                                 rack_log_sack_passed(tp, rack, nrsm);
8516                                 }
8517                                 /* Now are we done? */
8518                                 if (SEQ_LT(end, next->r_end) ||
8519                                     (end == next->r_end)) {
8520                                         /* Done with block */
8521                                         goto out;
8522                                 }
8523                                 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
8524                                 counter_u64_add(rack_sack_used_next_merge, 1);
8525                                 /* Postion for the next block */
8526                                 start = next->r_end;
8527                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
8528                                 if (rsm == NULL)
8529                                         goto out;
8530                         } else {
8531                                 /**
8532                                  * We can't use any hookery here, so we
8533                                  * need to split the map. We enter like
8534                                  * so:
8535                                  *  rsm      |--------|
8536                                  *  sackblk       |----->
8537                                  * We will add the new block nrsm and
8538                                  * that will be the new portion, and then
8539                                  * fall through after reseting rsm. So we
8540                                  * split and look like this:
8541                                  *  rsm      |----|
8542                                  *  sackblk       |----->
8543                                  *  nrsm          |---|
8544                                  * We then fall through reseting
8545                                  * rsm to nrsm, so the next block
8546                                  * picks it up.
8547                                  */
8548                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8549                                 if (nrsm == NULL) {
8550                                         /*
8551                                          * failed XXXrrs what can we do but loose the sack
8552                                          * info?
8553                                          */
8554                                         goto out;
8555                                 }
8556                                 counter_u64_add(rack_sack_splits, 1);
8557                                 rack_clone_rsm(rack, nrsm, rsm, start);
8558                                 rsm->r_just_ret = 0;
8559                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8560 #ifdef INVARIANTS
8561                                 if (insret != NULL) {
8562                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8563                                               nrsm, insret, rack, rsm);
8564                                 }
8565 #endif
8566                                 if (rsm->r_in_tmap) {
8567                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8568                                         nrsm->r_in_tmap = 1;
8569                                 }
8570                                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
8571                                 rsm->r_flags &= (~RACK_HAS_FIN);
8572                                 /* Position us to point to the new nrsm that starts the sack blk */
8573                                 rsm = nrsm;
8574                         }
8575                 } else {
8576                         /* Already sacked this piece */
8577                         counter_u64_add(rack_sack_skipped_acked, 1);
8578                         moved++;
8579                         if (end == rsm->r_end) {
8580                                 /* Done with block */
8581                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8582                                 goto out;
8583                         } else if (SEQ_LT(end, rsm->r_end)) {
8584                                 /* A partial sack to a already sacked block */
8585                                 moved++;
8586                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8587                                 goto out;
8588                         } else {
8589                                 /*
8590                                  * The end goes beyond this guy
8591                                  * repostion the start to the
8592                                  * next block.
8593                                  */
8594                                 start = rsm->r_end;
8595                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8596                                 if (rsm == NULL)
8597                                         goto out;
8598                         }
8599                 }
8600         }
8601         if (SEQ_GEQ(end, rsm->r_end)) {
8602                 /**
8603                  * The end of this block is either beyond this guy or right
8604                  * at this guy. I.e.:
8605                  *  rsm ---                 |-----|
8606                  *  end                     |-----|
8607                  *  <or>
8608                  *  end                     |---------|
8609                  */
8610                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8611                         /*
8612                          * Is it a TLP of interest?
8613                          */
8614                         if ((rsm->r_flags & RACK_TLP) &&
8615                             (rsm->r_rtr_cnt > 1)) {
8616                                 /*
8617                                  * We are splitting a rxt TLP, check
8618                                  * if we need to save off the start/end
8619                                  */
8620                                 if (rack->rc_last_tlp_acked_set &&
8621                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8622                                         /*
8623                                          * We already turned this on since we are inside
8624                                          * the previous one was a partially sack now we
8625                                          * are getting another one (maybe all of it).
8626                                          */
8627                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8628                                         /*
8629                                          * Lets make sure we have all of it though.
8630                                          */
8631                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8632                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8633                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8634                                                                      rack->r_ctl.last_tlp_acked_end);
8635                                         }
8636                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8637                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8638                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8639                                                                      rack->r_ctl.last_tlp_acked_end);
8640                                         }
8641                                 } else {
8642                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8643                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8644                                         rack->rc_last_tlp_past_cumack = 0;
8645                                         rack->rc_last_tlp_acked_set = 1;
8646                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8647                                 }
8648                         }
8649                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8650                         changed += (rsm->r_end - rsm->r_start);
8651                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8652                         if (rsm->r_in_tmap) /* should be true */
8653                                 rack_log_sack_passed(tp, rack, rsm);
8654                         /* Is Reordering occuring? */
8655                         if (rsm->r_flags & RACK_SACK_PASSED) {
8656                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8657                                 counter_u64_add(rack_reorder_seen, 1);
8658                                 rack->r_ctl.rc_reorder_ts = cts;
8659                         }
8660                         if (rack->app_limited_needs_set)
8661                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8662                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8663                         rsm->r_flags |= RACK_ACKED;
8664                         if (rsm->r_in_tmap) {
8665                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8666                                 rsm->r_in_tmap = 0;
8667                         }
8668                         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
8669                 } else {
8670                         counter_u64_add(rack_sack_skipped_acked, 1);
8671                         moved++;
8672                 }
8673                 if (end == rsm->r_end) {
8674                         /* This block only - done, setup for next */
8675                         goto out;
8676                 }
8677                 /*
8678                  * There is more not coverend by this rsm move on
8679                  * to the next block in the RB tree.
8680                  */
8681                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8682                 start = rsm->r_end;
8683                 rsm = nrsm;
8684                 if (rsm == NULL)
8685                         goto out;
8686                 goto do_rest_ofb;
8687         }
8688         /**
8689          * The end of this sack block is smaller than
8690          * our rsm i.e.:
8691          *  rsm ---                 |-----|
8692          *  end                     |--|
8693          */
8694         if ((rsm->r_flags & RACK_ACKED) == 0) {
8695                 /*
8696                  * Is it a TLP of interest?
8697                  */
8698                 if ((rsm->r_flags & RACK_TLP) &&
8699                     (rsm->r_rtr_cnt > 1)) {
8700                         /*
8701                          * We are splitting a rxt TLP, check
8702                          * if we need to save off the start/end
8703                          */
8704                         if (rack->rc_last_tlp_acked_set &&
8705                             (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8706                                 /*
8707                                  * We already turned this on since we are inside
8708                                  * the previous one was a partially sack now we
8709                                  * are getting another one (maybe all of it).
8710                                  */
8711                                 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8712                                 /*
8713                                  * Lets make sure we have all of it though.
8714                                  */
8715                                 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8716                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8717                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8718                                                              rack->r_ctl.last_tlp_acked_end);
8719                                 }
8720                                 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8721                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8722                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8723                                                              rack->r_ctl.last_tlp_acked_end);
8724                                 }
8725                         } else {
8726                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8727                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8728                                 rack->rc_last_tlp_past_cumack = 0;
8729                                 rack->rc_last_tlp_acked_set = 1;
8730                                 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8731                         }
8732                 }
8733                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8734                 if (prev &&
8735                     (prev->r_flags & RACK_ACKED)) {
8736                         /**
8737                          * Goal, we want the right remainder of rsm to shrink
8738                          * in place and span from (rsm->r_start = end) to rsm->r_end.
8739                          * We want to expand prev to go all the way
8740                          * to prev->r_end <- end.
8741                          * so in the tree we have before:
8742                          *   prev     |--------|         (acked)
8743                          *   rsm               |-------| (non-acked)
8744                          *   sackblk           |-|
8745                          * We churn it so we end up with
8746                          *   prev     |----------|       (acked)
8747                          *   rsm                 |-----| (non-acked)
8748                          *   nrsm              |-| (temporary)
8749                          *
8750                          * Note if either prev/rsm is a TLP we don't
8751                          * do this.
8752                          */
8753                         nrsm = &stack_map;
8754                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8755                         prev->r_end = end;
8756                         rsm->r_start = end;
8757                         /* Now adjust nrsm (stack copy) to be
8758                          * the one that is the small
8759                          * piece that was "sacked".
8760                          */
8761                         nrsm->r_end = end;
8762                         rsm->r_dupack = 0;
8763                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8764                         /*
8765                          * Now that the rsm has had its start moved forward
8766                          * lets go ahead and get its new place in the world.
8767                          */
8768                         rack_setup_offset_for_rsm(prev, rsm);
8769                         /*
8770                          * Now nrsm is our new little piece
8771                          * that is acked (which was merged
8772                          * to prev). Update the rtt and changed
8773                          * based on that. Also check for reordering.
8774                          */
8775                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8776                         if (rack->app_limited_needs_set)
8777                                 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8778                         changed += (nrsm->r_end - nrsm->r_start);
8779                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8780                         if (nrsm->r_flags & RACK_SACK_PASSED) {
8781                                 counter_u64_add(rack_reorder_seen, 1);
8782                                 rack->r_ctl.rc_reorder_ts = cts;
8783                         }
8784                         rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
8785                         rsm = prev;
8786                         counter_u64_add(rack_sack_used_prev_merge, 1);
8787                 } else {
8788                         /**
8789                          * This is the case where our previous
8790                          * block is not acked either, so we must
8791                          * split the block in two.
8792                          */
8793                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8794                         if (nrsm == NULL) {
8795                                 /* failed rrs what can we do but loose the sack info? */
8796                                 goto out;
8797                         }
8798                         if ((rsm->r_flags & RACK_TLP) &&
8799                             (rsm->r_rtr_cnt > 1)) {
8800                                 /*
8801                                  * We are splitting a rxt TLP, check
8802                                  * if we need to save off the start/end
8803                                  */
8804                                 if (rack->rc_last_tlp_acked_set &&
8805                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8806                                             /*
8807                                              * We already turned this on since this block is inside
8808                                              * the previous one was a partially sack now we
8809                                              * are getting another one (maybe all of it).
8810                                              */
8811                                             rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8812                                             /*
8813                                              * Lets make sure we have all of it though.
8814                                              */
8815                                             if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8816                                                     rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8817                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8818                                                                          rack->r_ctl.last_tlp_acked_end);
8819                                             }
8820                                             if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8821                                                     rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8822                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8823                                                                          rack->r_ctl.last_tlp_acked_end);
8824                                             }
8825                                     } else {
8826                                             rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8827                                             rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8828                                             rack->rc_last_tlp_acked_set = 1;
8829                                             rack->rc_last_tlp_past_cumack = 0;
8830                                             rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8831                                     }
8832                         }
8833                         /**
8834                          * In this case nrsm becomes
8835                          * nrsm->r_start = end;
8836                          * nrsm->r_end = rsm->r_end;
8837                          * which is un-acked.
8838                          * <and>
8839                          * rsm->r_end = nrsm->r_start;
8840                          * i.e. the remaining un-acked
8841                          * piece is left on the left
8842                          * hand side.
8843                          *
8844                          * So we start like this
8845                          * rsm      |----------| (not acked)
8846                          * sackblk  |---|
8847                          * build it so we have
8848                          * rsm      |---|         (acked)
8849                          * nrsm         |------|  (not acked)
8850                          */
8851                         counter_u64_add(rack_sack_splits, 1);
8852                         rack_clone_rsm(rack, nrsm, rsm, end);
8853                         rsm->r_flags &= (~RACK_HAS_FIN);
8854                         rsm->r_just_ret = 0;
8855                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8856 #ifdef INVARIANTS
8857                         if (insret != NULL) {
8858                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8859                                       nrsm, insret, rack, rsm);
8860                         }
8861 #endif
8862                         if (rsm->r_in_tmap) {
8863                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8864                                 nrsm->r_in_tmap = 1;
8865                         }
8866                         nrsm->r_dupack = 0;
8867                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8868                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8869                         changed += (rsm->r_end - rsm->r_start);
8870                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8871                         if (rsm->r_in_tmap) /* should be true */
8872                                 rack_log_sack_passed(tp, rack, rsm);
8873                         /* Is Reordering occuring? */
8874                         if (rsm->r_flags & RACK_SACK_PASSED) {
8875                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8876                                 counter_u64_add(rack_reorder_seen, 1);
8877                                 rack->r_ctl.rc_reorder_ts = cts;
8878                         }
8879                         if (rack->app_limited_needs_set)
8880                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8881                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8882                         rsm->r_flags |= RACK_ACKED;
8883                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
8884                         if (rsm->r_in_tmap) {
8885                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8886                                 rsm->r_in_tmap = 0;
8887                         }
8888                 }
8889         } else if (start != end){
8890                 /*
8891                  * The block was already acked.
8892                  */
8893                 counter_u64_add(rack_sack_skipped_acked, 1);
8894                 moved++;
8895         }
8896 out:
8897         if (rsm &&
8898             ((rsm->r_flags & RACK_TLP) == 0) &&
8899             (rsm->r_flags & RACK_ACKED)) {
8900                 /*
8901                  * Now can we merge where we worked
8902                  * with either the previous or
8903                  * next block?
8904                  */
8905                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8906                 while (next) {
8907                         if (next->r_flags & RACK_TLP)
8908                                 break;
8909                         if (next->r_flags & RACK_ACKED) {
8910                         /* yep this and next can be merged */
8911                                 rsm = rack_merge_rsm(rack, rsm, next);
8912                                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8913                         } else
8914                                 break;
8915                 }
8916                 /* Now what about the previous? */
8917                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8918                 while (prev) {
8919                         if (prev->r_flags & RACK_TLP)
8920                                 break;
8921                         if (prev->r_flags & RACK_ACKED) {
8922                                 /* yep the previous and this can be merged */
8923                                 rsm = rack_merge_rsm(rack, prev, rsm);
8924                                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8925                         } else
8926                                 break;
8927                 }
8928         }
8929         if (used_ref == 0) {
8930                 counter_u64_add(rack_sack_proc_all, 1);
8931         } else {
8932                 counter_u64_add(rack_sack_proc_short, 1);
8933         }
8934         /* Save off the next one for quick reference. */
8935         if (rsm)
8936                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8937         else
8938                 nrsm = NULL;
8939         *prsm = rack->r_ctl.rc_sacklast = nrsm;
8940         /* Pass back the moved. */
8941         *moved_two = moved;
8942         return (changed);
8943 }
8944
8945 static void inline
8946 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
8947 {
8948         struct rack_sendmap *tmap;
8949
8950         tmap = NULL;
8951         while (rsm && (rsm->r_flags & RACK_ACKED)) {
8952                 /* Its no longer sacked, mark it so */
8953                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
8954 #ifdef INVARIANTS
8955                 if (rsm->r_in_tmap) {
8956                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
8957                               rack, rsm, rsm->r_flags);
8958                 }
8959 #endif
8960                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
8961                 /* Rebuild it into our tmap */
8962                 if (tmap == NULL) {
8963                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8964                         tmap = rsm;
8965                 } else {
8966                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
8967                         tmap = rsm;
8968                 }
8969                 tmap->r_in_tmap = 1;
8970                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8971         }
8972         /*
8973          * Now lets possibly clear the sack filter so we start
8974          * recognizing sacks that cover this area.
8975          */
8976         sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
8977
8978 }
8979
8980 static void
8981 rack_do_decay(struct tcp_rack *rack)
8982 {
8983         struct timeval res;
8984
8985 #define timersub(tvp, uvp, vvp)                                         \
8986         do {                                                            \
8987                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
8988                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
8989                 if ((vvp)->tv_usec < 0) {                               \
8990                         (vvp)->tv_sec--;                                \
8991                         (vvp)->tv_usec += 1000000;                      \
8992                 }                                                       \
8993         } while (0)
8994
8995         timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
8996 #undef timersub
8997
8998         rack->r_ctl.input_pkt++;
8999         if ((rack->rc_in_persist) ||
9000             (res.tv_sec >= 1) ||
9001             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
9002                 /*
9003                  * Check for decay of non-SAD,
9004                  * we want all SAD detection metrics to
9005                  * decay 1/4 per second (or more) passed.
9006                  */
9007                 uint32_t pkt_delta;
9008
9009                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
9010                 /* Update our saved tracking values */
9011                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
9012                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
9013                 /* Now do we escape without decay? */
9014 #ifdef NETFLIX_EXP_DETECTION
9015                 if (rack->rc_in_persist ||
9016                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
9017                     (pkt_delta < tcp_sad_low_pps)){
9018                         /*
9019                          * We don't decay idle connections
9020                          * or ones that have a low input pps.
9021                          */
9022                         return;
9023                 }
9024                 /* Decay the counters */
9025                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
9026                                                         tcp_sad_decay_val);
9027                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
9028                                                          tcp_sad_decay_val);
9029                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
9030                                                                tcp_sad_decay_val);
9031                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
9032                                                                 tcp_sad_decay_val);
9033 #endif
9034         }
9035 }
9036
9037 static void
9038 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
9039 {
9040         struct rack_sendmap *rsm, *rm;
9041
9042         /*
9043          * The ACK point is advancing to th_ack, we must drop off
9044          * the packets in the rack log and calculate any eligble
9045          * RTT's.
9046          */
9047         rack->r_wanted_output = 1;
9048
9049         /* Tend any TLP that has been marked for 1/2 the seq space (its old)  */
9050         if ((rack->rc_last_tlp_acked_set == 1)&&
9051             (rack->rc_last_tlp_past_cumack == 1) &&
9052             (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
9053                 /*
9054                  * We have reached the point where our last rack
9055                  * tlp retransmit sequence is ahead of the cum-ack.
9056                  * This can only happen when the cum-ack moves all
9057                  * the way around (its been a full 2^^31+1 bytes
9058                  * or more since we sent a retransmitted TLP). Lets
9059                  * turn off the valid flag since its not really valid.
9060                  *
9061                  * Note since sack's also turn on this event we have
9062                  * a complication, we have to wait to age it out until
9063                  * the cum-ack is by the TLP before checking which is
9064                  * what the next else clause does.
9065                  */
9066                 rack_log_dsack_event(rack, 9, __LINE__,
9067                                      rack->r_ctl.last_tlp_acked_start,
9068                                      rack->r_ctl.last_tlp_acked_end);
9069                 rack->rc_last_tlp_acked_set = 0;
9070                 rack->rc_last_tlp_past_cumack = 0;
9071         } else if ((rack->rc_last_tlp_acked_set == 1) &&
9072                    (rack->rc_last_tlp_past_cumack == 0) &&
9073                    (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
9074                 /*
9075                  * It is safe to start aging TLP's out.
9076                  */
9077                 rack->rc_last_tlp_past_cumack = 1;
9078         }
9079         /* We do the same for the tlp send seq as well */
9080         if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
9081             (rack->rc_last_sent_tlp_past_cumack == 1) &&
9082             (SEQ_GT(rack->r_ctl.last_sent_tlp_seq,  th_ack))) {
9083                 rack_log_dsack_event(rack, 9, __LINE__,
9084                                      rack->r_ctl.last_sent_tlp_seq,
9085                                      (rack->r_ctl.last_sent_tlp_seq +
9086                                       rack->r_ctl.last_sent_tlp_len));
9087                 rack->rc_last_sent_tlp_seq_valid = 0;
9088                 rack->rc_last_sent_tlp_past_cumack = 0;
9089         } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
9090                    (rack->rc_last_sent_tlp_past_cumack == 0) &&
9091                    (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
9092                 /*
9093                  * It is safe to start aging TLP's send.
9094                  */
9095                 rack->rc_last_sent_tlp_past_cumack = 1;
9096         }
9097 more:
9098         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9099         if (rsm == NULL) {
9100                 if ((th_ack - 1) == tp->iss) {
9101                         /*
9102                          * For the SYN incoming case we will not
9103                          * have called tcp_output for the sending of
9104                          * the SYN, so there will be no map. All
9105                          * other cases should probably be a panic.
9106                          */
9107                         return;
9108                 }
9109                 if (tp->t_flags & TF_SENTFIN) {
9110                         /* if we sent a FIN we often will not have map */
9111                         return;
9112                 }
9113 #ifdef INVARIANTS
9114                 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
9115                       tp,
9116                       tp->t_state, th_ack, rack,
9117                       tp->snd_una, tp->snd_max, tp->snd_nxt);
9118 #endif
9119                 return;
9120         }
9121         if (SEQ_LT(th_ack, rsm->r_start)) {
9122                 /* Huh map is missing this */
9123 #ifdef INVARIANTS
9124                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
9125                        rsm->r_start,
9126                        th_ack, tp->t_state, rack->r_state);
9127 #endif
9128                 return;
9129         }
9130         rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
9131
9132         /* Now was it a retransmitted TLP? */
9133         if ((rsm->r_flags & RACK_TLP) &&
9134             (rsm->r_rtr_cnt > 1)) {
9135                 /*
9136                  * Yes, this rsm was a TLP and retransmitted, remember that
9137                  * since if a DSACK comes back on this we don't want
9138                  * to think of it as a reordered segment. This may
9139                  * get updated again with possibly even other TLPs
9140                  * in flight, but thats ok. Only when we don't send
9141                  * a retransmitted TLP for 1/2 the sequences space
9142                  * will it get turned off (above).
9143                  */
9144                 if (rack->rc_last_tlp_acked_set &&
9145                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9146                         /*
9147                          * We already turned this on since the end matches,
9148                          * the previous one was a partially ack now we
9149                          * are getting another one (maybe all of it).
9150                          */
9151                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9152                         /*
9153                          * Lets make sure we have all of it though.
9154                          */
9155                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9156                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9157                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9158                                                      rack->r_ctl.last_tlp_acked_end);
9159                         }
9160                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9161                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9162                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9163                                                      rack->r_ctl.last_tlp_acked_end);
9164                         }
9165                 } else {
9166                         rack->rc_last_tlp_past_cumack = 1;
9167                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9168                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9169                         rack->rc_last_tlp_acked_set = 1;
9170                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9171                 }
9172         }
9173         /* Now do we consume the whole thing? */
9174         if (SEQ_GEQ(th_ack, rsm->r_end)) {
9175                 /* Its all consumed. */
9176                 uint32_t left;
9177                 uint8_t newly_acked;
9178
9179                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
9180                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
9181                 rsm->r_rtr_bytes = 0;
9182                 /* Record the time of highest cumack sent */
9183                 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9184                 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
9185 #ifdef INVARIANTS
9186                 if (rm != rsm) {
9187                         panic("removing head in rack:%p rsm:%p rm:%p",
9188                               rack, rsm, rm);
9189                 }
9190 #endif
9191                 if (rsm->r_in_tmap) {
9192                         TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9193                         rsm->r_in_tmap = 0;
9194                 }
9195                 newly_acked = 1;
9196                 if (rsm->r_flags & RACK_ACKED) {
9197                         /*
9198                          * It was acked on the scoreboard -- remove
9199                          * it from total
9200                          */
9201                         rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
9202                         newly_acked = 0;
9203                 } else if (rsm->r_flags & RACK_SACK_PASSED) {
9204                         /*
9205                          * There are segments ACKED on the
9206                          * scoreboard further up. We are seeing
9207                          * reordering.
9208                          */
9209                         rsm->r_flags &= ~RACK_SACK_PASSED;
9210                         counter_u64_add(rack_reorder_seen, 1);
9211                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9212                         rsm->r_flags |= RACK_ACKED;
9213                         rack->r_ctl.rc_reorder_ts = cts;
9214                         if (rack->r_ent_rec_ns) {
9215                                 /*
9216                                  * We have sent no more, and we saw an sack
9217                                  * then ack arrive.
9218                                  */
9219                                 rack->r_might_revert = 1;
9220                         }
9221                 }
9222                 if ((rsm->r_flags & RACK_TO_REXT) &&
9223                     (tp->t_flags & TF_RCVD_TSTMP) &&
9224                     (to->to_flags & TOF_TS) &&
9225                     (to->to_tsecr != 0) &&
9226                     (tp->t_flags & TF_PREVVALID)) {
9227                         /*
9228                          * We can use the timestamp to see
9229                          * if this retransmission was from the
9230                          * first transmit. If so we made a mistake.
9231                          */
9232                         tp->t_flags &= ~TF_PREVVALID;
9233                         if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
9234                                 /* The first transmit is what this ack is for */
9235                                 rack_cong_signal(tp, CC_RTO_ERR, th_ack);
9236                         }
9237                 }
9238                 left = th_ack - rsm->r_end;
9239                 if (rack->app_limited_needs_set && newly_acked)
9240                         rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
9241                 /* Free back to zone */
9242                 rack_free(rack, rsm);
9243                 if (left) {
9244                         goto more;
9245                 }
9246                 /* Check for reneging */
9247                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9248                 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
9249                         /*
9250                          * The peer has moved snd_una up to
9251                          * the edge of this send, i.e. one
9252                          * that it had previously acked. The only
9253                          * way that can be true if the peer threw
9254                          * away data (space issues) that it had
9255                          * previously sacked (else it would have
9256                          * given us snd_una up to (rsm->r_end).
9257                          * We need to undo the acked markings here.
9258                          *
9259                          * Note we have to look to make sure th_ack is
9260                          * our rsm->r_start in case we get an old ack
9261                          * where th_ack is behind snd_una.
9262                          */
9263                         rack_peer_reneges(rack, rsm, th_ack);
9264                 }
9265                 return;
9266         }
9267         if (rsm->r_flags & RACK_ACKED) {
9268                 /*
9269                  * It was acked on the scoreboard -- remove it from
9270                  * total for the part being cum-acked.
9271                  */
9272                 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
9273         }
9274         /*
9275          * Clear the dup ack count for
9276          * the piece that remains.
9277          */
9278         rsm->r_dupack = 0;
9279         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9280         if (rsm->r_rtr_bytes) {
9281                 /*
9282                  * It was retransmitted adjust the
9283                  * sack holes for what was acked.
9284                  */
9285                 int ack_am;
9286
9287                 ack_am = (th_ack - rsm->r_start);
9288                 if (ack_am >= rsm->r_rtr_bytes) {
9289                         rack->r_ctl.rc_holes_rxt -= ack_am;
9290                         rsm->r_rtr_bytes -= ack_am;
9291                 }
9292         }
9293         /*
9294          * Update where the piece starts and record
9295          * the time of send of highest cumack sent.
9296          */
9297         rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9298         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
9299         /* Now we need to move our offset forward too */
9300         if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
9301                 /* Fix up the orig_m_len and possibly the mbuf offset */
9302                 rack_adjust_orig_mlen(rsm);
9303         }
9304         rsm->soff += (th_ack - rsm->r_start);
9305         rsm->r_start = th_ack;
9306         /* Now do we need to move the mbuf fwd too? */
9307         if (rsm->m) {
9308                 while (rsm->soff >= rsm->m->m_len) {
9309                         rsm->soff -= rsm->m->m_len;
9310                         rsm->m = rsm->m->m_next;
9311                         KASSERT((rsm->m != NULL),
9312                                 (" nrsm:%p hit at soff:%u null m",
9313                                  rsm, rsm->soff));
9314                 }
9315                 rsm->orig_m_len = rsm->m->m_len;
9316         }
9317         if (rack->app_limited_needs_set)
9318                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
9319 }
9320
9321 static void
9322 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
9323 {
9324         struct rack_sendmap *rsm;
9325         int sack_pass_fnd = 0;
9326
9327         if (rack->r_might_revert) {
9328                 /*
9329                  * Ok we have reordering, have not sent anything, we
9330                  * might want to revert the congestion state if nothing
9331                  * further has SACK_PASSED on it. Lets check.
9332                  *
9333                  * We also get here when we have DSACKs come in for
9334                  * all the data that we FR'd. Note that a rxt or tlp
9335                  * timer clears this from happening.
9336                  */
9337
9338                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
9339                         if (rsm->r_flags & RACK_SACK_PASSED) {
9340                                 sack_pass_fnd = 1;
9341                                 break;
9342                         }
9343                 }
9344                 if (sack_pass_fnd == 0) {
9345                         /*
9346                          * We went into recovery
9347                          * incorrectly due to reordering!
9348                          */
9349                         int orig_cwnd;
9350
9351                         rack->r_ent_rec_ns = 0;
9352                         orig_cwnd = tp->snd_cwnd;
9353                         tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec;
9354                         tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
9355                         tp->snd_recover = tp->snd_una;
9356                         rack_log_to_prr(rack, 14, orig_cwnd);
9357                         EXIT_RECOVERY(tp->t_flags);
9358                 }
9359                 rack->r_might_revert = 0;
9360         }
9361 }
9362
9363 #ifdef NETFLIX_EXP_DETECTION
9364 static void
9365 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t bytes_this_ack, uint32_t segsiz)
9366 {
9367         if ((rack->do_detection || tcp_force_detection) &&
9368             tcp_sack_to_ack_thresh &&
9369             tcp_sack_to_move_thresh &&
9370             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
9371                 /*
9372                  * We have thresholds set to find
9373                  * possible attackers and disable sack.
9374                  * Check them.
9375                  */
9376                 uint64_t ackratio, moveratio, movetotal;
9377
9378                 /* Log detecting */
9379                 rack_log_sad(rack, 1);
9380                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
9381                 ackratio *= (uint64_t)(1000);
9382                 if (rack->r_ctl.ack_count)
9383                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
9384                 else {
9385                         /* We really should not hit here */
9386                         ackratio = 1000;
9387                 }
9388                 if ((rack->sack_attack_disable == 0) &&
9389                     (ackratio > rack_highest_sack_thresh_seen))
9390                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
9391                 movetotal = rack->r_ctl.sack_moved_extra;
9392                 movetotal += rack->r_ctl.sack_noextra_move;
9393                 moveratio = rack->r_ctl.sack_moved_extra;
9394                 moveratio *= (uint64_t)1000;
9395                 if (movetotal)
9396                         moveratio /= movetotal;
9397                 else {
9398                         /* No moves, thats pretty good */
9399                         moveratio = 0;
9400                 }
9401                 if ((rack->sack_attack_disable == 0) &&
9402                     (moveratio > rack_highest_move_thresh_seen))
9403                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
9404                 if (rack->sack_attack_disable == 0) {
9405                         if ((ackratio > tcp_sack_to_ack_thresh) &&
9406                             (moveratio > tcp_sack_to_move_thresh)) {
9407                                 /* Disable sack processing */
9408                                 rack->sack_attack_disable = 1;
9409                                 if (rack->r_rep_attack == 0) {
9410                                         rack->r_rep_attack = 1;
9411                                         counter_u64_add(rack_sack_attacks_detected, 1);
9412                                 }
9413                                 if (tcp_attack_on_turns_on_logging) {
9414                                         /*
9415                                          * Turn on logging, used for debugging
9416                                          * false positives.
9417                                          */
9418                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
9419                                 }
9420                                 /* Clamp the cwnd at flight size */
9421                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
9422                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9423                                 rack_log_sad(rack, 2);
9424                         }
9425                 } else {
9426                         /* We are sack-disabled check for false positives */
9427                         if ((ackratio <= tcp_restoral_thresh) ||
9428                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
9429                                 rack->sack_attack_disable = 0;
9430                                 rack_log_sad(rack, 3);
9431                                 /* Restart counting */
9432                                 rack->r_ctl.sack_count = 0;
9433                                 rack->r_ctl.sack_moved_extra = 0;
9434                                 rack->r_ctl.sack_noextra_move = 1;
9435                                 rack->r_ctl.ack_count = max(1,
9436                                       (bytes_this_ack / segsiz));
9437
9438                                 if (rack->r_rep_reverse == 0) {
9439                                         rack->r_rep_reverse = 1;
9440                                         counter_u64_add(rack_sack_attacks_reversed, 1);
9441                                 }
9442                                 /* Restore the cwnd */
9443                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
9444                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
9445                         }
9446                 }
9447         }
9448 }
9449 #endif
9450
9451 static int
9452 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
9453 {
9454
9455         uint32_t am, l_end;
9456         int was_tlp = 0;
9457
9458         if (SEQ_GT(end, start))
9459                 am = end - start;
9460         else
9461                 am = 0;
9462         if ((rack->rc_last_tlp_acked_set ) &&
9463             (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
9464             (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
9465                 /*
9466                  * The DSACK is because of a TLP which we don't
9467                  * do anything with the reordering window over since
9468                  * it was not reordering that caused the DSACK but
9469                  * our previous retransmit TLP.
9470                  */
9471                 rack_log_dsack_event(rack, 7, __LINE__, start, end);
9472                 was_tlp = 1;
9473                 goto skip_dsack_round;
9474         }
9475         if (rack->rc_last_sent_tlp_seq_valid) {
9476                 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
9477                 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
9478                     (SEQ_LEQ(end, l_end))) {
9479                         /*
9480                          * This dsack is from the last sent TLP, ignore it
9481                          * for reordering purposes.
9482                          */
9483                         rack_log_dsack_event(rack, 7, __LINE__, start, end);
9484                         was_tlp = 1;
9485                         goto skip_dsack_round;
9486                 }
9487         }
9488         if (rack->rc_dsack_round_seen == 0) {
9489                 rack->rc_dsack_round_seen = 1;
9490                 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
9491                 rack->r_ctl.num_dsack++;
9492                 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
9493                 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
9494         }
9495 skip_dsack_round:
9496         /*
9497          * We keep track of how many DSACK blocks we get
9498          * after a recovery incident.
9499          */
9500         rack->r_ctl.dsack_byte_cnt += am;
9501         if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
9502             rack->r_ctl.retran_during_recovery &&
9503             (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
9504                 /*
9505                  * False recovery most likely culprit is reordering. If
9506                  * nothing else is missing we need to revert.
9507                  */
9508                 rack->r_might_revert = 1;
9509                 rack_handle_might_revert(rack->rc_tp, rack);
9510                 rack->r_might_revert = 0;
9511                 rack->r_ctl.retran_during_recovery = 0;
9512                 rack->r_ctl.dsack_byte_cnt = 0;
9513         }
9514         return (was_tlp);
9515 }
9516
9517 static void
9518 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
9519 {
9520         /* Deal with changed and PRR here (in recovery only) */
9521         uint32_t pipe, snd_una;
9522
9523         rack->r_ctl.rc_prr_delivered += changed;
9524
9525         if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
9526                 /*
9527                  * It is all outstanding, we are application limited
9528                  * and thus we don't need more room to send anything.
9529                  * Note we use tp->snd_una here and not th_ack because
9530                  * the data as yet not been cut from the sb.
9531                  */
9532                 rack->r_ctl.rc_prr_sndcnt = 0;
9533                 return;
9534         }
9535         /* Compute prr_sndcnt */
9536         if (SEQ_GT(tp->snd_una, th_ack)) {
9537                 snd_una = tp->snd_una;
9538         } else {
9539                 snd_una = th_ack;
9540         }
9541         pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
9542         if (pipe > tp->snd_ssthresh) {
9543                 long sndcnt;
9544
9545                 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
9546                 if (rack->r_ctl.rc_prr_recovery_fs > 0)
9547                         sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
9548                 else {
9549                         rack->r_ctl.rc_prr_sndcnt = 0;
9550                         rack_log_to_prr(rack, 9, 0);
9551                         sndcnt = 0;
9552                 }
9553                 sndcnt++;
9554                 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
9555                         sndcnt -= rack->r_ctl.rc_prr_out;
9556                 else
9557                         sndcnt = 0;
9558                 rack->r_ctl.rc_prr_sndcnt = sndcnt;
9559                 rack_log_to_prr(rack, 10, 0);
9560         } else {
9561                 uint32_t limit;
9562
9563                 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
9564                         limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
9565                 else
9566                         limit = 0;
9567                 if (changed > limit)
9568                         limit = changed;
9569                 limit += ctf_fixed_maxseg(tp);
9570                 if (tp->snd_ssthresh > pipe) {
9571                         rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
9572                         rack_log_to_prr(rack, 11, 0);
9573                 } else {
9574                         rack->r_ctl.rc_prr_sndcnt = min(0, limit);
9575                         rack_log_to_prr(rack, 12, 0);
9576                 }
9577         }
9578 }
9579
9580 static void
9581 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
9582 {
9583         uint32_t changed;
9584         struct tcp_rack *rack;
9585         struct rack_sendmap *rsm;
9586         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
9587         register uint32_t th_ack;
9588         int32_t i, j, k, num_sack_blks = 0;
9589         uint32_t cts, acked, ack_point, sack_changed = 0;
9590         int loop_start = 0, moved_two = 0;
9591         uint32_t tsused;
9592
9593
9594         INP_WLOCK_ASSERT(tp->t_inpcb);
9595         if (th->th_flags & TH_RST) {
9596                 /* We don't log resets */
9597                 return;
9598         }
9599         rack = (struct tcp_rack *)tp->t_fb_ptr;
9600         cts = tcp_get_usecs(NULL);
9601         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9602         changed = 0;
9603         th_ack = th->th_ack;
9604         if (rack->sack_attack_disable == 0)
9605                 rack_do_decay(rack);
9606         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
9607                 /*
9608                  * You only get credit for
9609                  * MSS and greater (and you get extra
9610                  * credit for larger cum-ack moves).
9611                  */
9612                 int ac;
9613
9614                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
9615                 rack->r_ctl.ack_count += ac;
9616                 counter_u64_add(rack_ack_total, ac);
9617         }
9618         if (rack->r_ctl.ack_count > 0xfff00000) {
9619                 /*
9620                  * reduce the number to keep us under
9621                  * a uint32_t.
9622                  */
9623                 rack->r_ctl.ack_count /= 2;
9624                 rack->r_ctl.sack_count /= 2;
9625         }
9626         if (SEQ_GT(th_ack, tp->snd_una)) {
9627                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
9628                 tp->t_acktime = ticks;
9629         }
9630         if (rsm && SEQ_GT(th_ack, rsm->r_start))
9631                 changed = th_ack - rsm->r_start;
9632         if (changed) {
9633                 rack_process_to_cumack(tp, rack, th_ack, cts, to);
9634         }
9635         if ((to->to_flags & TOF_SACK) == 0) {
9636                 /* We are done nothing left and no sack. */
9637                 rack_handle_might_revert(tp, rack);
9638                 /*
9639                  * For cases where we struck a dup-ack
9640                  * with no SACK, add to the changes so
9641                  * PRR will work right.
9642                  */
9643                 if (dup_ack_struck && (changed == 0)) {
9644                         changed += ctf_fixed_maxseg(rack->rc_tp);
9645                 }
9646                 goto out;
9647         }
9648         /* Sack block processing */
9649         if (SEQ_GT(th_ack, tp->snd_una))
9650                 ack_point = th_ack;
9651         else
9652                 ack_point = tp->snd_una;
9653         for (i = 0; i < to->to_nsacks; i++) {
9654                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
9655                       &sack, sizeof(sack));
9656                 sack.start = ntohl(sack.start);
9657                 sack.end = ntohl(sack.end);
9658                 if (SEQ_GT(sack.end, sack.start) &&
9659                     SEQ_GT(sack.start, ack_point) &&
9660                     SEQ_LT(sack.start, tp->snd_max) &&
9661                     SEQ_GT(sack.end, ack_point) &&
9662                     SEQ_LEQ(sack.end, tp->snd_max)) {
9663                         sack_blocks[num_sack_blks] = sack;
9664                         num_sack_blks++;
9665                 } else if (SEQ_LEQ(sack.start, th_ack) &&
9666                            SEQ_LEQ(sack.end, th_ack)) {
9667                         int was_tlp;
9668
9669                         was_tlp = rack_note_dsack(rack, sack.start, sack.end);
9670                         /*
9671                          * Its a D-SACK block.
9672                          */
9673                         tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
9674                 }
9675         }
9676         if (rack->rc_dsack_round_seen) {
9677                 /* Is the dsack roound over? */
9678                 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
9679                         /* Yes it is */
9680                         rack->rc_dsack_round_seen = 0;
9681                         rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
9682                 }
9683         }
9684         /*
9685          * Sort the SACK blocks so we can update the rack scoreboard with
9686          * just one pass.
9687          */
9688         num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
9689                                          num_sack_blks, th->th_ack);
9690         ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
9691         if (num_sack_blks == 0) {
9692                 /* Nothing to sack (DSACKs?) */
9693                 goto out_with_totals;
9694         }
9695         if (num_sack_blks < 2) {
9696                 /* Only one, we don't need to sort */
9697                 goto do_sack_work;
9698         }
9699         /* Sort the sacks */
9700         for (i = 0; i < num_sack_blks; i++) {
9701                 for (j = i + 1; j < num_sack_blks; j++) {
9702                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
9703                                 sack = sack_blocks[i];
9704                                 sack_blocks[i] = sack_blocks[j];
9705                                 sack_blocks[j] = sack;
9706                         }
9707                 }
9708         }
9709         /*
9710          * Now are any of the sack block ends the same (yes some
9711          * implementations send these)?
9712          */
9713 again:
9714         if (num_sack_blks == 0)
9715                 goto out_with_totals;
9716         if (num_sack_blks > 1) {
9717                 for (i = 0; i < num_sack_blks; i++) {
9718                         for (j = i + 1; j < num_sack_blks; j++) {
9719                                 if (sack_blocks[i].end == sack_blocks[j].end) {
9720                                         /*
9721                                          * Ok these two have the same end we
9722                                          * want the smallest end and then
9723                                          * throw away the larger and start
9724                                          * again.
9725                                          */
9726                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
9727                                                 /*
9728                                                  * The second block covers
9729                                                  * more area use that
9730                                                  */
9731                                                 sack_blocks[i].start = sack_blocks[j].start;
9732                                         }
9733                                         /*
9734                                          * Now collapse out the dup-sack and
9735                                          * lower the count
9736                                          */
9737                                         for (k = (j + 1); k < num_sack_blks; k++) {
9738                                                 sack_blocks[j].start = sack_blocks[k].start;
9739                                                 sack_blocks[j].end = sack_blocks[k].end;
9740                                                 j++;
9741                                         }
9742                                         num_sack_blks--;
9743                                         goto again;
9744                                 }
9745                         }
9746                 }
9747         }
9748 do_sack_work:
9749         /*
9750          * First lets look to see if
9751          * we have retransmitted and
9752          * can use the transmit next?
9753          */
9754         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
9755         if (rsm &&
9756             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
9757             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
9758                 /*
9759                  * We probably did the FR and the next
9760                  * SACK in continues as we would expect.
9761                  */
9762                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
9763                 if (acked) {
9764                         rack->r_wanted_output = 1;
9765                         changed += acked;
9766                         sack_changed += acked;
9767                 }
9768                 if (num_sack_blks == 1) {
9769                         /*
9770                          * This is what we would expect from
9771                          * a normal implementation to happen
9772                          * after we have retransmitted the FR,
9773                          * i.e the sack-filter pushes down
9774                          * to 1 block and the next to be retransmitted
9775                          * is the sequence in the sack block (has more
9776                          * are acked). Count this as ACK'd data to boost
9777                          * up the chances of recovering any false positives.
9778                          */
9779                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
9780                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
9781                         counter_u64_add(rack_express_sack, 1);
9782                         if (rack->r_ctl.ack_count > 0xfff00000) {
9783                                 /*
9784                                  * reduce the number to keep us under
9785                                  * a uint32_t.
9786                                  */
9787                                 rack->r_ctl.ack_count /= 2;
9788                                 rack->r_ctl.sack_count /= 2;
9789                         }
9790                         goto out_with_totals;
9791                 } else {
9792                         /*
9793                          * Start the loop through the
9794                          * rest of blocks, past the first block.
9795                          */
9796                         moved_two = 0;
9797                         loop_start = 1;
9798                 }
9799         }
9800         /* Its a sack of some sort */
9801         rack->r_ctl.sack_count++;
9802         if (rack->r_ctl.sack_count > 0xfff00000) {
9803                 /*
9804                  * reduce the number to keep us under
9805                  * a uint32_t.
9806                  */
9807                 rack->r_ctl.ack_count /= 2;
9808                 rack->r_ctl.sack_count /= 2;
9809         }
9810         counter_u64_add(rack_sack_total, 1);
9811         if (rack->sack_attack_disable) {
9812                 /* An attacker disablement is in place */
9813                 if (num_sack_blks > 1) {
9814                         rack->r_ctl.sack_count += (num_sack_blks - 1);
9815                         rack->r_ctl.sack_moved_extra++;
9816                         counter_u64_add(rack_move_some, 1);
9817                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
9818                                 rack->r_ctl.sack_moved_extra /= 2;
9819                                 rack->r_ctl.sack_noextra_move /= 2;
9820                         }
9821                 }
9822                 goto out;
9823         }
9824         rsm = rack->r_ctl.rc_sacklast;
9825         for (i = loop_start; i < num_sack_blks; i++) {
9826                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
9827                 if (acked) {
9828                         rack->r_wanted_output = 1;
9829                         changed += acked;
9830                         sack_changed += acked;
9831                 }
9832                 if (moved_two) {
9833                         /*
9834                          * If we did not get a SACK for at least a MSS and
9835                          * had to move at all, or if we moved more than our
9836                          * threshold, it counts against the "extra" move.
9837                          */
9838                         rack->r_ctl.sack_moved_extra += moved_two;
9839                         counter_u64_add(rack_move_some, 1);
9840                 } else {
9841                         /*
9842                          * else we did not have to move
9843                          * any more than we would expect.
9844                          */
9845                         rack->r_ctl.sack_noextra_move++;
9846                         counter_u64_add(rack_move_none, 1);
9847                 }
9848                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
9849                         /*
9850                          * If the SACK was not a full MSS then
9851                          * we add to sack_count the number of
9852                          * MSS's (or possibly more than
9853                          * a MSS if its a TSO send) we had to skip by.
9854                          */
9855                         rack->r_ctl.sack_count += moved_two;
9856                         counter_u64_add(rack_sack_total, moved_two);
9857                 }
9858                 /*
9859                  * Now we need to setup for the next
9860                  * round. First we make sure we won't
9861                  * exceed the size of our uint32_t on
9862                  * the various counts, and then clear out
9863                  * moved_two.
9864                  */
9865                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
9866                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
9867                         rack->r_ctl.sack_moved_extra /= 2;
9868                         rack->r_ctl.sack_noextra_move /= 2;
9869                 }
9870                 if (rack->r_ctl.sack_count > 0xfff00000) {
9871                         rack->r_ctl.ack_count /= 2;
9872                         rack->r_ctl.sack_count /= 2;
9873                 }
9874                 moved_two = 0;
9875         }
9876 out_with_totals:
9877         if (num_sack_blks > 1) {
9878                 /*
9879                  * You get an extra stroke if
9880                  * you have more than one sack-blk, this
9881                  * could be where we are skipping forward
9882                  * and the sack-filter is still working, or
9883                  * it could be an attacker constantly
9884                  * moving us.
9885                  */
9886                 rack->r_ctl.sack_moved_extra++;
9887                 counter_u64_add(rack_move_some, 1);
9888         }
9889 out:
9890 #ifdef NETFLIX_EXP_DETECTION
9891         rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
9892 #endif
9893         if (changed) {
9894                 /* Something changed cancel the rack timer */
9895                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
9896         }
9897         tsused = tcp_get_usecs(NULL);
9898         rsm = tcp_rack_output(tp, rack, tsused);
9899         if ((!IN_FASTRECOVERY(tp->t_flags)) &&
9900             rsm) {
9901                 /* Enter recovery */
9902                 rack->r_ctl.rc_rsm_start = rsm->r_start;
9903                 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
9904                 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
9905                 entered_recovery = 1;
9906                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
9907                 /*
9908                  * When we enter recovery we need to assure we send
9909                  * one packet.
9910                  */
9911                 if (rack->rack_no_prr == 0) {
9912                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
9913                         rack_log_to_prr(rack, 8, 0);
9914                 }
9915                 rack->r_timer_override = 1;
9916                 rack->r_early = 0;
9917                 rack->r_ctl.rc_agg_early = 0;
9918         } else if (IN_FASTRECOVERY(tp->t_flags) &&
9919                    rsm &&
9920                    (rack->r_rr_config == 3)) {
9921                 /*
9922                  * Assure we can output and we get no
9923                  * remembered pace time except the retransmit.
9924                  */
9925                 rack->r_timer_override = 1;
9926                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
9927                 rack->r_ctl.rc_resend = rsm;
9928         }
9929         if (IN_FASTRECOVERY(tp->t_flags) &&
9930             (rack->rack_no_prr == 0) &&
9931             (entered_recovery == 0)) {
9932                 rack_update_prr(tp, rack, changed, th_ack);
9933                 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
9934                      ((rack->rc_inp->inp_in_hpts == 0) &&
9935                       ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
9936                         /*
9937                          * If you are pacing output you don't want
9938                          * to override.
9939                          */
9940                         rack->r_early = 0;
9941                         rack->r_ctl.rc_agg_early = 0;
9942                         rack->r_timer_override = 1;
9943                 }
9944         }
9945 }
9946
9947 static void
9948 rack_strike_dupack(struct tcp_rack *rack)
9949 {
9950         struct rack_sendmap *rsm;
9951
9952         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
9953         while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
9954                 rsm = TAILQ_NEXT(rsm, r_tnext);
9955         }
9956         if (rsm && (rsm->r_dupack < 0xff)) {
9957                 rsm->r_dupack++;
9958                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
9959                         struct timeval tv;
9960                         uint32_t cts;
9961                         /*
9962                          * Here we see if we need to retransmit. For
9963                          * a SACK type connection if enough time has passed
9964                          * we will get a return of the rsm. For a non-sack
9965                          * connection we will get the rsm returned if the
9966                          * dupack value is 3 or more.
9967                          */
9968                         cts = tcp_get_usecs(&tv);
9969                         rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
9970                         if (rack->r_ctl.rc_resend != NULL) {
9971                                 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
9972                                         rack_cong_signal(rack->rc_tp, CC_NDUPACK,
9973                                                          rack->rc_tp->snd_una);
9974                                 }
9975                                 rack->r_wanted_output = 1;
9976                                 rack->r_timer_override = 1;
9977                                 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
9978                         }
9979                 } else {
9980                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
9981                 }
9982         }
9983 }
9984
9985 static void
9986 rack_check_bottom_drag(struct tcpcb *tp,
9987                        struct tcp_rack *rack,
9988                        struct socket *so, int32_t acked)
9989 {
9990         uint32_t segsiz, minseg;
9991
9992         segsiz = ctf_fixed_maxseg(tp);
9993         minseg = segsiz;
9994
9995         if (tp->snd_max == tp->snd_una) {
9996                 /*
9997                  * We are doing dynamic pacing and we are way
9998                  * under. Basically everything got acked while
9999                  * we were still waiting on the pacer to expire.
10000                  *
10001                  * This means we need to boost the b/w in
10002                  * addition to any earlier boosting of
10003                  * the multipler.
10004                  */
10005                 rack->rc_dragged_bottom = 1;
10006                 rack_validate_multipliers_at_or_above100(rack);
10007                 /*
10008                  * Lets use the segment bytes acked plus
10009                  * the lowest RTT seen as the basis to
10010                  * form a b/w estimate. This will be off
10011                  * due to the fact that the true estimate
10012                  * should be around 1/2 the time of the RTT
10013                  * but we can settle for that.
10014                  */
10015                 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
10016                     acked) {
10017                         uint64_t bw, calc_bw, rtt;
10018
10019                         rtt = rack->r_ctl.rack_rs.rs_us_rtt;
10020                         if (rtt == 0) {
10021                                 /* no us sample is there a ms one? */
10022                                 if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
10023                                         rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
10024                                 } else {
10025                                         goto no_measurement;
10026                                 }
10027                         }
10028                         bw = acked;
10029                         calc_bw = bw * 1000000;
10030                         calc_bw /= rtt;
10031                         if (rack->r_ctl.last_max_bw &&
10032                             (rack->r_ctl.last_max_bw < calc_bw)) {
10033                                 /*
10034                                  * If we have a last calculated max bw
10035                                  * enforce it.
10036                                  */
10037                                 calc_bw = rack->r_ctl.last_max_bw;
10038                         }
10039                         /* now plop it in */
10040                         if (rack->rc_gp_filled == 0) {
10041                                 if (calc_bw > ONE_POINT_TWO_MEG) {
10042                                         /*
10043                                          * If we have no measurement
10044                                          * don't let us set in more than
10045                                          * 1.2Mbps. If we are still too
10046                                          * low after pacing with this we
10047                                          * will hopefully have a max b/w
10048                                          * available to sanity check things.
10049                                          */
10050                                         calc_bw = ONE_POINT_TWO_MEG;
10051                                 }
10052                                 rack->r_ctl.rc_rtt_diff = 0;
10053                                 rack->r_ctl.gp_bw = calc_bw;
10054                                 rack->rc_gp_filled = 1;
10055                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10056                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
10057                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10058                         } else if (calc_bw > rack->r_ctl.gp_bw) {
10059                                 rack->r_ctl.rc_rtt_diff = 0;
10060                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10061                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
10062                                 rack->r_ctl.gp_bw = calc_bw;
10063                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10064                         } else
10065                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10066                         if ((rack->gp_ready == 0) &&
10067                             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
10068                                 /* We have enough measurements now */
10069                                 rack->gp_ready = 1;
10070                                 rack_set_cc_pacing(rack);
10071                                 if (rack->defer_options)
10072                                         rack_apply_deferred_options(rack);
10073                         }
10074                         /*
10075                          * For acks over 1mss we do a extra boost to simulate
10076                          * where we would get 2 acks (we want 110 for the mul).
10077                          */
10078                         if (acked > segsiz)
10079                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10080                 } else {
10081                         /*
10082                          * zero rtt possibly?, settle for just an old increase.
10083                          */
10084 no_measurement:
10085                         rack_increase_bw_mul(rack, -1, 0, 0, 1);
10086                 }
10087         } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
10088                    (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
10089                                                minseg)) &&
10090                    (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10091                    (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10092                    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
10093                     (segsiz * rack_req_segs))) {
10094                 /*
10095                  * We are doing dynamic GP pacing and
10096                  * we have everything except 1MSS or less
10097                  * bytes left out. We are still pacing away.
10098                  * And there is data that could be sent, This
10099                  * means we are inserting delayed ack time in
10100                  * our measurements because we are pacing too slow.
10101                  */
10102                 rack_validate_multipliers_at_or_above100(rack);
10103                 rack->rc_dragged_bottom = 1;
10104                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10105         }
10106 }
10107
10108
10109
10110 static void
10111 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
10112 {
10113         /*
10114          * The fast output path is enabled and we
10115          * have moved the cumack forward. Lets see if
10116          * we can expand forward the fast path length by
10117          * that amount. What we would ideally like to
10118          * do is increase the number of bytes in the
10119          * fast path block (left_to_send) by the
10120          * acked amount. However we have to gate that
10121          * by two factors:
10122          * 1) The amount outstanding and the rwnd of the peer
10123          *    (i.e. we don't want to exceed the rwnd of the peer).
10124          *    <and>
10125          * 2) The amount of data left in the socket buffer (i.e.
10126          *    we can't send beyond what is in the buffer).
10127          *
10128          * Note that this does not take into account any increase
10129          * in the cwnd. We will only extend the fast path by
10130          * what was acked.
10131          */
10132         uint32_t new_total, gating_val;
10133
10134         new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
10135         gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
10136                          (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
10137         if (new_total <= gating_val) {
10138                 /* We can increase left_to_send by the acked amount */
10139                 counter_u64_add(rack_extended_rfo, 1);
10140                 rack->r_ctl.fsb.left_to_send = new_total;
10141                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
10142                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
10143                          rack, rack->r_ctl.fsb.left_to_send,
10144                          sbavail(&rack->rc_inp->inp_socket->so_snd),
10145                          (tp->snd_max - tp->snd_una)));
10146
10147         }
10148 }
10149
10150 static void
10151 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
10152 {
10153         /*
10154          * Here any sendmap entry that points to the
10155          * beginning mbuf must be adjusted to the correct
10156          * offset. This must be called with:
10157          * 1) The socket buffer locked
10158          * 2) snd_una adjusted to its new postion.
10159          *
10160          * Note that (2) implies rack_ack_received has also
10161          * been called.
10162          *
10163          * We grab the first mbuf in the socket buffer and
10164          * then go through the front of the sendmap, recalculating
10165          * the stored offset for any sendmap entry that has
10166          * that mbuf. We must use the sb functions to do this
10167          * since its possible an add was done has well as
10168          * the subtraction we may have just completed. This should
10169          * not be a penalty though, since we just referenced the sb
10170          * to go in and trim off the mbufs that we freed (of course
10171          * there will be a penalty for the sendmap references though).
10172          */
10173         struct mbuf *m;
10174         struct rack_sendmap *rsm;
10175
10176         SOCKBUF_LOCK_ASSERT(sb);
10177         m = sb->sb_mb;
10178         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
10179         if ((rsm == NULL) || (m == NULL)) {
10180                 /* Nothing outstanding */
10181                 return;
10182         }
10183         while (rsm->m && (rsm->m == m)) {
10184                 /* one to adjust */
10185 #ifdef INVARIANTS
10186                 struct mbuf *tm;
10187                 uint32_t soff;
10188
10189                 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
10190                 if (rsm->orig_m_len != m->m_len) {
10191                         rack_adjust_orig_mlen(rsm);
10192                 }
10193                 if (rsm->soff != soff) {
10194                         /*
10195                          * This is not a fatal error, we anticipate it
10196                          * might happen (the else code), so we count it here
10197                          * so that under invariant we can see that it really
10198                          * does happen.
10199                          */
10200                         counter_u64_add(rack_adjust_map_bw, 1);
10201                 }
10202                 rsm->m = tm;
10203                 rsm->soff = soff;
10204                 if (tm)
10205                         rsm->orig_m_len = rsm->m->m_len;
10206                 else
10207                         rsm->orig_m_len = 0;
10208 #else
10209                 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
10210                 if (rsm->m)
10211                         rsm->orig_m_len = rsm->m->m_len;
10212                 else
10213                         rsm->orig_m_len = 0;
10214 #endif
10215                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
10216                               rsm);
10217                 if (rsm == NULL)
10218                         break;
10219         }
10220 }
10221
10222 /*
10223  * Return value of 1, we do not need to call rack_process_data().
10224  * return value of 0, rack_process_data can be called.
10225  * For ret_val if its 0 the TCP is locked, if its non-zero
10226  * its unlocked and probably unsafe to touch the TCB.
10227  */
10228 static int
10229 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
10230     struct tcpcb *tp, struct tcpopt *to,
10231     uint32_t tiwin, int32_t tlen,
10232     int32_t * ofia, int32_t thflags, int32_t *ret_val)
10233 {
10234         int32_t ourfinisacked = 0;
10235         int32_t nsegs, acked_amount;
10236         int32_t acked;
10237         struct mbuf *mfree;
10238         struct tcp_rack *rack;
10239         int32_t under_pacing = 0;
10240         int32_t recovery = 0;
10241
10242         rack = (struct tcp_rack *)tp->t_fb_ptr;
10243         if (SEQ_GT(th->th_ack, tp->snd_max)) {
10244                 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
10245                                       &rack->r_ctl.challenge_ack_ts,
10246                                       &rack->r_ctl.challenge_ack_cnt);
10247                 rack->r_wanted_output = 1;
10248                 return (1);
10249         }
10250         if (rack->gp_ready &&
10251             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
10252                 under_pacing = 1;
10253         }
10254         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
10255                 int in_rec, dup_ack_struck = 0;
10256
10257                 in_rec = IN_FASTRECOVERY(tp->t_flags);
10258                 if (rack->rc_in_persist) {
10259                         tp->t_rxtshift = 0;
10260                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10261                                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10262                 }
10263                 if ((th->th_ack == tp->snd_una) &&
10264                     (tiwin == tp->snd_wnd) &&
10265                     ((to->to_flags & TOF_SACK) == 0)) {
10266                         rack_strike_dupack(rack);
10267                         dup_ack_struck = 1;
10268                 }
10269                 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
10270         }
10271         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
10272                 /*
10273                  * Old ack, behind (or duplicate to) the last one rcv'd
10274                  * Note: We mark reordering is occuring if its
10275                  * less than and we have not closed our window.
10276                  */
10277                 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
10278                         counter_u64_add(rack_reorder_seen, 1);
10279                         rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
10280                 }
10281                 return (0);
10282         }
10283         /*
10284          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
10285          * something we sent.
10286          */
10287         if (tp->t_flags & TF_NEEDSYN) {
10288                 /*
10289                  * T/TCP: Connection was half-synchronized, and our SYN has
10290                  * been ACK'd (so connection is now fully synchronized).  Go
10291                  * to non-starred state, increment snd_una for ACK of SYN,
10292                  * and check if we can do window scaling.
10293                  */
10294                 tp->t_flags &= ~TF_NEEDSYN;
10295                 tp->snd_una++;
10296                 /* Do window scaling? */
10297                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
10298                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
10299                         tp->rcv_scale = tp->request_r_scale;
10300                         /* Send window already scaled. */
10301                 }
10302         }
10303         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10304         INP_WLOCK_ASSERT(tp->t_inpcb);
10305
10306         acked = BYTES_THIS_ACK(tp, th);
10307         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
10308         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
10309         /*
10310          * If we just performed our first retransmit, and the ACK arrives
10311          * within our recovery window, then it was a mistake to do the
10312          * retransmit in the first place.  Recover our original cwnd and
10313          * ssthresh, and proceed to transmit where we left off.
10314          */
10315         if ((tp->t_flags & TF_PREVVALID) &&
10316             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
10317                 tp->t_flags &= ~TF_PREVVALID;
10318                 if (tp->t_rxtshift == 1 &&
10319                     (int)(ticks - tp->t_badrxtwin) < 0)
10320                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
10321         }
10322         if (acked) {
10323                 /* assure we are not backed off */
10324                 tp->t_rxtshift = 0;
10325                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10326                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10327                 rack->rc_tlp_in_progress = 0;
10328                 rack->r_ctl.rc_tlp_cnt_out = 0;
10329                 /*
10330                  * If it is the RXT timer we want to
10331                  * stop it, so we can restart a TLP.
10332                  */
10333                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
10334                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10335 #ifdef NETFLIX_HTTP_LOGGING
10336                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
10337 #endif
10338         }
10339         /*
10340          * If we have a timestamp reply, update smoothed round trip time. If
10341          * no timestamp is present but transmit timer is running and timed
10342          * sequence number was acked, update smoothed round trip time. Since
10343          * we now have an rtt measurement, cancel the timer backoff (cf.,
10344          * Phil Karn's retransmit alg.). Recompute the initial retransmit
10345          * timer.
10346          *
10347          * Some boxes send broken timestamp replies during the SYN+ACK
10348          * phase, ignore timestamps of 0 or we could calculate a huge RTT
10349          * and blow up the retransmit timer.
10350          */
10351         /*
10352          * If all outstanding data is acked, stop retransmit timer and
10353          * remember to restart (more output or persist). If there is more
10354          * data to be acked, restart retransmit timer, using current
10355          * (possibly backed-off) value.
10356          */
10357         if (acked == 0) {
10358                 if (ofia)
10359                         *ofia = ourfinisacked;
10360                 return (0);
10361         }
10362         if (IN_RECOVERY(tp->t_flags)) {
10363                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
10364                     (SEQ_LT(th->th_ack, tp->snd_max))) {
10365                         tcp_rack_partialack(tp);
10366                 } else {
10367                         rack_post_recovery(tp, th->th_ack);
10368                         recovery = 1;
10369                 }
10370         }
10371         /*
10372          * Let the congestion control algorithm update congestion control
10373          * related information. This typically means increasing the
10374          * congestion window.
10375          */
10376         rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
10377         SOCKBUF_LOCK(&so->so_snd);
10378         acked_amount = min(acked, (int)sbavail(&so->so_snd));
10379         tp->snd_wnd -= acked_amount;
10380         mfree = sbcut_locked(&so->so_snd, acked_amount);
10381         if ((sbused(&so->so_snd) == 0) &&
10382             (acked > acked_amount) &&
10383             (tp->t_state >= TCPS_FIN_WAIT_1) &&
10384             (tp->t_flags & TF_SENTFIN)) {
10385                 /*
10386                  * We must be sure our fin
10387                  * was sent and acked (we can be
10388                  * in FIN_WAIT_1 without having
10389                  * sent the fin).
10390                  */
10391                 ourfinisacked = 1;
10392         }
10393         tp->snd_una = th->th_ack;
10394         if (acked_amount && sbavail(&so->so_snd))
10395                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
10396         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
10397         /* NB: sowwakeup_locked() does an implicit unlock. */
10398         sowwakeup_locked(so);
10399         m_freem(mfree);
10400         if (SEQ_GT(tp->snd_una, tp->snd_recover))
10401                 tp->snd_recover = tp->snd_una;
10402
10403         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
10404                 tp->snd_nxt = tp->snd_una;
10405         }
10406         if (under_pacing &&
10407             (rack->use_fixed_rate == 0) &&
10408             (rack->in_probe_rtt == 0) &&
10409             rack->rc_gp_dyn_mul &&
10410             rack->rc_always_pace) {
10411                 /* Check if we are dragging bottom */
10412                 rack_check_bottom_drag(tp, rack, so, acked);
10413         }
10414         if (tp->snd_una == tp->snd_max) {
10415                 /* Nothing left outstanding */
10416                 tp->t_flags &= ~TF_PREVVALID;
10417                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
10418                 rack->r_ctl.retran_during_recovery = 0;
10419                 rack->r_ctl.dsack_byte_cnt = 0;
10420                 if (rack->r_ctl.rc_went_idle_time == 0)
10421                         rack->r_ctl.rc_went_idle_time = 1;
10422                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
10423                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
10424                         tp->t_acktime = 0;
10425                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10426                 /* Set need output so persist might get set */
10427                 rack->r_wanted_output = 1;
10428                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
10429                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
10430                     (sbavail(&so->so_snd) == 0) &&
10431                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
10432                         /*
10433                          * The socket was gone and the
10434                          * peer sent data (now or in the past), time to
10435                          * reset him.
10436                          */
10437                         *ret_val = 1;
10438                         /* tcp_close will kill the inp pre-log the Reset */
10439                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
10440                         tp = tcp_close(tp);
10441                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
10442                         return (1);
10443                 }
10444         }
10445         if (ofia)
10446                 *ofia = ourfinisacked;
10447         return (0);
10448 }
10449
10450 static void
10451 rack_collapsed_window(struct tcp_rack *rack)
10452 {
10453         /*
10454          * Now we must walk the
10455          * send map and divide the
10456          * ones left stranded. These
10457          * guys can't cause us to abort
10458          * the connection and are really
10459          * "unsent". However if a buggy
10460          * client actually did keep some
10461          * of the data i.e. collapsed the win
10462          * and refused to ack and then opened
10463          * the win and acked that data. We would
10464          * get into an ack war, the simplier
10465          * method then of just pretending we
10466          * did not send those segments something
10467          * won't work.
10468          */
10469         struct rack_sendmap *rsm, *nrsm, fe, *insret;
10470         tcp_seq max_seq;
10471
10472         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
10473         memset(&fe, 0, sizeof(fe));
10474         fe.r_start = max_seq;
10475         /* Find the first seq past or at maxseq */
10476         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
10477         if (rsm == NULL) {
10478                 /* Nothing to do strange */
10479                 rack->rc_has_collapsed = 0;
10480                 return;
10481         }
10482         /*
10483          * Now do we need to split at
10484          * the collapse point?
10485          */
10486         if (SEQ_GT(max_seq, rsm->r_start)) {
10487                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
10488                 if (nrsm == NULL) {
10489                         /* We can't get a rsm, mark all? */
10490                         nrsm = rsm;
10491                         goto no_split;
10492                 }
10493                 /* Clone it */
10494                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
10495                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
10496 #ifdef INVARIANTS
10497                 if (insret != NULL) {
10498                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
10499                               nrsm, insret, rack, rsm);
10500                 }
10501 #endif
10502                 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
10503                 if (rsm->r_in_tmap) {
10504                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
10505                         nrsm->r_in_tmap = 1;
10506                 }
10507                 /*
10508                  * Set in the new RSM as the
10509                  * collapsed starting point
10510                  */
10511                 rsm = nrsm;
10512         }
10513 no_split:
10514         counter_u64_add(rack_collapsed_win, 1);
10515         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
10516                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
10517         }
10518         rack->rc_has_collapsed = 1;
10519 }
10520
10521 static void
10522 rack_un_collapse_window(struct tcp_rack *rack)
10523 {
10524         struct rack_sendmap *rsm;
10525
10526         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
10527                 if (rsm->r_flags & RACK_RWND_COLLAPSED)
10528                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
10529                 else
10530                         break;
10531         }
10532         rack->rc_has_collapsed = 0;
10533 }
10534
10535 static void
10536 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
10537                         int32_t tlen, int32_t tfo_syn)
10538 {
10539         if (DELAY_ACK(tp, tlen) || tfo_syn) {
10540                 if (rack->rc_dack_mode &&
10541                     (tlen > 500) &&
10542                     (rack->rc_dack_toggle == 1)) {
10543                         goto no_delayed_ack;
10544                 }
10545                 rack_timer_cancel(tp, rack,
10546                                   rack->r_ctl.rc_rcvtime, __LINE__);
10547                 tp->t_flags |= TF_DELACK;
10548         } else {
10549 no_delayed_ack:
10550                 rack->r_wanted_output = 1;
10551                 tp->t_flags |= TF_ACKNOW;
10552                 if (rack->rc_dack_mode) {
10553                         if (tp->t_flags & TF_DELACK)
10554                                 rack->rc_dack_toggle = 1;
10555                         else
10556                                 rack->rc_dack_toggle = 0;
10557                 }
10558         }
10559 }
10560
10561 static void
10562 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
10563 {
10564         /*
10565          * If fast output is in progress, lets validate that
10566          * the new window did not shrink on us and make it
10567          * so fast output should end.
10568          */
10569         if (rack->r_fast_output) {
10570                 uint32_t out;
10571
10572                 /*
10573                  * Calculate what we will send if left as is
10574                  * and compare that to our send window.
10575                  */
10576                 out = ctf_outstanding(tp);
10577                 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
10578                         /* ok we have an issue */
10579                         if (out >= tp->snd_wnd) {
10580                                 /* Turn off fast output the window is met or collapsed */
10581                                 rack->r_fast_output = 0;
10582                         } else {
10583                                 /* we have some room left */
10584                                 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
10585                                 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
10586                                         /* If not at least 1 full segment never mind */
10587                                         rack->r_fast_output = 0;
10588                                 }
10589                         }
10590                 }
10591         }
10592 }
10593
10594
10595 /*
10596  * Return value of 1, the TCB is unlocked and most
10597  * likely gone, return value of 0, the TCP is still
10598  * locked.
10599  */
10600 static int
10601 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
10602     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
10603     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
10604 {
10605         /*
10606          * Update window information. Don't look at window if no ACK: TAC's
10607          * send garbage on first SYN.
10608          */
10609         int32_t nsegs;
10610         int32_t tfo_syn;
10611         struct tcp_rack *rack;
10612
10613         rack = (struct tcp_rack *)tp->t_fb_ptr;
10614         INP_WLOCK_ASSERT(tp->t_inpcb);
10615         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10616         if ((thflags & TH_ACK) &&
10617             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
10618             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
10619             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
10620                 /* keep track of pure window updates */
10621                 if (tlen == 0 &&
10622                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
10623                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
10624                 tp->snd_wnd = tiwin;
10625                 rack_validate_fo_sendwin_up(tp, rack);
10626                 tp->snd_wl1 = th->th_seq;
10627                 tp->snd_wl2 = th->th_ack;
10628                 if (tp->snd_wnd > tp->max_sndwnd)
10629                         tp->max_sndwnd = tp->snd_wnd;
10630                 rack->r_wanted_output = 1;
10631         } else if (thflags & TH_ACK) {
10632                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
10633                         tp->snd_wnd = tiwin;
10634                         rack_validate_fo_sendwin_up(tp, rack);
10635                         tp->snd_wl1 = th->th_seq;
10636                         tp->snd_wl2 = th->th_ack;
10637                 }
10638         }
10639         if (tp->snd_wnd < ctf_outstanding(tp))
10640                 /* The peer collapsed the window */
10641                 rack_collapsed_window(rack);
10642         else if (rack->rc_has_collapsed)
10643                 rack_un_collapse_window(rack);
10644         /* Was persist timer active and now we have window space? */
10645         if ((rack->rc_in_persist != 0) &&
10646             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
10647                                 rack->r_ctl.rc_pace_min_segs))) {
10648                 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10649                 tp->snd_nxt = tp->snd_max;
10650                 /* Make sure we output to start the timer */
10651                 rack->r_wanted_output = 1;
10652         }
10653         /* Do we enter persists? */
10654         if ((rack->rc_in_persist == 0) &&
10655             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
10656             TCPS_HAVEESTABLISHED(tp->t_state) &&
10657             (tp->snd_max == tp->snd_una) &&
10658             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
10659             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
10660                 /*
10661                  * Here the rwnd is less than
10662                  * the pacing size, we are established,
10663                  * nothing is outstanding, and there is
10664                  * data to send. Enter persists.
10665                  */
10666                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10667         }
10668         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
10669                 m_freem(m);
10670                 return (0);
10671         }
10672         /*
10673          * don't process the URG bit, ignore them drag
10674          * along the up.
10675          */
10676         tp->rcv_up = tp->rcv_nxt;
10677         INP_WLOCK_ASSERT(tp->t_inpcb);
10678
10679         /*
10680          * Process the segment text, merging it into the TCP sequencing
10681          * queue, and arranging for acknowledgment of receipt if necessary.
10682          * This process logically involves adjusting tp->rcv_wnd as data is
10683          * presented to the user (this happens in tcp_usrreq.c, case
10684          * PRU_RCVD).  If a FIN has already been received on this connection
10685          * then we just ignore the text.
10686          */
10687         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
10688                    IS_FASTOPEN(tp->t_flags));
10689         if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
10690             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10691                 tcp_seq save_start = th->th_seq;
10692                 tcp_seq save_rnxt  = tp->rcv_nxt;
10693                 int     save_tlen  = tlen;
10694
10695                 m_adj(m, drop_hdrlen);  /* delayed header drop */
10696                 /*
10697                  * Insert segment which includes th into TCP reassembly
10698                  * queue with control block tp.  Set thflags to whether
10699                  * reassembly now includes a segment with FIN.  This handles
10700                  * the common case inline (segment is the next to be
10701                  * received on an established connection, and the queue is
10702                  * empty), avoiding linkage into and removal from the queue
10703                  * and repetition of various conversions. Set DELACK for
10704                  * segments received in order, but ack immediately when
10705                  * segments are out of order (so fast retransmit can work).
10706                  */
10707                 if (th->th_seq == tp->rcv_nxt &&
10708                     SEGQ_EMPTY(tp) &&
10709                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
10710                     tfo_syn)) {
10711 #ifdef NETFLIX_SB_LIMITS
10712                         u_int mcnt, appended;
10713
10714                         if (so->so_rcv.sb_shlim) {
10715                                 mcnt = m_memcnt(m);
10716                                 appended = 0;
10717                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
10718                                     CFO_NOSLEEP, NULL) == false) {
10719                                         counter_u64_add(tcp_sb_shlim_fails, 1);
10720                                         m_freem(m);
10721                                         return (0);
10722                                 }
10723                         }
10724 #endif
10725                         rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
10726                         tp->rcv_nxt += tlen;
10727                         if (tlen &&
10728                             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
10729                             (tp->t_fbyte_in == 0)) {
10730                                 tp->t_fbyte_in = ticks;
10731                                 if (tp->t_fbyte_in == 0)
10732                                         tp->t_fbyte_in = 1;
10733                                 if (tp->t_fbyte_out && tp->t_fbyte_in)
10734                                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
10735                         }
10736                         thflags = th->th_flags & TH_FIN;
10737                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
10738                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
10739                         SOCKBUF_LOCK(&so->so_rcv);
10740                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10741                                 m_freem(m);
10742                         } else
10743 #ifdef NETFLIX_SB_LIMITS
10744                                 appended =
10745 #endif
10746                                         sbappendstream_locked(&so->so_rcv, m, 0);
10747
10748                         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
10749                         /* NB: sorwakeup_locked() does an implicit unlock. */
10750                         sorwakeup_locked(so);
10751 #ifdef NETFLIX_SB_LIMITS
10752                         if (so->so_rcv.sb_shlim && appended != mcnt)
10753                                 counter_fo_release(so->so_rcv.sb_shlim,
10754                                     mcnt - appended);
10755 #endif
10756                 } else {
10757                         /*
10758                          * XXX: Due to the header drop above "th" is
10759                          * theoretically invalid by now.  Fortunately
10760                          * m_adj() doesn't actually frees any mbufs when
10761                          * trimming from the head.
10762                          */
10763                         tcp_seq temp = save_start;
10764
10765                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
10766                         tp->t_flags |= TF_ACKNOW;
10767                         if (tp->t_flags & TF_WAKESOR) {
10768                                 tp->t_flags &= ~TF_WAKESOR;
10769                                 /* NB: sorwakeup_locked() does an implicit unlock. */
10770                                 sorwakeup_locked(so);
10771                         }
10772                 }
10773                 if ((tp->t_flags & TF_SACK_PERMIT) &&
10774                     (save_tlen > 0) &&
10775                     TCPS_HAVEESTABLISHED(tp->t_state)) {
10776                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
10777                                 /*
10778                                  * DSACK actually handled in the fastpath
10779                                  * above.
10780                                  */
10781                                 RACK_OPTS_INC(tcp_sack_path_1);
10782                                 tcp_update_sack_list(tp, save_start,
10783                                     save_start + save_tlen);
10784                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
10785                                 if ((tp->rcv_numsacks >= 1) &&
10786                                     (tp->sackblks[0].end == save_start)) {
10787                                         /*
10788                                          * Partial overlap, recorded at todrop
10789                                          * above.
10790                                          */
10791                                         RACK_OPTS_INC(tcp_sack_path_2a);
10792                                         tcp_update_sack_list(tp,
10793                                             tp->sackblks[0].start,
10794                                             tp->sackblks[0].end);
10795                                 } else {
10796                                         RACK_OPTS_INC(tcp_sack_path_2b);
10797                                         tcp_update_dsack_list(tp, save_start,
10798                                             save_start + save_tlen);
10799                                 }
10800                         } else if (tlen >= save_tlen) {
10801                                 /* Update of sackblks. */
10802                                 RACK_OPTS_INC(tcp_sack_path_3);
10803                                 tcp_update_dsack_list(tp, save_start,
10804                                     save_start + save_tlen);
10805                         } else if (tlen > 0) {
10806                                 RACK_OPTS_INC(tcp_sack_path_4);
10807                                 tcp_update_dsack_list(tp, save_start,
10808                                     save_start + tlen);
10809                         }
10810                 }
10811         } else {
10812                 m_freem(m);
10813                 thflags &= ~TH_FIN;
10814         }
10815
10816         /*
10817          * If FIN is received ACK the FIN and let the user know that the
10818          * connection is closing.
10819          */
10820         if (thflags & TH_FIN) {
10821                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10822                         /* The socket upcall is handled by socantrcvmore. */
10823                         socantrcvmore(so);
10824                         /*
10825                          * If connection is half-synchronized (ie NEEDSYN
10826                          * flag on) then delay ACK, so it may be piggybacked
10827                          * when SYN is sent. Otherwise, since we received a
10828                          * FIN then no more input can be expected, send ACK
10829                          * now.
10830                          */
10831                         if (tp->t_flags & TF_NEEDSYN) {
10832                                 rack_timer_cancel(tp, rack,
10833                                     rack->r_ctl.rc_rcvtime, __LINE__);
10834                                 tp->t_flags |= TF_DELACK;
10835                         } else {
10836                                 tp->t_flags |= TF_ACKNOW;
10837                         }
10838                         tp->rcv_nxt++;
10839                 }
10840                 switch (tp->t_state) {
10841                         /*
10842                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
10843                          * CLOSE_WAIT state.
10844                          */
10845                 case TCPS_SYN_RECEIVED:
10846                         tp->t_starttime = ticks;
10847                         /* FALLTHROUGH */
10848                 case TCPS_ESTABLISHED:
10849                         rack_timer_cancel(tp, rack,
10850                             rack->r_ctl.rc_rcvtime, __LINE__);
10851                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
10852                         break;
10853
10854                         /*
10855                          * If still in FIN_WAIT_1 STATE FIN has not been
10856                          * acked so enter the CLOSING state.
10857                          */
10858                 case TCPS_FIN_WAIT_1:
10859                         rack_timer_cancel(tp, rack,
10860                             rack->r_ctl.rc_rcvtime, __LINE__);
10861                         tcp_state_change(tp, TCPS_CLOSING);
10862                         break;
10863
10864                         /*
10865                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
10866                          * starting the time-wait timer, turning off the
10867                          * other standard timers.
10868                          */
10869                 case TCPS_FIN_WAIT_2:
10870                         rack_timer_cancel(tp, rack,
10871                             rack->r_ctl.rc_rcvtime, __LINE__);
10872                         tcp_twstart(tp);
10873                         return (1);
10874                 }
10875         }
10876         /*
10877          * Return any desired output.
10878          */
10879         if ((tp->t_flags & TF_ACKNOW) ||
10880             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
10881                 rack->r_wanted_output = 1;
10882         }
10883         INP_WLOCK_ASSERT(tp->t_inpcb);
10884         return (0);
10885 }
10886
10887 /*
10888  * Here nothing is really faster, its just that we
10889  * have broken out the fast-data path also just like
10890  * the fast-ack.
10891  */
10892 static int
10893 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
10894     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10895     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
10896 {
10897         int32_t nsegs;
10898         int32_t newsize = 0;    /* automatic sockbuf scaling */
10899         struct tcp_rack *rack;
10900 #ifdef NETFLIX_SB_LIMITS
10901         u_int mcnt, appended;
10902 #endif
10903 #ifdef TCPDEBUG
10904         /*
10905          * The size of tcp_saveipgen must be the size of the max ip header,
10906          * now IPv6.
10907          */
10908         u_char tcp_saveipgen[IP6_HDR_LEN];
10909         struct tcphdr tcp_savetcp;
10910         short ostate = 0;
10911
10912 #endif
10913         /*
10914          * If last ACK falls within this segment's sequence numbers, record
10915          * the timestamp. NOTE that the test is modified according to the
10916          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
10917          */
10918         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
10919                 return (0);
10920         }
10921         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
10922                 return (0);
10923         }
10924         if (tiwin && tiwin != tp->snd_wnd) {
10925                 return (0);
10926         }
10927         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
10928                 return (0);
10929         }
10930         if (__predict_false((to->to_flags & TOF_TS) &&
10931             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
10932                 return (0);
10933         }
10934         if (__predict_false((th->th_ack != tp->snd_una))) {
10935                 return (0);
10936         }
10937         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
10938                 return (0);
10939         }
10940         if ((to->to_flags & TOF_TS) != 0 &&
10941             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
10942                 tp->ts_recent_age = tcp_ts_getticks();
10943                 tp->ts_recent = to->to_tsval;
10944         }
10945         rack = (struct tcp_rack *)tp->t_fb_ptr;
10946         /*
10947          * This is a pure, in-sequence data packet with nothing on the
10948          * reassembly queue and we have enough buffer space to take it.
10949          */
10950         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10951
10952 #ifdef NETFLIX_SB_LIMITS
10953         if (so->so_rcv.sb_shlim) {
10954                 mcnt = m_memcnt(m);
10955                 appended = 0;
10956                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
10957                     CFO_NOSLEEP, NULL) == false) {
10958                         counter_u64_add(tcp_sb_shlim_fails, 1);
10959                         m_freem(m);
10960                         return (1);
10961                 }
10962         }
10963 #endif
10964         /* Clean receiver SACK report if present */
10965         if (tp->rcv_numsacks)
10966                 tcp_clean_sackreport(tp);
10967         KMOD_TCPSTAT_INC(tcps_preddat);
10968         tp->rcv_nxt += tlen;
10969         if (tlen &&
10970             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
10971             (tp->t_fbyte_in == 0)) {
10972                 tp->t_fbyte_in = ticks;
10973                 if (tp->t_fbyte_in == 0)
10974                         tp->t_fbyte_in = 1;
10975                 if (tp->t_fbyte_out && tp->t_fbyte_in)
10976                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
10977         }
10978         /*
10979          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
10980          */
10981         tp->snd_wl1 = th->th_seq;
10982         /*
10983          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
10984          */
10985         tp->rcv_up = tp->rcv_nxt;
10986         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
10987         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
10988 #ifdef TCPDEBUG
10989         if (so->so_options & SO_DEBUG)
10990                 tcp_trace(TA_INPUT, ostate, tp,
10991                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
10992 #endif
10993         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
10994
10995         /* Add data to socket buffer. */
10996         SOCKBUF_LOCK(&so->so_rcv);
10997         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10998                 m_freem(m);
10999         } else {
11000                 /*
11001                  * Set new socket buffer size. Give up when limit is
11002                  * reached.
11003                  */
11004                 if (newsize)
11005                         if (!sbreserve_locked(&so->so_rcv,
11006                             newsize, so, NULL))
11007                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
11008                 m_adj(m, drop_hdrlen);  /* delayed header drop */
11009 #ifdef NETFLIX_SB_LIMITS
11010                 appended =
11011 #endif
11012                         sbappendstream_locked(&so->so_rcv, m, 0);
11013                 ctf_calc_rwin(so, tp);
11014         }
11015         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
11016         /* NB: sorwakeup_locked() does an implicit unlock. */
11017         sorwakeup_locked(so);
11018 #ifdef NETFLIX_SB_LIMITS
11019         if (so->so_rcv.sb_shlim && mcnt != appended)
11020                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
11021 #endif
11022         rack_handle_delayed_ack(tp, rack, tlen, 0);
11023         if (tp->snd_una == tp->snd_max)
11024                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
11025         return (1);
11026 }
11027
11028 /*
11029  * This subfunction is used to try to highly optimize the
11030  * fast path. We again allow window updates that are
11031  * in sequence to remain in the fast-path. We also add
11032  * in the __predict's to attempt to help the compiler.
11033  * Note that if we return a 0, then we can *not* process
11034  * it and the caller should push the packet into the
11035  * slow-path.
11036  */
11037 static int
11038 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
11039     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11040     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
11041 {
11042         int32_t acked;
11043         int32_t nsegs;
11044 #ifdef TCPDEBUG
11045         /*
11046          * The size of tcp_saveipgen must be the size of the max ip header,
11047          * now IPv6.
11048          */
11049         u_char tcp_saveipgen[IP6_HDR_LEN];
11050         struct tcphdr tcp_savetcp;
11051         short ostate = 0;
11052 #endif
11053         int32_t under_pacing = 0;
11054         struct tcp_rack *rack;
11055
11056         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
11057                 /* Old ack, behind (or duplicate to) the last one rcv'd */
11058                 return (0);
11059         }
11060         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
11061                 /* Above what we have sent? */
11062                 return (0);
11063         }
11064         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
11065                 /* We are retransmitting */
11066                 return (0);
11067         }
11068         if (__predict_false(tiwin == 0)) {
11069                 /* zero window */
11070                 return (0);
11071         }
11072         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
11073                 /* We need a SYN or a FIN, unlikely.. */
11074                 return (0);
11075         }
11076         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
11077                 /* Timestamp is behind .. old ack with seq wrap? */
11078                 return (0);
11079         }
11080         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
11081                 /* Still recovering */
11082                 return (0);
11083         }
11084         rack = (struct tcp_rack *)tp->t_fb_ptr;
11085         if (rack->r_ctl.rc_sacked) {
11086                 /* We have sack holes on our scoreboard */
11087                 return (0);
11088         }
11089         /* Ok if we reach here, we can process a fast-ack */
11090         if (rack->gp_ready &&
11091             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11092                 under_pacing = 1;
11093         }
11094         nsegs = max(1, m->m_pkthdr.lro_nsegs);
11095         rack_log_ack(tp, to, th, 0, 0);
11096         /* Did the window get updated? */
11097         if (tiwin != tp->snd_wnd) {
11098                 tp->snd_wnd = tiwin;
11099                 rack_validate_fo_sendwin_up(tp, rack);
11100                 tp->snd_wl1 = th->th_seq;
11101                 if (tp->snd_wnd > tp->max_sndwnd)
11102                         tp->max_sndwnd = tp->snd_wnd;
11103         }
11104         /* Do we exit persists? */
11105         if ((rack->rc_in_persist != 0) &&
11106             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
11107                                rack->r_ctl.rc_pace_min_segs))) {
11108                 rack_exit_persist(tp, rack, cts);
11109         }
11110         /* Do we enter persists? */
11111         if ((rack->rc_in_persist == 0) &&
11112             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
11113             TCPS_HAVEESTABLISHED(tp->t_state) &&
11114             (tp->snd_max == tp->snd_una) &&
11115             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
11116             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
11117                 /*
11118                  * Here the rwnd is less than
11119                  * the pacing size, we are established,
11120                  * nothing is outstanding, and there is
11121                  * data to send. Enter persists.
11122                  */
11123                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
11124         }
11125         /*
11126          * If last ACK falls within this segment's sequence numbers, record
11127          * the timestamp. NOTE that the test is modified according to the
11128          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
11129          */
11130         if ((to->to_flags & TOF_TS) != 0 &&
11131             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
11132                 tp->ts_recent_age = tcp_ts_getticks();
11133                 tp->ts_recent = to->to_tsval;
11134         }
11135         /*
11136          * This is a pure ack for outstanding data.
11137          */
11138         KMOD_TCPSTAT_INC(tcps_predack);
11139
11140         /*
11141          * "bad retransmit" recovery.
11142          */
11143         if ((tp->t_flags & TF_PREVVALID) &&
11144             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
11145                 tp->t_flags &= ~TF_PREVVALID;
11146                 if (tp->t_rxtshift == 1 &&
11147                     (int)(ticks - tp->t_badrxtwin) < 0)
11148                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack);
11149         }
11150         /*
11151          * Recalculate the transmit timer / rtt.
11152          *
11153          * Some boxes send broken timestamp replies during the SYN+ACK
11154          * phase, ignore timestamps of 0 or we could calculate a huge RTT
11155          * and blow up the retransmit timer.
11156          */
11157         acked = BYTES_THIS_ACK(tp, th);
11158
11159 #ifdef TCP_HHOOK
11160         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
11161         hhook_run_tcp_est_in(tp, th, to);
11162 #endif
11163         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
11164         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
11165         if (acked) {
11166                 struct mbuf *mfree;
11167
11168                 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
11169                 SOCKBUF_LOCK(&so->so_snd);
11170                 mfree = sbcut_locked(&so->so_snd, acked);
11171                 tp->snd_una = th->th_ack;
11172                 /* Note we want to hold the sb lock through the sendmap adjust */
11173                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
11174                 /* Wake up the socket if we have room to write more */
11175                 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
11176                 sowwakeup_locked(so);
11177                 m_freem(mfree);
11178                 tp->t_rxtshift = 0;
11179                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11180                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11181                 rack->rc_tlp_in_progress = 0;
11182                 rack->r_ctl.rc_tlp_cnt_out = 0;
11183                 /*
11184                  * If it is the RXT timer we want to
11185                  * stop it, so we can restart a TLP.
11186                  */
11187                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
11188                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11189 #ifdef NETFLIX_HTTP_LOGGING
11190                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
11191 #endif
11192         }
11193         /*
11194          * Let the congestion control algorithm update congestion control
11195          * related information. This typically means increasing the
11196          * congestion window.
11197          */
11198         if (tp->snd_wnd < ctf_outstanding(tp)) {
11199                 /* The peer collapsed the window */
11200                 rack_collapsed_window(rack);
11201         } else if (rack->rc_has_collapsed)
11202                 rack_un_collapse_window(rack);
11203
11204         /*
11205          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
11206          */
11207         tp->snd_wl2 = th->th_ack;
11208         tp->t_dupacks = 0;
11209         m_freem(m);
11210         /* ND6_HINT(tp);         *//* Some progress has been made. */
11211
11212         /*
11213          * If all outstanding data are acked, stop retransmit timer,
11214          * otherwise restart timer using current (possibly backed-off)
11215          * value. If process is waiting for space, wakeup/selwakeup/signal.
11216          * If data are ready to send, let tcp_output decide between more
11217          * output or persist.
11218          */
11219 #ifdef TCPDEBUG
11220         if (so->so_options & SO_DEBUG)
11221                 tcp_trace(TA_INPUT, ostate, tp,
11222                     (void *)tcp_saveipgen,
11223                     &tcp_savetcp, 0);
11224 #endif
11225         if (under_pacing &&
11226             (rack->use_fixed_rate == 0) &&
11227             (rack->in_probe_rtt == 0) &&
11228             rack->rc_gp_dyn_mul &&
11229             rack->rc_always_pace) {
11230                 /* Check if we are dragging bottom */
11231                 rack_check_bottom_drag(tp, rack, so, acked);
11232         }
11233         if (tp->snd_una == tp->snd_max) {
11234                 tp->t_flags &= ~TF_PREVVALID;
11235                 rack->r_ctl.retran_during_recovery = 0;
11236                 rack->r_ctl.dsack_byte_cnt = 0;
11237                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
11238                 if (rack->r_ctl.rc_went_idle_time == 0)
11239                         rack->r_ctl.rc_went_idle_time = 1;
11240                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
11241                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
11242                         tp->t_acktime = 0;
11243                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11244         }
11245         if (acked && rack->r_fast_output)
11246                 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
11247         if (sbavail(&so->so_snd)) {
11248                 rack->r_wanted_output = 1;
11249         }
11250         return (1);
11251 }
11252
11253 /*
11254  * Return value of 1, the TCB is unlocked and most
11255  * likely gone, return value of 0, the TCP is still
11256  * locked.
11257  */
11258 static int
11259 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
11260     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11261     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11262 {
11263         int32_t ret_val = 0;
11264         int32_t todrop;
11265         int32_t ourfinisacked = 0;
11266         struct tcp_rack *rack;
11267
11268         ctf_calc_rwin(so, tp);
11269         /*
11270          * If the state is SYN_SENT: if seg contains an ACK, but not for our
11271          * SYN, drop the input. if seg contains a RST, then drop the
11272          * connection. if seg does not contain SYN, then drop it. Otherwise
11273          * this is an acceptable SYN segment initialize tp->rcv_nxt and
11274          * tp->irs if seg contains ack then advance tp->snd_una if seg
11275          * contains an ECE and ECN support is enabled, the stream is ECN
11276          * capable. if SYN has been acked change to ESTABLISHED else
11277          * SYN_RCVD state arrange for segment to be acked (eventually)
11278          * continue processing rest of data/controls.
11279          */
11280         if ((thflags & TH_ACK) &&
11281             (SEQ_LEQ(th->th_ack, tp->iss) ||
11282             SEQ_GT(th->th_ack, tp->snd_max))) {
11283                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11284                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11285                 return (1);
11286         }
11287         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
11288                 TCP_PROBE5(connect__refused, NULL, tp,
11289                     mtod(m, const char *), tp, th);
11290                 tp = tcp_drop(tp, ECONNREFUSED);
11291                 ctf_do_drop(m, tp);
11292                 return (1);
11293         }
11294         if (thflags & TH_RST) {
11295                 ctf_do_drop(m, tp);
11296                 return (1);
11297         }
11298         if (!(thflags & TH_SYN)) {
11299                 ctf_do_drop(m, tp);
11300                 return (1);
11301         }
11302         tp->irs = th->th_seq;
11303         tcp_rcvseqinit(tp);
11304         rack = (struct tcp_rack *)tp->t_fb_ptr;
11305         if (thflags & TH_ACK) {
11306                 int tfo_partial = 0;
11307
11308                 KMOD_TCPSTAT_INC(tcps_connects);
11309                 soisconnected(so);
11310 #ifdef MAC
11311                 mac_socketpeer_set_from_mbuf(m, so);
11312 #endif
11313                 /* Do window scaling on this connection? */
11314                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11315                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11316                         tp->rcv_scale = tp->request_r_scale;
11317                 }
11318                 tp->rcv_adv += min(tp->rcv_wnd,
11319                     TCP_MAXWIN << tp->rcv_scale);
11320                 /*
11321                  * If not all the data that was sent in the TFO SYN
11322                  * has been acked, resend the remainder right away.
11323                  */
11324                 if (IS_FASTOPEN(tp->t_flags) &&
11325                     (tp->snd_una != tp->snd_max)) {
11326                         tp->snd_nxt = th->th_ack;
11327                         tfo_partial = 1;
11328                 }
11329                 /*
11330                  * If there's data, delay ACK; if there's also a FIN ACKNOW
11331                  * will be turned on later.
11332                  */
11333                 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
11334                         rack_timer_cancel(tp, rack,
11335                                           rack->r_ctl.rc_rcvtime, __LINE__);
11336                         tp->t_flags |= TF_DELACK;
11337                 } else {
11338                         rack->r_wanted_output = 1;
11339                         tp->t_flags |= TF_ACKNOW;
11340                         rack->rc_dack_toggle = 0;
11341                 }
11342                 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
11343                     (V_tcp_do_ecn == 1)) {
11344                         tp->t_flags2 |= TF2_ECN_PERMIT;
11345                         KMOD_TCPSTAT_INC(tcps_ecn_shs);
11346                 }
11347                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
11348                         /*
11349                          * We advance snd_una for the
11350                          * fast open case. If th_ack is
11351                          * acknowledging data beyond
11352                          * snd_una we can't just call
11353                          * ack-processing since the
11354                          * data stream in our send-map
11355                          * will start at snd_una + 1 (one
11356                          * beyond the SYN). If its just
11357                          * equal we don't need to do that
11358                          * and there is no send_map.
11359                          */
11360                         tp->snd_una++;
11361                 }
11362                 /*
11363                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
11364                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
11365                  */
11366                 tp->t_starttime = ticks;
11367                 if (tp->t_flags & TF_NEEDFIN) {
11368                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
11369                         tp->t_flags &= ~TF_NEEDFIN;
11370                         thflags &= ~TH_SYN;
11371                 } else {
11372                         tcp_state_change(tp, TCPS_ESTABLISHED);
11373                         TCP_PROBE5(connect__established, NULL, tp,
11374                             mtod(m, const char *), tp, th);
11375                         rack_cc_conn_init(tp);
11376                 }
11377         } else {
11378                 /*
11379                  * Received initial SYN in SYN-SENT[*] state => simultaneous
11380                  * open.  If segment contains CC option and there is a
11381                  * cached CC, apply TAO test. If it succeeds, connection is *
11382                  * half-synchronized. Otherwise, do 3-way handshake:
11383                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
11384                  * there was no CC option, clear cached CC value.
11385                  */
11386                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
11387                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
11388         }
11389         INP_WLOCK_ASSERT(tp->t_inpcb);
11390         /*
11391          * Advance th->th_seq to correspond to first data byte. If data,
11392          * trim to stay within window, dropping FIN if necessary.
11393          */
11394         th->th_seq++;
11395         if (tlen > tp->rcv_wnd) {
11396                 todrop = tlen - tp->rcv_wnd;
11397                 m_adj(m, -todrop);
11398                 tlen = tp->rcv_wnd;
11399                 thflags &= ~TH_FIN;
11400                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
11401                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
11402         }
11403         tp->snd_wl1 = th->th_seq - 1;
11404         tp->rcv_up = th->th_seq;
11405         /*
11406          * Client side of transaction: already sent SYN and data. If the
11407          * remote host used T/TCP to validate the SYN, our data will be
11408          * ACK'd; if so, enter normal data segment processing in the middle
11409          * of step 5, ack processing. Otherwise, goto step 6.
11410          */
11411         if (thflags & TH_ACK) {
11412                 /* For syn-sent we need to possibly update the rtt */
11413                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11414                         uint32_t t, mcts;
11415
11416                         mcts = tcp_ts_getticks();
11417                         t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11418                         if (!tp->t_rttlow || tp->t_rttlow > t)
11419                                 tp->t_rttlow = t;
11420                         rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
11421                         tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11422                         tcp_rack_xmit_timer_commit(rack, tp);
11423                 }
11424                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
11425                         return (ret_val);
11426                 /* We may have changed to FIN_WAIT_1 above */
11427                 if (tp->t_state == TCPS_FIN_WAIT_1) {
11428                         /*
11429                          * In FIN_WAIT_1 STATE in addition to the processing
11430                          * for the ESTABLISHED state if our FIN is now
11431                          * acknowledged then enter FIN_WAIT_2.
11432                          */
11433                         if (ourfinisacked) {
11434                                 /*
11435                                  * If we can't receive any more data, then
11436                                  * closing user can proceed. Starting the
11437                                  * timer is contrary to the specification,
11438                                  * but if we don't get a FIN we'll hang
11439                                  * forever.
11440                                  *
11441                                  * XXXjl: we should release the tp also, and
11442                                  * use a compressed state.
11443                                  */
11444                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11445                                         soisdisconnected(so);
11446                                         tcp_timer_activate(tp, TT_2MSL,
11447                                             (tcp_fast_finwait2_recycle ?
11448                                             tcp_finwait2_timeout :
11449                                             TP_MAXIDLE(tp)));
11450                                 }
11451                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
11452                         }
11453                 }
11454         }
11455         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11456            tiwin, thflags, nxt_pkt));
11457 }
11458
11459 /*
11460  * Return value of 1, the TCB is unlocked and most
11461  * likely gone, return value of 0, the TCP is still
11462  * locked.
11463  */
11464 static int
11465 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
11466     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11467     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11468 {
11469         struct tcp_rack *rack;
11470         int32_t ret_val = 0;
11471         int32_t ourfinisacked = 0;
11472
11473         ctf_calc_rwin(so, tp);
11474         if ((thflags & TH_ACK) &&
11475             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
11476             SEQ_GT(th->th_ack, tp->snd_max))) {
11477                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11478                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11479                 return (1);
11480         }
11481         rack = (struct tcp_rack *)tp->t_fb_ptr;
11482         if (IS_FASTOPEN(tp->t_flags)) {
11483                 /*
11484                  * When a TFO connection is in SYN_RECEIVED, the
11485                  * only valid packets are the initial SYN, a
11486                  * retransmit/copy of the initial SYN (possibly with
11487                  * a subset of the original data), a valid ACK, a
11488                  * FIN, or a RST.
11489                  */
11490                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
11491                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11492                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11493                         return (1);
11494                 } else if (thflags & TH_SYN) {
11495                         /* non-initial SYN is ignored */
11496                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
11497                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
11498                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
11499                                 ctf_do_drop(m, NULL);
11500                                 return (0);
11501                         }
11502                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
11503                         ctf_do_drop(m, NULL);
11504                         return (0);
11505                 }
11506         }
11507         if ((thflags & TH_RST) ||
11508             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11509                 return (ctf_process_rst(m, th, so, tp));
11510         /*
11511          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11512          * it's less than ts_recent, drop it.
11513          */
11514         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11515             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11516                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11517                         return (ret_val);
11518         }
11519         /*
11520          * In the SYN-RECEIVED state, validate that the packet belongs to
11521          * this connection before trimming the data to fit the receive
11522          * window.  Check the sequence number versus IRS since we know the
11523          * sequence numbers haven't wrapped.  This is a partial fix for the
11524          * "LAND" DoS attack.
11525          */
11526         if (SEQ_LT(th->th_seq, tp->irs)) {
11527                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11528                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11529                 return (1);
11530         }
11531         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11532                               &rack->r_ctl.challenge_ack_ts,
11533                               &rack->r_ctl.challenge_ack_cnt)) {
11534                 return (ret_val);
11535         }
11536         /*
11537          * If last ACK falls within this segment's sequence numbers, record
11538          * its timestamp. NOTE: 1) That the test incorporates suggestions
11539          * from the latest proposal of the tcplw@cray.com list (Braden
11540          * 1993/04/26). 2) That updating only on newer timestamps interferes
11541          * with our earlier PAWS tests, so this check should be solely
11542          * predicated on the sequence space of this segment. 3) That we
11543          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11544          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11545          * SEG.Len, This modified check allows us to overcome RFC1323's
11546          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11547          * p.869. In such cases, we can still calculate the RTT correctly
11548          * when RCV.NXT == Last.ACK.Sent.
11549          */
11550         if ((to->to_flags & TOF_TS) != 0 &&
11551             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11552             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11553             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11554                 tp->ts_recent_age = tcp_ts_getticks();
11555                 tp->ts_recent = to->to_tsval;
11556         }
11557         tp->snd_wnd = tiwin;
11558         rack_validate_fo_sendwin_up(tp, rack);
11559         /*
11560          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11561          * is on (half-synchronized state), then queue data for later
11562          * processing; else drop segment and return.
11563          */
11564         if ((thflags & TH_ACK) == 0) {
11565                 if (IS_FASTOPEN(tp->t_flags)) {
11566                         rack_cc_conn_init(tp);
11567                 }
11568                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11569                     tiwin, thflags, nxt_pkt));
11570         }
11571         KMOD_TCPSTAT_INC(tcps_connects);
11572         soisconnected(so);
11573         /* Do window scaling? */
11574         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11575             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11576                 tp->rcv_scale = tp->request_r_scale;
11577         }
11578         /*
11579          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
11580          * FIN-WAIT-1
11581          */
11582         tp->t_starttime = ticks;
11583         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
11584                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
11585                 tp->t_tfo_pending = NULL;
11586         }
11587         if (tp->t_flags & TF_NEEDFIN) {
11588                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
11589                 tp->t_flags &= ~TF_NEEDFIN;
11590         } else {
11591                 tcp_state_change(tp, TCPS_ESTABLISHED);
11592                 TCP_PROBE5(accept__established, NULL, tp,
11593                     mtod(m, const char *), tp, th);
11594                 /*
11595                  * TFO connections call cc_conn_init() during SYN
11596                  * processing.  Calling it again here for such connections
11597                  * is not harmless as it would undo the snd_cwnd reduction
11598                  * that occurs when a TFO SYN|ACK is retransmitted.
11599                  */
11600                 if (!IS_FASTOPEN(tp->t_flags))
11601                         rack_cc_conn_init(tp);
11602         }
11603         /*
11604          * Account for the ACK of our SYN prior to
11605          * regular ACK processing below, except for
11606          * simultaneous SYN, which is handled later.
11607          */
11608         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
11609                 tp->snd_una++;
11610         /*
11611          * If segment contains data or ACK, will call tcp_reass() later; if
11612          * not, do so now to pass queued data to user.
11613          */
11614         if (tlen == 0 && (thflags & TH_FIN) == 0) {
11615                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
11616                     (struct mbuf *)0);
11617                 if (tp->t_flags & TF_WAKESOR) {
11618                         tp->t_flags &= ~TF_WAKESOR;
11619                         /* NB: sorwakeup_locked() does an implicit unlock. */
11620                         sorwakeup_locked(so);
11621                 }
11622         }
11623         tp->snd_wl1 = th->th_seq - 1;
11624         /* For syn-recv we need to possibly update the rtt */
11625         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11626                 uint32_t t, mcts;
11627
11628                 mcts = tcp_ts_getticks();
11629                 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11630                 if (!tp->t_rttlow || tp->t_rttlow > t)
11631                         tp->t_rttlow = t;
11632                 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
11633                 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11634                 tcp_rack_xmit_timer_commit(rack, tp);
11635         }
11636         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
11637                 return (ret_val);
11638         }
11639         if (tp->t_state == TCPS_FIN_WAIT_1) {
11640                 /* We could have went to FIN_WAIT_1 (or EST) above */
11641                 /*
11642                  * In FIN_WAIT_1 STATE in addition to the processing for the
11643                  * ESTABLISHED state if our FIN is now acknowledged then
11644                  * enter FIN_WAIT_2.
11645                  */
11646                 if (ourfinisacked) {
11647                         /*
11648                          * If we can't receive any more data, then closing
11649                          * user can proceed. Starting the timer is contrary
11650                          * to the specification, but if we don't get a FIN
11651                          * we'll hang forever.
11652                          *
11653                          * XXXjl: we should release the tp also, and use a
11654                          * compressed state.
11655                          */
11656                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11657                                 soisdisconnected(so);
11658                                 tcp_timer_activate(tp, TT_2MSL,
11659                                     (tcp_fast_finwait2_recycle ?
11660                                     tcp_finwait2_timeout :
11661                                     TP_MAXIDLE(tp)));
11662                         }
11663                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
11664                 }
11665         }
11666         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11667             tiwin, thflags, nxt_pkt));
11668 }
11669
11670 /*
11671  * Return value of 1, the TCB is unlocked and most
11672  * likely gone, return value of 0, the TCP is still
11673  * locked.
11674  */
11675 static int
11676 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
11677     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11678     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11679 {
11680         int32_t ret_val = 0;
11681         struct tcp_rack *rack;
11682
11683         /*
11684          * Header prediction: check for the two common cases of a
11685          * uni-directional data xfer.  If the packet has no control flags,
11686          * is in-sequence, the window didn't change and we're not
11687          * retransmitting, it's a candidate.  If the length is zero and the
11688          * ack moved forward, we're the sender side of the xfer.  Just free
11689          * the data acked & wake any higher level process that was blocked
11690          * waiting for space.  If the length is non-zero and the ack didn't
11691          * move, we're the receiver side.  If we're getting packets in-order
11692          * (the reassembly queue is empty), add the data toc The socket
11693          * buffer and note that we need a delayed ack. Make sure that the
11694          * hidden state-flags are also off. Since we check for
11695          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
11696          */
11697         rack = (struct tcp_rack *)tp->t_fb_ptr;
11698         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
11699             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
11700             __predict_true(SEGQ_EMPTY(tp)) &&
11701             __predict_true(th->th_seq == tp->rcv_nxt)) {
11702                 if (tlen == 0) {
11703                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
11704                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
11705                                 return (0);
11706                         }
11707                 } else {
11708                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
11709                             tiwin, nxt_pkt, iptos)) {
11710                                 return (0);
11711                         }
11712                 }
11713         }
11714         ctf_calc_rwin(so, tp);
11715
11716         if ((thflags & TH_RST) ||
11717             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11718                 return (ctf_process_rst(m, th, so, tp));
11719
11720         /*
11721          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11722          * synchronized state.
11723          */
11724         if (thflags & TH_SYN) {
11725                 ctf_challenge_ack(m, th, tp, &ret_val);
11726                 return (ret_val);
11727         }
11728         /*
11729          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11730          * it's less than ts_recent, drop it.
11731          */
11732         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11733             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11734                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11735                         return (ret_val);
11736         }
11737         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11738                               &rack->r_ctl.challenge_ack_ts,
11739                               &rack->r_ctl.challenge_ack_cnt)) {
11740                 return (ret_val);
11741         }
11742         /*
11743          * If last ACK falls within this segment's sequence numbers, record
11744          * its timestamp. NOTE: 1) That the test incorporates suggestions
11745          * from the latest proposal of the tcplw@cray.com list (Braden
11746          * 1993/04/26). 2) That updating only on newer timestamps interferes
11747          * with our earlier PAWS tests, so this check should be solely
11748          * predicated on the sequence space of this segment. 3) That we
11749          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11750          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11751          * SEG.Len, This modified check allows us to overcome RFC1323's
11752          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11753          * p.869. In such cases, we can still calculate the RTT correctly
11754          * when RCV.NXT == Last.ACK.Sent.
11755          */
11756         if ((to->to_flags & TOF_TS) != 0 &&
11757             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11758             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11759             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11760                 tp->ts_recent_age = tcp_ts_getticks();
11761                 tp->ts_recent = to->to_tsval;
11762         }
11763         /*
11764          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11765          * is on (half-synchronized state), then queue data for later
11766          * processing; else drop segment and return.
11767          */
11768         if ((thflags & TH_ACK) == 0) {
11769                 if (tp->t_flags & TF_NEEDSYN) {
11770                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11771                             tiwin, thflags, nxt_pkt));
11772
11773                 } else if (tp->t_flags & TF_ACKNOW) {
11774                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11775                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11776                         return (ret_val);
11777                 } else {
11778                         ctf_do_drop(m, NULL);
11779                         return (0);
11780                 }
11781         }
11782         /*
11783          * Ack processing.
11784          */
11785         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11786                 return (ret_val);
11787         }
11788         if (sbavail(&so->so_snd)) {
11789                 if (ctf_progress_timeout_check(tp, true)) {
11790                         rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
11791                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
11792                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11793                         return (1);
11794                 }
11795         }
11796         /* State changes only happen in rack_process_data() */
11797         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11798             tiwin, thflags, nxt_pkt));
11799 }
11800
11801 /*
11802  * Return value of 1, the TCB is unlocked and most
11803  * likely gone, return value of 0, the TCP is still
11804  * locked.
11805  */
11806 static int
11807 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
11808     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11809     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11810 {
11811         int32_t ret_val = 0;
11812         struct tcp_rack *rack;
11813
11814         rack = (struct tcp_rack *)tp->t_fb_ptr;
11815         ctf_calc_rwin(so, tp);
11816         if ((thflags & TH_RST) ||
11817             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11818                 return (ctf_process_rst(m, th, so, tp));
11819         /*
11820          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11821          * synchronized state.
11822          */
11823         if (thflags & TH_SYN) {
11824                 ctf_challenge_ack(m, th, tp, &ret_val);
11825                 return (ret_val);
11826         }
11827         /*
11828          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11829          * it's less than ts_recent, drop it.
11830          */
11831         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11832             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11833                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11834                         return (ret_val);
11835         }
11836         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11837                               &rack->r_ctl.challenge_ack_ts,
11838                               &rack->r_ctl.challenge_ack_cnt)) {
11839                 return (ret_val);
11840         }
11841         /*
11842          * If last ACK falls within this segment's sequence numbers, record
11843          * its timestamp. NOTE: 1) That the test incorporates suggestions
11844          * from the latest proposal of the tcplw@cray.com list (Braden
11845          * 1993/04/26). 2) That updating only on newer timestamps interferes
11846          * with our earlier PAWS tests, so this check should be solely
11847          * predicated on the sequence space of this segment. 3) That we
11848          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11849          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11850          * SEG.Len, This modified check allows us to overcome RFC1323's
11851          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11852          * p.869. In such cases, we can still calculate the RTT correctly
11853          * when RCV.NXT == Last.ACK.Sent.
11854          */
11855         if ((to->to_flags & TOF_TS) != 0 &&
11856             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11857             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11858             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11859                 tp->ts_recent_age = tcp_ts_getticks();
11860                 tp->ts_recent = to->to_tsval;
11861         }
11862         /*
11863          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11864          * is on (half-synchronized state), then queue data for later
11865          * processing; else drop segment and return.
11866          */
11867         if ((thflags & TH_ACK) == 0) {
11868                 if (tp->t_flags & TF_NEEDSYN) {
11869                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11870                             tiwin, thflags, nxt_pkt));
11871
11872                 } else if (tp->t_flags & TF_ACKNOW) {
11873                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11874                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11875                         return (ret_val);
11876                 } else {
11877                         ctf_do_drop(m, NULL);
11878                         return (0);
11879                 }
11880         }
11881         /*
11882          * Ack processing.
11883          */
11884         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11885                 return (ret_val);
11886         }
11887         if (sbavail(&so->so_snd)) {
11888                 if (ctf_progress_timeout_check(tp, true)) {
11889                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
11890                                                 tp, tick, PROGRESS_DROP, __LINE__);
11891                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
11892                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11893                         return (1);
11894                 }
11895         }
11896         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11897             tiwin, thflags, nxt_pkt));
11898 }
11899
11900 static int
11901 rack_check_data_after_close(struct mbuf *m,
11902     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
11903 {
11904         struct tcp_rack *rack;
11905
11906         rack = (struct tcp_rack *)tp->t_fb_ptr;
11907         if (rack->rc_allow_data_af_clo == 0) {
11908         close_now:
11909                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11910                 /* tcp_close will kill the inp pre-log the Reset */
11911                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
11912                 tp = tcp_close(tp);
11913                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
11914                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
11915                 return (1);
11916         }
11917         if (sbavail(&so->so_snd) == 0)
11918                 goto close_now;
11919         /* Ok we allow data that is ignored and a followup reset */
11920         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11921         tp->rcv_nxt = th->th_seq + *tlen;
11922         tp->t_flags2 |= TF2_DROP_AF_DATA;
11923         rack->r_wanted_output = 1;
11924         *tlen = 0;
11925         return (0);
11926 }
11927
11928 /*
11929  * Return value of 1, the TCB is unlocked and most
11930  * likely gone, return value of 0, the TCP is still
11931  * locked.
11932  */
11933 static int
11934 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
11935     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11936     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11937 {
11938         int32_t ret_val = 0;
11939         int32_t ourfinisacked = 0;
11940         struct tcp_rack *rack;
11941
11942         rack = (struct tcp_rack *)tp->t_fb_ptr;
11943         ctf_calc_rwin(so, tp);
11944
11945         if ((thflags & TH_RST) ||
11946             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11947                 return (ctf_process_rst(m, th, so, tp));
11948         /*
11949          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11950          * synchronized state.
11951          */
11952         if (thflags & TH_SYN) {
11953                 ctf_challenge_ack(m, th, tp, &ret_val);
11954                 return (ret_val);
11955         }
11956         /*
11957          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11958          * it's less than ts_recent, drop it.
11959          */
11960         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11961             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11962                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11963                         return (ret_val);
11964         }
11965         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11966                               &rack->r_ctl.challenge_ack_ts,
11967                               &rack->r_ctl.challenge_ack_cnt)) {
11968                 return (ret_val);
11969         }
11970         /*
11971          * If new data are received on a connection after the user processes
11972          * are gone, then RST the other end.
11973          */
11974         if ((so->so_state & SS_NOFDREF) && tlen) {
11975                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
11976                         return (1);
11977         }
11978         /*
11979          * If last ACK falls within this segment's sequence numbers, record
11980          * its timestamp. NOTE: 1) That the test incorporates suggestions
11981          * from the latest proposal of the tcplw@cray.com list (Braden
11982          * 1993/04/26). 2) That updating only on newer timestamps interferes
11983          * with our earlier PAWS tests, so this check should be solely
11984          * predicated on the sequence space of this segment. 3) That we
11985          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11986          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11987          * SEG.Len, This modified check allows us to overcome RFC1323's
11988          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11989          * p.869. In such cases, we can still calculate the RTT correctly
11990          * when RCV.NXT == Last.ACK.Sent.
11991          */
11992         if ((to->to_flags & TOF_TS) != 0 &&
11993             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11994             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11995             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11996                 tp->ts_recent_age = tcp_ts_getticks();
11997                 tp->ts_recent = to->to_tsval;
11998         }
11999         /*
12000          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12001          * is on (half-synchronized state), then queue data for later
12002          * processing; else drop segment and return.
12003          */
12004         if ((thflags & TH_ACK) == 0) {
12005                 if (tp->t_flags & TF_NEEDSYN) {
12006                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12007                             tiwin, thflags, nxt_pkt));
12008                 } else if (tp->t_flags & TF_ACKNOW) {
12009                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12010                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12011                         return (ret_val);
12012                 } else {
12013                         ctf_do_drop(m, NULL);
12014                         return (0);
12015                 }
12016         }
12017         /*
12018          * Ack processing.
12019          */
12020         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12021                 return (ret_val);
12022         }
12023         if (ourfinisacked) {
12024                 /*
12025                  * If we can't receive any more data, then closing user can
12026                  * proceed. Starting the timer is contrary to the
12027                  * specification, but if we don't get a FIN we'll hang
12028                  * forever.
12029                  *
12030                  * XXXjl: we should release the tp also, and use a
12031                  * compressed state.
12032                  */
12033                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12034                         soisdisconnected(so);
12035                         tcp_timer_activate(tp, TT_2MSL,
12036                             (tcp_fast_finwait2_recycle ?
12037                             tcp_finwait2_timeout :
12038                             TP_MAXIDLE(tp)));
12039                 }
12040                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
12041         }
12042         if (sbavail(&so->so_snd)) {
12043                 if (ctf_progress_timeout_check(tp, true)) {
12044                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12045                                                 tp, tick, PROGRESS_DROP, __LINE__);
12046                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12047                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12048                         return (1);
12049                 }
12050         }
12051         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12052             tiwin, thflags, nxt_pkt));
12053 }
12054
12055 /*
12056  * Return value of 1, the TCB is unlocked and most
12057  * likely gone, return value of 0, the TCP is still
12058  * locked.
12059  */
12060 static int
12061 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
12062     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12063     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12064 {
12065         int32_t ret_val = 0;
12066         int32_t ourfinisacked = 0;
12067         struct tcp_rack *rack;
12068
12069         rack = (struct tcp_rack *)tp->t_fb_ptr;
12070         ctf_calc_rwin(so, tp);
12071
12072         if ((thflags & TH_RST) ||
12073             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12074                 return (ctf_process_rst(m, th, so, tp));
12075         /*
12076          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12077          * synchronized state.
12078          */
12079         if (thflags & TH_SYN) {
12080                 ctf_challenge_ack(m, th, tp, &ret_val);
12081                 return (ret_val);
12082         }
12083         /*
12084          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12085          * it's less than ts_recent, drop it.
12086          */
12087         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12088             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12089                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12090                         return (ret_val);
12091         }
12092         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12093                               &rack->r_ctl.challenge_ack_ts,
12094                               &rack->r_ctl.challenge_ack_cnt)) {
12095                 return (ret_val);
12096         }
12097         /*
12098          * If new data are received on a connection after the user processes
12099          * are gone, then RST the other end.
12100          */
12101         if ((so->so_state & SS_NOFDREF) && tlen) {
12102                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12103                         return (1);
12104         }
12105         /*
12106          * If last ACK falls within this segment's sequence numbers, record
12107          * its timestamp. NOTE: 1) That the test incorporates suggestions
12108          * from the latest proposal of the tcplw@cray.com list (Braden
12109          * 1993/04/26). 2) That updating only on newer timestamps interferes
12110          * with our earlier PAWS tests, so this check should be solely
12111          * predicated on the sequence space of this segment. 3) That we
12112          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12113          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12114          * SEG.Len, This modified check allows us to overcome RFC1323's
12115          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12116          * p.869. In such cases, we can still calculate the RTT correctly
12117          * when RCV.NXT == Last.ACK.Sent.
12118          */
12119         if ((to->to_flags & TOF_TS) != 0 &&
12120             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12121             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12122             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12123                 tp->ts_recent_age = tcp_ts_getticks();
12124                 tp->ts_recent = to->to_tsval;
12125         }
12126         /*
12127          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12128          * is on (half-synchronized state), then queue data for later
12129          * processing; else drop segment and return.
12130          */
12131         if ((thflags & TH_ACK) == 0) {
12132                 if (tp->t_flags & TF_NEEDSYN) {
12133                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12134                             tiwin, thflags, nxt_pkt));
12135                 } else if (tp->t_flags & TF_ACKNOW) {
12136                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12137                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12138                         return (ret_val);
12139                 } else {
12140                         ctf_do_drop(m, NULL);
12141                         return (0);
12142                 }
12143         }
12144         /*
12145          * Ack processing.
12146          */
12147         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12148                 return (ret_val);
12149         }
12150         if (ourfinisacked) {
12151                 tcp_twstart(tp);
12152                 m_freem(m);
12153                 return (1);
12154         }
12155         if (sbavail(&so->so_snd)) {
12156                 if (ctf_progress_timeout_check(tp, true)) {
12157                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12158                                                 tp, tick, PROGRESS_DROP, __LINE__);
12159                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12160                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12161                         return (1);
12162                 }
12163         }
12164         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12165             tiwin, thflags, nxt_pkt));
12166 }
12167
12168 /*
12169  * Return value of 1, the TCB is unlocked and most
12170  * likely gone, return value of 0, the TCP is still
12171  * locked.
12172  */
12173 static int
12174 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12175     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12176     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12177 {
12178         int32_t ret_val = 0;
12179         int32_t ourfinisacked = 0;
12180         struct tcp_rack *rack;
12181
12182         rack = (struct tcp_rack *)tp->t_fb_ptr;
12183         ctf_calc_rwin(so, tp);
12184
12185         if ((thflags & TH_RST) ||
12186             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12187                 return (ctf_process_rst(m, th, so, tp));
12188         /*
12189          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12190          * synchronized state.
12191          */
12192         if (thflags & TH_SYN) {
12193                 ctf_challenge_ack(m, th, tp, &ret_val);
12194                 return (ret_val);
12195         }
12196         /*
12197          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12198          * it's less than ts_recent, drop it.
12199          */
12200         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12201             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12202                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12203                         return (ret_val);
12204         }
12205         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12206                               &rack->r_ctl.challenge_ack_ts,
12207                               &rack->r_ctl.challenge_ack_cnt)) {
12208                 return (ret_val);
12209         }
12210         /*
12211          * If new data are received on a connection after the user processes
12212          * are gone, then RST the other end.
12213          */
12214         if ((so->so_state & SS_NOFDREF) && tlen) {
12215                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12216                         return (1);
12217         }
12218         /*
12219          * If last ACK falls within this segment's sequence numbers, record
12220          * its timestamp. NOTE: 1) That the test incorporates suggestions
12221          * from the latest proposal of the tcplw@cray.com list (Braden
12222          * 1993/04/26). 2) That updating only on newer timestamps interferes
12223          * with our earlier PAWS tests, so this check should be solely
12224          * predicated on the sequence space of this segment. 3) That we
12225          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12226          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12227          * SEG.Len, This modified check allows us to overcome RFC1323's
12228          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12229          * p.869. In such cases, we can still calculate the RTT correctly
12230          * when RCV.NXT == Last.ACK.Sent.
12231          */
12232         if ((to->to_flags & TOF_TS) != 0 &&
12233             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12234             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12235             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12236                 tp->ts_recent_age = tcp_ts_getticks();
12237                 tp->ts_recent = to->to_tsval;
12238         }
12239         /*
12240          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12241          * is on (half-synchronized state), then queue data for later
12242          * processing; else drop segment and return.
12243          */
12244         if ((thflags & TH_ACK) == 0) {
12245                 if (tp->t_flags & TF_NEEDSYN) {
12246                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12247                             tiwin, thflags, nxt_pkt));
12248                 } else if (tp->t_flags & TF_ACKNOW) {
12249                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12250                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12251                         return (ret_val);
12252                 } else {
12253                         ctf_do_drop(m, NULL);
12254                         return (0);
12255                 }
12256         }
12257         /*
12258          * case TCPS_LAST_ACK: Ack processing.
12259          */
12260         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12261                 return (ret_val);
12262         }
12263         if (ourfinisacked) {
12264                 tp = tcp_close(tp);
12265                 ctf_do_drop(m, tp);
12266                 return (1);
12267         }
12268         if (sbavail(&so->so_snd)) {
12269                 if (ctf_progress_timeout_check(tp, true)) {
12270                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12271                                                 tp, tick, PROGRESS_DROP, __LINE__);
12272                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12273                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12274                         return (1);
12275                 }
12276         }
12277         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12278             tiwin, thflags, nxt_pkt));
12279 }
12280
12281 /*
12282  * Return value of 1, the TCB is unlocked and most
12283  * likely gone, return value of 0, the TCP is still
12284  * locked.
12285  */
12286 static int
12287 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
12288     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12289     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12290 {
12291         int32_t ret_val = 0;
12292         int32_t ourfinisacked = 0;
12293         struct tcp_rack *rack;
12294
12295         rack = (struct tcp_rack *)tp->t_fb_ptr;
12296         ctf_calc_rwin(so, tp);
12297
12298         /* Reset receive buffer auto scaling when not in bulk receive mode. */
12299         if ((thflags & TH_RST) ||
12300             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12301                 return (ctf_process_rst(m, th, so, tp));
12302         /*
12303          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12304          * synchronized state.
12305          */
12306         if (thflags & TH_SYN) {
12307                 ctf_challenge_ack(m, th, tp, &ret_val);
12308                 return (ret_val);
12309         }
12310         /*
12311          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12312          * it's less than ts_recent, drop it.
12313          */
12314         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12315             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12316                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12317                         return (ret_val);
12318         }
12319         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12320                               &rack->r_ctl.challenge_ack_ts,
12321                               &rack->r_ctl.challenge_ack_cnt)) {
12322                 return (ret_val);
12323         }
12324         /*
12325          * If new data are received on a connection after the user processes
12326          * are gone, then RST the other end.
12327          */
12328         if ((so->so_state & SS_NOFDREF) &&
12329             tlen) {
12330                 if (rack_check_data_after_close(m, tp, &tlen, th, so))
12331                         return (1);
12332         }
12333         /*
12334          * If last ACK falls within this segment's sequence numbers, record
12335          * its timestamp. NOTE: 1) That the test incorporates suggestions
12336          * from the latest proposal of the tcplw@cray.com list (Braden
12337          * 1993/04/26). 2) That updating only on newer timestamps interferes
12338          * with our earlier PAWS tests, so this check should be solely
12339          * predicated on the sequence space of this segment. 3) That we
12340          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12341          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12342          * SEG.Len, This modified check allows us to overcome RFC1323's
12343          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12344          * p.869. In such cases, we can still calculate the RTT correctly
12345          * when RCV.NXT == Last.ACK.Sent.
12346          */
12347         if ((to->to_flags & TOF_TS) != 0 &&
12348             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12349             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12350             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12351                 tp->ts_recent_age = tcp_ts_getticks();
12352                 tp->ts_recent = to->to_tsval;
12353         }
12354         /*
12355          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12356          * is on (half-synchronized state), then queue data for later
12357          * processing; else drop segment and return.
12358          */
12359         if ((thflags & TH_ACK) == 0) {
12360                 if (tp->t_flags & TF_NEEDSYN) {
12361                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12362                             tiwin, thflags, nxt_pkt));
12363                 } else if (tp->t_flags & TF_ACKNOW) {
12364                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12365                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12366                         return (ret_val);
12367                 } else {
12368                         ctf_do_drop(m, NULL);
12369                         return (0);
12370                 }
12371         }
12372         /*
12373          * Ack processing.
12374          */
12375         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12376                 return (ret_val);
12377         }
12378         if (sbavail(&so->so_snd)) {
12379                 if (ctf_progress_timeout_check(tp, true)) {
12380                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12381                                                 tp, tick, PROGRESS_DROP, __LINE__);
12382                         tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
12383                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12384                         return (1);
12385                 }
12386         }
12387         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12388             tiwin, thflags, nxt_pkt));
12389 }
12390
12391 static void inline
12392 rack_clear_rate_sample(struct tcp_rack *rack)
12393 {
12394         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
12395         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
12396         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
12397 }
12398
12399 static void
12400 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
12401 {
12402         uint64_t bw_est, rate_wanted;
12403         int chged = 0;
12404         uint32_t user_max, orig_min, orig_max;
12405
12406         orig_min = rack->r_ctl.rc_pace_min_segs;
12407         orig_max = rack->r_ctl.rc_pace_max_segs;
12408         user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
12409         if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
12410                 chged = 1;
12411         rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
12412         if (rack->use_fixed_rate || rack->rc_force_max_seg) {
12413                 if (user_max != rack->r_ctl.rc_pace_max_segs)
12414                         chged = 1;
12415         }
12416         if (rack->rc_force_max_seg) {
12417                 rack->r_ctl.rc_pace_max_segs = user_max;
12418         } else if (rack->use_fixed_rate) {
12419                 bw_est = rack_get_bw(rack);
12420                 if ((rack->r_ctl.crte == NULL) ||
12421                     (bw_est != rack->r_ctl.crte->rate)) {
12422                         rack->r_ctl.rc_pace_max_segs = user_max;
12423                 } else {
12424                         /* We are pacing right at the hardware rate */
12425                         uint32_t segsiz;
12426
12427                         segsiz = min(ctf_fixed_maxseg(tp),
12428                                      rack->r_ctl.rc_pace_min_segs);
12429                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
12430                                                            tp, bw_est, segsiz, 0,
12431                                                            rack->r_ctl.crte, NULL);
12432                 }
12433         } else if (rack->rc_always_pace) {
12434                 if (rack->r_ctl.gp_bw ||
12435 #ifdef NETFLIX_PEAKRATE
12436                     rack->rc_tp->t_maxpeakrate ||
12437 #endif
12438                     rack->r_ctl.init_rate) {
12439                         /* We have a rate of some sort set */
12440                         uint32_t  orig;
12441
12442                         bw_est = rack_get_bw(rack);
12443                         orig = rack->r_ctl.rc_pace_max_segs;
12444                         if (fill_override)
12445                                 rate_wanted = *fill_override;
12446                         else
12447                                 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
12448                         if (rate_wanted) {
12449                                 /* We have something */
12450                                 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
12451                                                                                    rate_wanted,
12452                                                                                    ctf_fixed_maxseg(rack->rc_tp));
12453                         } else
12454                                 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
12455                         if (orig != rack->r_ctl.rc_pace_max_segs)
12456                                 chged = 1;
12457                 } else if ((rack->r_ctl.gp_bw == 0) &&
12458                            (rack->r_ctl.rc_pace_max_segs == 0)) {
12459                         /*
12460                          * If we have nothing limit us to bursting
12461                          * out IW sized pieces.
12462                          */
12463                         chged = 1;
12464                         rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
12465                 }
12466         }
12467         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
12468                 chged = 1;
12469                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
12470         }
12471         if (chged)
12472                 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
12473 }
12474
12475
12476 static void
12477 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
12478 {
12479 #ifdef INET6
12480         struct ip6_hdr *ip6 = NULL;
12481 #endif
12482 #ifdef INET
12483         struct ip *ip = NULL;
12484 #endif
12485         struct udphdr *udp = NULL;
12486
12487         /* Ok lets fill in the fast block, it can only be used with no IP options! */
12488 #ifdef INET6
12489         if (rack->r_is_v6) {
12490                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
12491                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
12492                 if (tp->t_port) {
12493                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12494                         udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
12495                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12496                         udp->uh_dport = tp->t_port;
12497                         rack->r_ctl.fsb.udp = udp;
12498                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12499                 } else
12500                 {
12501                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
12502                         rack->r_ctl.fsb.udp = NULL;
12503                 }
12504                 tcpip_fillheaders(rack->rc_inp,
12505                                   tp->t_port,
12506                                   ip6, rack->r_ctl.fsb.th);
12507         } else
12508 #endif                          /* INET6 */
12509         {
12510                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
12511                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
12512                 if (tp->t_port) {
12513                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12514                         udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
12515                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12516                         udp->uh_dport = tp->t_port;
12517                         rack->r_ctl.fsb.udp = udp;
12518                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12519                 } else
12520                 {
12521                         rack->r_ctl.fsb.udp = NULL;
12522                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
12523                 }
12524                 tcpip_fillheaders(rack->rc_inp,
12525                                   tp->t_port,
12526                                   ip, rack->r_ctl.fsb.th);
12527         }
12528         rack->r_fsb_inited = 1;
12529 }
12530
12531 static int
12532 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
12533 {
12534         /*
12535          * Allocate the larger of spaces V6 if available else just
12536          * V4 and include udphdr (overbook)
12537          */
12538 #ifdef INET6
12539         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
12540 #else
12541         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
12542 #endif
12543         rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
12544                                             M_TCPFSB, M_NOWAIT|M_ZERO);
12545         if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
12546                 return (ENOMEM);
12547         }
12548         rack->r_fsb_inited = 0;
12549         return (0);
12550 }
12551
12552 static int
12553 rack_init(struct tcpcb *tp)
12554 {
12555         struct tcp_rack *rack = NULL;
12556         struct rack_sendmap *insret;
12557         uint32_t iwin, snt, us_cts;
12558         int err;
12559
12560         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
12561         if (tp->t_fb_ptr == NULL) {
12562                 /*
12563                  * We need to allocate memory but cant. The INP and INP_INFO
12564                  * locks and they are recusive (happens during setup. So a
12565                  * scheme to drop the locks fails :(
12566                  *
12567                  */
12568                 return (ENOMEM);
12569         }
12570         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
12571
12572         rack = (struct tcp_rack *)tp->t_fb_ptr;
12573         RB_INIT(&rack->r_ctl.rc_mtree);
12574         TAILQ_INIT(&rack->r_ctl.rc_free);
12575         TAILQ_INIT(&rack->r_ctl.rc_tmap);
12576         rack->rc_tp = tp;
12577         rack->rc_inp = tp->t_inpcb;
12578         /* Set the flag */
12579         rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
12580         /* Probably not needed but lets be sure */
12581         rack_clear_rate_sample(rack);
12582         /*
12583          * Save off the default values, socket options will poke
12584          * at these if pacing is not on or we have not yet
12585          * reached where pacing is on (gp_ready/fixed enabled).
12586          * When they get set into the CC module (when gp_ready
12587          * is enabled or we enable fixed) then we will set these
12588          * values into the CC and place in here the old values
12589          * so we have a restoral. Then we will set the flag
12590          * rc_pacing_cc_set. That way whenever we turn off pacing
12591          * or switch off this stack, we will know to go restore
12592          * the saved values.
12593          */
12594         rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
12595         rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
12596         /* We want abe like behavior as well */
12597         rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
12598         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
12599         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
12600         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
12601         rack->r_ctl.roundends = tp->snd_max;
12602         if (use_rack_rr)
12603                 rack->use_rack_rr = 1;
12604         if (V_tcp_delack_enabled)
12605                 tp->t_delayed_ack = 1;
12606         else
12607                 tp->t_delayed_ack = 0;
12608 #ifdef TCP_ACCOUNTING
12609         if (rack_tcp_accounting) {
12610                 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
12611         }
12612 #endif
12613         if (rack_enable_shared_cwnd)
12614                 rack->rack_enable_scwnd = 1;
12615         rack->rc_user_set_max_segs = rack_hptsi_segments;
12616         rack->rc_force_max_seg = 0;
12617         if (rack_use_imac_dack)
12618                 rack->rc_dack_mode = 1;
12619         TAILQ_INIT(&rack->r_ctl.opt_list);
12620         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
12621         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
12622         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
12623         rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
12624         rack->r_ctl.rc_highest_us_rtt = 0;
12625         rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
12626         rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
12627         if (rack_use_cmp_acks)
12628                 rack->r_use_cmp_ack = 1;
12629         if (rack_disable_prr)
12630                 rack->rack_no_prr = 1;
12631         if (rack_gp_no_rec_chg)
12632                 rack->rc_gp_no_rec_chg = 1;
12633         if (rack_pace_every_seg && tcp_can_enable_pacing()) {
12634                 rack->rc_always_pace = 1;
12635                 if (rack->use_fixed_rate || rack->gp_ready)
12636                         rack_set_cc_pacing(rack);
12637         } else
12638                 rack->rc_always_pace = 0;
12639         if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
12640                 rack->r_mbuf_queue = 1;
12641         else
12642                 rack->r_mbuf_queue = 0;
12643         if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
12644                 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
12645         else
12646                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
12647         rack_set_pace_segments(tp, rack, __LINE__, NULL);
12648         if (rack_limits_scwnd)
12649                 rack->r_limit_scw = 1;
12650         else
12651                 rack->r_limit_scw = 0;
12652         rack->rc_labc = V_tcp_abc_l_var;
12653         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
12654         rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
12655         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
12656         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
12657         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
12658         rack->r_ctl.rc_min_to = rack_min_to;
12659         microuptime(&rack->r_ctl.act_rcv_time);
12660         rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
12661         rack->r_running_late = 0;
12662         rack->r_running_early = 0;
12663         rack->rc_init_win = rack_default_init_window;
12664         rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
12665         if (rack_hw_up_only)
12666                 rack->r_up_only = 1;
12667         if (rack_do_dyn_mul) {
12668                 /* When dynamic adjustment is on CA needs to start at 100% */
12669                 rack->rc_gp_dyn_mul = 1;
12670                 if (rack_do_dyn_mul >= 100)
12671                         rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
12672         } else
12673                 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
12674         rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
12675         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
12676         rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
12677         setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
12678                                 rack_probertt_filter_life);
12679         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
12680         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
12681         rack->r_ctl.rc_time_of_last_probertt = us_cts;
12682         rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
12683         rack->r_ctl.rc_time_probertt_starts = 0;
12684         if (rack_dsack_std_based & 0x1) {
12685                 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
12686                 rack->rc_rack_tmr_std_based = 1;
12687         }
12688         if (rack_dsack_std_based & 0x2) {
12689                 /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
12690                 rack->rc_rack_use_dsack = 1;
12691         }
12692         /* We require at least one measurement, even if the sysctl is 0 */
12693         if (rack_req_measurements)
12694                 rack->r_ctl.req_measurements = rack_req_measurements;
12695         else
12696                 rack->r_ctl.req_measurements = 1;
12697         if (rack_enable_hw_pacing)
12698                 rack->rack_hdw_pace_ena = 1;
12699         if (rack_hw_rate_caps)
12700                 rack->r_rack_hw_rate_caps = 1;
12701         /* Do we force on detection? */
12702 #ifdef NETFLIX_EXP_DETECTION
12703         if (tcp_force_detection)
12704                 rack->do_detection = 1;
12705         else
12706 #endif
12707                 rack->do_detection = 0;
12708         if (rack_non_rxt_use_cr)
12709                 rack->rack_rec_nonrxt_use_cr = 1;
12710         err = rack_init_fsb(tp, rack);
12711         if (err) {
12712                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12713                 tp->t_fb_ptr = NULL;
12714                 return (err);
12715         }
12716         if (tp->snd_una != tp->snd_max) {
12717                 /* Create a send map for the current outstanding data */
12718                 struct rack_sendmap *rsm;
12719
12720                 rsm = rack_alloc(rack);
12721                 if (rsm == NULL) {
12722                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12723                         tp->t_fb_ptr = NULL;
12724                         return (ENOMEM);
12725                 }
12726                 rsm->r_no_rtt_allowed = 1;
12727                 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
12728                 rsm->r_rtr_cnt = 1;
12729                 rsm->r_rtr_bytes = 0;
12730                 if (tp->t_flags & TF_SENTFIN) {
12731                         rsm->r_end = tp->snd_max - 1;
12732                         rsm->r_flags |= RACK_HAS_FIN;
12733                 } else {
12734                         rsm->r_end = tp->snd_max;
12735                 }
12736                 if (tp->snd_una == tp->iss) {
12737                         /* The data space is one beyond snd_una */
12738                         rsm->r_flags |= RACK_HAS_SYN;
12739                         rsm->r_start = tp->iss;
12740                         rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
12741                 } else
12742                         rsm->r_start = tp->snd_una;
12743                 rsm->r_dupack = 0;
12744                 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
12745                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
12746                         if (rsm->m)
12747                                 rsm->orig_m_len = rsm->m->m_len;
12748                         else
12749                                 rsm->orig_m_len = 0;
12750                 } else {
12751                         /*
12752                          * This can happen if we have a stand-alone FIN or
12753                          *  SYN.
12754                          */
12755                         rsm->m = NULL;
12756                         rsm->orig_m_len = 0;
12757                         rsm->soff = 0;
12758                 }
12759                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12760 #ifdef INVARIANTS
12761                 if (insret != NULL) {
12762                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
12763                               insret, rack, rsm);
12764                 }
12765 #endif
12766                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
12767                 rsm->r_in_tmap = 1;
12768         }
12769         /*
12770          * Timers in Rack are kept in microseconds so lets
12771          * convert any initial incoming variables
12772          * from ticks into usecs. Note that we
12773          * also change the values of t_srtt and t_rttvar, if
12774          * they are non-zero. They are kept with a 5
12775          * bit decimal so we have to carefully convert
12776          * these to get the full precision.
12777          */
12778         rack_convert_rtts(tp);
12779         tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
12780         if (rack_do_hystart) {
12781                 struct sockopt sopt;
12782                 struct cc_newreno_opts opt;
12783
12784                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
12785                 sopt.sopt_dir = SOPT_SET;
12786                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
12787                 opt.val = rack_do_hystart;
12788                 if (CC_ALGO(tp)->ctl_output != NULL)
12789                         (void)CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
12790         }
12791         if (rack_def_profile)
12792                 rack_set_profile(rack, rack_def_profile);
12793         /* Cancel the GP measurement in progress */
12794         tp->t_flags &= ~TF_GPUTINPROG;
12795         if (SEQ_GT(tp->snd_max, tp->iss))
12796                 snt = tp->snd_max - tp->iss;
12797         else
12798                 snt = 0;
12799         iwin = rc_init_window(rack);
12800         if (snt < iwin) {
12801                 /* We are not past the initial window
12802                  * so we need to make sure cwnd is
12803                  * correct.
12804                  */
12805                 if (tp->snd_cwnd < iwin)
12806                         tp->snd_cwnd = iwin;
12807                 /*
12808                  * If we are within the initial window
12809                  * we want ssthresh to be unlimited. Setting
12810                  * it to the rwnd (which the default stack does
12811                  * and older racks) is not really a good idea
12812                  * since we want to be in SS and grow both the
12813                  * cwnd and the rwnd (via dynamic rwnd growth). If
12814                  * we set it to the rwnd then as the peer grows its
12815                  * rwnd we will be stuck in CA and never hit SS.
12816                  *
12817                  * Its far better to raise it up high (this takes the
12818                  * risk that there as been a loss already, probably
12819                  * we should have an indicator in all stacks of loss
12820                  * but we don't), but considering the normal use this
12821                  * is a risk worth taking. The consequences of not
12822                  * hitting SS are far worse than going one more time
12823                  * into it early on (before we have sent even a IW).
12824                  * It is highly unlikely that we will have had a loss
12825                  * before getting the IW out.
12826                  */
12827                 tp->snd_ssthresh = 0xffffffff;
12828         }
12829         rack_stop_all_timers(tp);
12830         /* Lets setup the fsb block */
12831         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
12832         rack_log_rtt_shrinks(rack,  us_cts,  tp->t_rxtcur,
12833                              __LINE__, RACK_RTTS_INIT);
12834         return (0);
12835 }
12836
12837 static int
12838 rack_handoff_ok(struct tcpcb *tp)
12839 {
12840         if ((tp->t_state == TCPS_CLOSED) ||
12841             (tp->t_state == TCPS_LISTEN)) {
12842                 /* Sure no problem though it may not stick */
12843                 return (0);
12844         }
12845         if ((tp->t_state == TCPS_SYN_SENT) ||
12846             (tp->t_state == TCPS_SYN_RECEIVED)) {
12847                 /*
12848                  * We really don't know if you support sack,
12849                  * you have to get to ESTAB or beyond to tell.
12850                  */
12851                 return (EAGAIN);
12852         }
12853         if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
12854                 /*
12855                  * Rack will only send a FIN after all data is acknowledged.
12856                  * So in this case we have more data outstanding. We can't
12857                  * switch stacks until either all data and only the FIN
12858                  * is left (in which case rack_init() now knows how
12859                  * to deal with that) <or> all is acknowledged and we
12860                  * are only left with incoming data, though why you
12861                  * would want to switch to rack after all data is acknowledged
12862                  * I have no idea (rrs)!
12863                  */
12864                 return (EAGAIN);
12865         }
12866         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
12867                 return (0);
12868         }
12869         /*
12870          * If we reach here we don't do SACK on this connection so we can
12871          * never do rack.
12872          */
12873         return (EINVAL);
12874 }
12875
12876
12877 static void
12878 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
12879 {
12880         int ack_cmp = 0;
12881
12882         if (tp->t_fb_ptr) {
12883                 struct tcp_rack *rack;
12884                 struct rack_sendmap *rsm, *nrsm, *rm;
12885
12886                 rack = (struct tcp_rack *)tp->t_fb_ptr;
12887                 if (tp->t_in_pkt) {
12888                         /*
12889                          * It is unsafe to process the packets since a
12890                          * reset may be lurking in them (its rare but it
12891                          * can occur). If we were to find a RST, then we
12892                          * would end up dropping the connection and the
12893                          * INP lock, so when we return the caller (tcp_usrreq)
12894                          * will blow up when it trys to unlock the inp.
12895                          */
12896                         struct mbuf *save, *m;
12897
12898                         m = tp->t_in_pkt;
12899                         tp->t_in_pkt = NULL;
12900                         tp->t_tail_pkt = NULL;
12901                         while (m) {
12902                                 save = m->m_nextpkt;
12903                                 m->m_nextpkt = NULL;
12904                                 m_freem(m);
12905                                 m = save;
12906                         }
12907                         if ((tp->t_inpcb) &&
12908                             (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP))
12909                                 ack_cmp = 1;
12910                         if (ack_cmp) {
12911                                 /* Total if we used large or small (if ack-cmp was used). */
12912                                 if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS)
12913                                         counter_u64_add(rack_large_ackcmp, 1);
12914                                 else
12915                                         counter_u64_add(rack_small_ackcmp, 1);
12916                         }
12917                 }
12918                 tp->t_flags &= ~TF_FORCEDATA;
12919 #ifdef NETFLIX_SHARED_CWND
12920                 if (rack->r_ctl.rc_scw) {
12921                         uint32_t limit;
12922
12923                         if (rack->r_limit_scw)
12924                                 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
12925                         else
12926                                 limit = 0;
12927                         tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
12928                                                   rack->r_ctl.rc_scw_index,
12929                                                   limit);
12930                         rack->r_ctl.rc_scw = NULL;
12931                 }
12932 #endif
12933                 if (rack->r_ctl.fsb.tcp_ip_hdr) {
12934                         free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
12935                         rack->r_ctl.fsb.tcp_ip_hdr = NULL;
12936                         rack->r_ctl.fsb.th = NULL;
12937                 }
12938                 /* Convert back to ticks, with  */
12939                 if (tp->t_srtt > 1) {
12940                         uint32_t val, frac;
12941
12942                         val = USEC_2_TICKS(tp->t_srtt);
12943                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
12944                         tp->t_srtt = val << TCP_RTT_SHIFT;
12945                         /*
12946                          * frac is the fractional part here is left
12947                          * over from converting to hz and shifting.
12948                          * We need to convert this to the 5 bit
12949                          * remainder.
12950                          */
12951                         if (frac) {
12952                                 if (hz == 1000) {
12953                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
12954                                 } else {
12955                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
12956                                 }
12957                                 tp->t_srtt += frac;
12958                         }
12959                 }
12960                 if (tp->t_rttvar) {
12961                         uint32_t val, frac;
12962
12963                         val = USEC_2_TICKS(tp->t_rttvar);
12964                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
12965                         tp->t_rttvar = val <<  TCP_RTTVAR_SHIFT;
12966                         /*
12967                          * frac is the fractional part here is left
12968                          * over from converting to hz and shifting.
12969                          * We need to convert this to the 5 bit
12970                          * remainder.
12971                          */
12972                         if (frac) {
12973                                 if (hz == 1000) {
12974                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
12975                                 } else {
12976                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
12977                                 }
12978                                 tp->t_rttvar += frac;
12979                         }
12980                 }
12981                 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
12982                 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
12983                 if (rack->rc_always_pace) {
12984                         tcp_decrement_paced_conn();
12985                         rack_undo_cc_pacing(rack);
12986                         rack->rc_always_pace = 0;
12987                 }
12988                 /* Clean up any options if they were not applied */
12989                 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
12990                         struct deferred_opt_list *dol;
12991
12992                         dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
12993                         TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
12994                         free(dol, M_TCPDO);
12995                 }
12996                 /* rack does not use force data but other stacks may clear it */
12997                 if (rack->r_ctl.crte != NULL) {
12998                         tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
12999                         rack->rack_hdrw_pacing = 0;
13000                         rack->r_ctl.crte = NULL;
13001                 }
13002 #ifdef TCP_BLACKBOX
13003                 tcp_log_flowend(tp);
13004 #endif
13005                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
13006                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
13007 #ifdef INVARIANTS
13008                         if (rm != rsm) {
13009                                 panic("At fini, rack:%p rsm:%p rm:%p",
13010                                       rack, rsm, rm);
13011                         }
13012 #endif
13013                         uma_zfree(rack_zone, rsm);
13014                 }
13015                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
13016                 while (rsm) {
13017                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
13018                         uma_zfree(rack_zone, rsm);
13019                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
13020                 }
13021                 rack->rc_free_cnt = 0;
13022                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
13023                 tp->t_fb_ptr = NULL;
13024         }
13025         if (tp->t_inpcb) {
13026                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
13027                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
13028                 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
13029                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP;
13030                 /* Cancel the GP measurement in progress */
13031                 tp->t_flags &= ~TF_GPUTINPROG;
13032                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS;
13033         }
13034         /* Make sure snd_nxt is correctly set */
13035         tp->snd_nxt = tp->snd_max;
13036 }
13037
13038 static void
13039 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
13040 {
13041         if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
13042                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
13043         }
13044         switch (tp->t_state) {
13045         case TCPS_SYN_SENT:
13046                 rack->r_state = TCPS_SYN_SENT;
13047                 rack->r_substate = rack_do_syn_sent;
13048                 break;
13049         case TCPS_SYN_RECEIVED:
13050                 rack->r_state = TCPS_SYN_RECEIVED;
13051                 rack->r_substate = rack_do_syn_recv;
13052                 break;
13053         case TCPS_ESTABLISHED:
13054                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13055                 rack->r_state = TCPS_ESTABLISHED;
13056                 rack->r_substate = rack_do_established;
13057                 break;
13058         case TCPS_CLOSE_WAIT:
13059                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13060                 rack->r_state = TCPS_CLOSE_WAIT;
13061                 rack->r_substate = rack_do_close_wait;
13062                 break;
13063         case TCPS_FIN_WAIT_1:
13064                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13065                 rack->r_state = TCPS_FIN_WAIT_1;
13066                 rack->r_substate = rack_do_fin_wait_1;
13067                 break;
13068         case TCPS_CLOSING:
13069                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13070                 rack->r_state = TCPS_CLOSING;
13071                 rack->r_substate = rack_do_closing;
13072                 break;
13073         case TCPS_LAST_ACK:
13074                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13075                 rack->r_state = TCPS_LAST_ACK;
13076                 rack->r_substate = rack_do_lastack;
13077                 break;
13078         case TCPS_FIN_WAIT_2:
13079                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
13080                 rack->r_state = TCPS_FIN_WAIT_2;
13081                 rack->r_substate = rack_do_fin_wait_2;
13082                 break;
13083         case TCPS_LISTEN:
13084         case TCPS_CLOSED:
13085         case TCPS_TIME_WAIT:
13086         default:
13087                 break;
13088         };
13089         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
13090                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
13091
13092 }
13093
13094 static void
13095 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
13096 {
13097         /*
13098          * We received an ack, and then did not
13099          * call send or were bounced out due to the
13100          * hpts was running. Now a timer is up as well, is
13101          * it the right timer?
13102          */
13103         struct rack_sendmap *rsm;
13104         int tmr_up;
13105
13106         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
13107         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
13108                 return;
13109         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
13110         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
13111             (tmr_up == PACE_TMR_RXT)) {
13112                 /* Should be an RXT */
13113                 return;
13114         }
13115         if (rsm == NULL) {
13116                 /* Nothing outstanding? */
13117                 if (tp->t_flags & TF_DELACK) {
13118                         if (tmr_up == PACE_TMR_DELACK)
13119                                 /* We are supposed to have delayed ack up and we do */
13120                                 return;
13121                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
13122                         /*
13123                          * if we hit enobufs then we would expect the possiblity
13124                          * of nothing outstanding and the RXT up (and the hptsi timer).
13125                          */
13126                         return;
13127                 } else if (((V_tcp_always_keepalive ||
13128                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
13129                             (tp->t_state <= TCPS_CLOSING)) &&
13130                            (tmr_up == PACE_TMR_KEEP) &&
13131                            (tp->snd_max == tp->snd_una)) {
13132                         /* We should have keep alive up and we do */
13133                         return;
13134                 }
13135         }
13136         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
13137                    ((tmr_up == PACE_TMR_TLP) ||
13138                     (tmr_up == PACE_TMR_RACK) ||
13139                     (tmr_up == PACE_TMR_RXT))) {
13140                 /*
13141                  * Either a Rack, TLP or RXT is fine if  we
13142                  * have outstanding data.
13143                  */
13144                 return;
13145         } else if (tmr_up == PACE_TMR_DELACK) {
13146                 /*
13147                  * If the delayed ack was going to go off
13148                  * before the rtx/tlp/rack timer were going to
13149                  * expire, then that would be the timer in control.
13150                  * Note we don't check the time here trusting the
13151                  * code is correct.
13152                  */
13153                 return;
13154         }
13155         /*
13156          * Ok the timer originally started is not what we want now.
13157          * We will force the hpts to be stopped if any, and restart
13158          * with the slot set to what was in the saved slot.
13159          */
13160         if (rack->rc_inp->inp_in_hpts) {
13161                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
13162                         uint32_t us_cts;
13163
13164                         us_cts = tcp_get_usecs(NULL);
13165                         if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
13166                                 rack->r_early = 1;
13167                                 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
13168                         }
13169                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
13170                 }
13171                 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
13172         }
13173         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13174         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
13175 }
13176
13177
13178 static void
13179 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
13180 {
13181         if ((SEQ_LT(tp->snd_wl1, seq) ||
13182             (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
13183             (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
13184                 /* keep track of pure window updates */
13185                 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
13186                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
13187                 tp->snd_wnd = tiwin;
13188                 rack_validate_fo_sendwin_up(tp, rack);
13189                 tp->snd_wl1 = seq;
13190                 tp->snd_wl2 = ack;
13191                 if (tp->snd_wnd > tp->max_sndwnd)
13192                         tp->max_sndwnd = tp->snd_wnd;
13193             rack->r_wanted_output = 1;
13194         } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
13195                 tp->snd_wnd = tiwin;
13196                 rack_validate_fo_sendwin_up(tp, rack);
13197                 tp->snd_wl1 = seq;
13198                 tp->snd_wl2 = ack;
13199         } else {
13200                 /* Not a valid win update */
13201                 return;
13202         }
13203         if (tp->snd_wnd > tp->max_sndwnd)
13204                 tp->max_sndwnd = tp->snd_wnd;
13205         if (tp->snd_wnd < (tp->snd_max - high_seq)) {
13206                 /* The peer collapsed the window */
13207                 rack_collapsed_window(rack);
13208         } else if (rack->rc_has_collapsed)
13209                 rack_un_collapse_window(rack);
13210         /* Do we exit persists? */
13211         if ((rack->rc_in_persist != 0) &&
13212             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
13213                                 rack->r_ctl.rc_pace_min_segs))) {
13214                 rack_exit_persist(tp, rack, cts);
13215         }
13216         /* Do we enter persists? */
13217         if ((rack->rc_in_persist == 0) &&
13218             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
13219             TCPS_HAVEESTABLISHED(tp->t_state) &&
13220             (tp->snd_max == tp->snd_una) &&
13221             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
13222             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
13223                 /*
13224                  * Here the rwnd is less than
13225                  * the pacing size, we are established,
13226                  * nothing is outstanding, and there is
13227                  * data to send. Enter persists.
13228                  */
13229                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
13230         }
13231 }
13232
13233 static void
13234 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
13235 {
13236
13237         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13238                 union tcp_log_stackspecific log;
13239                 struct timeval ltv;
13240                 char tcp_hdr_buf[60];
13241                 struct tcphdr *th;
13242                 struct timespec ts;
13243                 uint32_t orig_snd_una;
13244                 uint8_t xx = 0;
13245
13246 #ifdef NETFLIX_HTTP_LOGGING
13247                 struct http_sendfile_track *http_req;
13248
13249                 if (SEQ_GT(ae->ack, tp->snd_una)) {
13250                         http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
13251                 } else {
13252                         http_req = tcp_http_find_req_for_seq(tp, ae->ack);
13253                 }
13254 #endif
13255                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
13256                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
13257                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
13258                 if (rack->rack_no_prr == 0)
13259                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
13260                 else
13261                         log.u_bbr.flex1 = 0;
13262                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
13263                 log.u_bbr.use_lt_bw <<= 1;
13264                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
13265                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
13266                 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
13267                 log.u_bbr.pkts_out = tp->t_maxseg;
13268                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
13269                 log.u_bbr.flex7 = 1;
13270                 log.u_bbr.lost = ae->flags;
13271                 log.u_bbr.cwnd_gain = ackval;
13272                 log.u_bbr.pacing_gain = 0x2;
13273                 if (ae->flags & TSTMP_HDWR) {
13274                         /* Record the hardware timestamp if present */
13275                         log.u_bbr.flex3 = M_TSTMP;
13276                         ts.tv_sec = ae->timestamp / 1000000000;
13277                         ts.tv_nsec = ae->timestamp % 1000000000;
13278                         ltv.tv_sec = ts.tv_sec;
13279                         ltv.tv_usec = ts.tv_nsec / 1000;
13280                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
13281                 } else if (ae->flags & TSTMP_LRO) {
13282                         /* Record the LRO the arrival timestamp */
13283                         log.u_bbr.flex3 = M_TSTMP_LRO;
13284                         ts.tv_sec = ae->timestamp / 1000000000;
13285                         ts.tv_nsec = ae->timestamp % 1000000000;
13286                         ltv.tv_sec = ts.tv_sec;
13287                         ltv.tv_usec = ts.tv_nsec / 1000;
13288                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
13289                 }
13290                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
13291                 /* Log the rcv time */
13292                 log.u_bbr.delRate = ae->timestamp;
13293 #ifdef NETFLIX_HTTP_LOGGING
13294                 log.u_bbr.applimited = tp->t_http_closed;
13295                 log.u_bbr.applimited <<= 8;
13296                 log.u_bbr.applimited |= tp->t_http_open;
13297                 log.u_bbr.applimited <<= 8;
13298                 log.u_bbr.applimited |= tp->t_http_req;
13299                 if (http_req) {
13300                         /* Copy out any client req info */
13301                         /* seconds */
13302                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
13303                         /* useconds */
13304                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
13305                         log.u_bbr.rttProp = http_req->timestamp;
13306                         log.u_bbr.cur_del_rate = http_req->start;
13307                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
13308                                 log.u_bbr.flex8 |= 1;
13309                         } else {
13310                                 log.u_bbr.flex8 |= 2;
13311                                 log.u_bbr.bw_inuse = http_req->end;
13312                         }
13313                         log.u_bbr.flex6 = http_req->start_seq;
13314                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
13315                                 log.u_bbr.flex8 |= 4;
13316                                 log.u_bbr.epoch = http_req->end_seq;
13317                         }
13318                 }
13319 #endif
13320                 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
13321                 th = (struct tcphdr *)tcp_hdr_buf;
13322                 th->th_seq = ae->seq;
13323                 th->th_ack = ae->ack;
13324                 th->th_win = ae->win;
13325                 /* Now fill in the ports */
13326                 th->th_sport = tp->t_inpcb->inp_fport;
13327                 th->th_dport = tp->t_inpcb->inp_lport;
13328                 th->th_flags = ae->flags & 0xff;
13329                 /* Now do we have a timestamp option? */
13330                 if (ae->flags & HAS_TSTMP) {
13331                         u_char *cp;
13332                         uint32_t val;
13333
13334                         th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
13335                         cp = (u_char *)(th + 1);
13336                         *cp = TCPOPT_NOP;
13337                         cp++;
13338                         *cp = TCPOPT_NOP;
13339                         cp++;
13340                         *cp = TCPOPT_TIMESTAMP;
13341                         cp++;
13342                         *cp = TCPOLEN_TIMESTAMP;
13343                         cp++;
13344                         val = htonl(ae->ts_value);
13345                         bcopy((char *)&val,
13346                               (char *)cp, sizeof(uint32_t));
13347                         val = htonl(ae->ts_echo);
13348                         bcopy((char *)&val,
13349                               (char *)(cp + 4), sizeof(uint32_t));
13350                 } else
13351                         th->th_off = (sizeof(struct tcphdr) >> 2);
13352
13353                 /*
13354                  * For sane logging we need to play a little trick.
13355                  * If the ack were fully processed we would have moved
13356                  * snd_una to high_seq, but since compressed acks are
13357                  * processed in two phases, at this point (logging) snd_una
13358                  * won't be advanced. So we would see multiple acks showing
13359                  * the advancement. We can prevent that by "pretending" that
13360                  * snd_una was advanced and then un-advancing it so that the
13361                  * logging code has the right value for tlb_snd_una.
13362                  */
13363                 if (tp->snd_una != high_seq) {
13364                         orig_snd_una = tp->snd_una;
13365                         tp->snd_una = high_seq;
13366                         xx = 1;
13367                 } else
13368                         xx = 0;
13369                 TCP_LOG_EVENTP(tp, th,
13370                                &tp->t_inpcb->inp_socket->so_rcv,
13371                                &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0,
13372                                0, &log, true, &ltv);
13373                 if (xx) {
13374                         tp->snd_una = orig_snd_una;
13375                 }
13376         }
13377
13378 }
13379
13380 static int
13381 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
13382 {
13383         /*
13384          * Handle a "special" compressed ack mbuf. Each incoming
13385          * ack has only four possible dispositions:
13386          *
13387          * A) It moves the cum-ack forward
13388          * B) It is behind the cum-ack.
13389          * C) It is a window-update ack.
13390          * D) It is a dup-ack.
13391          *
13392          * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
13393          * in the incoming mbuf. We also need to still pay attention
13394          * to nxt_pkt since there may be another packet after this
13395          * one.
13396          */
13397 #ifdef TCP_ACCOUNTING
13398         uint64_t ts_val;
13399         uint64_t rdstc;
13400 #endif
13401         int segsiz;
13402         struct timespec ts;
13403         struct tcp_rack *rack;
13404         struct tcp_ackent *ae;
13405         uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
13406         int cnt, i, did_out, ourfinisacked = 0;
13407         struct tcpopt to_holder, *to = NULL;
13408         int win_up_req = 0;
13409         int nsegs = 0;
13410         int under_pacing = 1;
13411         int recovery = 0;
13412         int idx;
13413 #ifdef TCP_ACCOUNTING
13414         sched_pin();
13415 #endif
13416         rack = (struct tcp_rack *)tp->t_fb_ptr;
13417         if (rack->gp_ready &&
13418             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
13419                 under_pacing = 0;
13420         else
13421                 under_pacing = 1;
13422
13423         if (rack->r_state != tp->t_state)
13424                 rack_set_state(tp, rack);
13425         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13426             (tp->t_flags & TF_GPUTINPROG)) {
13427                 /*
13428                  * We have a goodput in progress
13429                  * and we have entered a late state.
13430                  * Do we have enough data in the sb
13431                  * to handle the GPUT request?
13432                  */
13433                 uint32_t bytes;
13434
13435                 bytes = tp->gput_ack - tp->gput_seq;
13436                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
13437                         bytes += tp->gput_seq - tp->snd_una;
13438                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
13439                         /*
13440                          * There are not enough bytes in the socket
13441                          * buffer that have been sent to cover this
13442                          * measurement. Cancel it.
13443                          */
13444                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
13445                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
13446                                                    tp->gput_seq,
13447                                                    0, 0, 18, __LINE__, NULL, 0);
13448                         tp->t_flags &= ~TF_GPUTINPROG;
13449                 }
13450         }
13451         to = &to_holder;
13452         to->to_flags = 0;
13453         KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
13454                 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
13455         cnt = m->m_len / sizeof(struct tcp_ackent);
13456         idx = cnt / 5;
13457         if (idx >= MAX_NUM_OF_CNTS)
13458                 idx = MAX_NUM_OF_CNTS - 1;
13459         counter_u64_add(rack_proc_comp_ack[idx], 1);
13460         counter_u64_add(rack_multi_single_eq, cnt);
13461         high_seq = tp->snd_una;
13462         the_win = tp->snd_wnd;
13463         win_seq = tp->snd_wl1;
13464         win_upd_ack = tp->snd_wl2;
13465         cts = tcp_tv_to_usectick(tv);
13466         ms_cts = tcp_tv_to_mssectick(tv);
13467         segsiz = ctf_fixed_maxseg(tp);
13468         if ((rack->rc_gp_dyn_mul) &&
13469             (rack->use_fixed_rate == 0) &&
13470             (rack->rc_always_pace)) {
13471                 /* Check in on probertt */
13472                 rack_check_probe_rtt(rack, cts);
13473         }
13474         for (i = 0; i < cnt; i++) {
13475 #ifdef TCP_ACCOUNTING
13476                 ts_val = get_cyclecount();
13477 #endif
13478                 rack_clear_rate_sample(rack);
13479                 ae = ((mtod(m, struct tcp_ackent *)) + i);
13480                 /* Setup the window */
13481                 tiwin = ae->win << tp->snd_scale;
13482                 /* figure out the type of ack */
13483                 if (SEQ_LT(ae->ack, high_seq)) {
13484                         /* Case B*/
13485                         ae->ack_val_set = ACK_BEHIND;
13486                 } else if (SEQ_GT(ae->ack, high_seq)) {
13487                         /* Case A */
13488                         ae->ack_val_set = ACK_CUMACK;
13489                 } else if (tiwin == the_win) {
13490                         /* Case D */
13491                         ae->ack_val_set = ACK_DUPACK;
13492                 } else {
13493                         /* Case C */
13494                         ae->ack_val_set = ACK_RWND;
13495                 }
13496                 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
13497                 /* Validate timestamp */
13498                 if (ae->flags & HAS_TSTMP) {
13499                         /* Setup for a timestamp */
13500                         to->to_flags = TOF_TS;
13501                         ae->ts_echo -= tp->ts_offset;
13502                         to->to_tsecr = ae->ts_echo;
13503                         to->to_tsval = ae->ts_value;
13504                         /*
13505                          * If echoed timestamp is later than the current time, fall back to
13506                          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
13507                          * were used when this connection was established.
13508                          */
13509                         if (TSTMP_GT(ae->ts_echo, ms_cts))
13510                                 to->to_tsecr = 0;
13511                         if (tp->ts_recent &&
13512                             TSTMP_LT(ae->ts_value, tp->ts_recent)) {
13513                                 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
13514 #ifdef TCP_ACCOUNTING
13515                                         rdstc = get_cyclecount();
13516                                         if (rdstc > ts_val) {
13517                                                 counter_u64_add(tcp_proc_time[ae->ack_val_set] ,
13518                                                                 (rdstc - ts_val));
13519                                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13520                                                         tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13521                                                 }
13522                                         }
13523 #endif
13524                                         continue;
13525                                 }
13526                         }
13527                         if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
13528                             SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
13529                                 tp->ts_recent_age = tcp_ts_getticks();
13530                                 tp->ts_recent = ae->ts_value;
13531                         }
13532                 } else {
13533                         /* Setup for a no options */
13534                         to->to_flags = 0;
13535                 }
13536                 /* Update the rcv time and perform idle reduction possibly */
13537                 if  (tp->t_idle_reduce &&
13538                      (tp->snd_max == tp->snd_una) &&
13539                      ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
13540                         counter_u64_add(rack_input_idle_reduces, 1);
13541                         rack_cc_after_idle(rack, tp);
13542                 }
13543                 tp->t_rcvtime = ticks;
13544                 /* Now what about ECN? */
13545                 if (tp->t_flags2 & TF2_ECN_PERMIT) {
13546                         if (ae->flags & TH_CWR) {
13547                                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
13548                                 tp->t_flags |= TF_ACKNOW;
13549                         }
13550                         switch (ae->codepoint & IPTOS_ECN_MASK) {
13551                         case IPTOS_ECN_CE:
13552                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
13553                                 KMOD_TCPSTAT_INC(tcps_ecn_ce);
13554                                 break;
13555                         case IPTOS_ECN_ECT0:
13556                                 KMOD_TCPSTAT_INC(tcps_ecn_ect0);
13557                                 break;
13558                         case IPTOS_ECN_ECT1:
13559                                 KMOD_TCPSTAT_INC(tcps_ecn_ect1);
13560                                 break;
13561                         }
13562
13563                         /* Process a packet differently from RFC3168. */
13564                         cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint);
13565                         /* Congestion experienced. */
13566                         if (ae->flags & TH_ECE) {
13567                                 rack_cong_signal(tp,  CC_ECN, ae->ack);
13568                         }
13569                 }
13570 #ifdef TCP_ACCOUNTING
13571                 /* Count for the specific type of ack in */
13572                 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1);
13573                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13574                         tp->tcp_cnt_counters[ae->ack_val_set]++;
13575                 }
13576 #endif
13577                 /*
13578                  * Note how we could move up these in the determination
13579                  * above, but we don't so that way the timestamp checks (and ECN)
13580                  * is done first before we do any processing on the ACK.
13581                  * The non-compressed path through the code has this
13582                  * weakness (noted by @jtl) that it actually does some
13583                  * processing before verifying the timestamp information.
13584                  * We don't take that path here which is why we set
13585                  * the ack_val_set first, do the timestamp and ecn
13586                  * processing, and then look at what we have setup.
13587                  */
13588                 if (ae->ack_val_set == ACK_BEHIND) {
13589                         /*
13590                          * Case B flag reordering, if window is not closed
13591                          * or it could be a keep-alive or persists
13592                          */
13593                         if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
13594                                 counter_u64_add(rack_reorder_seen, 1);
13595                                 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
13596                         }
13597                 } else if (ae->ack_val_set == ACK_DUPACK) {
13598                         /* Case D */
13599                         rack_strike_dupack(rack);
13600                 } else if (ae->ack_val_set == ACK_RWND) {
13601                         /* Case C */
13602                         win_up_req = 1;
13603                         win_upd_ack = ae->ack;
13604                         win_seq = ae->seq;
13605                         the_win = tiwin;
13606                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13607                 } else {
13608                         /* Case A */
13609                         if (SEQ_GT(ae->ack, tp->snd_max)) {
13610                                 /*
13611                                  * We just send an ack since the incoming
13612                                  * ack is beyond the largest seq we sent.
13613                                  */
13614                                 if ((tp->t_flags & TF_ACKNOW) == 0) {
13615                                         ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
13616                                         if (tp->t_flags && TF_ACKNOW)
13617                                                 rack->r_wanted_output = 1;
13618                                 }
13619                         } else {
13620                                 nsegs++;
13621                                 /* If the window changed setup to update */
13622                                 if (tiwin != tp->snd_wnd) {
13623                                         win_upd_ack = ae->ack;
13624                                         win_seq = ae->seq;
13625                                         the_win = tiwin;
13626                                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13627                                 }
13628 #ifdef TCP_ACCOUNTING
13629                                 /* Account for the acks */
13630                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13631                                         tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
13632                                 }
13633                                 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN],
13634                                                 (((ae->ack - high_seq) + segsiz - 1) / segsiz));
13635 #endif
13636                                 high_seq = ae->ack;
13637                                 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends)) {
13638                                         rack->r_ctl.current_round++;
13639                                         rack->r_ctl.roundends = tp->snd_max;
13640                                         if (CC_ALGO(tp)->newround != NULL) {
13641                                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
13642                                         }
13643                                 }
13644                                 /* Setup our act_rcv_time */
13645                                 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
13646                                         ts.tv_sec = ae->timestamp / 1000000000;
13647                                         ts.tv_nsec = ae->timestamp % 1000000000;
13648                                         rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
13649                                         rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
13650                                 } else {
13651                                         rack->r_ctl.act_rcv_time = *tv;
13652                                 }
13653                                 rack_process_to_cumack(tp, rack, ae->ack, cts, to);
13654                                 if (rack->rc_dsack_round_seen) {
13655                                         /* Is the dsack round over? */
13656                                         if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
13657                                                 /* Yes it is */
13658                                                 rack->rc_dsack_round_seen = 0;
13659                                                 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
13660                                         }
13661                                 }
13662                         }
13663                 }
13664                 /* And lets be sure to commit the rtt measurements for this ack */
13665                 tcp_rack_xmit_timer_commit(rack, tp);
13666 #ifdef TCP_ACCOUNTING
13667                 rdstc = get_cyclecount();
13668                 if (rdstc > ts_val) {
13669                         counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val));
13670                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13671                                 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13672                                 if (ae->ack_val_set == ACK_CUMACK)
13673                                         tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
13674                         }
13675                 }
13676 #endif
13677         }
13678 #ifdef TCP_ACCOUNTING
13679         ts_val = get_cyclecount();
13680 #endif
13681         acked_amount = acked = (high_seq - tp->snd_una);
13682         if (acked) {
13683                 if (rack->sack_attack_disable == 0)
13684                         rack_do_decay(rack);
13685                 if (acked >= segsiz) {
13686                         /*
13687                          * You only get credit for
13688                          * MSS and greater (and you get extra
13689                          * credit for larger cum-ack moves).
13690                          */
13691                         int ac;
13692
13693                         ac = acked / segsiz;
13694                         rack->r_ctl.ack_count += ac;
13695                         counter_u64_add(rack_ack_total, ac);
13696                 }
13697                 if (rack->r_ctl.ack_count > 0xfff00000) {
13698                         /*
13699                          * reduce the number to keep us under
13700                          * a uint32_t.
13701                          */
13702                         rack->r_ctl.ack_count /= 2;
13703                         rack->r_ctl.sack_count /= 2;
13704                 }
13705                 if (tp->t_flags & TF_NEEDSYN) {
13706                         /*
13707                          * T/TCP: Connection was half-synchronized, and our SYN has
13708                          * been ACK'd (so connection is now fully synchronized).  Go
13709                          * to non-starred state, increment snd_una for ACK of SYN,
13710                          * and check if we can do window scaling.
13711                          */
13712                         tp->t_flags &= ~TF_NEEDSYN;
13713                         tp->snd_una++;
13714                         acked_amount = acked = (high_seq - tp->snd_una);
13715                 }
13716                 if (acked > sbavail(&so->so_snd))
13717                         acked_amount = sbavail(&so->so_snd);
13718 #ifdef NETFLIX_EXP_DETECTION
13719                 /*
13720                  * We only care on a cum-ack move if we are in a sack-disabled
13721                  * state. We have already added in to the ack_count, and we never
13722                  * would disable on a cum-ack move, so we only care to do the
13723                  * detection if it may "undo" it, i.e. we were in disabled already.
13724                  */
13725                 if (rack->sack_attack_disable)
13726                         rack_do_detection(tp, rack, acked_amount, segsiz);
13727 #endif
13728                 if (IN_FASTRECOVERY(tp->t_flags) &&
13729                     (rack->rack_no_prr == 0))
13730                         rack_update_prr(tp, rack, acked_amount, high_seq);
13731                 if (IN_RECOVERY(tp->t_flags)) {
13732                         if (SEQ_LT(high_seq, tp->snd_recover) &&
13733                             (SEQ_LT(high_seq, tp->snd_max))) {
13734                                 tcp_rack_partialack(tp);
13735                         } else {
13736                                 rack_post_recovery(tp, high_seq);
13737                                 recovery = 1;
13738                         }
13739                 }
13740                 /* Handle the rack-log-ack part (sendmap) */
13741                 if ((sbused(&so->so_snd) == 0) &&
13742                     (acked > acked_amount) &&
13743                     (tp->t_state >= TCPS_FIN_WAIT_1) &&
13744                     (tp->t_flags & TF_SENTFIN)) {
13745                         /*
13746                          * We must be sure our fin
13747                          * was sent and acked (we can be
13748                          * in FIN_WAIT_1 without having
13749                          * sent the fin).
13750                          */
13751                         ourfinisacked = 1;
13752                         /*
13753                          * Lets make sure snd_una is updated
13754                          * since most likely acked_amount = 0 (it
13755                          * should be).
13756                          */
13757                         tp->snd_una = high_seq;
13758                 }
13759                 /* Did we make a RTO error? */
13760                 if ((tp->t_flags & TF_PREVVALID) &&
13761                     ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
13762                         tp->t_flags &= ~TF_PREVVALID;
13763                         if (tp->t_rxtshift == 1 &&
13764                             (int)(ticks - tp->t_badrxtwin) < 0)
13765                                 rack_cong_signal(tp, CC_RTO_ERR, high_seq);
13766                 }
13767                 /* Handle the data in the socket buffer */
13768                 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
13769                 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
13770                 if (acked_amount > 0) {
13771                         struct mbuf *mfree;
13772
13773                         rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
13774                         SOCKBUF_LOCK(&so->so_snd);
13775                         mfree = sbcut_locked(&so->so_snd, acked_amount);
13776                         tp->snd_una = high_seq;
13777                         /* Note we want to hold the sb lock through the sendmap adjust */
13778                         rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
13779                         /* Wake up the socket if we have room to write more */
13780                         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
13781                         sowwakeup_locked(so);
13782                         m_freem(mfree);
13783                 }
13784                 /* update progress */
13785                 tp->t_acktime = ticks;
13786                 rack_log_progress_event(rack, tp, tp->t_acktime,
13787                                         PROGRESS_UPDATE, __LINE__);
13788                 /* Clear out shifts and such */
13789                 tp->t_rxtshift = 0;
13790                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
13791                                    rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
13792                 rack->rc_tlp_in_progress = 0;
13793                 rack->r_ctl.rc_tlp_cnt_out = 0;
13794                 /* Send recover and snd_nxt must be dragged along */
13795                 if (SEQ_GT(tp->snd_una, tp->snd_recover))
13796                         tp->snd_recover = tp->snd_una;
13797                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
13798                         tp->snd_nxt = tp->snd_una;
13799                 /*
13800                  * If the RXT timer is running we want to
13801                  * stop it, so we can restart a TLP (or new RXT).
13802                  */
13803                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
13804                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13805 #ifdef NETFLIX_HTTP_LOGGING
13806                 tcp_http_check_for_comp(rack->rc_tp, high_seq);
13807 #endif
13808                 tp->snd_wl2 = high_seq;
13809                 tp->t_dupacks = 0;
13810                 if (under_pacing &&
13811                     (rack->use_fixed_rate == 0) &&
13812                     (rack->in_probe_rtt == 0) &&
13813                     rack->rc_gp_dyn_mul &&
13814                     rack->rc_always_pace) {
13815                         /* Check if we are dragging bottom */
13816                         rack_check_bottom_drag(tp, rack, so, acked);
13817                 }
13818                 if (tp->snd_una == tp->snd_max) {
13819                         tp->t_flags &= ~TF_PREVVALID;
13820                         rack->r_ctl.retran_during_recovery = 0;
13821                         rack->r_ctl.dsack_byte_cnt = 0;
13822                         rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
13823                         if (rack->r_ctl.rc_went_idle_time == 0)
13824                                 rack->r_ctl.rc_went_idle_time = 1;
13825                         rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
13826                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
13827                                 tp->t_acktime = 0;
13828                         /* Set so we might enter persists... */
13829                         rack->r_wanted_output = 1;
13830                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13831                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
13832                         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13833                             (sbavail(&so->so_snd) == 0) &&
13834                             (tp->t_flags2 & TF2_DROP_AF_DATA)) {
13835                                 /*
13836                                  * The socket was gone and the
13837                                  * peer sent data (not now in the past), time to
13838                                  * reset him.
13839                                  */
13840                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13841                                 /* tcp_close will kill the inp pre-log the Reset */
13842                                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13843 #ifdef TCP_ACCOUNTING
13844                                 rdstc = get_cyclecount();
13845                                 if (rdstc > ts_val) {
13846                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13847                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13848                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13849                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13850                                         }
13851                                 }
13852 #endif
13853                                 m_freem(m);
13854                                 tp = tcp_close(tp);
13855                                 if (tp == NULL) {
13856 #ifdef TCP_ACCOUNTING
13857                                         sched_unpin();
13858 #endif
13859                                         return (1);
13860                                 }
13861                                 /*
13862                                  * We would normally do drop-with-reset which would
13863                                  * send back a reset. We can't since we don't have
13864                                  * all the needed bits. Instead lets arrange for
13865                                  * a call to tcp_output(). That way since we
13866                                  * are in the closed state we will generate a reset.
13867                                  *
13868                                  * Note if tcp_accounting is on we don't unpin since
13869                                  * we do that after the goto label.
13870                                  */
13871                                 goto send_out_a_rst;
13872                         }
13873                         if ((sbused(&so->so_snd) == 0) &&
13874                             (tp->t_state >= TCPS_FIN_WAIT_1) &&
13875                             (tp->t_flags & TF_SENTFIN)) {
13876                                 /*
13877                                  * If we can't receive any more data, then closing user can
13878                                  * proceed. Starting the timer is contrary to the
13879                                  * specification, but if we don't get a FIN we'll hang
13880                                  * forever.
13881                                  *
13882                                  */
13883                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13884                                         soisdisconnected(so);
13885                                         tcp_timer_activate(tp, TT_2MSL,
13886                                                            (tcp_fast_finwait2_recycle ?
13887                                                             tcp_finwait2_timeout :
13888                                                             TP_MAXIDLE(tp)));
13889                                 }
13890                                 if (ourfinisacked == 0) {
13891                                         /*
13892                                          * We don't change to fin-wait-2 if we have our fin acked
13893                                          * which means we are probably in TCPS_CLOSING.
13894                                          */
13895                                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
13896                                 }
13897                         }
13898                 }
13899                 /* Wake up the socket if we have room to write more */
13900                 if (sbavail(&so->so_snd)) {
13901                         rack->r_wanted_output = 1;
13902                         if (ctf_progress_timeout_check(tp, true)) {
13903                                 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13904                                                         tp, tick, PROGRESS_DROP, __LINE__);
13905                                 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
13906                                 /*
13907                                  * We cheat here and don't send a RST, we should send one
13908                                  * when the pacer drops the connection.
13909                                  */
13910 #ifdef TCP_ACCOUNTING
13911                                 rdstc = get_cyclecount();
13912                                 if (rdstc > ts_val) {
13913                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13914                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13915                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13916                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13917                                         }
13918                                 }
13919                                 sched_unpin();
13920 #endif
13921                                 INP_WUNLOCK(rack->rc_inp);
13922                                 m_freem(m);
13923                                 return (1);
13924                         }
13925                 }
13926                 if (ourfinisacked) {
13927                         switch(tp->t_state) {
13928                         case TCPS_CLOSING:
13929 #ifdef TCP_ACCOUNTING
13930                                 rdstc = get_cyclecount();
13931                                 if (rdstc > ts_val) {
13932                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13933                                                         (rdstc - ts_val));
13934                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13935                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13936                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13937                                         }
13938                                 }
13939                                 sched_unpin();
13940 #endif
13941                                 tcp_twstart(tp);
13942                                 m_freem(m);
13943                                 return (1);
13944                                 break;
13945                         case TCPS_LAST_ACK:
13946 #ifdef TCP_ACCOUNTING
13947                                 rdstc = get_cyclecount();
13948                                 if (rdstc > ts_val) {
13949                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13950                                                         (rdstc - ts_val));
13951                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13952                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13953                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13954                                         }
13955                                 }
13956                                 sched_unpin();
13957 #endif
13958                                 tp = tcp_close(tp);
13959                                 ctf_do_drop(m, tp);
13960                                 return (1);
13961                                 break;
13962                         case TCPS_FIN_WAIT_1:
13963 #ifdef TCP_ACCOUNTING
13964                                 rdstc = get_cyclecount();
13965                                 if (rdstc > ts_val) {
13966                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13967                                                         (rdstc - ts_val));
13968                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13969                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13970                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13971                                         }
13972                                 }
13973 #endif
13974                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13975                                         soisdisconnected(so);
13976                                         tcp_timer_activate(tp, TT_2MSL,
13977                                                            (tcp_fast_finwait2_recycle ?
13978                                                             tcp_finwait2_timeout :
13979                                                             TP_MAXIDLE(tp)));
13980                                 }
13981                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13982                                 break;
13983                         default:
13984                                 break;
13985                         }
13986                 }
13987                 if (rack->r_fast_output) {
13988                         /*
13989                          * We re doing fast output.. can we expand that?
13990                          */
13991                         rack_gain_for_fastoutput(rack, tp, so, acked_amount);
13992                 }
13993 #ifdef TCP_ACCOUNTING
13994                 rdstc = get_cyclecount();
13995                 if (rdstc > ts_val) {
13996                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13997                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13998                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13999                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
14000                         }
14001                 }
14002
14003         } else if (win_up_req) {
14004                 rdstc = get_cyclecount();
14005                 if (rdstc > ts_val) {
14006                         counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val));
14007                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14008                                 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
14009                         }
14010                 }
14011 #endif
14012         }
14013         /* Now is there a next packet, if so we are done */
14014         m_freem(m);
14015         did_out = 0;
14016         if (nxt_pkt) {
14017 #ifdef TCP_ACCOUNTING
14018                 sched_unpin();
14019 #endif
14020                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
14021                 return (0);
14022         }
14023         rack_handle_might_revert(tp, rack);
14024         ctf_calc_rwin(so, tp);
14025         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
14026         send_out_a_rst:
14027                 (void)tp->t_fb->tfb_tcp_output(tp);
14028                 did_out = 1;
14029         }
14030         rack_free_trim(rack);
14031 #ifdef TCP_ACCOUNTING
14032         sched_unpin();
14033 #endif
14034         rack_timer_audit(tp, rack, &so->so_snd);
14035         rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
14036         return (0);
14037 }
14038
14039
14040 static int
14041 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
14042     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
14043     int32_t nxt_pkt, struct timeval *tv)
14044 {
14045 #ifdef TCP_ACCOUNTING
14046         uint64_t ts_val;
14047 #endif
14048         int32_t thflags, retval, did_out = 0;
14049         int32_t way_out = 0;
14050         /*
14051          * cts - is the current time from tv (caller gets ts) in microseconds.
14052          * ms_cts - is the current time from tv in milliseconds.
14053          * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
14054          */
14055         uint32_t cts, us_cts, ms_cts;
14056         uint32_t tiwin;
14057         struct timespec ts;
14058         struct tcpopt to;
14059         struct tcp_rack *rack;
14060         struct rack_sendmap *rsm;
14061         int32_t prev_state = 0;
14062 #ifdef TCP_ACCOUNTING
14063         int ack_val_set = 0xf;
14064 #endif
14065         int nsegs;
14066         /*
14067          * tv passed from common code is from either M_TSTMP_LRO or
14068          * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
14069          */
14070         rack = (struct tcp_rack *)tp->t_fb_ptr;
14071         if (m->m_flags & M_ACKCMP) {
14072                 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
14073         }
14074         if (m->m_flags & M_ACKCMP) {
14075                 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
14076         }
14077         cts = tcp_tv_to_usectick(tv);
14078         ms_cts =  tcp_tv_to_mssectick(tv);
14079         nsegs = m->m_pkthdr.lro_nsegs;
14080         counter_u64_add(rack_proc_non_comp_ack, 1);
14081         thflags = th->th_flags;
14082 #ifdef TCP_ACCOUNTING
14083         sched_pin();
14084         if (thflags & TH_ACK)
14085                 ts_val = get_cyclecount();
14086 #endif
14087         if ((m->m_flags & M_TSTMP) ||
14088             (m->m_flags & M_TSTMP_LRO)) {
14089                 mbuf_tstmp2timespec(m, &ts);
14090                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
14091                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
14092         } else
14093                 rack->r_ctl.act_rcv_time = *tv;
14094         kern_prefetch(rack, &prev_state);
14095         prev_state = 0;
14096         /*
14097          * Unscale the window into a 32-bit value. For the SYN_SENT state
14098          * the scale is zero.
14099          */
14100         tiwin = th->th_win << tp->snd_scale;
14101 #ifdef TCP_ACCOUNTING
14102         if (thflags & TH_ACK) {
14103                 /*
14104                  * We have a tradeoff here. We can either do what we are
14105                  * doing i.e. pinning to this CPU and then doing the accounting
14106                  * <or> we could do a critical enter, setup the rdtsc and cpu
14107                  * as in below, and then validate we are on the same CPU on
14108                  * exit. I have choosen to not do the critical enter since
14109                  * that often will gain you a context switch, and instead lock
14110                  * us (line above this if) to the same CPU with sched_pin(). This
14111                  * means we may be context switched out for a higher priority
14112                  * interupt but we won't be moved to another CPU.
14113                  *
14114                  * If this occurs (which it won't very often since we most likely
14115                  * are running this code in interupt context and only a higher
14116                  * priority will bump us ... clock?) we will falsely add in
14117                  * to the time the interupt processing time plus the ack processing
14118                  * time. This is ok since its a rare event.
14119                  */
14120                 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
14121                                                     ctf_fixed_maxseg(tp));
14122         }
14123 #endif
14124         /*
14125          * Parse options on any incoming segment.
14126          */
14127         memset(&to, 0, sizeof(to));
14128         tcp_dooptions(&to, (u_char *)(th + 1),
14129             (th->th_off << 2) - sizeof(struct tcphdr),
14130             (thflags & TH_SYN) ? TO_SYN : 0);
14131         NET_EPOCH_ASSERT();
14132         INP_WLOCK_ASSERT(tp->t_inpcb);
14133         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
14134             __func__));
14135         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
14136             __func__));
14137         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
14138             (tp->t_flags & TF_GPUTINPROG)) {
14139                 /*
14140                  * We have a goodput in progress
14141                  * and we have entered a late state.
14142                  * Do we have enough data in the sb
14143                  * to handle the GPUT request?
14144                  */
14145                 uint32_t bytes;
14146
14147                 bytes = tp->gput_ack - tp->gput_seq;
14148                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
14149                         bytes += tp->gput_seq - tp->snd_una;
14150                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
14151                         /*
14152                          * There are not enough bytes in the socket
14153                          * buffer that have been sent to cover this
14154                          * measurement. Cancel it.
14155                          */
14156                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
14157                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
14158                                                    tp->gput_seq,
14159                                                    0, 0, 18, __LINE__, NULL, 0);
14160                         tp->t_flags &= ~TF_GPUTINPROG;
14161                 }
14162         }
14163         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
14164                 union tcp_log_stackspecific log;
14165                 struct timeval ltv;
14166 #ifdef NETFLIX_HTTP_LOGGING
14167                 struct http_sendfile_track *http_req;
14168
14169                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
14170                         http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
14171                 } else {
14172                         http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
14173                 }
14174 #endif
14175                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
14176                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
14177                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
14178                 if (rack->rack_no_prr == 0)
14179                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
14180                 else
14181                         log.u_bbr.flex1 = 0;
14182                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
14183                 log.u_bbr.use_lt_bw <<= 1;
14184                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
14185                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
14186                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14187                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
14188                 log.u_bbr.flex3 = m->m_flags;
14189                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
14190                 log.u_bbr.lost = thflags;
14191                 log.u_bbr.pacing_gain = 0x1;
14192 #ifdef TCP_ACCOUNTING
14193                 log.u_bbr.cwnd_gain = ack_val_set;
14194 #endif
14195                 log.u_bbr.flex7 = 2;
14196                 if (m->m_flags & M_TSTMP) {
14197                         /* Record the hardware timestamp if present */
14198                         mbuf_tstmp2timespec(m, &ts);
14199                         ltv.tv_sec = ts.tv_sec;
14200                         ltv.tv_usec = ts.tv_nsec / 1000;
14201                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
14202                 } else if (m->m_flags & M_TSTMP_LRO) {
14203                         /* Record the LRO the arrival timestamp */
14204                         mbuf_tstmp2timespec(m, &ts);
14205                         ltv.tv_sec = ts.tv_sec;
14206                         ltv.tv_usec = ts.tv_nsec / 1000;
14207                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
14208                 }
14209                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
14210                 /* Log the rcv time */
14211                 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
14212 #ifdef NETFLIX_HTTP_LOGGING
14213                 log.u_bbr.applimited = tp->t_http_closed;
14214                 log.u_bbr.applimited <<= 8;
14215                 log.u_bbr.applimited |= tp->t_http_open;
14216                 log.u_bbr.applimited <<= 8;
14217                 log.u_bbr.applimited |= tp->t_http_req;
14218                 if (http_req) {
14219                         /* Copy out any client req info */
14220                         /* seconds */
14221                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
14222                         /* useconds */
14223                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
14224                         log.u_bbr.rttProp = http_req->timestamp;
14225                         log.u_bbr.cur_del_rate = http_req->start;
14226                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
14227                                 log.u_bbr.flex8 |= 1;
14228                         } else {
14229                                 log.u_bbr.flex8 |= 2;
14230                                 log.u_bbr.bw_inuse = http_req->end;
14231                         }
14232                         log.u_bbr.flex6 = http_req->start_seq;
14233                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
14234                                 log.u_bbr.flex8 |= 4;
14235                                 log.u_bbr.epoch = http_req->end_seq;
14236                         }
14237                 }
14238 #endif
14239                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
14240                     tlen, &log, true, &ltv);
14241         }
14242         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
14243                 way_out = 4;
14244                 retval = 0;
14245                 m_freem(m);
14246                 goto done_with_input;
14247         }
14248         /*
14249          * If a segment with the ACK-bit set arrives in the SYN-SENT state
14250          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
14251          */
14252         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
14253             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
14254                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
14255                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
14256 #ifdef TCP_ACCOUNTING
14257                 sched_unpin();
14258 #endif
14259                 return (1);
14260         }
14261         /*
14262          * If timestamps were negotiated during SYN/ACK and a
14263          * segment without a timestamp is received, silently drop
14264          * the segment, unless it is a RST segment or missing timestamps are
14265          * tolerated.
14266          * See section 3.2 of RFC 7323.
14267          */
14268         if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
14269             ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
14270                 way_out = 5;
14271                 retval = 0;
14272                 m_freem(m);
14273                 goto done_with_input;
14274         }
14275
14276         /*
14277          * Segment received on connection. Reset idle time and keep-alive
14278          * timer. XXX: This should be done after segment validation to
14279          * ignore broken/spoofed segs.
14280          */
14281         if  (tp->t_idle_reduce &&
14282              (tp->snd_max == tp->snd_una) &&
14283              ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
14284                 counter_u64_add(rack_input_idle_reduces, 1);
14285                 rack_cc_after_idle(rack, tp);
14286         }
14287         tp->t_rcvtime = ticks;
14288 #ifdef STATS
14289         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
14290 #endif
14291         if (tiwin > rack->r_ctl.rc_high_rwnd)
14292                 rack->r_ctl.rc_high_rwnd = tiwin;
14293         /*
14294          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
14295          * this to occur after we've validated the segment.
14296          */
14297         if (tp->t_flags2 & TF2_ECN_PERMIT) {
14298                 if (thflags & TH_CWR) {
14299                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
14300                         tp->t_flags |= TF_ACKNOW;
14301                 }
14302                 switch (iptos & IPTOS_ECN_MASK) {
14303                 case IPTOS_ECN_CE:
14304                         tp->t_flags2 |= TF2_ECN_SND_ECE;
14305                         KMOD_TCPSTAT_INC(tcps_ecn_ce);
14306                         break;
14307                 case IPTOS_ECN_ECT0:
14308                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
14309                         break;
14310                 case IPTOS_ECN_ECT1:
14311                         KMOD_TCPSTAT_INC(tcps_ecn_ect1);
14312                         break;
14313                 }
14314
14315                 /* Process a packet differently from RFC3168. */
14316                 cc_ecnpkt_handler(tp, th, iptos);
14317
14318                 /* Congestion experienced. */
14319                 if (thflags & TH_ECE) {
14320                         rack_cong_signal(tp, CC_ECN, th->th_ack);
14321                 }
14322         }
14323
14324         /*
14325          * If echoed timestamp is later than the current time, fall back to
14326          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
14327          * were used when this connection was established.
14328          */
14329         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
14330                 to.to_tsecr -= tp->ts_offset;
14331                 if (TSTMP_GT(to.to_tsecr, ms_cts))
14332                         to.to_tsecr = 0;
14333         }
14334
14335         /*
14336          * If its the first time in we need to take care of options and
14337          * verify we can do SACK for rack!
14338          */
14339         if (rack->r_state == 0) {
14340                 /* Should be init'd by rack_init() */
14341                 KASSERT(rack->rc_inp != NULL,
14342                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
14343                 if (rack->rc_inp == NULL) {
14344                         rack->rc_inp = tp->t_inpcb;
14345                 }
14346
14347                 /*
14348                  * Process options only when we get SYN/ACK back. The SYN
14349                  * case for incoming connections is handled in tcp_syncache.
14350                  * According to RFC1323 the window field in a SYN (i.e., a
14351                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
14352                  * this is traditional behavior, may need to be cleaned up.
14353                  */
14354                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
14355                         /* Handle parallel SYN for ECN */
14356                         if (!(thflags & TH_ACK) &&
14357                             ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) &&
14358                             ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) {
14359                                 tp->t_flags2 |= TF2_ECN_PERMIT;
14360                                 tp->t_flags2 |= TF2_ECN_SND_ECE;
14361                                 TCPSTAT_INC(tcps_ecn_shs);
14362                         }
14363                         if ((to.to_flags & TOF_SCALE) &&
14364                             (tp->t_flags & TF_REQ_SCALE)) {
14365                                 tp->t_flags |= TF_RCVD_SCALE;
14366                                 tp->snd_scale = to.to_wscale;
14367                         } else
14368                                 tp->t_flags &= ~TF_REQ_SCALE;
14369                         /*
14370                          * Initial send window.  It will be updated with the
14371                          * next incoming segment to the scaled value.
14372                          */
14373                         tp->snd_wnd = th->th_win;
14374                         rack_validate_fo_sendwin_up(tp, rack);
14375                         if ((to.to_flags & TOF_TS) &&
14376                             (tp->t_flags & TF_REQ_TSTMP)) {
14377                                 tp->t_flags |= TF_RCVD_TSTMP;
14378                                 tp->ts_recent = to.to_tsval;
14379                                 tp->ts_recent_age = cts;
14380                         } else
14381                                 tp->t_flags &= ~TF_REQ_TSTMP;
14382                         if (to.to_flags & TOF_MSS) {
14383                                 tcp_mss(tp, to.to_mss);
14384                         }
14385                         if ((tp->t_flags & TF_SACK_PERMIT) &&
14386                             (to.to_flags & TOF_SACKPERM) == 0)
14387                                 tp->t_flags &= ~TF_SACK_PERMIT;
14388                         if (IS_FASTOPEN(tp->t_flags)) {
14389                                 if (to.to_flags & TOF_FASTOPEN) {
14390                                         uint16_t mss;
14391
14392                                         if (to.to_flags & TOF_MSS)
14393                                                 mss = to.to_mss;
14394                                         else
14395                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
14396                                                         mss = TCP6_MSS;
14397                                                 else
14398                                                         mss = TCP_MSS;
14399                                         tcp_fastopen_update_cache(tp, mss,
14400                                             to.to_tfo_len, to.to_tfo_cookie);
14401                                 } else
14402                                         tcp_fastopen_disable_path(tp);
14403                         }
14404                 }
14405                 /*
14406                  * At this point we are at the initial call. Here we decide
14407                  * if we are doing RACK or not. We do this by seeing if
14408                  * TF_SACK_PERMIT is set and the sack-not-required is clear.
14409                  * The code now does do dup-ack counting so if you don't
14410                  * switch back you won't get rack & TLP, but you will still
14411                  * get this stack.
14412                  */
14413
14414                 if ((rack_sack_not_required == 0) &&
14415                     ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
14416                         tcp_switch_back_to_default(tp);
14417                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
14418                             tlen, iptos);
14419 #ifdef TCP_ACCOUNTING
14420                         sched_unpin();
14421 #endif
14422                         return (1);
14423                 }
14424                 tcp_set_hpts(tp->t_inpcb);
14425                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
14426         }
14427         if (thflags & TH_FIN)
14428                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
14429         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
14430         if ((rack->rc_gp_dyn_mul) &&
14431             (rack->use_fixed_rate == 0) &&
14432             (rack->rc_always_pace)) {
14433                 /* Check in on probertt */
14434                 rack_check_probe_rtt(rack, us_cts);
14435         }
14436         rack_clear_rate_sample(rack);
14437         if (rack->forced_ack) {
14438                 uint32_t us_rtt;
14439
14440                 /*
14441                  * A persist or keep-alive was forced out, update our
14442                  * min rtt time. Note we do not worry about lost
14443                  * retransmissions since KEEP-ALIVES and persists
14444                  * are usually way long on times of sending (though
14445                  * if we were really paranoid or worried we could
14446                  * at least use timestamps if available to validate).
14447                  */
14448                 rack->forced_ack = 0;
14449                 if (tiwin == tp->snd_wnd) {
14450                         /*
14451                          * Only apply the RTT update if this is
14452                          * a response to our window probe. And that
14453                          * means the rwnd sent must match the current
14454                          * snd_wnd. If it does not, then we got a
14455                          * window update ack instead.
14456                          */
14457                         us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
14458                         if (us_rtt == 0)
14459                                 us_rtt = 1;
14460                         rack_apply_updated_usrtt(rack, us_rtt, us_cts);
14461                         tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
14462                 }
14463         }
14464         /*
14465          * This is the one exception case where we set the rack state
14466          * always. All other times (timers etc) we must have a rack-state
14467          * set (so we assure we have done the checks above for SACK).
14468          */
14469         rack->r_ctl.rc_rcvtime = cts;
14470         if (rack->r_state != tp->t_state)
14471                 rack_set_state(tp, rack);
14472         if (SEQ_GT(th->th_ack, tp->snd_una) &&
14473             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
14474                 kern_prefetch(rsm, &prev_state);
14475         prev_state = rack->r_state;
14476         retval = (*rack->r_substate) (m, th, so,
14477             tp, &to, drop_hdrlen,
14478             tlen, tiwin, thflags, nxt_pkt, iptos);
14479 #ifdef INVARIANTS
14480         if ((retval == 0) &&
14481             (tp->t_inpcb == NULL)) {
14482                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
14483                     retval, tp, prev_state);
14484         }
14485 #endif
14486         if (retval == 0) {
14487                 /*
14488                  * If retval is 1 the tcb is unlocked and most likely the tp
14489                  * is gone.
14490                  */
14491                 INP_WLOCK_ASSERT(tp->t_inpcb);
14492                 if ((rack->rc_gp_dyn_mul) &&
14493                     (rack->rc_always_pace) &&
14494                     (rack->use_fixed_rate == 0) &&
14495                     rack->in_probe_rtt &&
14496                     (rack->r_ctl.rc_time_probertt_starts == 0)) {
14497                         /*
14498                          * If we are going for target, lets recheck before
14499                          * we output.
14500                          */
14501                         rack_check_probe_rtt(rack, us_cts);
14502                 }
14503                 if (rack->set_pacing_done_a_iw == 0) {
14504                         /* How much has been acked? */
14505                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
14506                                 /* We have enough to set in the pacing segment size */
14507                                 rack->set_pacing_done_a_iw = 1;
14508                                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14509                         }
14510                 }
14511                 tcp_rack_xmit_timer_commit(rack, tp);
14512 #ifdef TCP_ACCOUNTING
14513                 /*
14514                  * If we set the ack_val_se to what ack processing we are doing
14515                  * we also want to track how many cycles we burned. Note
14516                  * the bits after tcp_output we let be "free". This is because
14517                  * we are also tracking the tcp_output times as well. Note the
14518                  * use of 0xf here since we only have 11 counter (0 - 0xa) and
14519                  * 0xf cannot be returned and is what we initialize it too to
14520                  * indicate we are not doing the tabulations.
14521                  */
14522                 if (ack_val_set != 0xf) {
14523                         uint64_t crtsc;
14524
14525                         crtsc = get_cyclecount();
14526                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14527                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14528                                 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
14529                         }
14530                 }
14531 #endif
14532                 if (nxt_pkt == 0) {
14533                         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
14534 do_output_now:
14535                                 did_out = 1;
14536                                 (void)tp->t_fb->tfb_tcp_output(tp);
14537                         }
14538                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
14539                         rack_free_trim(rack);
14540                 }
14541                 /* Update any rounds needed */
14542                 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends)) {
14543                         rack->r_ctl.current_round++;
14544                         rack->r_ctl.roundends = tp->snd_max;
14545                         if (CC_ALGO(tp)->newround != NULL) {
14546                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
14547                         }
14548                 }
14549                 if ((nxt_pkt == 0) &&
14550                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
14551                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
14552                      (tp->t_flags & TF_DELACK) ||
14553                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
14554                       (tp->t_state <= TCPS_CLOSING)))) {
14555                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
14556                         if ((tp->snd_max == tp->snd_una) &&
14557                             ((tp->t_flags & TF_DELACK) == 0) &&
14558                             (rack->rc_inp->inp_in_hpts) &&
14559                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
14560                                 /* keep alive not needed if we are hptsi output yet */
14561                                 ;
14562                         } else {
14563                                 int late = 0;
14564                                 if (rack->rc_inp->inp_in_hpts) {
14565                                         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14566                                                 us_cts = tcp_get_usecs(NULL);
14567                                                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
14568                                                         rack->r_early = 1;
14569                                                         rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
14570                                                 } else
14571                                                         late = 1;
14572                                                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
14573                                         }
14574                                         tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
14575                                 }
14576                                 if (late && (did_out == 0)) {
14577                                         /*
14578                                          * We are late in the sending
14579                                          * and we did not call the output
14580                                          * (this probably should not happen).
14581                                          */
14582                                         goto do_output_now;
14583                                 }
14584                                 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
14585                         }
14586                         way_out = 1;
14587                 } else if (nxt_pkt == 0) {
14588                         /* Do we have the correct timer running? */
14589                         rack_timer_audit(tp, rack, &so->so_snd);
14590                         way_out = 2;
14591                 }
14592         done_with_input:
14593                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
14594                 if (did_out)
14595                         rack->r_wanted_output = 0;
14596 #ifdef INVARIANTS
14597                 if (tp->t_inpcb == NULL) {
14598                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
14599                               did_out,
14600                               retval, tp, prev_state);
14601                 }
14602 #endif
14603 #ifdef TCP_ACCOUNTING
14604         } else {
14605                 /*
14606                  * Track the time (see above).
14607                  */
14608                 if (ack_val_set != 0xf) {
14609                         uint64_t crtsc;
14610
14611                         crtsc = get_cyclecount();
14612                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14613                         /*
14614                          * Note we *DO NOT* increment the per-tcb counters since
14615                          * in the else the TP may be gone!!
14616                          */
14617                 }
14618 #endif
14619         }
14620 #ifdef TCP_ACCOUNTING
14621         sched_unpin();
14622 #endif
14623         return (retval);
14624 }
14625
14626 void
14627 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
14628     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
14629 {
14630         struct timeval tv;
14631
14632         /* First lets see if we have old packets */
14633         if (tp->t_in_pkt) {
14634                 if (ctf_do_queued_segments(so, tp, 1)) {
14635                         m_freem(m);
14636                         return;
14637                 }
14638         }
14639         if (m->m_flags & M_TSTMP_LRO) {
14640                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
14641                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
14642         } else {
14643                 /* Should not be should we kassert instead? */
14644                 tcp_get_usecs(&tv);
14645         }
14646         if (rack_do_segment_nounlock(m, th, so, tp,
14647                                      drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
14648                 INP_WUNLOCK(tp->t_inpcb);
14649         }
14650 }
14651
14652 struct rack_sendmap *
14653 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
14654 {
14655         struct rack_sendmap *rsm = NULL;
14656         int32_t idx;
14657         uint32_t srtt = 0, thresh = 0, ts_low = 0;
14658
14659         /* Return the next guy to be re-transmitted */
14660         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
14661                 return (NULL);
14662         }
14663         if (tp->t_flags & TF_SENTFIN) {
14664                 /* retran the end FIN? */
14665                 return (NULL);
14666         }
14667         /* ok lets look at this one */
14668         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
14669         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
14670                 goto check_it;
14671         }
14672         rsm = rack_find_lowest_rsm(rack);
14673         if (rsm == NULL) {
14674                 return (NULL);
14675         }
14676 check_it:
14677         if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
14678             (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
14679                 /*
14680                  * No sack so we automatically do the 3 strikes and
14681                  * retransmit (no rack timer would be started).
14682                  */
14683
14684                 return (rsm);
14685         }
14686         if (rsm->r_flags & RACK_ACKED) {
14687                 return (NULL);
14688         }
14689         if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
14690             (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
14691                 /* Its not yet ready */
14692                 return (NULL);
14693         }
14694         srtt = rack_grab_rtt(tp, rack);
14695         idx = rsm->r_rtr_cnt - 1;
14696         ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
14697         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
14698         if ((tsused == ts_low) ||
14699             (TSTMP_LT(tsused, ts_low))) {
14700                 /* No time since sending */
14701                 return (NULL);
14702         }
14703         if ((tsused - ts_low) < thresh) {
14704                 /* It has not been long enough yet */
14705                 return (NULL);
14706         }
14707         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
14708             ((rsm->r_flags & RACK_SACK_PASSED) &&
14709              (rack->sack_attack_disable == 0))) {
14710                 /*
14711                  * We have passed the dup-ack threshold <or>
14712                  * a SACK has indicated this is missing.
14713                  * Note that if you are a declared attacker
14714                  * it is only the dup-ack threshold that
14715                  * will cause retransmits.
14716                  */
14717                 /* log retransmit reason */
14718                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
14719                 rack->r_fast_output = 0;
14720                 return (rsm);
14721         }
14722         return (NULL);
14723 }
14724
14725 static void
14726 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
14727                            uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
14728                            int line, struct rack_sendmap *rsm, uint8_t quality)
14729 {
14730         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
14731                 union tcp_log_stackspecific log;
14732                 struct timeval tv;
14733
14734                 memset(&log, 0, sizeof(log));
14735                 log.u_bbr.flex1 = slot;
14736                 log.u_bbr.flex2 = len;
14737                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
14738                 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
14739                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
14740                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
14741                 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
14742                 log.u_bbr.use_lt_bw <<= 1;
14743                 log.u_bbr.use_lt_bw |= rack->r_late;
14744                 log.u_bbr.use_lt_bw <<= 1;
14745                 log.u_bbr.use_lt_bw |= rack->r_early;
14746                 log.u_bbr.use_lt_bw <<= 1;
14747                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
14748                 log.u_bbr.use_lt_bw <<= 1;
14749                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
14750                 log.u_bbr.use_lt_bw <<= 1;
14751                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
14752                 log.u_bbr.use_lt_bw <<= 1;
14753                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
14754                 log.u_bbr.use_lt_bw <<= 1;
14755                 log.u_bbr.use_lt_bw |= rack->gp_ready;
14756                 log.u_bbr.pkt_epoch = line;
14757                 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
14758                 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
14759                 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
14760                 log.u_bbr.bw_inuse = bw_est;
14761                 log.u_bbr.delRate = bw;
14762                 if (rack->r_ctl.gp_bw == 0)
14763                         log.u_bbr.cur_del_rate = 0;
14764                 else
14765                         log.u_bbr.cur_del_rate = rack_get_bw(rack);
14766                 log.u_bbr.rttProp = len_time;
14767                 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
14768                 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
14769                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
14770                 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
14771                         /* We are in slow start */
14772                         log.u_bbr.flex7 = 1;
14773                 } else {
14774                         /* we are on congestion avoidance */
14775                         log.u_bbr.flex7 = 0;
14776                 }
14777                 log.u_bbr.flex8 = method;
14778                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14779                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14780                 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
14781                 log.u_bbr.cwnd_gain <<= 1;
14782                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
14783                 log.u_bbr.cwnd_gain <<= 1;
14784                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
14785                 log.u_bbr.bbr_substate = quality;
14786                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
14787                     &rack->rc_inp->inp_socket->so_rcv,
14788                     &rack->rc_inp->inp_socket->so_snd,
14789                     BBR_LOG_HPTSI_CALC, 0,
14790                     0, &log, false, &tv);
14791         }
14792 }
14793
14794 static uint32_t
14795 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
14796 {
14797         uint32_t new_tso, user_max;
14798
14799         user_max = rack->rc_user_set_max_segs * mss;
14800         if (rack->rc_force_max_seg) {
14801                 return (user_max);
14802         }
14803         if (rack->use_fixed_rate &&
14804             ((rack->r_ctl.crte == NULL) ||
14805              (bw != rack->r_ctl.crte->rate))) {
14806                 /* Use the user mss since we are not exactly matched */
14807                 return (user_max);
14808         }
14809         new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
14810         if (new_tso > user_max)
14811                 new_tso = user_max;
14812         return (new_tso);
14813 }
14814
14815 static int32_t
14816 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
14817 {
14818         uint64_t lentim, fill_bw;
14819
14820         /* Lets first see if we are full, if so continue with normal rate */
14821         rack->r_via_fill_cw = 0;
14822         if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
14823                 return (slot);
14824         if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
14825                 return (slot);
14826         if (rack->r_ctl.rc_last_us_rtt == 0)
14827                 return (slot);
14828         if (rack->rc_pace_fill_if_rttin_range &&
14829             (rack->r_ctl.rc_last_us_rtt >=
14830              (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
14831                 /* The rtt is huge, N * smallest, lets not fill */
14832                 return (slot);
14833         }
14834         /*
14835          * first lets calculate the b/w based on the last us-rtt
14836          * and the sndwnd.
14837          */
14838         fill_bw = rack->r_ctl.cwnd_to_use;
14839         /* Take the rwnd if its smaller */
14840         if (fill_bw > rack->rc_tp->snd_wnd)
14841                 fill_bw = rack->rc_tp->snd_wnd;
14842         if (rack->r_fill_less_agg) {
14843                 /*
14844                  * Now take away the inflight (this will reduce our
14845                  * aggressiveness and yeah, if we get that much out in 1RTT
14846                  * we will have had acks come back and still be behind).
14847                  */
14848                 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14849         }
14850         /* Now lets make it into a b/w */
14851         fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
14852         fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
14853         /* We are below the min b/w */
14854         if (non_paced)
14855                 *rate_wanted = fill_bw;
14856         if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
14857                 return (slot);
14858         if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
14859                 fill_bw = rack->r_ctl.bw_rate_cap;
14860         rack->r_via_fill_cw = 1;
14861         if (rack->r_rack_hw_rate_caps &&
14862             (rack->r_ctl.crte != NULL)) {
14863                 uint64_t high_rate;
14864
14865                 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
14866                 if (fill_bw > high_rate) {
14867                         /* We are capping bw at the highest rate table entry */
14868                         if (*rate_wanted > high_rate) {
14869                                 /* The original rate was also capped */
14870                                 rack->r_via_fill_cw = 0;
14871                         }
14872                         rack_log_hdwr_pacing(rack,
14873                                              fill_bw, high_rate, __LINE__,
14874                                              0, 3);
14875                         fill_bw = high_rate;
14876                         if (capped)
14877                                 *capped = 1;
14878                 }
14879         } else if ((rack->r_ctl.crte == NULL) &&
14880                    (rack->rack_hdrw_pacing == 0) &&
14881                    (rack->rack_hdw_pace_ena) &&
14882                    rack->r_rack_hw_rate_caps &&
14883                    (rack->rack_attempt_hdwr_pace == 0) &&
14884                    (rack->rc_inp->inp_route.ro_nh != NULL) &&
14885                    (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
14886                 /*
14887                  * Ok we may have a first attempt that is greater than our top rate
14888                  * lets check.
14889                  */
14890                 uint64_t high_rate;
14891
14892                 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
14893                 if (high_rate) {
14894                         if (fill_bw > high_rate) {
14895                                 fill_bw = high_rate;
14896                                 if (capped)
14897                                         *capped = 1;
14898                         }
14899                 }
14900         }
14901         /*
14902          * Ok fill_bw holds our mythical b/w to fill the cwnd
14903          * in a rtt, what does that time wise equate too?
14904          */
14905         lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
14906         lentim /= fill_bw;
14907         *rate_wanted = fill_bw;
14908         if (non_paced || (lentim < slot)) {
14909                 rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
14910                                            0, lentim, 12, __LINE__, NULL, 0);
14911                 return ((int32_t)lentim);
14912         } else
14913                 return (slot);
14914 }
14915
14916 static int32_t
14917 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
14918 {
14919         uint64_t srtt;
14920         int32_t slot = 0;
14921         int can_start_hw_pacing = 1;
14922         int err;
14923
14924         if (rack->rc_always_pace == 0) {
14925                 /*
14926                  * We use the most optimistic possible cwnd/srtt for
14927                  * sending calculations. This will make our
14928                  * calculation anticipate getting more through
14929                  * quicker then possible. But thats ok we don't want
14930                  * the peer to have a gap in data sending.
14931                  */
14932                 uint64_t cwnd, tr_perms = 0;
14933                 int32_t reduce = 0;
14934
14935         old_method:
14936                 /*
14937                  * We keep no precise pacing with the old method
14938                  * instead we use the pacer to mitigate bursts.
14939                  */
14940                 if (rack->r_ctl.rc_rack_min_rtt)
14941                         srtt = rack->r_ctl.rc_rack_min_rtt;
14942                 else
14943                         srtt = max(tp->t_srtt, 1);
14944                 if (rack->r_ctl.rc_rack_largest_cwnd)
14945                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
14946                 else
14947                         cwnd = rack->r_ctl.cwnd_to_use;
14948                 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
14949                 tr_perms = (cwnd * 1000) / srtt;
14950                 if (tr_perms == 0) {
14951                         tr_perms = ctf_fixed_maxseg(tp);
14952                 }
14953                 /*
14954                  * Calculate how long this will take to drain, if
14955                  * the calculation comes out to zero, thats ok we
14956                  * will use send_a_lot to possibly spin around for
14957                  * more increasing tot_len_this_send to the point
14958                  * that its going to require a pace, or we hit the
14959                  * cwnd. Which in that case we are just waiting for
14960                  * a ACK.
14961                  */
14962                 slot = len / tr_perms;
14963                 /* Now do we reduce the time so we don't run dry? */
14964                 if (slot && rack_slot_reduction) {
14965                         reduce = (slot / rack_slot_reduction);
14966                         if (reduce < slot) {
14967                                 slot -= reduce;
14968                         } else
14969                                 slot = 0;
14970                 }
14971                 slot *= HPTS_USEC_IN_MSEC;
14972                 if (rack->rc_pace_to_cwnd) {
14973                         uint64_t rate_wanted = 0;
14974
14975                         slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
14976                         rack->rc_ack_can_sendout_data = 1;
14977                         rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
14978                 } else
14979                         rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
14980         } else {
14981                 uint64_t bw_est, res, lentim, rate_wanted;
14982                 uint32_t orig_val, segs, oh;
14983                 int capped = 0;
14984                 int prev_fill;
14985
14986                 if ((rack->r_rr_config == 1) && rsm) {
14987                         return (rack->r_ctl.rc_min_to);
14988                 }
14989                 if (rack->use_fixed_rate) {
14990                         rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
14991                 } else if ((rack->r_ctl.init_rate == 0) &&
14992 #ifdef NETFLIX_PEAKRATE
14993                            (rack->rc_tp->t_maxpeakrate == 0) &&
14994 #endif
14995                            (rack->r_ctl.gp_bw == 0)) {
14996                         /* no way to yet do an estimate */
14997                         bw_est = rate_wanted = 0;
14998                 } else {
14999                         bw_est = rack_get_bw(rack);
15000                         rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
15001                 }
15002                 if ((bw_est == 0) || (rate_wanted == 0) ||
15003                     ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
15004                         /*
15005                          * No way yet to make a b/w estimate or
15006                          * our raise is set incorrectly.
15007                          */
15008                         goto old_method;
15009                 }
15010                 /* We need to account for all the overheads */
15011                 segs = (len + segsiz - 1) / segsiz;
15012                 /*
15013                  * We need the diff between 1514 bytes (e-mtu with e-hdr)
15014                  * and how much data we put in each packet. Yes this
15015                  * means we may be off if we are larger than 1500 bytes
15016                  * or smaller. But this just makes us more conservative.
15017                  */
15018                 if (rack_hw_rate_min &&
15019                     (bw_est < rack_hw_rate_min))
15020                         can_start_hw_pacing = 0;
15021                 if (ETHERNET_SEGMENT_SIZE > segsiz)
15022                         oh = ETHERNET_SEGMENT_SIZE - segsiz;
15023                 else
15024                         oh = 0;
15025                 segs *= oh;
15026                 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
15027                 res = lentim / rate_wanted;
15028                 slot = (uint32_t)res;
15029                 orig_val = rack->r_ctl.rc_pace_max_segs;
15030                 if (rack->r_ctl.crte == NULL) {
15031                         /*
15032                          * Only do this if we are not hardware pacing
15033                          * since if we are doing hw-pacing below we will
15034                          * set make a call after setting up or changing
15035                          * the rate.
15036                          */
15037                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
15038                 } else if (rack->rc_inp->inp_snd_tag == NULL) {
15039                         /*
15040                          * We lost our rate somehow, this can happen
15041                          * if the interface changed underneath us.
15042                          */
15043                         tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
15044                         rack->r_ctl.crte = NULL;
15045                         /* Lets re-allow attempting to setup pacing */
15046                         rack->rack_hdrw_pacing = 0;
15047                         rack->rack_attempt_hdwr_pace = 0;
15048                         rack_log_hdwr_pacing(rack,
15049                                              rate_wanted, bw_est, __LINE__,
15050                                              0, 6);
15051                 }
15052                 /* Did we change the TSO size, if so log it */
15053                 if (rack->r_ctl.rc_pace_max_segs != orig_val)
15054                         rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0);
15055                 prev_fill = rack->r_via_fill_cw;
15056                 if ((rack->rc_pace_to_cwnd) &&
15057                     (capped == 0) &&
15058                     (rack->use_fixed_rate == 0) &&
15059                     (rack->in_probe_rtt == 0) &&
15060                     (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
15061                         /*
15062                          * We want to pace at our rate *or* faster to
15063                          * fill the cwnd to the max if its not full.
15064                          */
15065                         slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
15066                 }
15067                 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
15068                     (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
15069                         if ((rack->rack_hdw_pace_ena) &&
15070                             (can_start_hw_pacing > 0) &&
15071                             (rack->rack_hdrw_pacing == 0) &&
15072                             (rack->rack_attempt_hdwr_pace == 0)) {
15073                                 /*
15074                                  * Lets attempt to turn on hardware pacing
15075                                  * if we can.
15076                                  */
15077                                 rack->rack_attempt_hdwr_pace = 1;
15078                                 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
15079                                                                        rack->rc_inp->inp_route.ro_nh->nh_ifp,
15080                                                                        rate_wanted,
15081                                                                        RS_PACING_GEQ,
15082                                                                        &err, &rack->r_ctl.crte_prev_rate);
15083                                 if (rack->r_ctl.crte) {
15084                                         rack->rack_hdrw_pacing = 1;
15085                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
15086                                                                                                  0, rack->r_ctl.crte,
15087                                                                                                  NULL);
15088                                         rack_log_hdwr_pacing(rack,
15089                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15090                                                              err, 0);
15091                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15092                                 } else {
15093                                         counter_u64_add(rack_hw_pace_init_fail, 1);
15094                                 }
15095                         } else if (rack->rack_hdrw_pacing &&
15096                                    (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
15097                                 /* Do we need to adjust our rate? */
15098                                 const struct tcp_hwrate_limit_table *nrte;
15099
15100                                 if (rack->r_up_only &&
15101                                     (rate_wanted < rack->r_ctl.crte->rate)) {
15102                                         /**
15103                                          * We have four possible states here
15104                                          * having to do with the previous time
15105                                          * and this time.
15106                                          *   previous  |  this-time
15107                                          * A)     0      |     0   -- fill_cw not in the picture
15108                                          * B)     1      |     0   -- we were doing a fill-cw but now are not
15109                                          * C)     1      |     1   -- all rates from fill_cw
15110                                          * D)     0      |     1   -- we were doing non-fill and now we are filling
15111                                          *
15112                                          * For case A, C and D we don't allow a drop. But for
15113                                          * case B where we now our on our steady rate we do
15114                                          * allow a drop.
15115                                          *
15116                                          */
15117                                         if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
15118                                                 goto done_w_hdwr;
15119                                 }
15120                                 if ((rate_wanted > rack->r_ctl.crte->rate) ||
15121                                     (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
15122                                         if (rack_hw_rate_to_low &&
15123                                             (bw_est < rack_hw_rate_to_low)) {
15124                                                 /*
15125                                                  * The pacing rate is too low for hardware, but
15126                                                  * do allow hardware pacing to be restarted.
15127                                                  */
15128                                                 rack_log_hdwr_pacing(rack,
15129                                                              bw_est, rack->r_ctl.crte->rate, __LINE__,
15130                                                              0, 5);
15131                                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
15132                                                 rack->r_ctl.crte = NULL;
15133                                                 rack->rack_attempt_hdwr_pace = 0;
15134                                                 rack->rack_hdrw_pacing = 0;
15135                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15136                                                 goto done_w_hdwr;
15137                                         }
15138                                         nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
15139                                                                    rack->rc_tp,
15140                                                                    rack->rc_inp->inp_route.ro_nh->nh_ifp,
15141                                                                    rate_wanted,
15142                                                                    RS_PACING_GEQ,
15143                                                                    &err, &rack->r_ctl.crte_prev_rate);
15144                                         if (nrte == NULL) {
15145                                                 /* Lost the rate */
15146                                                 rack->rack_hdrw_pacing = 0;
15147                                                 rack->r_ctl.crte = NULL;
15148                                                 rack_log_hdwr_pacing(rack,
15149                                                                      rate_wanted, 0, __LINE__,
15150                                                                      err, 1);
15151                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15152                                                 counter_u64_add(rack_hw_pace_lost, 1);
15153                                         } else if (nrte != rack->r_ctl.crte) {
15154                                                 rack->r_ctl.crte = nrte;
15155                                                 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
15156                                                                                                          segsiz, 0,
15157                                                                                                          rack->r_ctl.crte,
15158                                                                                                          NULL);
15159                                                 rack_log_hdwr_pacing(rack,
15160                                                                      rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15161                                                                      err, 2);
15162                                                 rack->r_ctl.last_hw_bw_req = rate_wanted;
15163                                         }
15164                                 } else {
15165                                         /* We just need to adjust the segment size */
15166                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15167                                         rack_log_hdwr_pacing(rack,
15168                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15169                                                              0, 4);
15170                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15171                                 }
15172                         }
15173                 }
15174                 if ((rack->r_ctl.crte != NULL) &&
15175                     (rack->r_ctl.crte->rate == rate_wanted)) {
15176                         /*
15177                          * We need to add a extra if the rates
15178                          * are exactly matched. The idea is
15179                          * we want the software to make sure the
15180                          * queue is empty before adding more, this
15181                          * gives us N MSS extra pace times where
15182                          * N is our sysctl
15183                          */
15184                         slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
15185                 }
15186 done_w_hdwr:
15187                 if (rack_limit_time_with_srtt &&
15188                     (rack->use_fixed_rate == 0) &&
15189 #ifdef NETFLIX_PEAKRATE
15190                     (rack->rc_tp->t_maxpeakrate == 0) &&
15191 #endif
15192                     (rack->rack_hdrw_pacing == 0)) {
15193                         /*
15194                          * Sanity check, we do not allow the pacing delay
15195                          * to be longer than the SRTT of the path. If it is
15196                          * a slow path, then adding a packet should increase
15197                          * the RTT and compensate for this i.e. the srtt will
15198                          * be greater so the allowed pacing time will be greater.
15199                          *
15200                          * Note this restriction is not for where a peak rate
15201                          * is set, we are doing fixed pacing or hardware pacing.
15202                          */
15203                         if (rack->rc_tp->t_srtt)
15204                                 srtt = rack->rc_tp->t_srtt;
15205                         else
15206                                 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;    /* its in ms convert */
15207                         if (srtt < (uint64_t)slot) {
15208                                 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
15209                                 slot = srtt;
15210                         }
15211                 }
15212                 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
15213         }
15214         if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
15215                 /*
15216                  * If this rate is seeing enobufs when it
15217                  * goes to send then either the nic is out
15218                  * of gas or we are mis-estimating the time
15219                  * somehow and not letting the queue empty
15220                  * completely. Lets add to the pacing time.
15221                  */
15222                 int hw_boost_delay;
15223
15224                 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
15225                 if (hw_boost_delay > rack_enobuf_hw_max)
15226                         hw_boost_delay = rack_enobuf_hw_max;
15227                 else if (hw_boost_delay < rack_enobuf_hw_min)
15228                         hw_boost_delay = rack_enobuf_hw_min;
15229                 slot += hw_boost_delay;
15230         }
15231         if (slot)
15232                 counter_u64_add(rack_calc_nonzero, 1);
15233         else
15234                 counter_u64_add(rack_calc_zero, 1);
15235         return (slot);
15236 }
15237
15238 static void
15239 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
15240     tcp_seq startseq, uint32_t sb_offset)
15241 {
15242         struct rack_sendmap *my_rsm = NULL;
15243         struct rack_sendmap fe;
15244
15245         if (tp->t_state < TCPS_ESTABLISHED) {
15246                 /*
15247                  * We don't start any measurements if we are
15248                  * not at least established.
15249                  */
15250                 return;
15251         }
15252         if (tp->t_state >= TCPS_FIN_WAIT_1) {
15253                 /*
15254                  * We will get no more data into the SB
15255                  * this means we need to have the data available
15256                  * before we start a measurement.
15257                  */
15258
15259                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) <
15260                     max(rc_init_window(rack),
15261                         (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
15262                         /* Nope not enough data */
15263                         return;
15264                 }
15265         }
15266         tp->t_flags |= TF_GPUTINPROG;
15267         rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
15268         rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
15269         tp->gput_seq = startseq;
15270         rack->app_limited_needs_set = 0;
15271         if (rack->in_probe_rtt)
15272                 rack->measure_saw_probe_rtt = 1;
15273         else if ((rack->measure_saw_probe_rtt) &&
15274                  (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
15275                 rack->measure_saw_probe_rtt = 0;
15276         if (rack->rc_gp_filled)
15277                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
15278         else {
15279                 /* Special case initial measurement */
15280                 struct timeval tv;
15281
15282                 tp->gput_ts = tcp_get_usecs(&tv);
15283                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15284         }
15285         /*
15286          * We take a guess out into the future,
15287          * if we have no measurement and no
15288          * initial rate, we measure the first
15289          * initial-windows worth of data to
15290          * speed up getting some GP measurement and
15291          * thus start pacing.
15292          */
15293         if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
15294                 rack->app_limited_needs_set = 1;
15295                 tp->gput_ack = startseq + max(rc_init_window(rack),
15296                                               (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
15297                 rack_log_pacing_delay_calc(rack,
15298                                            tp->gput_seq,
15299                                            tp->gput_ack,
15300                                            0,
15301                                            tp->gput_ts,
15302                                            rack->r_ctl.rc_app_limited_cnt,
15303                                            9,
15304                                            __LINE__, NULL, 0);
15305                 return;
15306         }
15307         if (sb_offset) {
15308                 /*
15309                  * We are out somewhere in the sb
15310                  * can we use the already outstanding data?
15311                  */
15312                 if (rack->r_ctl.rc_app_limited_cnt == 0) {
15313                         /*
15314                          * Yes first one is good and in this case
15315                          * the tp->gput_ts is correctly set based on
15316                          * the last ack that arrived (no need to
15317                          * set things up when an ack comes in).
15318                          */
15319                         my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
15320                         if ((my_rsm == NULL) ||
15321                             (my_rsm->r_rtr_cnt != 1)) {
15322                                 /* retransmission? */
15323                                 goto use_latest;
15324                         }
15325                 } else {
15326                         if (rack->r_ctl.rc_first_appl == NULL) {
15327                                 /*
15328                                  * If rc_first_appl is NULL
15329                                  * then the cnt should be 0.
15330                                  * This is probably an error, maybe
15331                                  * a KASSERT would be approprate.
15332                                  */
15333                                 goto use_latest;
15334                         }
15335                         /*
15336                          * If we have a marker pointer to the last one that is
15337                          * app limited we can use that, but we need to set
15338                          * things up so that when it gets ack'ed we record
15339                          * the ack time (if its not already acked).
15340                          */
15341                         rack->app_limited_needs_set = 1;
15342                         /*
15343                          * We want to get to the rsm that is either
15344                          * next with space i.e. over 1 MSS or the one
15345                          * after that (after the app-limited).
15346                          */
15347                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15348                                          rack->r_ctl.rc_first_appl);
15349                         if (my_rsm) {
15350                                 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
15351                                         /* Have to use the next one */
15352                                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15353                                                          my_rsm);
15354                                 else {
15355                                         /* Use after the first MSS of it is acked */
15356                                         tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
15357                                         goto start_set;
15358                                 }
15359                         }
15360                         if ((my_rsm == NULL) ||
15361                             (my_rsm->r_rtr_cnt != 1)) {
15362                                 /*
15363                                  * Either its a retransmit or
15364                                  * the last is the app-limited one.
15365                                  */
15366                                 goto use_latest;
15367                         }
15368                 }
15369                 tp->gput_seq = my_rsm->r_start;
15370 start_set:
15371                 if (my_rsm->r_flags & RACK_ACKED) {
15372                         /*
15373                          * This one has been acked use the arrival ack time
15374                          */
15375                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15376                         rack->app_limited_needs_set = 0;
15377                 }
15378                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15379                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
15380                 rack_log_pacing_delay_calc(rack,
15381                                            tp->gput_seq,
15382                                            tp->gput_ack,
15383                                            (uint64_t)my_rsm,
15384                                            tp->gput_ts,
15385                                            rack->r_ctl.rc_app_limited_cnt,
15386                                            9,
15387                                            __LINE__, NULL, 0);
15388                 return;
15389         }
15390
15391 use_latest:
15392         /*
15393          * We don't know how long we may have been
15394          * idle or if this is the first-send. Lets
15395          * setup the flag so we will trim off
15396          * the first ack'd data so we get a true
15397          * measurement.
15398          */
15399         rack->app_limited_needs_set = 1;
15400         tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
15401         /* Find this guy so we can pull the send time */
15402         fe.r_start = startseq;
15403         my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
15404         if (my_rsm) {
15405                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15406                 if (my_rsm->r_flags & RACK_ACKED) {
15407                         /*
15408                          * Unlikely since its probably what was
15409                          * just transmitted (but I am paranoid).
15410                          */
15411                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15412                         rack->app_limited_needs_set = 0;
15413                 }
15414                 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
15415                         /* This also is unlikely */
15416                         tp->gput_seq = my_rsm->r_start;
15417                 }
15418         } else {
15419                 /*
15420                  * TSNH unless we have some send-map limit,
15421                  * and even at that it should not be hitting
15422                  * that limit (we should have stopped sending).
15423                  */
15424                 struct timeval tv;
15425
15426                 microuptime(&tv);
15427                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15428         }
15429         rack_log_pacing_delay_calc(rack,
15430                                    tp->gput_seq,
15431                                    tp->gput_ack,
15432                                    (uint64_t)my_rsm,
15433                                    tp->gput_ts,
15434                                    rack->r_ctl.rc_app_limited_cnt,
15435                                    9, __LINE__, NULL, 0);
15436 }
15437
15438 static inline uint32_t
15439 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
15440     uint32_t avail, int32_t sb_offset)
15441 {
15442         uint32_t len;
15443         uint32_t sendwin;
15444
15445         if (tp->snd_wnd > cwnd_to_use)
15446                 sendwin = cwnd_to_use;
15447         else
15448                 sendwin = tp->snd_wnd;
15449         if (ctf_outstanding(tp) >= tp->snd_wnd) {
15450                 /* We never want to go over our peers rcv-window */
15451                 len = 0;
15452         } else {
15453                 uint32_t flight;
15454
15455                 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15456                 if (flight >= sendwin) {
15457                         /*
15458                          * We have in flight what we are allowed by cwnd (if
15459                          * it was rwnd blocking it would have hit above out
15460                          * >= tp->snd_wnd).
15461                          */
15462                         return (0);
15463                 }
15464                 len = sendwin - flight;
15465                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
15466                         /* We would send too much (beyond the rwnd) */
15467                         len = tp->snd_wnd - ctf_outstanding(tp);
15468                 }
15469                 if ((len + sb_offset) > avail) {
15470                         /*
15471                          * We don't have that much in the SB, how much is
15472                          * there?
15473                          */
15474                         len = avail - sb_offset;
15475                 }
15476         }
15477         return (len);
15478 }
15479
15480 static void
15481 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
15482              unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
15483              int rsm_is_null, int optlen, int line, uint16_t mode)
15484 {
15485         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
15486                 union tcp_log_stackspecific log;
15487                 struct timeval tv;
15488
15489                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15490                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
15491                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
15492                 log.u_bbr.flex1 = error;
15493                 log.u_bbr.flex2 = flags;
15494                 log.u_bbr.flex3 = rsm_is_null;
15495                 log.u_bbr.flex4 = ipoptlen;
15496                 log.u_bbr.flex5 = tp->rcv_numsacks;
15497                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
15498                 log.u_bbr.flex7 = optlen;
15499                 log.u_bbr.flex8 = rack->r_fsb_inited;
15500                 log.u_bbr.applimited = rack->r_fast_output;
15501                 log.u_bbr.bw_inuse = rack_get_bw(rack);
15502                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
15503                 log.u_bbr.cwnd_gain = mode;
15504                 log.u_bbr.pkts_out = orig_len;
15505                 log.u_bbr.lt_epoch = len;
15506                 log.u_bbr.delivered = line;
15507                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15508                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15509                 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
15510                                len, &log, false, NULL, NULL, 0, &tv);
15511         }
15512 }
15513
15514
15515 static struct mbuf *
15516 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
15517                    struct rack_fast_send_blk *fsb,
15518                    int32_t seglimit, int32_t segsize, int hw_tls)
15519 {
15520 #ifdef KERN_TLS
15521         struct ktls_session *tls, *ntls;
15522         struct mbuf *start;
15523 #endif
15524         struct mbuf *m, *n, **np, *smb;
15525         struct mbuf *top;
15526         int32_t off, soff;
15527         int32_t len = *plen;
15528         int32_t fragsize;
15529         int32_t len_cp = 0;
15530         uint32_t mlen, frags;
15531
15532         soff = off = the_off;
15533         smb = m = the_m;
15534         np = &top;
15535         top = NULL;
15536 #ifdef KERN_TLS
15537         if (hw_tls && (m->m_flags & M_EXTPG))
15538                 tls = m->m_epg_tls;
15539         else
15540                 tls = NULL;
15541         start = m;
15542 #endif
15543         while (len > 0) {
15544                 if (m == NULL) {
15545                         *plen = len_cp;
15546                         break;
15547                 }
15548 #ifdef KERN_TLS
15549                 if (hw_tls) {
15550                         if (m->m_flags & M_EXTPG)
15551                                 ntls = m->m_epg_tls;
15552                         else
15553                                 ntls = NULL;
15554
15555                         /*
15556                          * Avoid mixing TLS records with handshake
15557                          * data or TLS records from different
15558                          * sessions.
15559                          */
15560                         if (tls != ntls) {
15561                                 MPASS(m != start);
15562                                 *plen = len_cp;
15563                                 break;
15564                         }
15565                 }
15566 #endif
15567                 mlen = min(len, m->m_len - off);
15568                 if (seglimit) {
15569                         /*
15570                          * For M_EXTPG mbufs, add 3 segments
15571                          * + 1 in case we are crossing page boundaries
15572                          * + 2 in case the TLS hdr/trailer are used
15573                          * It is cheaper to just add the segments
15574                          * than it is to take the cache miss to look
15575                          * at the mbuf ext_pgs state in detail.
15576                          */
15577                         if (m->m_flags & M_EXTPG) {
15578                                 fragsize = min(segsize, PAGE_SIZE);
15579                                 frags = 3;
15580                         } else {
15581                                 fragsize = segsize;
15582                                 frags = 0;
15583                         }
15584
15585                         /* Break if we really can't fit anymore. */
15586                         if ((frags + 1) >= seglimit) {
15587                                 *plen = len_cp;
15588                                 break;
15589                         }
15590
15591                         /*
15592                          * Reduce size if you can't copy the whole
15593                          * mbuf. If we can't copy the whole mbuf, also
15594                          * adjust len so the loop will end after this
15595                          * mbuf.
15596                          */
15597                         if ((frags + howmany(mlen, fragsize)) >= seglimit) {
15598                                 mlen = (seglimit - frags - 1) * fragsize;
15599                                 len = mlen;
15600                                 *plen = len_cp + len;
15601                         }
15602                         frags += howmany(mlen, fragsize);
15603                         if (frags == 0)
15604                                 frags++;
15605                         seglimit -= frags;
15606                         KASSERT(seglimit > 0,
15607                             ("%s: seglimit went too low", __func__));
15608                 }
15609                 n = m_get(M_NOWAIT, m->m_type);
15610                 *np = n;
15611                 if (n == NULL)
15612                         goto nospace;
15613                 n->m_len = mlen;
15614                 soff += mlen;
15615                 len_cp += n->m_len;
15616                 if (m->m_flags & (M_EXT|M_EXTPG)) {
15617                         n->m_data = m->m_data + off;
15618                         mb_dupcl(n, m);
15619                 } else {
15620                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
15621                             (u_int)n->m_len);
15622                 }
15623                 len -= n->m_len;
15624                 off = 0;
15625                 m = m->m_next;
15626                 np = &n->m_next;
15627                 if (len || (soff == smb->m_len)) {
15628                         /*
15629                          * We have more so we move forward  or
15630                          * we have consumed the entire mbuf and
15631                          * len has fell to 0.
15632                          */
15633                         soff = 0;
15634                         smb = m;
15635                 }
15636
15637         }
15638         if (fsb != NULL) {
15639                 fsb->m = smb;
15640                 fsb->off = soff;
15641                 if (smb) {
15642                         /*
15643                          * Save off the size of the mbuf. We do
15644                          * this so that we can recognize when it
15645                          * has been trimmed by sbcut() as acks
15646                          * come in.
15647                          */
15648                         fsb->o_m_len = smb->m_len;
15649                 } else {
15650                         /*
15651                          * This is the case where the next mbuf went to NULL. This
15652                          * means with this copy we have sent everything in the sb.
15653                          * In theory we could clear the fast_output flag, but lets
15654                          * not since its possible that we could get more added
15655                          * and acks that call the extend function which would let
15656                          * us send more.
15657                          */
15658                         fsb->o_m_len = 0;
15659                 }
15660         }
15661         return (top);
15662 nospace:
15663         if (top)
15664                 m_freem(top);
15665         return (NULL);
15666
15667 }
15668
15669 /*
15670  * This is a copy of m_copym(), taking the TSO segment size/limit
15671  * constraints into account, and advancing the sndptr as it goes.
15672  */
15673 static struct mbuf *
15674 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
15675                 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
15676 {
15677         struct mbuf *m, *n;
15678         int32_t soff;
15679
15680         soff = rack->r_ctl.fsb.off;
15681         m = rack->r_ctl.fsb.m;
15682         if (rack->r_ctl.fsb.o_m_len > m->m_len) {
15683                 /*
15684                  * The mbuf had the front of it chopped off by an ack
15685                  * we need to adjust the soff/off by that difference.
15686                  */
15687                 uint32_t delta;
15688
15689                 delta = rack->r_ctl.fsb.o_m_len - m->m_len;
15690                 soff -= delta;
15691         } else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
15692                 /*
15693                  * The mbuf was expanded probably by
15694                  * a m_compress. Just update o_m_len.
15695                  */
15696                 rack->r_ctl.fsb.o_m_len = m->m_len;
15697         }
15698         KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
15699         KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
15700         KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
15701                                  __FUNCTION__,
15702                                  rack, *plen, m, m->m_len));
15703         /* Save off the right location before we copy and advance */
15704         *s_soff = soff;
15705         *s_mb = rack->r_ctl.fsb.m;
15706         n = rack_fo_base_copym(m, soff, plen,
15707                                &rack->r_ctl.fsb,
15708                                seglimit, segsize, rack->r_ctl.fsb.hw_tls);
15709         return (n);
15710 }
15711
15712 static int
15713 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
15714                      uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
15715 {
15716         /*
15717          * Enter the fast retransmit path. We are given that a sched_pin is
15718          * in place (if accounting is compliled in) and the cycle count taken
15719          * at the entry is in the ts_val. The concept her is that the rsm
15720          * now holds the mbuf offsets and such so we can directly transmit
15721          * without a lot of overhead, the len field is already set for
15722          * us to prohibit us from sending too much (usually its 1MSS).
15723          */
15724         struct ip *ip = NULL;
15725         struct udphdr *udp = NULL;
15726         struct tcphdr *th = NULL;
15727         struct mbuf *m = NULL;
15728         struct inpcb *inp;
15729         uint8_t *cpto;
15730         struct tcp_log_buffer *lgb;
15731 #ifdef TCP_ACCOUNTING
15732         uint64_t crtsc;
15733         int cnt_thru = 1;
15734 #endif
15735         struct tcpopt to;
15736         u_char opt[TCP_MAXOLEN];
15737         uint32_t hdrlen, optlen;
15738         int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0;
15739         uint32_t us_cts;
15740         uint32_t if_hw_tsomaxsegcount = 0, startseq;
15741         uint32_t if_hw_tsomaxsegsize;
15742
15743 #ifdef INET6
15744         struct ip6_hdr *ip6 = NULL;
15745
15746         if (rack->r_is_v6) {
15747                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
15748                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
15749         } else
15750 #endif                          /* INET6 */
15751         {
15752                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
15753                 hdrlen = sizeof(struct tcpiphdr);
15754         }
15755         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
15756                 goto failed;
15757         }
15758         if (doing_tlp) {
15759                 /* Its a TLP add the flag, it may already be there but be sure */
15760                 rsm->r_flags |= RACK_TLP;
15761         } else {
15762                 /* If it was a TLP it is not not on this retransmit */
15763                 rsm->r_flags &= ~RACK_TLP;
15764         }
15765         startseq = rsm->r_start;
15766         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
15767         inp = rack->rc_inp;
15768         to.to_flags = 0;
15769         flags = tcp_outflags[tp->t_state];
15770         if (flags & (TH_SYN|TH_RST)) {
15771                 goto failed;
15772         }
15773         if (rsm->r_flags & RACK_HAS_FIN) {
15774                 /* We can't send a FIN here */
15775                 goto failed;
15776         }
15777         if (flags & TH_FIN) {
15778                 /* We never send a FIN */
15779                 flags &= ~TH_FIN;
15780         }
15781         if (tp->t_flags & TF_RCVD_TSTMP) {
15782                 to.to_tsval = ms_cts + tp->ts_offset;
15783                 to.to_tsecr = tp->ts_recent;
15784                 to.to_flags = TOF_TS;
15785         }
15786         optlen = tcp_addoptions(&to, opt);
15787         hdrlen += optlen;
15788         udp = rack->r_ctl.fsb.udp;
15789         if (udp)
15790                 hdrlen += sizeof(struct udphdr);
15791         if (rack->r_ctl.rc_pace_max_segs)
15792                 max_val = rack->r_ctl.rc_pace_max_segs;
15793         else if (rack->rc_user_set_max_segs)
15794                 max_val = rack->rc_user_set_max_segs * segsiz;
15795         else
15796                 max_val = len;
15797         if ((tp->t_flags & TF_TSO) &&
15798             V_tcp_do_tso &&
15799             (len > segsiz) &&
15800             (tp->t_port == 0))
15801                 tso = 1;
15802 #ifdef INET6
15803         if (MHLEN < hdrlen + max_linkhdr)
15804                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
15805         else
15806 #endif
15807                 m = m_gethdr(M_NOWAIT, MT_DATA);
15808         if (m == NULL)
15809                 goto failed;
15810         m->m_data += max_linkhdr;
15811         m->m_len = hdrlen;
15812         th = rack->r_ctl.fsb.th;
15813         /* Establish the len to send */
15814         if (len > max_val)
15815                 len = max_val;
15816         if ((tso) && (len + optlen > tp->t_maxseg)) {
15817                 uint32_t if_hw_tsomax;
15818                 int32_t max_len;
15819
15820                 /* extract TSO information */
15821                 if_hw_tsomax = tp->t_tsomax;
15822                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
15823                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
15824                 /*
15825                  * Check if we should limit by maximum payload
15826                  * length:
15827                  */
15828                 if (if_hw_tsomax != 0) {
15829                         /* compute maximum TSO length */
15830                         max_len = (if_hw_tsomax - hdrlen -
15831                                    max_linkhdr);
15832                         if (max_len <= 0) {
15833                                 goto failed;
15834                         } else if (len > max_len) {
15835                                 len = max_len;
15836                         }
15837                 }
15838                 if (len <= segsiz) {
15839                         /*
15840                          * In case there are too many small fragments don't
15841                          * use TSO:
15842                          */
15843                         tso = 0;
15844                 }
15845         } else {
15846                 tso = 0;
15847         }
15848         if ((tso == 0) && (len > segsiz))
15849                 len = segsiz;
15850         us_cts = tcp_get_usecs(tv);
15851         if ((len == 0) ||
15852             (len <= MHLEN - hdrlen - max_linkhdr)) {
15853                 goto failed;
15854         }
15855         th->th_seq = htonl(rsm->r_start);
15856         th->th_ack = htonl(tp->rcv_nxt);
15857         /*
15858          * The PUSH bit should only be applied
15859          * if the full retransmission is made. If
15860          * we are sending less than this is the
15861          * left hand edge and should not have
15862          * the PUSH bit.
15863          */
15864         if ((rsm->r_flags & RACK_HAD_PUSH) &&
15865             (len == (rsm->r_end - rsm->r_start)))
15866                 flags |= TH_PUSH;
15867         th->th_flags = flags;
15868         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
15869         if (th->th_win == 0) {
15870                 tp->t_sndzerowin++;
15871                 tp->t_flags |= TF_RXWIN0SENT;
15872         } else
15873                 tp->t_flags &= ~TF_RXWIN0SENT;
15874         if (rsm->r_flags & RACK_TLP) {
15875                 /*
15876                  * TLP should not count in retran count, but
15877                  * in its own bin
15878                  */
15879                 counter_u64_add(rack_tlp_retran, 1);
15880                 counter_u64_add(rack_tlp_retran_bytes, len);
15881         } else {
15882                 tp->t_sndrexmitpack++;
15883                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
15884                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
15885         }
15886 #ifdef STATS
15887         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
15888                                  len);
15889 #endif
15890         if (rsm->m == NULL)
15891                 goto failed;
15892         if (rsm->orig_m_len != rsm->m->m_len) {
15893                 /* Fix up the orig_m_len and possibly the mbuf offset */
15894                 rack_adjust_orig_mlen(rsm);
15895         }
15896         m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
15897         if (len <= segsiz) {
15898                 /*
15899                  * Must have ran out of mbufs for the copy
15900                  * shorten it to no longer need tso. Lets
15901                  * not put on sendalot since we are low on
15902                  * mbufs.
15903                  */
15904                 tso = 0;
15905         }
15906         if ((m->m_next == NULL) || (len <= 0)){
15907                 goto failed;
15908         }
15909         if (udp) {
15910                 if (rack->r_is_v6)
15911                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
15912                 else
15913                         ulen = hdrlen + len - sizeof(struct ip);
15914                 udp->uh_ulen = htons(ulen);
15915         }
15916         m->m_pkthdr.rcvif = (struct ifnet *)0;
15917         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
15918 #ifdef INET6
15919         if (rack->r_is_v6) {
15920                 if (tp->t_port) {
15921                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
15922                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
15923                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
15924                         th->th_sum = htons(0);
15925                         UDPSTAT_INC(udps_opackets);
15926                 } else {
15927                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
15928                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
15929                         th->th_sum = in6_cksum_pseudo(ip6,
15930                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
15931                                                       0);
15932                 }
15933         }
15934 #endif
15935 #if defined(INET6) && defined(INET)
15936         else
15937 #endif
15938 #ifdef INET
15939         {
15940                 if (tp->t_port) {
15941                         m->m_pkthdr.csum_flags = CSUM_UDP;
15942                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
15943                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
15944                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
15945                         th->th_sum = htons(0);
15946                         UDPSTAT_INC(udps_opackets);
15947                 } else {
15948                         m->m_pkthdr.csum_flags = CSUM_TCP;
15949                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
15950                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
15951                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
15952                                                                         IPPROTO_TCP + len + optlen));
15953                 }
15954                 /* IP version must be set here for ipv4/ipv6 checking later */
15955                 KASSERT(ip->ip_v == IPVERSION,
15956                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
15957         }
15958 #endif
15959         if (tso) {
15960                 KASSERT(len > tp->t_maxseg - optlen,
15961                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
15962                 m->m_pkthdr.csum_flags |= CSUM_TSO;
15963                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
15964         }
15965 #ifdef INET6
15966         if (rack->r_is_v6) {
15967                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
15968                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
15969                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
15970                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
15971                 else
15972                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
15973         }
15974 #endif
15975 #if defined(INET) && defined(INET6)
15976         else
15977 #endif
15978 #ifdef INET
15979         {
15980                 ip->ip_len = htons(m->m_pkthdr.len);
15981                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
15982                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
15983                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
15984                         if (tp->t_port == 0 || len < V_tcp_minmss) {
15985                                 ip->ip_off |= htons(IP_DF);
15986                         }
15987                 } else {
15988                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
15989                 }
15990         }
15991 #endif
15992         /* Time to copy in our header */
15993         cpto = mtod(m, uint8_t *);
15994         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
15995         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
15996         if (optlen) {
15997                 bcopy(opt, th + 1, optlen);
15998                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
15999         } else {
16000                 th->th_off = sizeof(struct tcphdr) >> 2;
16001         }
16002         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
16003                 union tcp_log_stackspecific log;
16004
16005                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16006                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
16007                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
16008                 if (rack->rack_no_prr)
16009                         log.u_bbr.flex1 = 0;
16010                 else
16011                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16012                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
16013                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
16014                 log.u_bbr.flex4 = max_val;
16015                 log.u_bbr.flex5 = 0;
16016                 /* Save off the early/late values */
16017                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
16018                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
16019                 log.u_bbr.bw_inuse = rack_get_bw(rack);
16020                 if (doing_tlp == 0)
16021                         log.u_bbr.flex8 = 1;
16022                 else
16023                         log.u_bbr.flex8 = 2;
16024                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
16025                 log.u_bbr.flex7 = 55;
16026                 log.u_bbr.pkts_out = tp->t_maxseg;
16027                 log.u_bbr.timeStamp = cts;
16028                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16029                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
16030                 log.u_bbr.delivered = 0;
16031                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
16032                                      len, &log, false, NULL, NULL, 0, tv);
16033         } else
16034                 lgb = NULL;
16035 #ifdef INET6
16036         if (rack->r_is_v6) {
16037                 error = ip6_output(m, NULL,
16038                                    &inp->inp_route6,
16039                                    0, NULL, NULL, inp);
16040         }
16041 #endif
16042 #if defined(INET) && defined(INET6)
16043         else
16044 #endif
16045 #ifdef INET
16046         {
16047                 error = ip_output(m, NULL,
16048                                   &inp->inp_route,
16049                                   0, 0, inp);
16050         }
16051 #endif
16052         m = NULL;
16053         if (lgb) {
16054                 lgb->tlb_errno = error;
16055                 lgb = NULL;
16056         }
16057         if (error) {
16058                 goto failed;
16059         }
16060         rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
16061                         rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls);
16062         if (doing_tlp && (rack->fast_rsm_hack == 0)) {
16063                 rack->rc_tlp_in_progress = 1;
16064                 rack->r_ctl.rc_tlp_cnt_out++;
16065         }
16066         if (error == 0) {
16067                 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
16068                 if (doing_tlp) {
16069                         rack->rc_last_sent_tlp_past_cumack = 0;
16070                         rack->rc_last_sent_tlp_seq_valid = 1;
16071                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
16072                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
16073                 }
16074         }
16075         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16076         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16077         if (IN_FASTRECOVERY(tp->t_flags) && rsm)
16078                 rack->r_ctl.retran_during_recovery += len;
16079         {
16080                 int idx;
16081
16082                 idx = (len / segsiz) + 3;
16083                 if (idx >= TCP_MSS_ACCT_ATIMER)
16084                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16085                 else
16086                         counter_u64_add(rack_out_size[idx], 1);
16087         }
16088         if (tp->t_rtttime == 0) {
16089                 tp->t_rtttime = ticks;
16090                 tp->t_rtseq = startseq;
16091                 KMOD_TCPSTAT_INC(tcps_segstimed);
16092         }
16093         counter_u64_add(rack_fto_rsm_send, 1);
16094         if (error && (error == ENOBUFS)) {
16095                 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
16096                 if (rack->rc_enobuf < 0x7f)
16097                         rack->rc_enobuf++;
16098                 if (slot < (10 * HPTS_USEC_IN_MSEC))
16099                         slot = 10 * HPTS_USEC_IN_MSEC;
16100         } else
16101                 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
16102         if ((slot == 0) ||
16103             (rack->rc_always_pace == 0) ||
16104             (rack->r_rr_config == 1)) {
16105                 /*
16106                  * We have no pacing set or we
16107                  * are using old-style rack or
16108                  * we are overriden to use the old 1ms pacing.
16109                  */
16110                 slot = rack->r_ctl.rc_min_to;
16111         }
16112         rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
16113         if (rack->r_must_retran) {
16114                 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
16115                 if ((SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) ||
16116                     ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
16117                         /*
16118                          * We have retransmitted all we need. If 
16119                          * RACK_MUST_RXT is not set then we need to
16120                          * not retransmit this guy.
16121                          */
16122                         rack->r_must_retran = 0;
16123                         rack->r_ctl.rc_out_at_rto = 0;
16124                         if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
16125                                 /* Not one we should rxt */
16126                                 goto failed;
16127                         } else {
16128                                 /* Clear the flag */
16129                                 rsm->r_flags &= ~RACK_MUST_RXT;
16130                         }
16131                 } else {
16132                         /* Remove  the flag */
16133                         rsm->r_flags &= ~RACK_MUST_RXT;
16134                 }
16135         }
16136 #ifdef TCP_ACCOUNTING
16137         crtsc = get_cyclecount();
16138         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16139                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16140         }
16141         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16142         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16143                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16144         }
16145         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16146         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16147                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
16148         }
16149         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz));
16150         sched_unpin();
16151 #endif
16152         return (0);
16153 failed:
16154         if (m)
16155                 m_free(m);
16156         return (-1);
16157 }
16158
16159 static void
16160 rack_sndbuf_autoscale(struct tcp_rack *rack)
16161 {
16162         /*
16163          * Automatic sizing of send socket buffer.  Often the send buffer
16164          * size is not optimally adjusted to the actual network conditions
16165          * at hand (delay bandwidth product).  Setting the buffer size too
16166          * small limits throughput on links with high bandwidth and high
16167          * delay (eg. trans-continental/oceanic links).  Setting the
16168          * buffer size too big consumes too much real kernel memory,
16169          * especially with many connections on busy servers.
16170          *
16171          * The criteria to step up the send buffer one notch are:
16172          *  1. receive window of remote host is larger than send buffer
16173          *     (with a fudge factor of 5/4th);
16174          *  2. send buffer is filled to 7/8th with data (so we actually
16175          *     have data to make use of it);
16176          *  3. send buffer fill has not hit maximal automatic size;
16177          *  4. our send window (slow start and cogestion controlled) is
16178          *     larger than sent but unacknowledged data in send buffer.
16179          *
16180          * Note that the rack version moves things much faster since
16181          * we want to avoid hitting cache lines in the rack_fast_output()
16182          * path so this is called much less often and thus moves
16183          * the SB forward by a percentage.
16184          */
16185         struct socket *so;
16186         struct tcpcb *tp;
16187         uint32_t sendwin, scaleup;
16188
16189         tp = rack->rc_tp;
16190         so = rack->rc_inp->inp_socket;
16191         sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
16192         if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
16193                 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
16194                     sbused(&so->so_snd) >=
16195                     (so->so_snd.sb_hiwat / 8 * 7) &&
16196                     sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
16197                     sendwin >= (sbused(&so->so_snd) -
16198                     (tp->snd_nxt - tp->snd_una))) {
16199                         if (rack_autosndbuf_inc)
16200                                 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
16201                         else
16202                                 scaleup = V_tcp_autosndbuf_inc;
16203                         if (scaleup < V_tcp_autosndbuf_inc)
16204                                 scaleup = V_tcp_autosndbuf_inc;
16205                         scaleup += so->so_snd.sb_hiwat;
16206                         if (scaleup > V_tcp_autosndbuf_max)
16207                                 scaleup = V_tcp_autosndbuf_max;
16208                         if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread))
16209                                 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
16210                 }
16211         }
16212 }
16213
16214 static int
16215 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
16216                  uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
16217 {
16218         /*
16219          * Enter to do fast output. We are given that the sched_pin is
16220          * in place (if accounting is compiled in) and the cycle count taken
16221          * at entry is in place in ts_val. The idea here is that
16222          * we know how many more bytes needs to be sent (presumably either
16223          * during pacing or to fill the cwnd and that was greater than
16224          * the max-burst). We have how much to send and all the info we
16225          * need to just send.
16226          */
16227         struct ip *ip = NULL;
16228         struct udphdr *udp = NULL;
16229         struct tcphdr *th = NULL;
16230         struct mbuf *m, *s_mb;
16231         struct inpcb *inp;
16232         uint8_t *cpto;
16233         struct tcp_log_buffer *lgb;
16234 #ifdef TCP_ACCOUNTING
16235         uint64_t crtsc;
16236 #endif
16237         struct tcpopt to;
16238         u_char opt[TCP_MAXOLEN];
16239         uint32_t hdrlen, optlen;
16240         int cnt_thru = 1;
16241         int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0;
16242         uint32_t us_cts, s_soff;
16243         uint32_t if_hw_tsomaxsegcount = 0, startseq;
16244         uint32_t if_hw_tsomaxsegsize;
16245         uint16_t add_flag = RACK_SENT_FP;
16246 #ifdef INET6
16247         struct ip6_hdr *ip6 = NULL;
16248
16249         if (rack->r_is_v6) {
16250                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
16251                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
16252         } else
16253 #endif                          /* INET6 */
16254         {
16255                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
16256                 hdrlen = sizeof(struct tcpiphdr);
16257         }
16258         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
16259                 m = NULL;
16260                 goto failed;
16261         }
16262         startseq = tp->snd_max;
16263         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
16264         inp = rack->rc_inp;
16265         len = rack->r_ctl.fsb.left_to_send;
16266         to.to_flags = 0;
16267         flags = rack->r_ctl.fsb.tcp_flags;
16268         if (tp->t_flags & TF_RCVD_TSTMP) {
16269                 to.to_tsval = ms_cts + tp->ts_offset;
16270                 to.to_tsecr = tp->ts_recent;
16271                 to.to_flags = TOF_TS;
16272         }
16273         optlen = tcp_addoptions(&to, opt);
16274         hdrlen += optlen;
16275         udp = rack->r_ctl.fsb.udp;
16276         if (udp)
16277                 hdrlen += sizeof(struct udphdr);
16278         if (rack->r_ctl.rc_pace_max_segs)
16279                 max_val = rack->r_ctl.rc_pace_max_segs;
16280         else if (rack->rc_user_set_max_segs)
16281                 max_val = rack->rc_user_set_max_segs * segsiz;
16282         else
16283                 max_val = len;
16284         if ((tp->t_flags & TF_TSO) &&
16285             V_tcp_do_tso &&
16286             (len > segsiz) &&
16287             (tp->t_port == 0))
16288                 tso = 1;
16289 again:
16290 #ifdef INET6
16291         if (MHLEN < hdrlen + max_linkhdr)
16292                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
16293         else
16294 #endif
16295                 m = m_gethdr(M_NOWAIT, MT_DATA);
16296         if (m == NULL)
16297                 goto failed;
16298         m->m_data += max_linkhdr;
16299         m->m_len = hdrlen;
16300         th = rack->r_ctl.fsb.th;
16301         /* Establish the len to send */
16302         if (len > max_val)
16303                 len = max_val;
16304         if ((tso) && (len + optlen > tp->t_maxseg)) {
16305                 uint32_t if_hw_tsomax;
16306                 int32_t max_len;
16307
16308                 /* extract TSO information */
16309                 if_hw_tsomax = tp->t_tsomax;
16310                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
16311                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
16312                 /*
16313                  * Check if we should limit by maximum payload
16314                  * length:
16315                  */
16316                 if (if_hw_tsomax != 0) {
16317                         /* compute maximum TSO length */
16318                         max_len = (if_hw_tsomax - hdrlen -
16319                                    max_linkhdr);
16320                         if (max_len <= 0) {
16321                                 goto failed;
16322                         } else if (len > max_len) {
16323                                 len = max_len;
16324                         }
16325                 }
16326                 if (len <= segsiz) {
16327                         /*
16328                          * In case there are too many small fragments don't
16329                          * use TSO:
16330                          */
16331                         tso = 0;
16332                 }
16333         } else {
16334                 tso = 0;
16335         }
16336         if ((tso == 0) && (len > segsiz))
16337                 len = segsiz;
16338         us_cts = tcp_get_usecs(tv);
16339         if ((len == 0) ||
16340             (len <= MHLEN - hdrlen - max_linkhdr)) {
16341                 goto failed;
16342         }
16343         sb_offset = tp->snd_max - tp->snd_una;
16344         th->th_seq = htonl(tp->snd_max);
16345         th->th_ack = htonl(tp->rcv_nxt);
16346         th->th_flags = flags;
16347         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
16348         if (th->th_win == 0) {
16349                 tp->t_sndzerowin++;
16350                 tp->t_flags |= TF_RXWIN0SENT;
16351         } else
16352                 tp->t_flags &= ~TF_RXWIN0SENT;
16353         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
16354         KMOD_TCPSTAT_INC(tcps_sndpack);
16355         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
16356 #ifdef STATS
16357         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
16358                                  len);
16359 #endif
16360         if (rack->r_ctl.fsb.m == NULL)
16361                 goto failed;
16362
16363         /* s_mb and s_soff are saved for rack_log_output */
16364         m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
16365                                     &s_mb, &s_soff);
16366         if (len <= segsiz) {
16367                 /*
16368                  * Must have ran out of mbufs for the copy
16369                  * shorten it to no longer need tso. Lets
16370                  * not put on sendalot since we are low on
16371                  * mbufs.
16372                  */
16373                 tso = 0;
16374         }
16375         if (rack->r_ctl.fsb.rfo_apply_push &&
16376             (len == rack->r_ctl.fsb.left_to_send)) {
16377                 th->th_flags |= TH_PUSH;
16378                 add_flag |= RACK_HAD_PUSH;
16379         }
16380         if ((m->m_next == NULL) || (len <= 0)){
16381                 goto failed;
16382         }
16383         if (udp) {
16384                 if (rack->r_is_v6)
16385                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
16386                 else
16387                         ulen = hdrlen + len - sizeof(struct ip);
16388                 udp->uh_ulen = htons(ulen);
16389         }
16390         m->m_pkthdr.rcvif = (struct ifnet *)0;
16391         if (tp->t_state == TCPS_ESTABLISHED &&
16392             (tp->t_flags2 & TF2_ECN_PERMIT)) {
16393                 /*
16394                  * If the peer has ECN, mark data packets with ECN capable
16395                  * transmission (ECT). Ignore pure ack packets,
16396                  * retransmissions.
16397                  */
16398                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
16399 #ifdef INET6
16400                         if (rack->r_is_v6)
16401                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
16402                         else
16403 #endif
16404                                 ip->ip_tos |= IPTOS_ECN_ECT0;
16405                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
16406                         /*
16407                          * Reply with proper ECN notifications.
16408                          * Only set CWR on new data segments.
16409                          */
16410                         if (tp->t_flags2 & TF2_ECN_SND_CWR) {
16411                                 flags |= TH_CWR;
16412                                 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
16413                         }
16414                 }
16415                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
16416                         flags |= TH_ECE;
16417         }
16418         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
16419 #ifdef INET6
16420         if (rack->r_is_v6) {
16421                 if (tp->t_port) {
16422                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
16423                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16424                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
16425                         th->th_sum = htons(0);
16426                         UDPSTAT_INC(udps_opackets);
16427                 } else {
16428                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
16429                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16430                         th->th_sum = in6_cksum_pseudo(ip6,
16431                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
16432                                                       0);
16433                 }
16434         }
16435 #endif
16436 #if defined(INET6) && defined(INET)
16437         else
16438 #endif
16439 #ifdef INET
16440         {
16441                 if (tp->t_port) {
16442                         m->m_pkthdr.csum_flags = CSUM_UDP;
16443                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16444                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
16445                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
16446                         th->th_sum = htons(0);
16447                         UDPSTAT_INC(udps_opackets);
16448                 } else {
16449                         m->m_pkthdr.csum_flags = CSUM_TCP;
16450                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16451                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
16452                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
16453                                                                         IPPROTO_TCP + len + optlen));
16454                 }
16455                 /* IP version must be set here for ipv4/ipv6 checking later */
16456                 KASSERT(ip->ip_v == IPVERSION,
16457                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
16458         }
16459 #endif
16460         if (tso) {
16461                 KASSERT(len > tp->t_maxseg - optlen,
16462                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
16463                 m->m_pkthdr.csum_flags |= CSUM_TSO;
16464                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
16465         }
16466 #ifdef INET6
16467         if (rack->r_is_v6) {
16468                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
16469                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
16470                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
16471                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16472                 else
16473                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16474         }
16475 #endif
16476 #if defined(INET) && defined(INET6)
16477         else
16478 #endif
16479 #ifdef INET
16480         {
16481                 ip->ip_len = htons(m->m_pkthdr.len);
16482                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
16483                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
16484                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16485                         if (tp->t_port == 0 || len < V_tcp_minmss) {
16486                                 ip->ip_off |= htons(IP_DF);
16487                         }
16488                 } else {
16489                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16490                 }
16491         }
16492 #endif
16493         /* Time to copy in our header */
16494         cpto = mtod(m, uint8_t *);
16495         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
16496         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
16497         if (optlen) {
16498                 bcopy(opt, th + 1, optlen);
16499                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
16500         } else {
16501                 th->th_off = sizeof(struct tcphdr) >> 2;
16502         }
16503         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
16504                 union tcp_log_stackspecific log;
16505
16506                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16507                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
16508                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
16509                 if (rack->rack_no_prr)
16510                         log.u_bbr.flex1 = 0;
16511                 else
16512                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16513                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
16514                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
16515                 log.u_bbr.flex4 = max_val;
16516                 log.u_bbr.flex5 = 0;
16517                 /* Save off the early/late values */
16518                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
16519                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
16520                 log.u_bbr.bw_inuse = rack_get_bw(rack);
16521                 log.u_bbr.flex8 = 0;
16522                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
16523                 log.u_bbr.flex7 = 44;
16524                 log.u_bbr.pkts_out = tp->t_maxseg;
16525                 log.u_bbr.timeStamp = cts;
16526                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16527                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
16528                 log.u_bbr.delivered = 0;
16529                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
16530                                      len, &log, false, NULL, NULL, 0, tv);
16531         } else
16532                 lgb = NULL;
16533 #ifdef INET6
16534         if (rack->r_is_v6) {
16535                 error = ip6_output(m, NULL,
16536                                    &inp->inp_route6,
16537                                    0, NULL, NULL, inp);
16538         }
16539 #endif
16540 #if defined(INET) && defined(INET6)
16541         else
16542 #endif
16543 #ifdef INET
16544         {
16545                 error = ip_output(m, NULL,
16546                                   &inp->inp_route,
16547                                   0, 0, inp);
16548         }
16549 #endif
16550         if (lgb) {
16551                 lgb->tlb_errno = error;
16552                 lgb = NULL;
16553         }
16554         if (error) {
16555                 *send_err = error;
16556                 m = NULL;
16557                 goto failed;
16558         }
16559         rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
16560                         NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls);
16561         m = NULL;
16562         if (tp->snd_una == tp->snd_max) {
16563                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
16564                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
16565                 tp->t_acktime = ticks;
16566         }
16567         if (error == 0)
16568                 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
16569
16570         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16571         tot_len += len;
16572         if ((tp->t_flags & TF_GPUTINPROG) == 0)
16573                 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
16574         tp->snd_max += len;
16575         tp->snd_nxt = tp->snd_max;
16576         {
16577                 int idx;
16578
16579                 idx = (len / segsiz) + 3;
16580                 if (idx >= TCP_MSS_ACCT_ATIMER)
16581                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16582                 else
16583                         counter_u64_add(rack_out_size[idx], 1);
16584         }
16585         if (len <= rack->r_ctl.fsb.left_to_send)
16586                 rack->r_ctl.fsb.left_to_send -= len;
16587         else
16588                 rack->r_ctl.fsb.left_to_send = 0;
16589         if (rack->r_ctl.fsb.left_to_send < segsiz) {
16590                 rack->r_fast_output = 0;
16591                 rack->r_ctl.fsb.left_to_send = 0;
16592                 /* At the end of fast_output scale up the sb */
16593                 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
16594                 rack_sndbuf_autoscale(rack);
16595                 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
16596         }
16597         if (tp->t_rtttime == 0) {
16598                 tp->t_rtttime = ticks;
16599                 tp->t_rtseq = startseq;
16600                 KMOD_TCPSTAT_INC(tcps_segstimed);
16601         }
16602         if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
16603             (max_val > len) &&
16604             (tso == 0)) {
16605                 max_val -= len;
16606                 len = segsiz;
16607                 th = rack->r_ctl.fsb.th;
16608                 cnt_thru++;
16609                 goto again;
16610         }
16611         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16612         counter_u64_add(rack_fto_send, 1);
16613         slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
16614         rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
16615 #ifdef TCP_ACCOUNTING
16616         crtsc = get_cyclecount();
16617         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16618                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16619         }
16620         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16621         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16622                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16623         }
16624         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16625         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16626                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
16627         }
16628         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz));
16629         sched_unpin();
16630 #endif
16631         return (0);
16632 failed:
16633         if (m)
16634                 m_free(m);
16635         rack->r_fast_output = 0;
16636         return (-1);
16637 }
16638
16639 static int
16640 rack_output(struct tcpcb *tp)
16641 {
16642         struct socket *so;
16643         uint32_t recwin;
16644         uint32_t sb_offset, s_moff = 0;
16645         int32_t len, flags, error = 0;
16646         struct mbuf *m, *s_mb = NULL;
16647         struct mbuf *mb;
16648         uint32_t if_hw_tsomaxsegcount = 0;
16649         uint32_t if_hw_tsomaxsegsize;
16650         int32_t segsiz, minseg;
16651         long tot_len_this_send = 0;
16652 #ifdef INET
16653         struct ip *ip = NULL;
16654 #endif
16655 #ifdef TCPDEBUG
16656         struct ipovly *ipov = NULL;
16657 #endif
16658         struct udphdr *udp = NULL;
16659         struct tcp_rack *rack;
16660         struct tcphdr *th;
16661         uint8_t pass = 0;
16662         uint8_t mark = 0;
16663         uint8_t wanted_cookie = 0;
16664         u_char opt[TCP_MAXOLEN];
16665         unsigned ipoptlen, optlen, hdrlen, ulen=0;
16666         uint32_t rack_seq;
16667
16668 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
16669         unsigned ipsec_optlen = 0;
16670
16671 #endif
16672         int32_t idle, sendalot;
16673         int32_t sub_from_prr = 0;
16674         volatile int32_t sack_rxmit;
16675         struct rack_sendmap *rsm = NULL;
16676         int32_t tso, mtu;
16677         struct tcpopt to;
16678         int32_t slot = 0;
16679         int32_t sup_rack = 0;
16680         uint32_t cts, ms_cts, delayed, early;
16681         uint16_t add_flag = RACK_SENT_SP;
16682         /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
16683         uint8_t hpts_calling,  doing_tlp = 0;
16684         uint32_t cwnd_to_use, pace_max_seg;
16685         int32_t do_a_prefetch = 0;
16686         int32_t prefetch_rsm = 0;
16687         int32_t orig_len = 0;
16688         struct timeval tv;
16689         int32_t prefetch_so_done = 0;
16690         struct tcp_log_buffer *lgb;
16691         struct inpcb *inp;
16692         struct sockbuf *sb;
16693         uint64_t ts_val = 0;
16694 #ifdef TCP_ACCOUNTING
16695         uint64_t crtsc;
16696 #endif
16697 #ifdef INET6
16698         struct ip6_hdr *ip6 = NULL;
16699         int32_t isipv6;
16700 #endif
16701         uint8_t filled_all = 0;
16702         bool hw_tls = false;
16703
16704         /* setup and take the cache hits here */
16705         rack = (struct tcp_rack *)tp->t_fb_ptr;
16706 #ifdef TCP_ACCOUNTING
16707         sched_pin();
16708         ts_val = get_cyclecount();
16709 #endif
16710         hpts_calling = rack->rc_inp->inp_hpts_calls;
16711         NET_EPOCH_ASSERT();
16712         INP_WLOCK_ASSERT(rack->rc_inp);
16713 #ifdef TCP_OFFLOAD
16714         if (tp->t_flags & TF_TOE) {
16715 #ifdef TCP_ACCOUNTING
16716                 sched_unpin();
16717 #endif
16718                 return (tcp_offload_output(tp));
16719         }
16720 #endif
16721         /*
16722          * For TFO connections in SYN_RECEIVED, only allow the initial
16723          * SYN|ACK and those sent by the retransmit timer.
16724          */
16725         if (IS_FASTOPEN(tp->t_flags) &&
16726             (tp->t_state == TCPS_SYN_RECEIVED) &&
16727             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
16728             (rack->r_ctl.rc_resend == NULL)) {         /* not a retransmit */
16729 #ifdef TCP_ACCOUNTING
16730                 sched_unpin();
16731 #endif
16732                 return (0);
16733         }
16734 #ifdef INET6
16735         if (rack->r_state) {
16736                 /* Use the cache line loaded if possible */
16737                 isipv6 = rack->r_is_v6;
16738         } else {
16739                 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
16740         }
16741 #endif
16742         early = 0;
16743         cts = tcp_get_usecs(&tv);
16744         ms_cts = tcp_tv_to_mssectick(&tv);
16745         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
16746             rack->rc_inp->inp_in_hpts) {
16747                 /*
16748                  * We are on the hpts for some timer but not hptsi output.
16749                  * Remove from the hpts unconditionally.
16750                  */
16751                 rack_timer_cancel(tp, rack, cts, __LINE__);
16752         }
16753         /* Are we pacing and late? */
16754         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16755             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
16756                 /* We are delayed */
16757                 delayed = cts - rack->r_ctl.rc_last_output_to;
16758         } else {
16759                 delayed = 0;
16760         }
16761         /* Do the timers, which may override the pacer */
16762         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
16763                 if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) {
16764                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
16765 #ifdef TCP_ACCOUNTING
16766                         sched_unpin();
16767 #endif
16768                         return (0);
16769                 }
16770         }
16771         if (rack->rc_in_persist) {
16772                 if (rack->rc_inp->inp_in_hpts == 0) {
16773                         /* Timer is not running */
16774                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16775                 }
16776 #ifdef TCP_ACCOUNTING
16777                 sched_unpin();
16778 #endif
16779                 return (0);
16780         }
16781         if ((rack->r_timer_override) ||
16782             (rack->rc_ack_can_sendout_data) ||
16783             (delayed) ||
16784             (tp->t_state < TCPS_ESTABLISHED)) {
16785                 rack->rc_ack_can_sendout_data = 0;
16786                 if (rack->rc_inp->inp_in_hpts)
16787                         tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
16788         } else if (rack->rc_inp->inp_in_hpts) {
16789                 /*
16790                  * On the hpts you can't pass even if ACKNOW is on, we will
16791                  * when the hpts fires.
16792                  */
16793 #ifdef TCP_ACCOUNTING
16794                 crtsc = get_cyclecount();
16795                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16796                         tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
16797                 }
16798                 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val));
16799                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16800                         tp->tcp_cnt_counters[SND_BLOCKED]++;
16801                 }
16802                 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1);
16803                 sched_unpin();
16804 #endif
16805                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
16806                 return (0);
16807         }
16808         rack->rc_inp->inp_hpts_calls = 0;
16809         /* Finish out both pacing early and late accounting */
16810         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16811             TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
16812                 early = rack->r_ctl.rc_last_output_to - cts;
16813         } else
16814                 early = 0;
16815         if (delayed) {
16816                 rack->r_ctl.rc_agg_delayed += delayed;
16817                 rack->r_late = 1;
16818         } else if (early) {
16819                 rack->r_ctl.rc_agg_early += early;
16820                 rack->r_early = 1;
16821         }
16822         /* Now that early/late accounting is done turn off the flag */
16823         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
16824         rack->r_wanted_output = 0;
16825         rack->r_timer_override = 0;
16826         if ((tp->t_state != rack->r_state) &&
16827             TCPS_HAVEESTABLISHED(tp->t_state)) {
16828                 rack_set_state(tp, rack);
16829         }
16830         if ((rack->r_fast_output) &&
16831             (doing_tlp == 0) &&
16832             (tp->rcv_numsacks == 0)) {
16833                 int ret;
16834
16835                 error = 0;
16836                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
16837                 if (ret >= 0)
16838                         return(ret);
16839                 else if (error) {
16840                         inp = rack->rc_inp;
16841                         so = inp->inp_socket;
16842                         sb = &so->so_snd;
16843                         goto nomore;
16844                 }
16845         }
16846         inp = rack->rc_inp;
16847         /*
16848          * For TFO connections in SYN_SENT or SYN_RECEIVED,
16849          * only allow the initial SYN or SYN|ACK and those sent
16850          * by the retransmit timer.
16851          */
16852         if (IS_FASTOPEN(tp->t_flags) &&
16853             ((tp->t_state == TCPS_SYN_RECEIVED) ||
16854              (tp->t_state == TCPS_SYN_SENT)) &&
16855             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
16856             (tp->t_rxtshift == 0)) {              /* not a retransmit */
16857                 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
16858                 so = inp->inp_socket;
16859                 sb = &so->so_snd;
16860                 goto just_return_nolock;
16861         }
16862         /*
16863          * Determine length of data that should be transmitted, and flags
16864          * that will be used. If there is some data or critical controls
16865          * (SYN, RST) to send, then transmit; otherwise, investigate
16866          * further.
16867          */
16868         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
16869         if (tp->t_idle_reduce) {
16870                 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
16871                         rack_cc_after_idle(rack, tp);
16872         }
16873         tp->t_flags &= ~TF_LASTIDLE;
16874         if (idle) {
16875                 if (tp->t_flags & TF_MORETOCOME) {
16876                         tp->t_flags |= TF_LASTIDLE;
16877                         idle = 0;
16878                 }
16879         }
16880         if ((tp->snd_una == tp->snd_max) &&
16881             rack->r_ctl.rc_went_idle_time &&
16882             TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
16883                 idle = cts - rack->r_ctl.rc_went_idle_time;
16884                 if (idle > rack_min_probertt_hold) {
16885                         /* Count as a probe rtt */
16886                         if (rack->in_probe_rtt == 0) {
16887                                 rack->r_ctl.rc_lower_rtt_us_cts = cts;
16888                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
16889                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
16890                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
16891                         } else {
16892                                 rack_exit_probertt(rack, cts);
16893                         }
16894                 }
16895                 idle = 0;
16896         }
16897         if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED))
16898                 rack_init_fsb_block(tp, rack);
16899 again:
16900         /*
16901          * If we've recently taken a timeout, snd_max will be greater than
16902          * snd_nxt.  There may be SACK information that allows us to avoid
16903          * resending already delivered data.  Adjust snd_nxt accordingly.
16904          */
16905         sendalot = 0;
16906         cts = tcp_get_usecs(&tv);
16907         ms_cts = tcp_tv_to_mssectick(&tv);
16908         tso = 0;
16909         mtu = 0;
16910         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
16911         minseg = segsiz;
16912         if (rack->r_ctl.rc_pace_max_segs == 0)
16913                 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
16914         else
16915                 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
16916         sb_offset = tp->snd_max - tp->snd_una;
16917         cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
16918         flags = tcp_outflags[tp->t_state];
16919         while (rack->rc_free_cnt < rack_free_cache) {
16920                 rsm = rack_alloc(rack);
16921                 if (rsm == NULL) {
16922                         if (inp->inp_hpts_calls)
16923                                 /* Retry in a ms */
16924                                 slot = (1 * HPTS_USEC_IN_MSEC);
16925                         so = inp->inp_socket;
16926                         sb = &so->so_snd;
16927                         goto just_return_nolock;
16928                 }
16929                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
16930                 rack->rc_free_cnt++;
16931                 rsm = NULL;
16932         }
16933         if (inp->inp_hpts_calls)
16934                 inp->inp_hpts_calls = 0;
16935         sack_rxmit = 0;
16936         len = 0;
16937         rsm = NULL;
16938         if (flags & TH_RST) {
16939                 SOCKBUF_LOCK(&inp->inp_socket->so_snd);
16940                 so = inp->inp_socket;
16941                 sb = &so->so_snd;
16942                 goto send;
16943         }
16944         if (rack->r_ctl.rc_resend) {
16945                 /* Retransmit timer */
16946                 rsm = rack->r_ctl.rc_resend;
16947                 rack->r_ctl.rc_resend = NULL;
16948                 len = rsm->r_end - rsm->r_start;
16949                 sack_rxmit = 1;
16950                 sendalot = 0;
16951                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
16952                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
16953                          __func__, __LINE__,
16954                          rsm->r_start, tp->snd_una, tp, rack, rsm));
16955                 sb_offset = rsm->r_start - tp->snd_una;
16956                 if (len >= segsiz)
16957                         len = segsiz;
16958         } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
16959                 /* We have a retransmit that takes precedence */
16960                 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
16961                     ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
16962                         /* Enter recovery if not induced by a time-out */
16963                         rack->r_ctl.rc_rsm_start = rsm->r_start;
16964                         rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
16965                         rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
16966                         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una);
16967                 }
16968 #ifdef INVARIANTS
16969                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
16970                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
16971                               tp, rack, rsm, rsm->r_start, tp->snd_una);
16972                 }
16973 #endif
16974                 len = rsm->r_end - rsm->r_start;
16975                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
16976                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
16977                          __func__, __LINE__,
16978                          rsm->r_start, tp->snd_una, tp, rack, rsm));
16979                 sb_offset = rsm->r_start - tp->snd_una;
16980                 sendalot = 0;
16981                 if (len >= segsiz)
16982                         len = segsiz;
16983                 if (len > 0) {
16984                         sack_rxmit = 1;
16985                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
16986                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
16987                             min(len, segsiz));
16988                         counter_u64_add(rack_rtm_prr_retran, 1);
16989                 }
16990         } else if (rack->r_ctl.rc_tlpsend) {
16991                 /* Tail loss probe */
16992                 long cwin;
16993                 long tlen;
16994
16995                 /*
16996                  * Check if we can do a TLP with a RACK'd packet
16997                  * this can happen if we are not doing the rack
16998                  * cheat and we skipped to a TLP and it
16999                  * went off.
17000                  */
17001                 rsm = rack->r_ctl.rc_tlpsend;
17002                 /* We are doing a TLP make sure the flag is preent */
17003                 rsm->r_flags |= RACK_TLP;
17004                 rack->r_ctl.rc_tlpsend = NULL;
17005                 sack_rxmit = 1;
17006                 tlen = rsm->r_end - rsm->r_start;
17007                 if (tlen > segsiz)
17008                         tlen = segsiz;
17009                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
17010                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
17011                          __func__, __LINE__,
17012                          rsm->r_start, tp->snd_una, tp, rack, rsm));
17013                 sb_offset = rsm->r_start - tp->snd_una;
17014                 cwin = min(tp->snd_wnd, tlen);
17015                 len = cwin;
17016         }
17017         if (rack->r_must_retran &&
17018             (rsm == NULL)) {
17019                 /*
17020                  * Non-Sack and we had a RTO or Sack/non-Sack and a 
17021                  * MTU change, we need to retransmit until we reach
17022                  * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
17023                  */
17024                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
17025                         int sendwin, flight;
17026
17027                         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
17028                         flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
17029                         if (flight >= sendwin) {
17030                                 so = inp->inp_socket;
17031                                 sb = &so->so_snd;
17032                                 goto just_return_nolock;
17033                         }
17034                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
17035                         if (rsm == NULL) {
17036                                 /* TSNH */
17037                                 rack->r_must_retran = 0;
17038                                 rack->r_ctl.rc_out_at_rto = 0;
17039                                 rack->r_must_retran = 0;
17040                                 so = inp->inp_socket;
17041                                 sb = &so->so_snd;
17042                                 goto just_return_nolock;
17043                         }
17044                         if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
17045                                 /* It does not have the flag, we are done */
17046                                 rack->r_must_retran = 0;
17047                                 rack->r_ctl.rc_out_at_rto = 0;
17048                         } else {
17049                                 sack_rxmit = 1;
17050                                 len = rsm->r_end - rsm->r_start;
17051                                 sendalot = 0;
17052                                 sb_offset = rsm->r_start - tp->snd_una;
17053                                 if (len >= segsiz)
17054                                         len = segsiz;
17055                                 /* 
17056                                  * Delay removing the flag RACK_MUST_RXT so
17057                                  * that the fastpath for retransmit will
17058                                  * work with this rsm.
17059                                  */
17060
17061                         }
17062                 } else {
17063                         /* We must be done if there is nothing outstanding */
17064                         rack->r_must_retran = 0;
17065                         rack->r_ctl.rc_out_at_rto = 0;
17066                 }
17067         }
17068         /*
17069          * Enforce a connection sendmap count limit if set
17070          * as long as we are not retransmiting.
17071          */
17072         if ((rsm == NULL) &&
17073             (rack->do_detection == 0) &&
17074             (V_tcp_map_entries_limit > 0) &&
17075             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
17076                 counter_u64_add(rack_to_alloc_limited, 1);
17077                 if (!rack->alloc_limit_reported) {
17078                         rack->alloc_limit_reported = 1;
17079                         counter_u64_add(rack_alloc_limited_conns, 1);
17080                 }
17081                 so = inp->inp_socket;
17082                 sb = &so->so_snd;
17083                 goto just_return_nolock;
17084         }
17085         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
17086                 /* we are retransmitting the fin */
17087                 len--;
17088                 if (len) {
17089                         /*
17090                          * When retransmitting data do *not* include the
17091                          * FIN. This could happen from a TLP probe.
17092                          */
17093                         flags &= ~TH_FIN;
17094                 }
17095         }
17096 #ifdef INVARIANTS
17097         /* For debugging */
17098         rack->r_ctl.rc_rsm_at_retran = rsm;
17099 #endif
17100         if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
17101             ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
17102                 int ret;
17103
17104                 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
17105                 if (ret == 0)
17106                         return (0);
17107         }
17108         if (rsm && (rsm->r_flags & RACK_MUST_RXT)) {
17109                 /* 
17110                  * Clear the flag in prep for the send
17111                  * note that if we can't get an mbuf
17112                  * and fail, we won't retransmit this
17113                  * rsm but that should be ok (its rare).
17114                  */
17115                 rsm->r_flags &= ~RACK_MUST_RXT;
17116         }
17117         so = inp->inp_socket;
17118         sb = &so->so_snd;
17119         if (do_a_prefetch == 0) {
17120                 kern_prefetch(sb, &do_a_prefetch);
17121                 do_a_prefetch = 1;
17122         }
17123 #ifdef NETFLIX_SHARED_CWND
17124         if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
17125             rack->rack_enable_scwnd) {
17126                 /* We are doing cwnd sharing */
17127                 if (rack->gp_ready &&
17128                     (rack->rack_attempted_scwnd == 0) &&
17129                     (rack->r_ctl.rc_scw == NULL) &&
17130                     tp->t_lib) {
17131                         /* The pcbid is in, lets make an attempt */
17132                         counter_u64_add(rack_try_scwnd, 1);
17133                         rack->rack_attempted_scwnd = 1;
17134                         rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
17135                                                                    &rack->r_ctl.rc_scw_index,
17136                                                                    segsiz);
17137                 }
17138                 if (rack->r_ctl.rc_scw &&
17139                     (rack->rack_scwnd_is_idle == 1) &&
17140                     sbavail(&so->so_snd)) {
17141                         /* we are no longer out of data */
17142                         tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
17143                         rack->rack_scwnd_is_idle = 0;
17144                 }
17145                 if (rack->r_ctl.rc_scw) {
17146                         /* First lets update and get the cwnd */
17147                         rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
17148                                                                     rack->r_ctl.rc_scw_index,
17149                                                                     tp->snd_cwnd, tp->snd_wnd, segsiz);
17150                 }
17151         }
17152 #endif
17153         /*
17154          * Get standard flags, and add SYN or FIN if requested by 'hidden'
17155          * state flags.
17156          */
17157         if (tp->t_flags & TF_NEEDFIN)
17158                 flags |= TH_FIN;
17159         if (tp->t_flags & TF_NEEDSYN)
17160                 flags |= TH_SYN;
17161         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
17162                 void *end_rsm;
17163                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
17164                 if (end_rsm)
17165                         kern_prefetch(end_rsm, &prefetch_rsm);
17166                 prefetch_rsm = 1;
17167         }
17168         SOCKBUF_LOCK(sb);
17169         /*
17170          * If snd_nxt == snd_max and we have transmitted a FIN, the
17171          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
17172          * negative length.  This can also occur when TCP opens up its
17173          * congestion window while receiving additional duplicate acks after
17174          * fast-retransmit because TCP will reset snd_nxt to snd_max after
17175          * the fast-retransmit.
17176          *
17177          * In the normal retransmit-FIN-only case, however, snd_nxt will be
17178          * set to snd_una, the sb_offset will be 0, and the length may wind
17179          * up 0.
17180          *
17181          * If sack_rxmit is true we are retransmitting from the scoreboard
17182          * in which case len is already set.
17183          */
17184         if ((sack_rxmit == 0) &&
17185             (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
17186                 uint32_t avail;
17187
17188                 avail = sbavail(sb);
17189                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
17190                         sb_offset = tp->snd_nxt - tp->snd_una;
17191                 else
17192                         sb_offset = 0;
17193                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
17194                         if (rack->r_ctl.rc_tlp_new_data) {
17195                                 /* TLP is forcing out new data */
17196                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
17197                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
17198                                 }
17199                                 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
17200                                         if (tp->snd_wnd > sb_offset)
17201                                                 len = tp->snd_wnd - sb_offset;
17202                                         else
17203                                                 len = 0;
17204                                 } else {
17205                                         len = rack->r_ctl.rc_tlp_new_data;
17206                                 }
17207                         }  else {
17208                                 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
17209                         }
17210                         if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
17211                                 /*
17212                                  * For prr=off, we need to send only 1 MSS
17213                                  * at a time. We do this because another sack could
17214                                  * be arriving that causes us to send retransmits and
17215                                  * we don't want to be on a long pace due to a larger send
17216                                  * that keeps us from sending out the retransmit.
17217                                  */
17218                                 len = segsiz;
17219                         }
17220                 } else {
17221                         uint32_t outstanding;
17222                         /*
17223                          * We are inside of a Fast recovery episode, this
17224                          * is caused by a SACK or 3 dup acks. At this point
17225                          * we have sent all the retransmissions and we rely
17226                          * on PRR to dictate what we will send in the form of
17227                          * new data.
17228                          */
17229
17230                         outstanding = tp->snd_max - tp->snd_una;
17231                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
17232                                 if (tp->snd_wnd > outstanding) {
17233                                         len = tp->snd_wnd - outstanding;
17234                                         /* Check to see if we have the data */
17235                                         if ((sb_offset + len) > avail) {
17236                                                 /* It does not all fit */
17237                                                 if (avail > sb_offset)
17238                                                         len = avail - sb_offset;
17239                                                 else
17240                                                         len = 0;
17241                                         }
17242                                 } else {
17243                                         len = 0;
17244                                 }
17245                         } else if (avail > sb_offset) {
17246                                 len = avail - sb_offset;
17247                         } else {
17248                                 len = 0;
17249                         }
17250                         if (len > 0) {
17251                                 if (len > rack->r_ctl.rc_prr_sndcnt) {
17252                                         len = rack->r_ctl.rc_prr_sndcnt;
17253                                 }
17254                                 if (len > 0) {
17255                                         sub_from_prr = 1;
17256                                         counter_u64_add(rack_rtm_prr_newdata, 1);
17257                                 }
17258                         }
17259                         if (len > segsiz) {
17260                                 /*
17261                                  * We should never send more than a MSS when
17262                                  * retransmitting or sending new data in prr
17263                                  * mode unless the override flag is on. Most
17264                                  * likely the PRR algorithm is not going to
17265                                  * let us send a lot as well :-)
17266                                  */
17267                                 if (rack->r_ctl.rc_prr_sendalot == 0) {
17268                                         len = segsiz;
17269                                 }
17270                         } else if (len < segsiz) {
17271                                 /*
17272                                  * Do we send any? The idea here is if the
17273                                  * send empty's the socket buffer we want to
17274                                  * do it. However if not then lets just wait
17275                                  * for our prr_sndcnt to get bigger.
17276                                  */
17277                                 long leftinsb;
17278
17279                                 leftinsb = sbavail(sb) - sb_offset;
17280                                 if (leftinsb > len) {
17281                                         /* This send does not empty the sb */
17282                                         len = 0;
17283                                 }
17284                         }
17285                 }
17286         } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
17287                 /*
17288                  * If you have not established
17289                  * and are not doing FAST OPEN
17290                  * no data please.
17291                  */
17292                 if ((sack_rxmit == 0) &&
17293                     (!IS_FASTOPEN(tp->t_flags))){
17294                         len = 0;
17295                         sb_offset = 0;
17296                 }
17297         }
17298         if (prefetch_so_done == 0) {
17299                 kern_prefetch(so, &prefetch_so_done);
17300                 prefetch_so_done = 1;
17301         }
17302         /*
17303          * Lop off SYN bit if it has already been sent.  However, if this is
17304          * SYN-SENT state and if segment contains data and if we don't know
17305          * that foreign host supports TAO, suppress sending segment.
17306          */
17307         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
17308             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
17309                 /*
17310                  * When sending additional segments following a TFO SYN|ACK,
17311                  * do not include the SYN bit.
17312                  */
17313                 if (IS_FASTOPEN(tp->t_flags) &&
17314                     (tp->t_state == TCPS_SYN_RECEIVED))
17315                         flags &= ~TH_SYN;
17316         }
17317         /*
17318          * Be careful not to send data and/or FIN on SYN segments. This
17319          * measure is needed to prevent interoperability problems with not
17320          * fully conformant TCP implementations.
17321          */
17322         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
17323                 len = 0;
17324                 flags &= ~TH_FIN;
17325         }
17326         /*
17327          * On TFO sockets, ensure no data is sent in the following cases:
17328          *
17329          *  - When retransmitting SYN|ACK on a passively-created socket
17330          *
17331          *  - When retransmitting SYN on an actively created socket
17332          *
17333          *  - When sending a zero-length cookie (cookie request) on an
17334          *    actively created socket
17335          *
17336          *  - When the socket is in the CLOSED state (RST is being sent)
17337          */
17338         if (IS_FASTOPEN(tp->t_flags) &&
17339             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
17340              ((tp->t_state == TCPS_SYN_SENT) &&
17341               (tp->t_tfo_client_cookie_len == 0)) ||
17342              (flags & TH_RST))) {
17343                 sack_rxmit = 0;
17344                 len = 0;
17345         }
17346         /* Without fast-open there should never be data sent on a SYN */
17347         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
17348                 tp->snd_nxt = tp->iss;
17349                 len = 0;
17350         }
17351         if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
17352                 /* We only send 1 MSS if we have a DSACK block */
17353                 add_flag |= RACK_SENT_W_DSACK;
17354                 len = segsiz;
17355         }
17356         orig_len = len;
17357         if (len <= 0) {
17358                 /*
17359                  * If FIN has been sent but not acked, but we haven't been
17360                  * called to retransmit, len will be < 0.  Otherwise, window
17361                  * shrank after we sent into it.  If window shrank to 0,
17362                  * cancel pending retransmit, pull snd_nxt back to (closed)
17363                  * window, and set the persist timer if it isn't already
17364                  * going.  If the window didn't close completely, just wait
17365                  * for an ACK.
17366                  *
17367                  * We also do a general check here to ensure that we will
17368                  * set the persist timer when we have data to send, but a
17369                  * 0-byte window. This makes sure the persist timer is set
17370                  * even if the packet hits one of the "goto send" lines
17371                  * below.
17372                  */
17373                 len = 0;
17374                 if ((tp->snd_wnd == 0) &&
17375                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17376                     (tp->snd_una == tp->snd_max) &&
17377                     (sb_offset < (int)sbavail(sb))) {
17378                         rack_enter_persist(tp, rack, cts);
17379                 }
17380         } else if ((rsm == NULL) &&
17381                    (doing_tlp == 0) &&
17382                    (len < pace_max_seg)) {
17383                 /*
17384                  * We are not sending a maximum sized segment for
17385                  * some reason. Should we not send anything (think
17386                  * sws or persists)?
17387                  */
17388                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17389                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17390                     (len < minseg) &&
17391                     (len < (int)(sbavail(sb) - sb_offset))) {
17392                         /*
17393                          * Here the rwnd is less than
17394                          * the minimum pacing size, this is not a retransmit,
17395                          * we are established and
17396                          * the send is not the last in the socket buffer
17397                          * we send nothing, and we may enter persists
17398                          * if nothing is outstanding.
17399                          */
17400                         len = 0;
17401                         if (tp->snd_max == tp->snd_una) {
17402                                 /*
17403                                  * Nothing out we can
17404                                  * go into persists.
17405                                  */
17406                                 rack_enter_persist(tp, rack, cts);
17407                         }
17408                      } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
17409                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17410                            (len < (int)(sbavail(sb) - sb_offset)) &&
17411                            (len < minseg)) {
17412                         /*
17413                          * Here we are not retransmitting, and
17414                          * the cwnd is not so small that we could
17415                          * not send at least a min size (rxt timer
17416                          * not having gone off), We have 2 segments or
17417                          * more already in flight, its not the tail end
17418                          * of the socket buffer  and the cwnd is blocking
17419                          * us from sending out a minimum pacing segment size.
17420                          * Lets not send anything.
17421                          */
17422                         len = 0;
17423                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
17424                             min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17425                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17426                            (len < (int)(sbavail(sb) - sb_offset)) &&
17427                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
17428                         /*
17429                          * Here we have a send window but we have
17430                          * filled it up and we can't send another pacing segment.
17431                          * We also have in flight more than 2 segments
17432                          * and we are not completing the sb i.e. we allow
17433                          * the last bytes of the sb to go out even if
17434                          * its not a full pacing segment.
17435                          */
17436                         len = 0;
17437                 } else if ((rack->r_ctl.crte != NULL) &&
17438                            (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
17439                            (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
17440                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
17441                            (len < (int)(sbavail(sb) - sb_offset))) {
17442                         /*
17443                          * Here we are doing hardware pacing, this is not a TLP,
17444                          * we are not sending a pace max segment size, there is rwnd
17445                          * room to send at least N pace_max_seg, the cwnd is greater
17446                          * than or equal to a full pacing segments plus 4 mss and we have 2 or
17447                          * more segments in flight and its not the tail of the socket buffer.
17448                          *
17449                          * We don't want to send instead we need to get more ack's in to
17450                          * allow us to send a full pacing segment. Normally, if we are pacing
17451                          * about the right speed, we should have finished our pacing
17452                          * send as most of the acks have come back if we are at the
17453                          * right rate. This is a bit fuzzy since return path delay
17454                          * can delay the acks, which is why we want to make sure we
17455                          * have cwnd space to have a bit more than a max pace segments in flight.
17456                          *
17457                          * If we have not gotten our acks back we are pacing at too high a
17458                          * rate delaying will not hurt and will bring our GP estimate down by
17459                          * injecting the delay. If we don't do this we will send
17460                          * 2 MSS out in response to the acks being clocked in which
17461                          * defeats the point of hw-pacing (i.e. to help us get
17462                          * larger TSO's out).
17463                          */
17464                         len = 0;
17465
17466                 }
17467
17468         }
17469         /* len will be >= 0 after this point. */
17470         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
17471         rack_sndbuf_autoscale(rack);
17472         /*
17473          * Decide if we can use TCP Segmentation Offloading (if supported by
17474          * hardware).
17475          *
17476          * TSO may only be used if we are in a pure bulk sending state.  The
17477          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
17478          * options prevent using TSO.  With TSO the TCP header is the same
17479          * (except for the sequence number) for all generated packets.  This
17480          * makes it impossible to transmit any options which vary per
17481          * generated segment or packet.
17482          *
17483          * IPv4 handling has a clear separation of ip options and ip header
17484          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
17485          * the right thing below to provide length of just ip options and thus
17486          * checking for ipoptlen is enough to decide if ip options are present.
17487          */
17488         ipoptlen = 0;
17489 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17490         /*
17491          * Pre-calculate here as we save another lookup into the darknesses
17492          * of IPsec that way and can actually decide if TSO is ok.
17493          */
17494 #ifdef INET6
17495         if (isipv6 && IPSEC_ENABLED(ipv6))
17496                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
17497 #ifdef INET
17498         else
17499 #endif
17500 #endif                          /* INET6 */
17501 #ifdef INET
17502                 if (IPSEC_ENABLED(ipv4))
17503                         ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
17504 #endif                          /* INET */
17505 #endif
17506
17507 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17508         ipoptlen += ipsec_optlen;
17509 #endif
17510         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
17511             (tp->t_port == 0) &&
17512             ((tp->t_flags & TF_SIGNATURE) == 0) &&
17513             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
17514             ipoptlen == 0)
17515                 tso = 1;
17516         {
17517                 uint32_t outstanding;
17518
17519                 outstanding = tp->snd_max - tp->snd_una;
17520                 if (tp->t_flags & TF_SENTFIN) {
17521                         /*
17522                          * If we sent a fin, snd_max is 1 higher than
17523                          * snd_una
17524                          */
17525                         outstanding--;
17526                 }
17527                 if (sack_rxmit) {
17528                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
17529                                 flags &= ~TH_FIN;
17530                 } else {
17531                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
17532                                    sbused(sb)))
17533                                 flags &= ~TH_FIN;
17534                 }
17535         }
17536         recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
17537             (long)TCP_MAXWIN << tp->rcv_scale);
17538
17539         /*
17540          * Sender silly window avoidance.   We transmit under the following
17541          * conditions when len is non-zero:
17542          *
17543          * - We have a full segment (or more with TSO) - This is the last
17544          * buffer in a write()/send() and we are either idle or running
17545          * NODELAY - we've timed out (e.g. persist timer) - we have more
17546          * then 1/2 the maximum send window's worth of data (receiver may be
17547          * limited the window size) - we need to retransmit
17548          */
17549         if (len) {
17550                 if (len >= segsiz) {
17551                         goto send;
17552                 }
17553                 /*
17554                  * NOTE! on localhost connections an 'ack' from the remote
17555                  * end may occur synchronously with the output and cause us
17556                  * to flush a buffer queued with moretocome.  XXX
17557                  *
17558                  */
17559                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
17560                     (idle || (tp->t_flags & TF_NODELAY)) &&
17561                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17562                     (tp->t_flags & TF_NOPUSH) == 0) {
17563                         pass = 2;
17564                         goto send;
17565                 }
17566                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
17567                         pass = 22;
17568                         goto send;
17569                 }
17570                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
17571                         pass = 4;
17572                         goto send;
17573                 }
17574                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
17575                         pass = 5;
17576                         goto send;
17577                 }
17578                 if (sack_rxmit) {
17579                         pass = 6;
17580                         goto send;
17581                 }
17582                 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
17583                     (ctf_outstanding(tp) < (segsiz * 2))) {
17584                         /*
17585                          * We have less than two MSS outstanding (delayed ack)
17586                          * and our rwnd will not let us send a full sized
17587                          * MSS. Lets go ahead and let this small segment
17588                          * out because we want to try to have at least two
17589                          * packets inflight to not be caught by delayed ack.
17590                          */
17591                         pass = 12;
17592                         goto send;
17593                 }
17594         }
17595         /*
17596          * Sending of standalone window updates.
17597          *
17598          * Window updates are important when we close our window due to a
17599          * full socket buffer and are opening it again after the application
17600          * reads data from it.  Once the window has opened again and the
17601          * remote end starts to send again the ACK clock takes over and
17602          * provides the most current window information.
17603          *
17604          * We must avoid the silly window syndrome whereas every read from
17605          * the receive buffer, no matter how small, causes a window update
17606          * to be sent.  We also should avoid sending a flurry of window
17607          * updates when the socket buffer had queued a lot of data and the
17608          * application is doing small reads.
17609          *
17610          * Prevent a flurry of pointless window updates by only sending an
17611          * update when we can increase the advertized window by more than
17612          * 1/4th of the socket buffer capacity.  When the buffer is getting
17613          * full or is very small be more aggressive and send an update
17614          * whenever we can increase by two mss sized segments. In all other
17615          * situations the ACK's to new incoming data will carry further
17616          * window increases.
17617          *
17618          * Don't send an independent window update if a delayed ACK is
17619          * pending (it will get piggy-backed on it) or the remote side
17620          * already has done a half-close and won't send more data.  Skip
17621          * this if the connection is in T/TCP half-open state.
17622          */
17623         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
17624             !(tp->t_flags & TF_DELACK) &&
17625             !TCPS_HAVERCVDFIN(tp->t_state)) {
17626                 /*
17627                  * "adv" is the amount we could increase the window, taking
17628                  * into account that we are limited by TCP_MAXWIN <<
17629                  * tp->rcv_scale.
17630                  */
17631                 int32_t adv;
17632                 int oldwin;
17633
17634                 adv = recwin;
17635                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
17636                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
17637                         if (adv > oldwin)
17638                             adv -= oldwin;
17639                         else {
17640                                 /* We can't increase the window */
17641                                 adv = 0;
17642                         }
17643                 } else
17644                         oldwin = 0;
17645
17646                 /*
17647                  * If the new window size ends up being the same as or less
17648                  * than the old size when it is scaled, then don't force
17649                  * a window update.
17650                  */
17651                 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
17652                         goto dontupdate;
17653
17654                 if (adv >= (int32_t)(2 * segsiz) &&
17655                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
17656                      recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
17657                      so->so_rcv.sb_hiwat <= 8 * segsiz)) {
17658                         pass = 7;
17659                         goto send;
17660                 }
17661                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
17662                         pass = 23;
17663                         goto send;
17664                 }
17665         }
17666 dontupdate:
17667
17668         /*
17669          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
17670          * is also a catch-all for the retransmit timer timeout case.
17671          */
17672         if (tp->t_flags & TF_ACKNOW) {
17673                 pass = 8;
17674                 goto send;
17675         }
17676         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
17677                 pass = 9;
17678                 goto send;
17679         }
17680         /*
17681          * If our state indicates that FIN should be sent and we have not
17682          * yet done so, then we need to send.
17683          */
17684         if ((flags & TH_FIN) &&
17685             (tp->snd_nxt == tp->snd_una)) {
17686                 pass = 11;
17687                 goto send;
17688         }
17689         /*
17690          * No reason to send a segment, just return.
17691          */
17692 just_return:
17693         SOCKBUF_UNLOCK(sb);
17694 just_return_nolock:
17695         {
17696                 int app_limited = CTF_JR_SENT_DATA;
17697
17698                 if (tot_len_this_send > 0) {
17699                         /* Make sure snd_nxt is up to max */
17700                         rack->r_ctl.fsb.recwin = recwin;
17701                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
17702                         if ((error == 0) &&
17703                             rack_use_rfo &&
17704                             ((flags & (TH_SYN|TH_FIN)) == 0) &&
17705                             (ipoptlen == 0) &&
17706                             (tp->snd_nxt == tp->snd_max) &&
17707                             (tp->rcv_numsacks == 0) &&
17708                             rack->r_fsb_inited &&
17709                             TCPS_HAVEESTABLISHED(tp->t_state) &&
17710                             (rack->r_must_retran == 0) &&
17711                             ((tp->t_flags & TF_NEEDFIN) == 0) &&
17712                             (len > 0) && (orig_len > 0) &&
17713                             (orig_len > len) &&
17714                             ((orig_len - len) >= segsiz) &&
17715                             ((optlen == 0) ||
17716                              ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
17717                                 /* We can send at least one more MSS using our fsb */
17718
17719                                 rack->r_fast_output = 1;
17720                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
17721                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
17722                                 rack->r_ctl.fsb.tcp_flags = flags;
17723                                 rack->r_ctl.fsb.left_to_send = orig_len - len;
17724                                 if (hw_tls)
17725                                         rack->r_ctl.fsb.hw_tls = 1;
17726                                 else
17727                                         rack->r_ctl.fsb.hw_tls = 0;
17728                                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
17729                                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
17730                                         rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
17731                                          (tp->snd_max - tp->snd_una)));
17732                                 if (rack->r_ctl.fsb.left_to_send < segsiz)
17733                                         rack->r_fast_output = 0;
17734                                 else {
17735                                         if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
17736                                                 rack->r_ctl.fsb.rfo_apply_push = 1;
17737                                         else
17738                                                 rack->r_ctl.fsb.rfo_apply_push = 0;
17739                                 }
17740                         } else
17741                                 rack->r_fast_output = 0;
17742
17743
17744                         rack_log_fsb(rack, tp, so, flags,
17745                                      ipoptlen, orig_len, len, 0,
17746                                      1, optlen, __LINE__, 1);
17747                         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
17748                                 tp->snd_nxt = tp->snd_max;
17749                 } else {
17750                         int end_window = 0;
17751                         uint32_t seq = tp->gput_ack;
17752
17753                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17754                         if (rsm) {
17755                                 /*
17756                                  * Mark the last sent that we just-returned (hinting
17757                                  * that delayed ack may play a role in any rtt measurement).
17758                                  */
17759                                 rsm->r_just_ret = 1;
17760                         }
17761                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
17762                         rack->r_ctl.rc_agg_delayed = 0;
17763                         rack->r_early = 0;
17764                         rack->r_late = 0;
17765                         rack->r_ctl.rc_agg_early = 0;
17766                         if ((ctf_outstanding(tp) +
17767                              min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
17768                                  minseg)) >= tp->snd_wnd) {
17769                                 /* We are limited by the rwnd */
17770                                 app_limited = CTF_JR_RWND_LIMITED;
17771                                 if (IN_FASTRECOVERY(tp->t_flags))
17772                                     rack->r_ctl.rc_prr_sndcnt = 0;
17773                         } else if (ctf_outstanding(tp) >= sbavail(sb)) {
17774                                 /* We are limited by whats available -- app limited */
17775                                 app_limited = CTF_JR_APP_LIMITED;
17776                                 if (IN_FASTRECOVERY(tp->t_flags))
17777                                     rack->r_ctl.rc_prr_sndcnt = 0;
17778                         } else if ((idle == 0) &&
17779                                    ((tp->t_flags & TF_NODELAY) == 0) &&
17780                                    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17781                                    (len < segsiz)) {
17782                                 /*
17783                                  * No delay is not on and the
17784                                  * user is sending less than 1MSS. This
17785                                  * brings out SWS avoidance so we
17786                                  * don't send. Another app-limited case.
17787                                  */
17788                                 app_limited = CTF_JR_APP_LIMITED;
17789                         } else if (tp->t_flags & TF_NOPUSH) {
17790                                 /*
17791                                  * The user has requested no push of
17792                                  * the last segment and we are
17793                                  * at the last segment. Another app
17794                                  * limited case.
17795                                  */
17796                                 app_limited = CTF_JR_APP_LIMITED;
17797                         } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
17798                                 /* Its the cwnd */
17799                                 app_limited = CTF_JR_CWND_LIMITED;
17800                         } else if (IN_FASTRECOVERY(tp->t_flags) &&
17801                                    (rack->rack_no_prr == 0) &&
17802                                    (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
17803                                 app_limited = CTF_JR_PRR;
17804                         } else {
17805                                 /* Now why here are we not sending? */
17806 #ifdef NOW
17807 #ifdef INVARIANTS
17808                                 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
17809 #endif
17810 #endif
17811                                 app_limited = CTF_JR_ASSESSING;
17812                         }
17813                         /*
17814                          * App limited in some fashion, for our pacing GP
17815                          * measurements we don't want any gap (even cwnd).
17816                          * Close  down the measurement window.
17817                          */
17818                         if (rack_cwnd_block_ends_measure &&
17819                             ((app_limited == CTF_JR_CWND_LIMITED) ||
17820                              (app_limited == CTF_JR_PRR))) {
17821                                 /*
17822                                  * The reason we are not sending is
17823                                  * the cwnd (or prr). We have been configured
17824                                  * to end the measurement window in
17825                                  * this case.
17826                                  */
17827                                 end_window = 1;
17828                         } else if (rack_rwnd_block_ends_measure &&
17829                                    (app_limited == CTF_JR_RWND_LIMITED)) {
17830                                 /*
17831                                  * We are rwnd limited and have been
17832                                  * configured to end the measurement
17833                                  * window in this case.
17834                                  */
17835                                 end_window = 1;
17836                         } else if (app_limited == CTF_JR_APP_LIMITED) {
17837                                 /*
17838                                  * A true application limited period, we have
17839                                  * ran out of data.
17840                                  */
17841                                 end_window = 1;
17842                         } else if (app_limited == CTF_JR_ASSESSING) {
17843                                 /*
17844                                  * In the assessing case we hit the end of
17845                                  * the if/else and had no known reason
17846                                  * This will panic us under invariants..
17847                                  *
17848                                  * If we get this out in logs we need to
17849                                  * investagate which reason we missed.
17850                                  */
17851                                 end_window = 1;
17852                         }
17853                         if (end_window) {
17854                                 uint8_t log = 0;
17855
17856                                 /* Adjust the Gput measurement */
17857                                 if ((tp->t_flags & TF_GPUTINPROG) &&
17858                                     SEQ_GT(tp->gput_ack, tp->snd_max)) {
17859                                         tp->gput_ack = tp->snd_max;
17860                                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
17861                                                 /*
17862                                                  * There is not enough to measure.
17863                                                  */
17864                                                 tp->t_flags &= ~TF_GPUTINPROG;
17865                                                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
17866                                                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
17867                                                                            tp->gput_seq,
17868                                                                            0, 0, 18, __LINE__, NULL, 0);
17869                                         } else
17870                                                 log = 1;
17871                                 }
17872                                 /* Mark the last packet has app limited */
17873                                 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17874                                 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
17875                                         if (rack->r_ctl.rc_app_limited_cnt == 0)
17876                                                 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
17877                                         else {
17878                                                 /*
17879                                                  * Go out to the end app limited and mark
17880                                                  * this new one as next and move the end_appl up
17881                                                  * to this guy.
17882                                                  */
17883                                                 if (rack->r_ctl.rc_end_appl)
17884                                                         rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
17885                                                 rack->r_ctl.rc_end_appl = rsm;
17886                                         }
17887                                         rsm->r_flags |= RACK_APP_LIMITED;
17888                                         rack->r_ctl.rc_app_limited_cnt++;
17889                                 }
17890                                 if (log)
17891                                         rack_log_pacing_delay_calc(rack,
17892                                                                    rack->r_ctl.rc_app_limited_cnt, seq,
17893                                                                    tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
17894                         }
17895                 }
17896                 if (slot) {
17897                         /* set the rack tcb into the slot N */
17898                         counter_u64_add(rack_paced_segments, 1);
17899                 } else if (tot_len_this_send) {
17900                         counter_u64_add(rack_unpaced_segments, 1);
17901                 }
17902                 /* Check if we need to go into persists or not */
17903                 if ((tp->snd_max == tp->snd_una) &&
17904                     TCPS_HAVEESTABLISHED(tp->t_state) &&
17905                     sbavail(sb) &&
17906                     (sbavail(sb) > tp->snd_wnd) &&
17907                     (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
17908                         /* Yes lets make sure to move to persist before timer-start */
17909                         rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
17910                 }
17911                 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
17912                 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
17913         }
17914 #ifdef NETFLIX_SHARED_CWND
17915         if ((sbavail(sb) == 0) &&
17916             rack->r_ctl.rc_scw) {
17917                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
17918                 rack->rack_scwnd_is_idle = 1;
17919         }
17920 #endif
17921 #ifdef TCP_ACCOUNTING
17922         if (tot_len_this_send > 0) {
17923                 crtsc = get_cyclecount();
17924                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17925                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
17926                 }
17927                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
17928                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17929                         tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
17930                 }
17931                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
17932                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17933                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
17934                 }
17935                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz));
17936         } else {
17937                 crtsc = get_cyclecount();
17938                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17939                         tp->tcp_cnt_counters[SND_LIMITED]++;
17940                 }
17941                 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1);
17942                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17943                         tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
17944                 }
17945                 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val));
17946         }
17947         sched_unpin();
17948 #endif
17949         return (0);
17950
17951 send:
17952         if (rsm || sack_rxmit)
17953                 counter_u64_add(rack_nfto_resend, 1);
17954         else
17955                 counter_u64_add(rack_non_fto_send, 1);
17956         if ((flags & TH_FIN) &&
17957             sbavail(sb)) {
17958                 /*
17959                  * We do not transmit a FIN
17960                  * with data outstanding. We
17961                  * need to make it so all data
17962                  * is acked first.
17963                  */
17964                 flags &= ~TH_FIN;
17965         }
17966         /* Enforce stack imposed max seg size if we have one */
17967         if (rack->r_ctl.rc_pace_max_segs &&
17968             (len > rack->r_ctl.rc_pace_max_segs)) {
17969                 mark = 1;
17970                 len = rack->r_ctl.rc_pace_max_segs;
17971         }
17972         SOCKBUF_LOCK_ASSERT(sb);
17973         if (len > 0) {
17974                 if (len >= segsiz)
17975                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
17976                 else
17977                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
17978         }
17979         /*
17980          * Before ESTABLISHED, force sending of initial options unless TCP
17981          * set not to do any options. NOTE: we assume that the IP/TCP header
17982          * plus TCP options always fit in a single mbuf, leaving room for a
17983          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
17984          * + optlen <= MCLBYTES
17985          */
17986         optlen = 0;
17987 #ifdef INET6
17988         if (isipv6)
17989                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
17990         else
17991 #endif
17992                 hdrlen = sizeof(struct tcpiphdr);
17993
17994         /*
17995          * Compute options for segment. We only have to care about SYN and
17996          * established connection segments.  Options for SYN-ACK segments
17997          * are handled in TCP syncache.
17998          */
17999         to.to_flags = 0;
18000         if ((tp->t_flags & TF_NOOPT) == 0) {
18001                 /* Maximum segment size. */
18002                 if (flags & TH_SYN) {
18003                         tp->snd_nxt = tp->iss;
18004                         to.to_mss = tcp_mssopt(&inp->inp_inc);
18005                         if (tp->t_port)
18006                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
18007                         to.to_flags |= TOF_MSS;
18008
18009                         /*
18010                          * On SYN or SYN|ACK transmits on TFO connections,
18011                          * only include the TFO option if it is not a
18012                          * retransmit, as the presence of the TFO option may
18013                          * have caused the original SYN or SYN|ACK to have
18014                          * been dropped by a middlebox.
18015                          */
18016                         if (IS_FASTOPEN(tp->t_flags) &&
18017                             (tp->t_rxtshift == 0)) {
18018                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
18019                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
18020                                         to.to_tfo_cookie =
18021                                                 (u_int8_t *)&tp->t_tfo_cookie.server;
18022                                         to.to_flags |= TOF_FASTOPEN;
18023                                         wanted_cookie = 1;
18024                                 } else if (tp->t_state == TCPS_SYN_SENT) {
18025                                         to.to_tfo_len =
18026                                                 tp->t_tfo_client_cookie_len;
18027                                         to.to_tfo_cookie =
18028                                                 tp->t_tfo_cookie.client;
18029                                         to.to_flags |= TOF_FASTOPEN;
18030                                         wanted_cookie = 1;
18031                                         /*
18032                                          * If we wind up having more data to
18033                                          * send with the SYN than can fit in
18034                                          * one segment, don't send any more
18035                                          * until the SYN|ACK comes back from
18036                                          * the other end.
18037                                          */
18038                                         sendalot = 0;
18039                                 }
18040                         }
18041                 }
18042                 /* Window scaling. */
18043                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
18044                         to.to_wscale = tp->request_r_scale;
18045                         to.to_flags |= TOF_SCALE;
18046                 }
18047                 /* Timestamps. */
18048                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
18049                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
18050                         to.to_tsval = ms_cts + tp->ts_offset;
18051                         to.to_tsecr = tp->ts_recent;
18052                         to.to_flags |= TOF_TS;
18053                 }
18054                 /* Set receive buffer autosizing timestamp. */
18055                 if (tp->rfbuf_ts == 0 &&
18056                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
18057                         tp->rfbuf_ts = tcp_ts_getticks();
18058                 /* Selective ACK's. */
18059                 if (tp->t_flags & TF_SACK_PERMIT) {
18060                         if (flags & TH_SYN)
18061                                 to.to_flags |= TOF_SACKPERM;
18062                         else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18063                                  tp->rcv_numsacks > 0) {
18064                                 to.to_flags |= TOF_SACK;
18065                                 to.to_nsacks = tp->rcv_numsacks;
18066                                 to.to_sacks = (u_char *)tp->sackblks;
18067                         }
18068                 }
18069 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18070                 /* TCP-MD5 (RFC2385). */
18071                 if (tp->t_flags & TF_SIGNATURE)
18072                         to.to_flags |= TOF_SIGNATURE;
18073 #endif                          /* TCP_SIGNATURE */
18074
18075                 /* Processing the options. */
18076                 hdrlen += optlen = tcp_addoptions(&to, opt);
18077                 /*
18078                  * If we wanted a TFO option to be added, but it was unable
18079                  * to fit, ensure no data is sent.
18080                  */
18081                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
18082                     !(to.to_flags & TOF_FASTOPEN))
18083                         len = 0;
18084         }
18085         if (tp->t_port) {
18086                 if (V_tcp_udp_tunneling_port == 0) {
18087                         /* The port was removed?? */
18088                         SOCKBUF_UNLOCK(&so->so_snd);
18089 #ifdef TCP_ACCOUNTING
18090                         crtsc = get_cyclecount();
18091                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18092                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
18093                         }
18094                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
18095                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18096                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
18097                         }
18098                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
18099                         sched_unpin();
18100 #endif
18101                         return (EHOSTUNREACH);
18102                 }
18103                 hdrlen += sizeof(struct udphdr);
18104         }
18105 #ifdef INET6
18106         if (isipv6)
18107                 ipoptlen = ip6_optlen(tp->t_inpcb);
18108         else
18109 #endif
18110                 if (tp->t_inpcb->inp_options)
18111                         ipoptlen = tp->t_inpcb->inp_options->m_len -
18112                                 offsetof(struct ipoption, ipopt_list);
18113                 else
18114                         ipoptlen = 0;
18115 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18116         ipoptlen += ipsec_optlen;
18117 #endif
18118
18119         /*
18120          * Adjust data length if insertion of options will bump the packet
18121          * length beyond the t_maxseg length. Clear the FIN bit because we
18122          * cut off the tail of the segment.
18123          */
18124         if (len + optlen + ipoptlen > tp->t_maxseg) {
18125                 if (tso) {
18126                         uint32_t if_hw_tsomax;
18127                         uint32_t moff;
18128                         int32_t max_len;
18129
18130                         /* extract TSO information */
18131                         if_hw_tsomax = tp->t_tsomax;
18132                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18133                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18134                         KASSERT(ipoptlen == 0,
18135                                 ("%s: TSO can't do IP options", __func__));
18136
18137                         /*
18138                          * Check if we should limit by maximum payload
18139                          * length:
18140                          */
18141                         if (if_hw_tsomax != 0) {
18142                                 /* compute maximum TSO length */
18143                                 max_len = (if_hw_tsomax - hdrlen -
18144                                            max_linkhdr);
18145                                 if (max_len <= 0) {
18146                                         len = 0;
18147                                 } else if (len > max_len) {
18148                                         sendalot = 1;
18149                                         len = max_len;
18150                                         mark = 2;
18151                                 }
18152                         }
18153                         /*
18154                          * Prevent the last segment from being fractional
18155                          * unless the send sockbuf can be emptied:
18156                          */
18157                         max_len = (tp->t_maxseg - optlen);
18158                         if ((sb_offset + len) < sbavail(sb)) {
18159                                 moff = len % (u_int)max_len;
18160                                 if (moff != 0) {
18161                                         mark = 3;
18162                                         len -= moff;
18163                                 }
18164                         }
18165                         /*
18166                          * In case there are too many small fragments don't
18167                          * use TSO:
18168                          */
18169                         if (len <= segsiz) {
18170                                 mark = 4;
18171                                 tso = 0;
18172                         }
18173                         /*
18174                          * Send the FIN in a separate segment after the bulk
18175                          * sending is done. We don't trust the TSO
18176                          * implementations to clear the FIN flag on all but
18177                          * the last segment.
18178                          */
18179                         if (tp->t_flags & TF_NEEDFIN) {
18180                                 sendalot = 4;
18181                         }
18182                 } else {
18183                         mark = 5;
18184                         if (optlen + ipoptlen >= tp->t_maxseg) {
18185                                 /*
18186                                  * Since we don't have enough space to put
18187                                  * the IP header chain and the TCP header in
18188                                  * one packet as required by RFC 7112, don't
18189                                  * send it. Also ensure that at least one
18190                                  * byte of the payload can be put into the
18191                                  * TCP segment.
18192                                  */
18193                                 SOCKBUF_UNLOCK(&so->so_snd);
18194                                 error = EMSGSIZE;
18195                                 sack_rxmit = 0;
18196                                 goto out;
18197                         }
18198                         len = tp->t_maxseg - optlen - ipoptlen;
18199                         sendalot = 5;
18200                 }
18201         } else {
18202                 tso = 0;
18203                 mark = 6;
18204         }
18205         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
18206                 ("%s: len > IP_MAXPACKET", __func__));
18207 #ifdef DIAGNOSTIC
18208 #ifdef INET6
18209         if (max_linkhdr + hdrlen > MCLBYTES)
18210 #else
18211                 if (max_linkhdr + hdrlen > MHLEN)
18212 #endif
18213                         panic("tcphdr too big");
18214 #endif
18215
18216         /*
18217          * This KASSERT is here to catch edge cases at a well defined place.
18218          * Before, those had triggered (random) panic conditions further
18219          * down.
18220          */
18221         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
18222         if ((len == 0) &&
18223             (flags & TH_FIN) &&
18224             (sbused(sb))) {
18225                 /*
18226                  * We have outstanding data, don't send a fin by itself!.
18227                  */
18228                 goto just_return;
18229         }
18230         /*
18231          * Grab a header mbuf, attaching a copy of data to be transmitted,
18232          * and initialize the header from the template for sends on this
18233          * connection.
18234          */
18235         hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
18236         if (len) {
18237                 uint32_t max_val;
18238                 uint32_t moff;
18239
18240                 if (rack->r_ctl.rc_pace_max_segs)
18241                         max_val = rack->r_ctl.rc_pace_max_segs;
18242                 else if (rack->rc_user_set_max_segs)
18243                         max_val = rack->rc_user_set_max_segs * segsiz;
18244                 else
18245                         max_val = len;
18246                 /*
18247                  * We allow a limit on sending with hptsi.
18248                  */
18249                 if (len > max_val) {
18250                         mark = 7;
18251                         len = max_val;
18252                 }
18253 #ifdef INET6
18254                 if (MHLEN < hdrlen + max_linkhdr)
18255                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18256                 else
18257 #endif
18258                         m = m_gethdr(M_NOWAIT, MT_DATA);
18259
18260                 if (m == NULL) {
18261                         SOCKBUF_UNLOCK(sb);
18262                         error = ENOBUFS;
18263                         sack_rxmit = 0;
18264                         goto out;
18265                 }
18266                 m->m_data += max_linkhdr;
18267                 m->m_len = hdrlen;
18268
18269                 /*
18270                  * Start the m_copy functions from the closest mbuf to the
18271                  * sb_offset in the socket buffer chain.
18272                  */
18273                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
18274                 s_mb = mb;
18275                 s_moff = moff;
18276                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
18277                         m_copydata(mb, moff, (int)len,
18278                                    mtod(m, caddr_t)+hdrlen);
18279                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18280                                 sbsndptr_adv(sb, mb, len);
18281                         m->m_len += len;
18282                 } else {
18283                         struct sockbuf *msb;
18284
18285                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18286                                 msb = NULL;
18287                         else
18288                                 msb = sb;
18289                         m->m_next = tcp_m_copym(
18290                                 mb, moff, &len,
18291                                 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
18292                                 ((rsm == NULL) ? hw_tls : 0)
18293 #ifdef NETFLIX_COPY_ARGS
18294                                 , &filled_all
18295 #endif
18296                                 );
18297                         if (len <= (tp->t_maxseg - optlen)) {
18298                                 /*
18299                                  * Must have ran out of mbufs for the copy
18300                                  * shorten it to no longer need tso. Lets
18301                                  * not put on sendalot since we are low on
18302                                  * mbufs.
18303                                  */
18304                                 tso = 0;
18305                         }
18306                         if (m->m_next == NULL) {
18307                                 SOCKBUF_UNLOCK(sb);
18308                                 (void)m_free(m);
18309                                 error = ENOBUFS;
18310                                 sack_rxmit = 0;
18311                                 goto out;
18312                         }
18313                 }
18314                 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
18315                         if (rsm && (rsm->r_flags & RACK_TLP)) {
18316                                 /*
18317                                  * TLP should not count in retran count, but
18318                                  * in its own bin
18319                                  */
18320                                 counter_u64_add(rack_tlp_retran, 1);
18321                                 counter_u64_add(rack_tlp_retran_bytes, len);
18322                         } else {
18323                                 tp->t_sndrexmitpack++;
18324                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18325                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18326                         }
18327 #ifdef STATS
18328                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18329                                                  len);
18330 #endif
18331                 } else {
18332                         KMOD_TCPSTAT_INC(tcps_sndpack);
18333                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
18334 #ifdef STATS
18335                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
18336                                                  len);
18337 #endif
18338                 }
18339                 /*
18340                  * If we're sending everything we've got, set PUSH. (This
18341                  * will keep happy those implementations which only give
18342                  * data to the user when a buffer fills or a PUSH comes in.)
18343                  */
18344                 if (sb_offset + len == sbused(sb) &&
18345                     sbused(sb) &&
18346                     !(flags & TH_SYN)) {
18347                         flags |= TH_PUSH;
18348                         add_flag |= RACK_HAD_PUSH;
18349                 }
18350
18351                 SOCKBUF_UNLOCK(sb);
18352         } else {
18353                 SOCKBUF_UNLOCK(sb);
18354                 if (tp->t_flags & TF_ACKNOW)
18355                         KMOD_TCPSTAT_INC(tcps_sndacks);
18356                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
18357                         KMOD_TCPSTAT_INC(tcps_sndctrl);
18358                 else
18359                         KMOD_TCPSTAT_INC(tcps_sndwinup);
18360
18361                 m = m_gethdr(M_NOWAIT, MT_DATA);
18362                 if (m == NULL) {
18363                         error = ENOBUFS;
18364                         sack_rxmit = 0;
18365                         goto out;
18366                 }
18367 #ifdef INET6
18368                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
18369                     MHLEN >= hdrlen) {
18370                         M_ALIGN(m, hdrlen);
18371                 } else
18372 #endif
18373                         m->m_data += max_linkhdr;
18374                 m->m_len = hdrlen;
18375         }
18376         SOCKBUF_UNLOCK_ASSERT(sb);
18377         m->m_pkthdr.rcvif = (struct ifnet *)0;
18378 #ifdef MAC
18379         mac_inpcb_create_mbuf(inp, m);
18380 #endif
18381         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) &&  rack->r_fsb_inited) {
18382 #ifdef INET6
18383                 if (isipv6)
18384                         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18385                 else
18386 #endif                          /* INET6 */
18387                         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18388                 th = rack->r_ctl.fsb.th;
18389                 udp = rack->r_ctl.fsb.udp;
18390                 if (udp) {
18391 #ifdef INET6
18392                         if (isipv6)
18393                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18394                         else
18395 #endif                          /* INET6 */
18396                                 ulen = hdrlen + len - sizeof(struct ip);
18397                         udp->uh_ulen = htons(ulen);
18398                 }
18399         } else {
18400 #ifdef INET6
18401                 if (isipv6) {
18402                         ip6 = mtod(m, struct ip6_hdr *);
18403                         if (tp->t_port) {
18404                                 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
18405                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18406                                 udp->uh_dport = tp->t_port;
18407                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18408                                 udp->uh_ulen = htons(ulen);
18409                                 th = (struct tcphdr *)(udp + 1);
18410                         } else
18411                                 th = (struct tcphdr *)(ip6 + 1);
18412                         tcpip_fillheaders(inp, tp->t_port, ip6, th);
18413                 } else
18414 #endif                          /* INET6 */
18415                 {
18416                         ip = mtod(m, struct ip *);
18417 #ifdef TCPDEBUG
18418                         ipov = (struct ipovly *)ip;
18419 #endif
18420                         if (tp->t_port) {
18421                                 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
18422                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18423                                 udp->uh_dport = tp->t_port;
18424                                 ulen = hdrlen + len - sizeof(struct ip);
18425                                 udp->uh_ulen = htons(ulen);
18426                                 th = (struct tcphdr *)(udp + 1);
18427                         } else
18428                                 th = (struct tcphdr *)(ip + 1);
18429                         tcpip_fillheaders(inp, tp->t_port, ip, th);
18430                 }
18431         }
18432         /*
18433          * Fill in fields, remembering maximum advertised window for use in
18434          * delaying messages about window sizes. If resending a FIN, be sure
18435          * not to use a new sequence number.
18436          */
18437         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
18438             tp->snd_nxt == tp->snd_max)
18439                 tp->snd_nxt--;
18440         /*
18441          * If we are starting a connection, send ECN setup SYN packet. If we
18442          * are on a retransmit, we may resend those bits a number of times
18443          * as per RFC 3168.
18444          */
18445         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
18446                 if (tp->t_rxtshift >= 1) {
18447                         if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
18448                                 flags |= TH_ECE | TH_CWR;
18449                 } else
18450                         flags |= TH_ECE | TH_CWR;
18451         }
18452         /* Handle parallel SYN for ECN */
18453         if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18454             (tp->t_flags2 & TF2_ECN_SND_ECE)) {
18455                 flags |= TH_ECE;
18456                 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18457         }
18458         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18459             (tp->t_flags2 & TF2_ECN_PERMIT)) {
18460                 /*
18461                  * If the peer has ECN, mark data packets with ECN capable
18462                  * transmission (ECT). Ignore pure ack packets,
18463                  * retransmissions.
18464                  */
18465                 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
18466                     (sack_rxmit == 0)) {
18467 #ifdef INET6
18468                         if (isipv6)
18469                                 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
18470                         else
18471 #endif
18472                                 ip->ip_tos |= IPTOS_ECN_ECT0;
18473                         KMOD_TCPSTAT_INC(tcps_ecn_ect0);
18474                         /*
18475                          * Reply with proper ECN notifications.
18476                          * Only set CWR on new data segments.
18477                          */
18478                         if (tp->t_flags2 & TF2_ECN_SND_CWR) {
18479                                 flags |= TH_CWR;
18480                                 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
18481                         }
18482                 }
18483                 if (tp->t_flags2 & TF2_ECN_SND_ECE)
18484                         flags |= TH_ECE;
18485         }
18486         /*
18487          * If we are doing retransmissions, then snd_nxt will not reflect
18488          * the first unsent octet.  For ACK only packets, we do not want the
18489          * sequence number of the retransmitted packet, we want the sequence
18490          * number of the next unsent octet.  So, if there is no data (and no
18491          * SYN or FIN), use snd_max instead of snd_nxt when filling in
18492          * ti_seq.  But if we are in persist state, snd_max might reflect
18493          * one byte beyond the right edge of the window, so use snd_nxt in
18494          * that case, since we know we aren't doing a retransmission.
18495          * (retransmit and persist are mutually exclusive...)
18496          */
18497         if (sack_rxmit == 0) {
18498                 if (len || (flags & (TH_SYN | TH_FIN))) {
18499                         th->th_seq = htonl(tp->snd_nxt);
18500                         rack_seq = tp->snd_nxt;
18501                 } else {
18502                         th->th_seq = htonl(tp->snd_max);
18503                         rack_seq = tp->snd_max;
18504                 }
18505         } else {
18506                 th->th_seq = htonl(rsm->r_start);
18507                 rack_seq = rsm->r_start;
18508         }
18509         th->th_ack = htonl(tp->rcv_nxt);
18510         th->th_flags = flags;
18511         /*
18512          * Calculate receive window.  Don't shrink window, but avoid silly
18513          * window syndrome.
18514          * If a RST segment is sent, advertise a window of zero.
18515          */
18516         if (flags & TH_RST) {
18517                 recwin = 0;
18518         } else {
18519                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
18520                     recwin < (long)segsiz) {
18521                         recwin = 0;
18522                 }
18523                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
18524                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
18525                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
18526         }
18527
18528         /*
18529          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
18530          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
18531          * handled in syncache.
18532          */
18533         if (flags & TH_SYN)
18534                 th->th_win = htons((u_short)
18535                                    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
18536         else {
18537                 /* Avoid shrinking window with window scaling. */
18538                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
18539                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
18540         }
18541         /*
18542          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
18543          * window.  This may cause the remote transmitter to stall.  This
18544          * flag tells soreceive() to disable delayed acknowledgements when
18545          * draining the buffer.  This can occur if the receiver is
18546          * attempting to read more data than can be buffered prior to
18547          * transmitting on the connection.
18548          */
18549         if (th->th_win == 0) {
18550                 tp->t_sndzerowin++;
18551                 tp->t_flags |= TF_RXWIN0SENT;
18552         } else
18553                 tp->t_flags &= ~TF_RXWIN0SENT;
18554         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
18555         /* Now are we using fsb?, if so copy the template data to the mbuf */
18556         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
18557                 uint8_t *cpto;
18558
18559                 cpto = mtod(m, uint8_t *);
18560                 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18561                 /*
18562                  * We have just copied in:
18563                  * IP/IP6
18564                  * <optional udphdr>
18565                  * tcphdr (no options)
18566                  *
18567                  * We need to grab the correct pointers into the mbuf
18568                  * for both the tcp header, and possibly the udp header (if tunneling).
18569                  * We do this by using the offset in the copy buffer and adding it
18570                  * to the mbuf base pointer (cpto).
18571                  */
18572 #ifdef INET6
18573                 if (isipv6)
18574                         ip6 = mtod(m, struct ip6_hdr *);
18575                 else
18576 #endif                          /* INET6 */
18577                         ip = mtod(m, struct ip *);
18578                 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18579                 /* If we have a udp header lets set it into the mbuf as well */
18580                 if (udp)
18581                         udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
18582         }
18583 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18584         if (to.to_flags & TOF_SIGNATURE) {
18585                 /*
18586                  * Calculate MD5 signature and put it into the place
18587                  * determined before.
18588                  * NOTE: since TCP options buffer doesn't point into
18589                  * mbuf's data, calculate offset and use it.
18590                  */
18591                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
18592                                                        (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
18593                         /*
18594                          * Do not send segment if the calculation of MD5
18595                          * digest has failed.
18596                          */
18597                         goto out;
18598                 }
18599         }
18600 #endif
18601         if (optlen) {
18602                 bcopy(opt, th + 1, optlen);
18603                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18604         }
18605         /*
18606          * Put TCP length in extended header, and then checksum extended
18607          * header and data.
18608          */
18609         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18610 #ifdef INET6
18611         if (isipv6) {
18612                 /*
18613                  * ip6_plen is not need to be filled now, and will be filled
18614                  * in ip6_output.
18615                  */
18616                 if (tp->t_port) {
18617                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18618                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18619                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18620                         th->th_sum = htons(0);
18621                         UDPSTAT_INC(udps_opackets);
18622                 } else {
18623                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18624                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18625                         th->th_sum = in6_cksum_pseudo(ip6,
18626                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18627                                                       0);
18628                 }
18629         }
18630 #endif
18631 #if defined(INET6) && defined(INET)
18632         else
18633 #endif
18634 #ifdef INET
18635         {
18636                 if (tp->t_port) {
18637                         m->m_pkthdr.csum_flags = CSUM_UDP;
18638                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18639                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18640                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18641                         th->th_sum = htons(0);
18642                         UDPSTAT_INC(udps_opackets);
18643                 } else {
18644                         m->m_pkthdr.csum_flags = CSUM_TCP;
18645                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18646                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
18647                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18648                                                                         IPPROTO_TCP + len + optlen));
18649                 }
18650                 /* IP version must be set here for ipv4/ipv6 checking later */
18651                 KASSERT(ip->ip_v == IPVERSION,
18652                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18653         }
18654 #endif
18655         /*
18656          * Enable TSO and specify the size of the segments. The TCP pseudo
18657          * header checksum is always provided. XXX: Fixme: This is currently
18658          * not the case for IPv6.
18659          */
18660         if (tso) {
18661                 KASSERT(len > tp->t_maxseg - optlen,
18662                         ("%s: len <= tso_segsz", __func__));
18663                 m->m_pkthdr.csum_flags |= CSUM_TSO;
18664                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
18665         }
18666         KASSERT(len + hdrlen == m_length(m, NULL),
18667                 ("%s: mbuf chain different than expected: %d + %u != %u",
18668                  __func__, len, hdrlen, m_length(m, NULL)));
18669
18670 #ifdef TCP_HHOOK
18671         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
18672         hhook_run_tcp_est_out(tp, th, &to, len, tso);
18673 #endif
18674         /* We're getting ready to send; log now. */
18675         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
18676                 union tcp_log_stackspecific log;
18677
18678                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18679                 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
18680                 log.u_bbr.ininput = rack->rc_inp->inp_in_input;
18681                 if (rack->rack_no_prr)
18682                         log.u_bbr.flex1 = 0;
18683                 else
18684                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18685                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18686                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18687                 log.u_bbr.flex4 = orig_len;
18688                 if (filled_all)
18689                         log.u_bbr.flex5 = 0x80000000;
18690                 else
18691                         log.u_bbr.flex5 = 0;
18692                 /* Save off the early/late values */
18693                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18694                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18695                 log.u_bbr.bw_inuse = rack_get_bw(rack);
18696                 if (rsm || sack_rxmit) {
18697                         if (doing_tlp)
18698                                 log.u_bbr.flex8 = 2;
18699                         else
18700                                 log.u_bbr.flex8 = 1;
18701                 } else {
18702                         if (doing_tlp)
18703                                 log.u_bbr.flex8 = 3;
18704                         else
18705                                 log.u_bbr.flex8 = 0;
18706                 }
18707                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
18708                 log.u_bbr.flex7 = mark;
18709                 log.u_bbr.flex7 <<= 8;
18710                 log.u_bbr.flex7 |= pass;
18711                 log.u_bbr.pkts_out = tp->t_maxseg;
18712                 log.u_bbr.timeStamp = cts;
18713                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18714                 log.u_bbr.lt_epoch = cwnd_to_use;
18715                 log.u_bbr.delivered = sendalot;
18716                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
18717                                      len, &log, false, NULL, NULL, 0, &tv);
18718         } else
18719                 lgb = NULL;
18720
18721         /*
18722          * Fill in IP length and desired time to live and send to IP level.
18723          * There should be a better way to handle ttl and tos; we could keep
18724          * them in the template, but need a way to checksum without them.
18725          */
18726         /*
18727          * m->m_pkthdr.len should have been set before cksum calcuration,
18728          * because in6_cksum() need it.
18729          */
18730 #ifdef INET6
18731         if (isipv6) {
18732                 /*
18733                  * we separately set hoplimit for every segment, since the
18734                  * user might want to change the value via setsockopt. Also,
18735                  * desired default hop limit might be changed via Neighbor
18736                  * Discovery.
18737                  */
18738                 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
18739
18740                 /*
18741                  * Set the packet size here for the benefit of DTrace
18742                  * probes. ip6_output() will set it properly; it's supposed
18743                  * to include the option header lengths as well.
18744                  */
18745                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18746
18747                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18748                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18749                 else
18750                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18751
18752                 if (tp->t_state == TCPS_SYN_SENT)
18753                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
18754
18755                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
18756                 /* TODO: IPv6 IP6TOS_ECT bit on */
18757                 error = ip6_output(m,
18758 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18759                                    inp->in6p_outputopts,
18760 #else
18761                                    NULL,
18762 #endif
18763                                    &inp->inp_route6,
18764                                    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
18765                                    NULL, NULL, inp);
18766
18767                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
18768                         mtu = inp->inp_route6.ro_nh->nh_mtu;
18769         }
18770 #endif                          /* INET6 */
18771 #if defined(INET) && defined(INET6)
18772         else
18773 #endif
18774 #ifdef INET
18775         {
18776                 ip->ip_len = htons(m->m_pkthdr.len);
18777 #ifdef INET6
18778                 if (inp->inp_vflag & INP_IPV6PROTO)
18779                         ip->ip_ttl = in6_selecthlim(inp, NULL);
18780 #endif                          /* INET6 */
18781                 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
18782                 /*
18783                  * If we do path MTU discovery, then we set DF on every
18784                  * packet. This might not be the best thing to do according
18785                  * to RFC3390 Section 2. However the tcp hostcache migitates
18786                  * the problem so it affects only the first tcp connection
18787                  * with a host.
18788                  *
18789                  * NB: Don't set DF on small MTU/MSS to have a safe
18790                  * fallback.
18791                  */
18792                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18793                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18794                         if (tp->t_port == 0 || len < V_tcp_minmss) {
18795                                 ip->ip_off |= htons(IP_DF);
18796                         }
18797                 } else {
18798                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18799                 }
18800
18801                 if (tp->t_state == TCPS_SYN_SENT)
18802                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
18803
18804                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
18805
18806                 error = ip_output(m,
18807 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18808                                   inp->inp_options,
18809 #else
18810                                   NULL,
18811 #endif
18812                                   &inp->inp_route,
18813                                   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
18814                                   inp);
18815                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
18816                         mtu = inp->inp_route.ro_nh->nh_mtu;
18817         }
18818 #endif                          /* INET */
18819
18820 out:
18821         if (lgb) {
18822                 lgb->tlb_errno = error;
18823                 lgb = NULL;
18824         }
18825         /*
18826          * In transmit state, time the transmission and arrange for the
18827          * retransmit.  In persist state, just set snd_max.
18828          */
18829         if (error == 0) {
18830                 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
18831                 if (rsm && doing_tlp) {
18832                         rack->rc_last_sent_tlp_past_cumack = 0;
18833                         rack->rc_last_sent_tlp_seq_valid = 1;
18834                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18835                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18836                 }
18837                 rack->forced_ack = 0;   /* If we send something zap the FA flag */
18838                 if (rsm && (doing_tlp == 0)) {
18839                         /* Set we retransmitted */
18840                         rack->rc_gp_saw_rec = 1;
18841                 } else {
18842                         if (cwnd_to_use > tp->snd_ssthresh) {
18843                                 /* Set we sent in CA */
18844                                 rack->rc_gp_saw_ca = 1;
18845                         } else {
18846                                 /* Set we sent in SS */
18847                                 rack->rc_gp_saw_ss = 1;
18848                         }
18849                 }
18850                 if (doing_tlp && (rsm == NULL)) {
18851                         /* Make sure new data TLP cnt is clear */
18852                         rack->r_ctl.rc_tlp_new_data = 0;
18853                 }
18854                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18855                     (tp->t_flags & TF_SACK_PERMIT) &&
18856                     tp->rcv_numsacks > 0)
18857                         tcp_clean_dsack_blocks(tp);
18858                 tot_len_this_send += len;
18859                 if (len == 0)
18860                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
18861                 else if (len == 1) {
18862                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
18863                 } else if (len > 1) {
18864                         int idx;
18865
18866                         idx = (len / segsiz) + 3;
18867                         if (idx >= TCP_MSS_ACCT_ATIMER)
18868                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18869                         else
18870                                 counter_u64_add(rack_out_size[idx], 1);
18871                 }
18872         }
18873         if ((rack->rack_no_prr == 0) &&
18874             sub_from_prr &&
18875             (error == 0)) {
18876                 if (rack->r_ctl.rc_prr_sndcnt >= len)
18877                         rack->r_ctl.rc_prr_sndcnt -= len;
18878                 else
18879                         rack->r_ctl.rc_prr_sndcnt = 0;
18880         }
18881         sub_from_prr = 0;
18882         if (doing_tlp) {
18883                 /* Make sure the TLP is added */
18884                 add_flag |= RACK_TLP;
18885         } else if (rsm) {
18886                 /* If its a resend without TLP then it must not have the flag */
18887                 rsm->r_flags &= ~RACK_TLP;
18888         }
18889         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
18890                         rack_to_usec_ts(&tv),
18891                         rsm, add_flag, s_mb, s_moff, hw_tls);
18892
18893
18894         if ((error == 0) &&
18895             (len > 0) &&
18896             (tp->snd_una == tp->snd_max))
18897                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
18898         {
18899                 tcp_seq startseq = tp->snd_nxt;
18900
18901                 /* Track our lost count */
18902                 if (rsm && (doing_tlp == 0))
18903                         rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
18904                 /*
18905                  * Advance snd_nxt over sequence space of this segment.
18906                  */
18907                 if (error)
18908                         /* We don't log or do anything with errors */
18909                         goto nomore;
18910                 if (doing_tlp == 0) {
18911                         if (rsm == NULL) {
18912                                 /*
18913                                  * Not a retransmission of some
18914                                  * sort, new data is going out so
18915                                  * clear our TLP count and flag.
18916                                  */
18917                                 rack->rc_tlp_in_progress = 0;
18918                                 rack->r_ctl.rc_tlp_cnt_out = 0;
18919                         }
18920                 } else {
18921                         /*
18922                          * We have just sent a TLP, mark that it is true
18923                          * and make sure our in progress is set so we
18924                          * continue to check the count.
18925                          */
18926                         rack->rc_tlp_in_progress = 1;
18927                         rack->r_ctl.rc_tlp_cnt_out++;
18928                 }
18929                 if (flags & (TH_SYN | TH_FIN)) {
18930                         if (flags & TH_SYN)
18931                                 tp->snd_nxt++;
18932                         if (flags & TH_FIN) {
18933                                 tp->snd_nxt++;
18934                                 tp->t_flags |= TF_SENTFIN;
18935                         }
18936                 }
18937                 /* In the ENOBUFS case we do *not* update snd_max */
18938                 if (sack_rxmit)
18939                         goto nomore;
18940
18941                 tp->snd_nxt += len;
18942                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
18943                         if (tp->snd_una == tp->snd_max) {
18944                                 /*
18945                                  * Update the time we just added data since
18946                                  * none was outstanding.
18947                                  */
18948                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
18949                                 tp->t_acktime = ticks;
18950                         }
18951                         tp->snd_max = tp->snd_nxt;
18952                         /*
18953                          * Time this transmission if not a retransmission and
18954                          * not currently timing anything.
18955                          * This is only relevant in case of switching back to
18956                          * the base stack.
18957                          */
18958                         if (tp->t_rtttime == 0) {
18959                                 tp->t_rtttime = ticks;
18960                                 tp->t_rtseq = startseq;
18961                                 KMOD_TCPSTAT_INC(tcps_segstimed);
18962                         }
18963                         if (len &&
18964                             ((tp->t_flags & TF_GPUTINPROG) == 0))
18965                                 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
18966                 }
18967                 /*
18968                  * If we are doing FO we need to update the mbuf position and subtract
18969                  * this happens when the peer sends us duplicate information and
18970                  * we thus want to send a DSACK.
18971                  *
18972                  * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
18973                  * turned off? If not then we are going to echo multiple DSACK blocks
18974                  * out (with the TSO), which we should not be doing.
18975                  */
18976                 if (rack->r_fast_output && len) {
18977                         if (rack->r_ctl.fsb.left_to_send > len)
18978                                 rack->r_ctl.fsb.left_to_send -= len;
18979                         else
18980                                 rack->r_ctl.fsb.left_to_send = 0;
18981                         if (rack->r_ctl.fsb.left_to_send < segsiz)
18982                                 rack->r_fast_output = 0;
18983                         if (rack->r_fast_output) {
18984                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
18985                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
18986                         }
18987                 }
18988         }
18989 nomore:
18990         if (error) {
18991                 rack->r_ctl.rc_agg_delayed = 0;
18992                 rack->r_early = 0;
18993                 rack->r_late = 0;
18994                 rack->r_ctl.rc_agg_early = 0;
18995                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
18996                 /*
18997                  * Failures do not advance the seq counter above. For the
18998                  * case of ENOBUFS we will fall out and retry in 1ms with
18999                  * the hpts. Everything else will just have to retransmit
19000                  * with the timer.
19001                  *
19002                  * In any case, we do not want to loop around for another
19003                  * send without a good reason.
19004                  */
19005                 sendalot = 0;
19006                 switch (error) {
19007                 case EPERM:
19008                         tp->t_softerror = error;
19009 #ifdef TCP_ACCOUNTING
19010                         crtsc = get_cyclecount();
19011                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19012                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19013                         }
19014                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19015                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19016                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19017                         }
19018                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19019                         sched_unpin();
19020 #endif
19021                         return (error);
19022                 case ENOBUFS:
19023                         /*
19024                          * Pace us right away to retry in a some
19025                          * time
19026                          */
19027                         slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
19028                         if (rack->rc_enobuf < 0x7f)
19029                                 rack->rc_enobuf++;
19030                         if (slot < (10 * HPTS_USEC_IN_MSEC))
19031                                 slot = 10 * HPTS_USEC_IN_MSEC;
19032                         if (rack->r_ctl.crte != NULL) {
19033                                 counter_u64_add(rack_saw_enobuf_hw, 1);
19034                                 tcp_rl_log_enobuf(rack->r_ctl.crte);
19035                         }
19036                         counter_u64_add(rack_saw_enobuf, 1);
19037                         goto enobufs;
19038                 case EMSGSIZE:
19039                         /*
19040                          * For some reason the interface we used initially
19041                          * to send segments changed to another or lowered
19042                          * its MTU. If TSO was active we either got an
19043                          * interface without TSO capabilits or TSO was
19044                          * turned off. If we obtained mtu from ip_output()
19045                          * then update it and try again.
19046                          */
19047                         if (tso)
19048                                 tp->t_flags &= ~TF_TSO;
19049                         if (mtu != 0) {
19050                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
19051                                 goto again;
19052                         }
19053                         slot = 10 * HPTS_USEC_IN_MSEC;
19054                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
19055 #ifdef TCP_ACCOUNTING
19056                         crtsc = get_cyclecount();
19057                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19058                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19059                         }
19060                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19061                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19062                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19063                         }
19064                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19065                         sched_unpin();
19066 #endif
19067                         return (error);
19068                 case ENETUNREACH:
19069                         counter_u64_add(rack_saw_enetunreach, 1);
19070                 case EHOSTDOWN:
19071                 case EHOSTUNREACH:
19072                 case ENETDOWN:
19073                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
19074                                 tp->t_softerror = error;
19075                         }
19076                         /* FALLTHROUGH */
19077                 default:
19078                         slot = 10 * HPTS_USEC_IN_MSEC;
19079                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
19080 #ifdef TCP_ACCOUNTING
19081                         crtsc = get_cyclecount();
19082                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19083                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
19084                         }
19085                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
19086                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19087                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
19088                         }
19089                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
19090                         sched_unpin();
19091 #endif
19092                         return (error);
19093                 }
19094         } else {
19095                 rack->rc_enobuf = 0;
19096                 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
19097                         rack->r_ctl.retran_during_recovery += len;
19098         }
19099         KMOD_TCPSTAT_INC(tcps_sndtotal);
19100
19101         /*
19102          * Data sent (as far as we can tell). If this advertises a larger
19103          * window than any other segment, then remember the size of the
19104          * advertised window. Any pending ACK has now been sent.
19105          */
19106         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
19107                 tp->rcv_adv = tp->rcv_nxt + recwin;
19108
19109         tp->last_ack_sent = tp->rcv_nxt;
19110         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
19111 enobufs:
19112         if (sendalot) {
19113                 /* Do we need to turn off sendalot? */
19114                 if (rack->r_ctl.rc_pace_max_segs &&
19115                     (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
19116                         /* We hit our max. */
19117                         sendalot = 0;
19118                 } else if ((rack->rc_user_set_max_segs) &&
19119                            (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
19120                         /* We hit the user defined max */
19121                         sendalot = 0;
19122                 }
19123         }
19124         if ((error == 0) && (flags & TH_FIN))
19125                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
19126         if (flags & TH_RST) {
19127                 /*
19128                  * We don't send again after sending a RST.
19129                  */
19130                 slot = 0;
19131                 sendalot = 0;
19132                 if (error == 0)
19133                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
19134         } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
19135                 /*
19136                  * Get our pacing rate, if an error
19137                  * occurred in sending (ENOBUF) we would
19138                  * hit the else if with slot preset. Other
19139                  * errors return.
19140                  */
19141                 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
19142         }
19143         if (rsm &&
19144             (rsm->r_flags & RACK_HAS_SYN) == 0 &&
19145             rack->use_rack_rr) {
19146                 /* Its a retransmit and we use the rack cheat? */
19147                 if ((slot == 0) ||
19148                     (rack->rc_always_pace == 0) ||
19149                     (rack->r_rr_config == 1)) {
19150                         /*
19151                          * We have no pacing set or we
19152                          * are using old-style rack or
19153                          * we are overriden to use the old 1ms pacing.
19154                          */
19155                         slot = rack->r_ctl.rc_min_to;
19156                 }
19157         }
19158         /* We have sent clear the flag */
19159         rack->r_ent_rec_ns = 0;
19160         if (rack->r_must_retran) {
19161                 if (rsm) {
19162                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
19163                         if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
19164                                 /*
19165                                  * We have retransmitted all.
19166                                  */
19167                                 rack->r_must_retran = 0;
19168                                 rack->r_ctl.rc_out_at_rto = 0;
19169                         }
19170                 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19171                         /*
19172                          * Sending new data will also kill
19173                          * the loop.
19174                          */
19175                         rack->r_must_retran = 0;
19176                         rack->r_ctl.rc_out_at_rto = 0;
19177                 }
19178         }
19179         rack->r_ctl.fsb.recwin = recwin;
19180         if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
19181             SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19182                 /*
19183                  * We hit an RTO and now have past snd_max at the RTO
19184                  * clear all the WAS flags.
19185                  */
19186                 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
19187         }
19188         if (slot) {
19189                 /* set the rack tcb into the slot N */
19190                 counter_u64_add(rack_paced_segments, 1);
19191                 if ((error == 0) &&
19192                     rack_use_rfo &&
19193                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19194                     (rsm == NULL) &&
19195                     (tp->snd_nxt == tp->snd_max) &&
19196                     (ipoptlen == 0) &&
19197                     (tp->rcv_numsacks == 0) &&
19198                     rack->r_fsb_inited &&
19199                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19200                     (rack->r_must_retran == 0) &&
19201                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19202                     (len > 0) && (orig_len > 0) &&
19203                     (orig_len > len) &&
19204                     ((orig_len - len) >= segsiz) &&
19205                     ((optlen == 0) ||
19206                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19207                         /* We can send at least one more MSS using our fsb */
19208
19209                         rack->r_fast_output = 1;
19210                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19211                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19212                         rack->r_ctl.fsb.tcp_flags = flags;
19213                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19214                         if (hw_tls)
19215                                 rack->r_ctl.fsb.hw_tls = 1;
19216                         else
19217                                 rack->r_ctl.fsb.hw_tls = 0;
19218                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19219                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19220                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19221                                  (tp->snd_max - tp->snd_una)));
19222                         if (rack->r_ctl.fsb.left_to_send < segsiz)
19223                                 rack->r_fast_output = 0;
19224                         else {
19225                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19226                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19227                                 else
19228                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19229                         }
19230                 } else
19231                         rack->r_fast_output = 0;
19232                 rack_log_fsb(rack, tp, so, flags,
19233                              ipoptlen, orig_len, len, error,
19234                              (rsm == NULL), optlen, __LINE__, 2);
19235         } else if (sendalot) {
19236                 int ret;
19237
19238                 if (len)
19239                         counter_u64_add(rack_unpaced_segments, 1);
19240                 sack_rxmit = 0;
19241                 if ((error == 0) &&
19242                     rack_use_rfo &&
19243                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19244                     (rsm == NULL) &&
19245                     (ipoptlen == 0) &&
19246                     (tp->rcv_numsacks == 0) &&
19247                     (tp->snd_nxt == tp->snd_max) &&
19248                     (rack->r_must_retran == 0) &&
19249                     rack->r_fsb_inited &&
19250                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19251                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19252                     (len > 0) && (orig_len > 0) &&
19253                     (orig_len > len) &&
19254                     ((orig_len - len) >= segsiz) &&
19255                     ((optlen == 0) ||
19256                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19257                         /* we can use fast_output for more */
19258
19259                         rack->r_fast_output = 1;
19260                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19261                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19262                         rack->r_ctl.fsb.tcp_flags = flags;
19263                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19264                         if (hw_tls)
19265                                 rack->r_ctl.fsb.hw_tls = 1;
19266                         else
19267                                 rack->r_ctl.fsb.hw_tls = 0;
19268                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19269                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19270                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19271                                  (tp->snd_max - tp->snd_una)));
19272                         if (rack->r_ctl.fsb.left_to_send < segsiz) {
19273                                 rack->r_fast_output = 0;
19274                         }
19275                         if (rack->r_fast_output) {
19276                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19277                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19278                                 else
19279                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19280                                 rack_log_fsb(rack, tp, so, flags,
19281                                              ipoptlen, orig_len, len, error,
19282                                              (rsm == NULL), optlen, __LINE__, 3);
19283                                 error = 0;
19284                                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
19285                                 if (ret >= 0)
19286                                         return (ret);
19287                                 else if (error)
19288                                         goto nomore;
19289
19290                         }
19291                 }
19292                 goto again;
19293         } else if (len) {
19294                 counter_u64_add(rack_unpaced_segments, 1);
19295         }
19296         /* Assure when we leave that snd_nxt will point to top */
19297         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
19298                 tp->snd_nxt = tp->snd_max;
19299         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
19300 #ifdef TCP_ACCOUNTING
19301         crtsc = get_cyclecount() - ts_val;
19302         if (tot_len_this_send) {
19303                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19304                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
19305                 }
19306                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
19307                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19308                         tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
19309                 }
19310                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc);
19311                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19312                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
19313                 }
19314                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz));
19315         } else {
19316                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19317                         tp->tcp_cnt_counters[SND_OUT_ACK]++;
19318                 }
19319                 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1);
19320                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19321                         tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
19322                 }
19323                 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc);
19324         }
19325         sched_unpin();
19326 #endif
19327         if (error == ENOBUFS)
19328                 error = 0;
19329         return (error);
19330 }
19331
19332 static void
19333 rack_update_seg(struct tcp_rack *rack)
19334 {
19335         uint32_t orig_val;
19336
19337         orig_val = rack->r_ctl.rc_pace_max_segs;
19338         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
19339         if (orig_val != rack->r_ctl.rc_pace_max_segs)
19340                 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
19341 }
19342
19343 static void
19344 rack_mtu_change(struct tcpcb *tp)
19345 {
19346         /*
19347          * The MSS may have changed
19348          */
19349         struct tcp_rack *rack;
19350         struct rack_sendmap *rsm;
19351
19352         rack = (struct tcp_rack *)tp->t_fb_ptr;
19353         if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
19354                 /*
19355                  * The MTU has changed we need to resend everything
19356                  * since all we have sent is lost. We first fix
19357                  * up the mtu though.
19358                  */
19359                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
19360                 /* We treat this like a full retransmit timeout without the cwnd adjustment */
19361                 rack_remxt_tmr(tp);
19362                 rack->r_fast_output = 0;
19363                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
19364                                                 rack->r_ctl.rc_sacked);
19365                 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
19366                 rack->r_must_retran = 1;
19367                 /* Mark all inflight to needing to be rxt'd */
19368                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
19369                         rsm->r_flags |= RACK_MUST_RXT;
19370                 }
19371         }
19372         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
19373         /* We don't use snd_nxt to retransmit */
19374         tp->snd_nxt = tp->snd_max;
19375 }
19376
19377 static int
19378 rack_set_profile(struct tcp_rack *rack, int prof)
19379 {
19380         int err = EINVAL;
19381         if (prof == 1) {
19382                 /* pace_always=1 */
19383                 if (rack->rc_always_pace == 0) {
19384                         if (tcp_can_enable_pacing() == 0)
19385                                 return (EBUSY);
19386                 }
19387                 rack->rc_always_pace = 1;
19388                 if (rack->use_fixed_rate || rack->gp_ready)
19389                         rack_set_cc_pacing(rack);
19390                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19391                 rack->rack_attempt_hdwr_pace = 0;
19392                 /* cmpack=1 */
19393                 if (rack_use_cmp_acks)
19394                         rack->r_use_cmp_ack = 1;
19395                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19396                     rack->r_use_cmp_ack)
19397                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19398                 /* scwnd=1 */
19399                 rack->rack_enable_scwnd = 1;
19400                 /* dynamic=100 */
19401                 rack->rc_gp_dyn_mul = 1;
19402                 /* gp_inc_ca */
19403                 rack->r_ctl.rack_per_of_gp_ca = 100;
19404                 /* rrr_conf=3 */
19405                 rack->r_rr_config = 3;
19406                 /* npush=2 */
19407                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19408                 /* fillcw=1 */
19409                 rack->rc_pace_to_cwnd = 1;
19410                 rack->rc_pace_fill_if_rttin_range = 0;
19411                 rack->rtt_limit_mul = 0;
19412                 /* noprr=1 */
19413                 rack->rack_no_prr = 1;
19414                 /* lscwnd=1 */
19415                 rack->r_limit_scw = 1;
19416                 /* gp_inc_rec */
19417                 rack->r_ctl.rack_per_of_gp_rec = 90;
19418                 err = 0;
19419
19420         } else if (prof == 3) {
19421                 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
19422                 /* pace_always=1 */
19423                 if (rack->rc_always_pace == 0) {
19424                         if (tcp_can_enable_pacing() == 0)
19425                                 return (EBUSY);
19426                 }
19427                 rack->rc_always_pace = 1;
19428                 if (rack->use_fixed_rate || rack->gp_ready)
19429                         rack_set_cc_pacing(rack);
19430                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19431                 rack->rack_attempt_hdwr_pace = 0;
19432                 /* cmpack=1 */
19433                 if (rack_use_cmp_acks)
19434                         rack->r_use_cmp_ack = 1;
19435                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19436                     rack->r_use_cmp_ack)
19437                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19438                 /* scwnd=1 */
19439                 rack->rack_enable_scwnd = 1;
19440                 /* dynamic=100 */
19441                 rack->rc_gp_dyn_mul = 1;
19442                 /* gp_inc_ca */
19443                 rack->r_ctl.rack_per_of_gp_ca = 100;
19444                 /* rrr_conf=3 */
19445                 rack->r_rr_config = 3;
19446                 /* npush=2 */
19447                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19448                 /* fillcw=2 */
19449                 rack->rc_pace_to_cwnd = 1;
19450                 rack->r_fill_less_agg = 1;
19451                 rack->rc_pace_fill_if_rttin_range = 0;
19452                 rack->rtt_limit_mul = 0;
19453                 /* noprr=1 */
19454                 rack->rack_no_prr = 1;
19455                 /* lscwnd=1 */
19456                 rack->r_limit_scw = 1;
19457                 /* gp_inc_rec */
19458                 rack->r_ctl.rack_per_of_gp_rec = 90;
19459                 err = 0;
19460
19461
19462         } else if (prof == 2) {
19463                 /* cmpack=1 */
19464                 if (rack->rc_always_pace == 0) {
19465                         if (tcp_can_enable_pacing() == 0)
19466                                 return (EBUSY);
19467                 }
19468                 rack->rc_always_pace = 1;
19469                 if (rack->use_fixed_rate || rack->gp_ready)
19470                         rack_set_cc_pacing(rack);
19471                 rack->r_use_cmp_ack = 1;
19472                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19473                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19474                 /* pace_always=1 */
19475                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19476                 /* scwnd=1 */
19477                 rack->rack_enable_scwnd = 1;
19478                 /* dynamic=100 */
19479                 rack->rc_gp_dyn_mul = 1;
19480                 rack->r_ctl.rack_per_of_gp_ca = 100;
19481                 /* rrr_conf=3 */
19482                 rack->r_rr_config = 3;
19483                 /* npush=2 */
19484                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19485                 /* fillcw=1 */
19486                 rack->rc_pace_to_cwnd = 1;
19487                 rack->rc_pace_fill_if_rttin_range = 0;
19488                 rack->rtt_limit_mul = 0;
19489                 /* noprr=1 */
19490                 rack->rack_no_prr = 1;
19491                 /* lscwnd=0 */
19492                 rack->r_limit_scw = 0;
19493                 err = 0;
19494         } else if (prof == 0) {
19495                 /* This changes things back to the default settings */
19496                 err = 0;
19497                 if (rack->rc_always_pace) {
19498                         tcp_decrement_paced_conn();
19499                         rack_undo_cc_pacing(rack);
19500                         rack->rc_always_pace = 0;
19501                 }
19502                 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
19503                         rack->rc_always_pace = 1;
19504                         if (rack->use_fixed_rate || rack->gp_ready)
19505                                 rack_set_cc_pacing(rack);
19506                 } else
19507                         rack->rc_always_pace = 0;
19508                 if (rack_dsack_std_based & 0x1) {
19509                         /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
19510                         rack->rc_rack_tmr_std_based = 1;
19511                 }
19512                 if (rack_dsack_std_based & 0x2) {
19513                         /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
19514                         rack->rc_rack_use_dsack = 1;
19515                 }
19516                 if (rack_use_cmp_acks)
19517                         rack->r_use_cmp_ack = 1;
19518                 else
19519                         rack->r_use_cmp_ack = 0;
19520                 if (rack_disable_prr)
19521                         rack->rack_no_prr = 1;
19522                 else
19523                         rack->rack_no_prr = 0;
19524                 if (rack_gp_no_rec_chg)
19525                         rack->rc_gp_no_rec_chg = 1;
19526                 else
19527                         rack->rc_gp_no_rec_chg = 0;
19528                 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
19529                         rack->r_mbuf_queue = 1;
19530                         if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19531                                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19532                         rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19533                 } else {
19534                         rack->r_mbuf_queue = 0;
19535                         rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19536                 }
19537                 if (rack_enable_shared_cwnd)
19538                         rack->rack_enable_scwnd = 1;
19539                 else
19540                         rack->rack_enable_scwnd = 0;
19541                 if (rack_do_dyn_mul) {
19542                         /* When dynamic adjustment is on CA needs to start at 100% */
19543                         rack->rc_gp_dyn_mul = 1;
19544                         if (rack_do_dyn_mul >= 100)
19545                                 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
19546                 } else {
19547                         rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
19548                         rack->rc_gp_dyn_mul = 0;
19549                 }
19550                 rack->r_rr_config = 0;
19551                 rack->r_ctl.rc_no_push_at_mrtt = 0;
19552                 rack->rc_pace_to_cwnd = 0;
19553                 rack->rc_pace_fill_if_rttin_range = 0;
19554                 rack->rtt_limit_mul = 0;
19555
19556                 if (rack_enable_hw_pacing)
19557                         rack->rack_hdw_pace_ena = 1;
19558                 else
19559                         rack->rack_hdw_pace_ena = 0;
19560                 if (rack_disable_prr)
19561                         rack->rack_no_prr = 1;
19562                 else
19563                         rack->rack_no_prr = 0;
19564                 if (rack_limits_scwnd)
19565                         rack->r_limit_scw  = 1;
19566                 else
19567                         rack->r_limit_scw  = 0;
19568                 err = 0;
19569         }
19570         return (err);
19571 }
19572
19573 static int
19574 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
19575 {
19576         struct deferred_opt_list *dol;
19577
19578         dol = malloc(sizeof(struct deferred_opt_list),
19579                      M_TCPFSB, M_NOWAIT|M_ZERO);
19580         if (dol == NULL) {
19581                 /*
19582                  * No space yikes -- fail out..
19583                  */
19584                 return (0);
19585         }
19586         dol->optname = sopt_name;
19587         dol->optval = loptval;
19588         TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
19589         return (1);
19590 }
19591
19592 static int
19593 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
19594                     uint32_t optval, uint64_t loptval)
19595 {
19596         struct epoch_tracker et;
19597         struct sockopt sopt;
19598         struct cc_newreno_opts opt;
19599         uint64_t val;
19600         int error = 0;
19601         uint16_t ca, ss;
19602
19603         switch (sopt_name) {
19604
19605         case TCP_RACK_DSACK_OPT:
19606                 RACK_OPTS_INC(tcp_rack_dsack_opt);
19607                 if (optval & 0x1) {
19608                         rack->rc_rack_tmr_std_based = 1;
19609                 } else {
19610                         rack->rc_rack_tmr_std_based = 0;
19611                 }
19612                 if (optval & 0x2) {
19613                         rack->rc_rack_use_dsack = 1;
19614                 } else {
19615                         rack->rc_rack_use_dsack = 0;
19616                 }
19617                 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
19618                 break;
19619         case TCP_RACK_PACING_BETA:
19620                 RACK_OPTS_INC(tcp_rack_beta);
19621                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19622                         /* This only works for newreno. */
19623                         error = EINVAL;
19624                         break;
19625                 }
19626                 if (rack->rc_pacing_cc_set) {
19627                         /*
19628                          * Set them into the real CC module
19629                          * whats in the rack pcb is the old values
19630                          * to be used on restoral/
19631                          */
19632                         sopt.sopt_dir = SOPT_SET;
19633                         opt.name = CC_NEWRENO_BETA;
19634                         opt.val = optval;
19635                         if (CC_ALGO(tp)->ctl_output != NULL)
19636                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19637                         else {
19638                                 error = ENOENT;
19639                                 break;
19640                         }
19641                 } else {
19642                         /*
19643                          * Not pacing yet so set it into our local
19644                          * rack pcb storage.
19645                          */
19646                         rack->r_ctl.rc_saved_beta.beta = optval;
19647                 }
19648                 break;
19649         case TCP_RACK_TIMER_SLOP:
19650                 RACK_OPTS_INC(tcp_rack_timer_slop);
19651                 rack->r_ctl.timer_slop = optval;
19652                 if (rack->rc_tp->t_srtt) {
19653                         /*
19654                          * If we have an SRTT lets update t_rxtcur
19655                          * to have the new slop.
19656                          */
19657                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
19658                                            rack_rto_min, rack_rto_max,
19659                                            rack->r_ctl.timer_slop);
19660                 }
19661                 break;
19662         case TCP_RACK_PACING_BETA_ECN:
19663                 RACK_OPTS_INC(tcp_rack_beta_ecn);
19664                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19665                         /* This only works for newreno. */
19666                         error = EINVAL;
19667                         break;
19668                 }
19669                 if (rack->rc_pacing_cc_set) {
19670                         /*
19671                          * Set them into the real CC module
19672                          * whats in the rack pcb is the old values
19673                          * to be used on restoral/
19674                          */
19675                         sopt.sopt_dir = SOPT_SET;
19676                         opt.name = CC_NEWRENO_BETA_ECN;
19677                         opt.val = optval;
19678                         if (CC_ALGO(tp)->ctl_output != NULL)
19679                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19680                         else
19681                                 error = ENOENT;
19682                 } else {
19683                         /*
19684                          * Not pacing yet so set it into our local
19685                          * rack pcb storage.
19686                          */
19687                         rack->r_ctl.rc_saved_beta.beta_ecn = optval;
19688                         rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED;
19689                 }
19690                 break;
19691         case TCP_DEFER_OPTIONS:
19692                 RACK_OPTS_INC(tcp_defer_opt);
19693                 if (optval) {
19694                         if (rack->gp_ready) {
19695                                 /* Too late */
19696                                 error = EINVAL;
19697                                 break;
19698                         }
19699                         rack->defer_options = 1;
19700                 } else
19701                         rack->defer_options = 0;
19702                 break;
19703         case TCP_RACK_MEASURE_CNT:
19704                 RACK_OPTS_INC(tcp_rack_measure_cnt);
19705                 if (optval && (optval <= 0xff)) {
19706                         rack->r_ctl.req_measurements = optval;
19707                 } else
19708                         error = EINVAL;
19709                 break;
19710         case TCP_REC_ABC_VAL:
19711                 RACK_OPTS_INC(tcp_rec_abc_val);
19712                 if (optval > 0)
19713                         rack->r_use_labc_for_rec = 1;
19714                 else
19715                         rack->r_use_labc_for_rec = 0;
19716                 break;
19717         case TCP_RACK_ABC_VAL:
19718                 RACK_OPTS_INC(tcp_rack_abc_val);
19719                 if ((optval > 0) && (optval < 255))
19720                         rack->rc_labc = optval;
19721                 else
19722                         error = EINVAL;
19723                 break;
19724         case TCP_HDWR_UP_ONLY:
19725                 RACK_OPTS_INC(tcp_pacing_up_only);
19726                 if (optval)
19727                         rack->r_up_only = 1;
19728                 else
19729                         rack->r_up_only = 0;
19730                 break;
19731         case TCP_PACING_RATE_CAP:
19732                 RACK_OPTS_INC(tcp_pacing_rate_cap);
19733                 rack->r_ctl.bw_rate_cap = loptval;
19734                 break;
19735         case TCP_RACK_PROFILE:
19736                 RACK_OPTS_INC(tcp_profile);
19737                 error = rack_set_profile(rack, optval);
19738                 break;
19739         case TCP_USE_CMP_ACKS:
19740                 RACK_OPTS_INC(tcp_use_cmp_acks);
19741                 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
19742                         /* You can't turn it off once its on! */
19743                         error = EINVAL;
19744                 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
19745                         rack->r_use_cmp_ack = 1;
19746                         rack->r_mbuf_queue = 1;
19747                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19748                 }
19749                 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
19750                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19751                 break;
19752         case TCP_SHARED_CWND_TIME_LIMIT:
19753                 RACK_OPTS_INC(tcp_lscwnd);
19754                 if (optval)
19755                         rack->r_limit_scw = 1;
19756                 else
19757                         rack->r_limit_scw = 0;
19758                 break;
19759         case TCP_RACK_PACE_TO_FILL:
19760                 RACK_OPTS_INC(tcp_fillcw);
19761                 if (optval == 0)
19762                         rack->rc_pace_to_cwnd = 0;
19763                 else {
19764                         rack->rc_pace_to_cwnd = 1;
19765                         if (optval > 1)
19766                                 rack->r_fill_less_agg = 1;
19767                 }
19768                 if ((optval >= rack_gp_rtt_maxmul) &&
19769                     rack_gp_rtt_maxmul &&
19770                     (optval < 0xf)) {
19771                         rack->rc_pace_fill_if_rttin_range = 1;
19772                         rack->rtt_limit_mul = optval;
19773                 } else {
19774                         rack->rc_pace_fill_if_rttin_range = 0;
19775                         rack->rtt_limit_mul = 0;
19776                 }
19777                 break;
19778         case TCP_RACK_NO_PUSH_AT_MAX:
19779                 RACK_OPTS_INC(tcp_npush);
19780                 if (optval == 0)
19781                         rack->r_ctl.rc_no_push_at_mrtt = 0;
19782                 else if (optval < 0xff)
19783                         rack->r_ctl.rc_no_push_at_mrtt = optval;
19784                 else
19785                         error = EINVAL;
19786                 break;
19787         case TCP_SHARED_CWND_ENABLE:
19788                 RACK_OPTS_INC(tcp_rack_scwnd);
19789                 if (optval == 0)
19790                         rack->rack_enable_scwnd = 0;
19791                 else
19792                         rack->rack_enable_scwnd = 1;
19793                 break;
19794         case TCP_RACK_MBUF_QUEUE:
19795                 /* Now do we use the LRO mbuf-queue feature */
19796                 RACK_OPTS_INC(tcp_rack_mbufq);
19797                 if (optval || rack->r_use_cmp_ack)
19798                         rack->r_mbuf_queue = 1;
19799                 else
19800                         rack->r_mbuf_queue = 0;
19801                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19802                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19803                 else
19804                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19805                 break;
19806         case TCP_RACK_NONRXT_CFG_RATE:
19807                 RACK_OPTS_INC(tcp_rack_cfg_rate);
19808                 if (optval == 0)
19809                         rack->rack_rec_nonrxt_use_cr = 0;
19810                 else
19811                         rack->rack_rec_nonrxt_use_cr = 1;
19812                 break;
19813         case TCP_NO_PRR:
19814                 RACK_OPTS_INC(tcp_rack_noprr);
19815                 if (optval == 0)
19816                         rack->rack_no_prr = 0;
19817                 else if (optval == 1)
19818                         rack->rack_no_prr = 1;
19819                 else if (optval == 2)
19820                         rack->no_prr_addback = 1;
19821                 else
19822                         error = EINVAL;
19823                 break;
19824         case TCP_TIMELY_DYN_ADJ:
19825                 RACK_OPTS_INC(tcp_timely_dyn);
19826                 if (optval == 0)
19827                         rack->rc_gp_dyn_mul = 0;
19828                 else {
19829                         rack->rc_gp_dyn_mul = 1;
19830                         if (optval >= 100) {
19831                                 /*
19832                                  * If the user sets something 100 or more
19833                                  * its the gp_ca value.
19834                                  */
19835                                 rack->r_ctl.rack_per_of_gp_ca  = optval;
19836                         }
19837                 }
19838                 break;
19839         case TCP_RACK_DO_DETECTION:
19840                 RACK_OPTS_INC(tcp_rack_do_detection);
19841                 if (optval == 0)
19842                         rack->do_detection = 0;
19843                 else
19844                         rack->do_detection = 1;
19845                 break;
19846         case TCP_RACK_TLP_USE:
19847                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
19848                         error = EINVAL;
19849                         break;
19850                 }
19851                 RACK_OPTS_INC(tcp_tlp_use);
19852                 rack->rack_tlp_threshold_use = optval;
19853                 break;
19854         case TCP_RACK_TLP_REDUCE:
19855                 /* RACK TLP cwnd reduction (bool) */
19856                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
19857                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
19858                 break;
19859         /*  Pacing related ones */
19860         case TCP_RACK_PACE_ALWAYS:
19861                 /*
19862                  * zero is old rack method, 1 is new
19863                  * method using a pacing rate.
19864                  */
19865                 RACK_OPTS_INC(tcp_rack_pace_always);
19866                 if (optval > 0) {
19867                         if (rack->rc_always_pace) {
19868                                 error = EALREADY;
19869                                 break;
19870                         } else if (tcp_can_enable_pacing()) {
19871                                 rack->rc_always_pace = 1;
19872                                 if (rack->use_fixed_rate || rack->gp_ready)
19873                                         rack_set_cc_pacing(rack);
19874                         }
19875                         else {
19876                                 error = ENOSPC;
19877                                 break;
19878                         }
19879                 } else {
19880                         if (rack->rc_always_pace) {
19881                                 tcp_decrement_paced_conn();
19882                                 rack->rc_always_pace = 0;
19883                                 rack_undo_cc_pacing(rack);
19884                         }
19885                 }
19886                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19887                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19888                 else
19889                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19890                 /* A rate may be set irate or other, if so set seg size */
19891                 rack_update_seg(rack);
19892                 break;
19893         case TCP_BBR_RACK_INIT_RATE:
19894                 RACK_OPTS_INC(tcp_initial_rate);
19895                 val = optval;
19896                 /* Change from kbits per second to bytes per second */
19897                 val *= 1000;
19898                 val /= 8;
19899                 rack->r_ctl.init_rate = val;
19900                 if (rack->rc_init_win != rack_default_init_window) {
19901                         uint32_t win, snt;
19902
19903                         /*
19904                          * Options don't always get applied
19905                          * in the order you think. So in order
19906                          * to assure we update a cwnd we need
19907                          * to check and see if we are still
19908                          * where we should raise the cwnd.
19909                          */
19910                         win = rc_init_window(rack);
19911                         if (SEQ_GT(tp->snd_max, tp->iss))
19912                                 snt = tp->snd_max - tp->iss;
19913                         else
19914                                 snt = 0;
19915                         if ((snt < win) &&
19916                             (tp->snd_cwnd < win))
19917                                 tp->snd_cwnd = win;
19918                 }
19919                 if (rack->rc_always_pace)
19920                         rack_update_seg(rack);
19921                 break;
19922         case TCP_BBR_IWINTSO:
19923                 RACK_OPTS_INC(tcp_initial_win);
19924                 if (optval && (optval <= 0xff)) {
19925                         uint32_t win, snt;
19926
19927                         rack->rc_init_win = optval;
19928                         win = rc_init_window(rack);
19929                         if (SEQ_GT(tp->snd_max, tp->iss))
19930                                 snt = tp->snd_max - tp->iss;
19931                         else
19932                                 snt = 0;
19933                         if ((snt < win) &&
19934                             (tp->t_srtt |
19935 #ifdef NETFLIX_PEAKRATE
19936                              tp->t_maxpeakrate |
19937 #endif
19938                              rack->r_ctl.init_rate)) {
19939                                 /*
19940                                  * We are not past the initial window
19941                                  * and we have some bases for pacing,
19942                                  * so we need to possibly adjust up
19943                                  * the cwnd. Note even if we don't set
19944                                  * the cwnd, its still ok to raise the rc_init_win
19945                                  * which can be used coming out of idle when we
19946                                  * would have a rate.
19947                                  */
19948                                 if (tp->snd_cwnd < win)
19949                                         tp->snd_cwnd = win;
19950                         }
19951                         if (rack->rc_always_pace)
19952                                 rack_update_seg(rack);
19953                 } else
19954                         error = EINVAL;
19955                 break;
19956         case TCP_RACK_FORCE_MSEG:
19957                 RACK_OPTS_INC(tcp_rack_force_max_seg);
19958                 if (optval)
19959                         rack->rc_force_max_seg = 1;
19960                 else
19961                         rack->rc_force_max_seg = 0;
19962                 break;
19963         case TCP_RACK_PACE_MAX_SEG:
19964                 /* Max segments size in a pace in bytes */
19965                 RACK_OPTS_INC(tcp_rack_max_seg);
19966                 rack->rc_user_set_max_segs = optval;
19967                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
19968                 break;
19969         case TCP_RACK_PACE_RATE_REC:
19970                 /* Set the fixed pacing rate in Bytes per second ca */
19971                 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
19972                 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
19973                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
19974                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
19975                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
19976                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
19977                 rack->use_fixed_rate = 1;
19978                 if (rack->rc_always_pace)
19979                         rack_set_cc_pacing(rack);
19980                 rack_log_pacing_delay_calc(rack,
19981                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
19982                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
19983                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
19984                                            __LINE__, NULL,0);
19985                 break;
19986
19987         case TCP_RACK_PACE_RATE_SS:
19988                 /* Set the fixed pacing rate in Bytes per second ca */
19989                 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
19990                 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
19991                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
19992                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
19993                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
19994                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
19995                 rack->use_fixed_rate = 1;
19996                 if (rack->rc_always_pace)
19997                         rack_set_cc_pacing(rack);
19998                 rack_log_pacing_delay_calc(rack,
19999                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
20000                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
20001                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
20002                                            __LINE__, NULL, 0);
20003                 break;
20004
20005         case TCP_RACK_PACE_RATE_CA:
20006                 /* Set the fixed pacing rate in Bytes per second ca */
20007                 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
20008                 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
20009                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
20010                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
20011                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
20012                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
20013                 rack->use_fixed_rate = 1;
20014                 if (rack->rc_always_pace)
20015                         rack_set_cc_pacing(rack);
20016                 rack_log_pacing_delay_calc(rack,
20017                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
20018                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
20019                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
20020                                            __LINE__, NULL, 0);
20021                 break;
20022         case TCP_RACK_GP_INCREASE_REC:
20023                 RACK_OPTS_INC(tcp_gp_inc_rec);
20024                 rack->r_ctl.rack_per_of_gp_rec = optval;
20025                 rack_log_pacing_delay_calc(rack,
20026                                            rack->r_ctl.rack_per_of_gp_ss,
20027                                            rack->r_ctl.rack_per_of_gp_ca,
20028                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20029                                            __LINE__, NULL, 0);
20030                 break;
20031         case TCP_RACK_GP_INCREASE_CA:
20032                 RACK_OPTS_INC(tcp_gp_inc_ca);
20033                 ca = optval;
20034                 if (ca < 100) {
20035                         /*
20036                          * We don't allow any reduction
20037                          * over the GP b/w.
20038                          */
20039                         error = EINVAL;
20040                         break;
20041                 }
20042                 rack->r_ctl.rack_per_of_gp_ca = ca;
20043                 rack_log_pacing_delay_calc(rack,
20044                                            rack->r_ctl.rack_per_of_gp_ss,
20045                                            rack->r_ctl.rack_per_of_gp_ca,
20046                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20047                                            __LINE__, NULL, 0);
20048                 break;
20049         case TCP_RACK_GP_INCREASE_SS:
20050                 RACK_OPTS_INC(tcp_gp_inc_ss);
20051                 ss = optval;
20052                 if (ss < 100) {
20053                         /*
20054                          * We don't allow any reduction
20055                          * over the GP b/w.
20056                          */
20057                         error = EINVAL;
20058                         break;
20059                 }
20060                 rack->r_ctl.rack_per_of_gp_ss = ss;
20061                 rack_log_pacing_delay_calc(rack,
20062                                            rack->r_ctl.rack_per_of_gp_ss,
20063                                            rack->r_ctl.rack_per_of_gp_ca,
20064                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
20065                                            __LINE__, NULL, 0);
20066                 break;
20067         case TCP_RACK_RR_CONF:
20068                 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
20069                 if (optval && optval <= 3)
20070                         rack->r_rr_config = optval;
20071                 else
20072                         rack->r_rr_config = 0;
20073                 break;
20074         case TCP_HDWR_RATE_CAP:
20075                 RACK_OPTS_INC(tcp_hdwr_rate_cap);
20076                 if (optval) {
20077                         if (rack->r_rack_hw_rate_caps == 0)
20078                                 rack->r_rack_hw_rate_caps = 1;
20079                         else
20080                                 error = EALREADY;
20081                 } else {
20082                         rack->r_rack_hw_rate_caps = 0;
20083                 }
20084                 break;
20085         case TCP_BBR_HDWR_PACE:
20086                 RACK_OPTS_INC(tcp_hdwr_pacing);
20087                 if (optval){
20088                         if (rack->rack_hdrw_pacing == 0) {
20089                                 rack->rack_hdw_pace_ena = 1;
20090                                 rack->rack_attempt_hdwr_pace = 0;
20091                         } else
20092                                 error = EALREADY;
20093                 } else {
20094                         rack->rack_hdw_pace_ena = 0;
20095 #ifdef RATELIMIT
20096                         if (rack->r_ctl.crte != NULL) {
20097                                 rack->rack_hdrw_pacing = 0;
20098                                 rack->rack_attempt_hdwr_pace = 0;
20099                                 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
20100                                 rack->r_ctl.crte = NULL;
20101                         }
20102 #endif
20103                 }
20104                 break;
20105         /*  End Pacing related ones */
20106         case TCP_RACK_PRR_SENDALOT:
20107                 /* Allow PRR to send more than one seg */
20108                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
20109                 rack->r_ctl.rc_prr_sendalot = optval;
20110                 break;
20111         case TCP_RACK_MIN_TO:
20112                 /* Minimum time between rack t-o's in ms */
20113                 RACK_OPTS_INC(tcp_rack_min_to);
20114                 rack->r_ctl.rc_min_to = optval;
20115                 break;
20116         case TCP_RACK_EARLY_SEG:
20117                 /* If early recovery max segments */
20118                 RACK_OPTS_INC(tcp_rack_early_seg);
20119                 rack->r_ctl.rc_early_recovery_segs = optval;
20120                 break;
20121         case TCP_RACK_ENABLE_HYSTART:
20122         {
20123                 struct sockopt sopt;
20124                 struct cc_newreno_opts opt;
20125
20126                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
20127                 sopt.sopt_dir = SOPT_SET;
20128                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
20129                 opt.val = optval;
20130                 if (CC_ALGO(tp)->ctl_output != NULL)
20131                         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
20132                 else
20133                         error = EINVAL;
20134         }
20135         break;
20136         case TCP_RACK_REORD_THRESH:
20137                 /* RACK reorder threshold (shift amount) */
20138                 RACK_OPTS_INC(tcp_rack_reord_thresh);
20139                 if ((optval > 0) && (optval < 31))
20140                         rack->r_ctl.rc_reorder_shift = optval;
20141                 else
20142                         error = EINVAL;
20143                 break;
20144         case TCP_RACK_REORD_FADE:
20145                 /* Does reordering fade after ms time */
20146                 RACK_OPTS_INC(tcp_rack_reord_fade);
20147                 rack->r_ctl.rc_reorder_fade = optval;
20148                 break;
20149         case TCP_RACK_TLP_THRESH:
20150                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20151                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
20152                 if (optval)
20153                         rack->r_ctl.rc_tlp_threshold = optval;
20154                 else
20155                         error = EINVAL;
20156                 break;
20157         case TCP_BBR_USE_RACK_RR:
20158                 RACK_OPTS_INC(tcp_rack_rr);
20159                 if (optval)
20160                         rack->use_rack_rr = 1;
20161                 else
20162                         rack->use_rack_rr = 0;
20163                 break;
20164         case TCP_FAST_RSM_HACK:
20165                 RACK_OPTS_INC(tcp_rack_fastrsm_hack);
20166                 if (optval)
20167                         rack->fast_rsm_hack = 1;
20168                 else
20169                         rack->fast_rsm_hack = 0;
20170                 break;
20171         case TCP_RACK_PKT_DELAY:
20172                 /* RACK added ms i.e. rack-rtt + reord + N */
20173                 RACK_OPTS_INC(tcp_rack_pkt_delay);
20174                 rack->r_ctl.rc_pkt_delay = optval;
20175                 break;
20176         case TCP_DELACK:
20177                 RACK_OPTS_INC(tcp_rack_delayed_ack);
20178                 if (optval == 0)
20179                         tp->t_delayed_ack = 0;
20180                 else
20181                         tp->t_delayed_ack = 1;
20182                 if (tp->t_flags & TF_DELACK) {
20183                         tp->t_flags &= ~TF_DELACK;
20184                         tp->t_flags |= TF_ACKNOW;
20185                         NET_EPOCH_ENTER(et);
20186                         rack_output(tp);
20187                         NET_EPOCH_EXIT(et);
20188                 }
20189                 break;
20190
20191         case TCP_BBR_RACK_RTT_USE:
20192                 RACK_OPTS_INC(tcp_rack_rtt_use);
20193                 if ((optval != USE_RTT_HIGH) &&
20194                     (optval != USE_RTT_LOW) &&
20195                     (optval != USE_RTT_AVG))
20196                         error = EINVAL;
20197                 else
20198                         rack->r_ctl.rc_rate_sample_method = optval;
20199                 break;
20200         case TCP_DATA_AFTER_CLOSE:
20201                 RACK_OPTS_INC(tcp_data_after_close);
20202                 if (optval)
20203                         rack->rc_allow_data_af_clo = 1;
20204                 else
20205                         rack->rc_allow_data_af_clo = 0;
20206                 break;
20207         default:
20208                 break;
20209         }
20210 #ifdef NETFLIX_STATS
20211         tcp_log_socket_option(tp, sopt_name, optval, error);
20212 #endif
20213         return (error);
20214 }
20215
20216
20217 static void
20218 rack_apply_deferred_options(struct tcp_rack *rack)
20219 {
20220         struct deferred_opt_list *dol, *sdol;
20221         uint32_t s_optval;
20222
20223         TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
20224                 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
20225                 /* Disadvantage of deferal is you loose the error return */
20226                 s_optval = (uint32_t)dol->optval;
20227                 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
20228                 free(dol, M_TCPDO);
20229         }
20230 }
20231
20232 static void
20233 rack_hw_tls_change(struct tcpcb *tp, int chg)
20234 {
20235         /*
20236          * HW tls state has changed.. fix all
20237          * rsm's in flight.
20238          */
20239         struct tcp_rack *rack;
20240         struct rack_sendmap *rsm;
20241
20242         rack = (struct tcp_rack *)tp->t_fb_ptr;
20243         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
20244                 if (chg)
20245                         rsm->r_hw_tls = 1;
20246                 else
20247                         rsm->r_hw_tls = 0;
20248         }
20249         if (chg)
20250                 rack->r_ctl.fsb.hw_tls = 1;
20251         else
20252                 rack->r_ctl.fsb.hw_tls = 0;
20253 }
20254
20255 static int
20256 rack_pru_options(struct tcpcb *tp, int flags)
20257 {
20258         if (flags & PRUS_OOB)
20259                 return (EOPNOTSUPP);
20260         return (0);
20261 }
20262
20263 static struct tcp_function_block __tcp_rack = {
20264         .tfb_tcp_block_name = __XSTRING(STACKNAME),
20265         .tfb_tcp_output = rack_output,
20266         .tfb_do_queued_segments = ctf_do_queued_segments,
20267         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
20268         .tfb_tcp_do_segment = rack_do_segment,
20269         .tfb_tcp_ctloutput = rack_ctloutput,
20270         .tfb_tcp_fb_init = rack_init,
20271         .tfb_tcp_fb_fini = rack_fini,
20272         .tfb_tcp_timer_stop_all = rack_stopall,
20273         .tfb_tcp_timer_activate = rack_timer_activate,
20274         .tfb_tcp_timer_active = rack_timer_active,
20275         .tfb_tcp_timer_stop = rack_timer_stop,
20276         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
20277         .tfb_tcp_handoff_ok = rack_handoff_ok,
20278         .tfb_tcp_mtu_chg = rack_mtu_change,
20279         .tfb_pru_options = rack_pru_options,
20280         .tfb_hwtls_change = rack_hw_tls_change,
20281 };
20282
20283 /*
20284  * rack_ctloutput() must drop the inpcb lock before performing copyin on
20285  * socket option arguments.  When it re-acquires the lock after the copy, it
20286  * has to revalidate that the connection is still valid for the socket
20287  * option.
20288  */
20289 static int
20290 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
20291     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
20292 {
20293 #ifdef INET6
20294         struct ip6_hdr *ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
20295 #endif
20296 #ifdef INET
20297         struct ip *ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
20298 #endif
20299         uint64_t loptval;
20300         int32_t error = 0, optval;
20301
20302         switch (sopt->sopt_level) {
20303 #ifdef INET6
20304         case IPPROTO_IPV6:
20305                 MPASS(inp->inp_vflag & INP_IPV6PROTO);
20306                 switch (sopt->sopt_name) {
20307                 case IPV6_USE_MIN_MTU:
20308                         tcp6_use_min_mtu(tp);
20309                         break;
20310                 case IPV6_TCLASS:
20311                         /*
20312                          * The DSCP codepoint has changed, update the fsb.
20313                          */
20314                         ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
20315                             (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK);
20316                         break;
20317                 }
20318                 INP_WUNLOCK(inp);
20319                 return (0);
20320 #endif
20321 #ifdef INET
20322         case IPPROTO_IP:
20323                 switch (sopt->sopt_name) {
20324                 case IP_TOS:
20325                         /*
20326                          * The DSCP codepoint has changed, update the fsb.
20327                          */
20328                         ip->ip_tos = rack->rc_inp->inp_ip_tos;
20329                         break;
20330                 case IP_TTL:
20331                         /*
20332                          * The TTL has changed, update the fsb.
20333                          */
20334                         ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
20335                         break;
20336                 }
20337                 INP_WUNLOCK(inp);
20338                 return (0);
20339 #endif
20340         }
20341
20342         switch (sopt->sopt_name) {
20343         case TCP_RACK_TLP_REDUCE:               /*  URL:tlp_reduce */
20344         /*  Pacing related ones */
20345         case TCP_RACK_PACE_ALWAYS:              /*  URL:pace_always */
20346         case TCP_BBR_RACK_INIT_RATE:            /*  URL:irate */
20347         case TCP_BBR_IWINTSO:                   /*  URL:tso_iwin */
20348         case TCP_RACK_PACE_MAX_SEG:             /*  URL:pace_max_seg */
20349         case TCP_RACK_FORCE_MSEG:               /*  URL:force_max_seg */
20350         case TCP_RACK_PACE_RATE_CA:             /*  URL:pr_ca */
20351         case TCP_RACK_PACE_RATE_SS:             /*  URL:pr_ss*/
20352         case TCP_RACK_PACE_RATE_REC:            /*  URL:pr_rec */
20353         case TCP_RACK_GP_INCREASE_CA:           /*  URL:gp_inc_ca */
20354         case TCP_RACK_GP_INCREASE_SS:           /*  URL:gp_inc_ss */
20355         case TCP_RACK_GP_INCREASE_REC:          /*  URL:gp_inc_rec */
20356         case TCP_RACK_RR_CONF:                  /*  URL:rrr_conf */
20357         case TCP_BBR_HDWR_PACE:                 /*  URL:hdwrpace */
20358         case TCP_HDWR_RATE_CAP:                 /*  URL:hdwrcap boolean */
20359         case TCP_PACING_RATE_CAP:               /*  URL:cap  -- used by side-channel */
20360         case TCP_HDWR_UP_ONLY:                  /*  URL:uponly -- hardware pacing  boolean */
20361        /* End pacing related */
20362         case TCP_FAST_RSM_HACK:                 /*  URL:frsm_hack */
20363         case TCP_DELACK:                        /*  URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
20364         case TCP_RACK_PRR_SENDALOT:             /*  URL:prr_sendalot */
20365         case TCP_RACK_MIN_TO:                   /*  URL:min_to */
20366         case TCP_RACK_EARLY_SEG:                /*  URL:early_seg */
20367         case TCP_RACK_REORD_THRESH:             /*  URL:reord_thresh */
20368         case TCP_RACK_REORD_FADE:               /*  URL:reord_fade */
20369         case TCP_RACK_TLP_THRESH:               /*  URL:tlp_thresh */
20370         case TCP_RACK_PKT_DELAY:                /*  URL:pkt_delay */
20371         case TCP_RACK_TLP_USE:                  /*  URL:tlp_use */
20372         case TCP_BBR_RACK_RTT_USE:              /*  URL:rttuse */
20373         case TCP_BBR_USE_RACK_RR:               /*  URL:rackrr */
20374         case TCP_RACK_DO_DETECTION:             /*  URL:detect */
20375         case TCP_NO_PRR:                        /*  URL:noprr */
20376         case TCP_TIMELY_DYN_ADJ:                /*  URL:dynamic */
20377         case TCP_DATA_AFTER_CLOSE:              /*  no URL */
20378         case TCP_RACK_NONRXT_CFG_RATE:          /*  URL:nonrxtcr */
20379         case TCP_SHARED_CWND_ENABLE:            /*  URL:scwnd */
20380         case TCP_RACK_MBUF_QUEUE:               /*  URL:mqueue */
20381         case TCP_RACK_NO_PUSH_AT_MAX:           /*  URL:npush */
20382         case TCP_RACK_PACE_TO_FILL:             /*  URL:fillcw */
20383         case TCP_SHARED_CWND_TIME_LIMIT:        /*  URL:lscwnd */
20384         case TCP_RACK_PROFILE:                  /*  URL:profile */
20385         case TCP_USE_CMP_ACKS:                  /*  URL:cmpack */
20386         case TCP_RACK_ABC_VAL:                  /*  URL:labc */
20387         case TCP_REC_ABC_VAL:                   /*  URL:reclabc */
20388         case TCP_RACK_MEASURE_CNT:              /*  URL:measurecnt */
20389         case TCP_DEFER_OPTIONS:                 /*  URL:defer */
20390         case TCP_RACK_DSACK_OPT:                /*  URL:dsack */
20391         case TCP_RACK_PACING_BETA:              /*  URL:pacing_beta */
20392         case TCP_RACK_PACING_BETA_ECN:          /*  URL:pacing_beta_ecn */
20393         case TCP_RACK_TIMER_SLOP:               /*  URL:timer_slop */
20394         case TCP_RACK_ENABLE_HYSTART:           /*  URL:hystart */
20395                 break;
20396         default:
20397                 /* Filter off all unknown options to the base stack */
20398                 return (tcp_default_ctloutput(so, sopt, inp, tp));
20399                 break;
20400         }
20401         INP_WUNLOCK(inp);
20402         if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
20403                 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
20404                 /*
20405                  * We truncate it down to 32 bits for the socket-option trace this
20406                  * means rates > 34Gbps won't show right, but thats probably ok.
20407                  */
20408                 optval = (uint32_t)loptval;
20409         } else {
20410                 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
20411                 /* Save it in 64 bit form too */
20412                 loptval = optval;
20413         }
20414         if (error)
20415                 return (error);
20416         INP_WLOCK(inp);
20417         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
20418                 INP_WUNLOCK(inp);
20419                 return (ECONNRESET);
20420         }
20421         if (tp->t_fb != &__tcp_rack) {
20422                 INP_WUNLOCK(inp);
20423                 return (ENOPROTOOPT);
20424         }
20425         if (rack->defer_options && (rack->gp_ready == 0) &&
20426             (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
20427             (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
20428             (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
20429             (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
20430                 /* Options are beind deferred */
20431                 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
20432                         INP_WUNLOCK(inp);
20433                         return (0);
20434                 } else {
20435                         /* No memory to defer, fail */
20436                         INP_WUNLOCK(inp);
20437                         return (ENOMEM);
20438                 }
20439         }
20440         error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
20441         INP_WUNLOCK(inp);
20442         return (error);
20443 }
20444
20445 static void
20446 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
20447 {
20448
20449         INP_WLOCK_ASSERT(tp->t_inpcb);
20450         bzero(ti, sizeof(*ti));
20451
20452         ti->tcpi_state = tp->t_state;
20453         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
20454                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
20455         if (tp->t_flags & TF_SACK_PERMIT)
20456                 ti->tcpi_options |= TCPI_OPT_SACK;
20457         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
20458                 ti->tcpi_options |= TCPI_OPT_WSCALE;
20459                 ti->tcpi_snd_wscale = tp->snd_scale;
20460                 ti->tcpi_rcv_wscale = tp->rcv_scale;
20461         }
20462         if (tp->t_flags2 & TF2_ECN_PERMIT)
20463                 ti->tcpi_options |= TCPI_OPT_ECN;
20464         if (tp->t_flags & TF_FASTOPEN)
20465                 ti->tcpi_options |= TCPI_OPT_TFO;
20466         /* still kept in ticks is t_rcvtime */
20467         ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
20468         /* Since we hold everything in precise useconds this is easy */
20469         ti->tcpi_rtt = tp->t_srtt;
20470         ti->tcpi_rttvar = tp->t_rttvar;
20471         ti->tcpi_rto = tp->t_rxtcur;
20472         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
20473         ti->tcpi_snd_cwnd = tp->snd_cwnd;
20474         /*
20475          * FreeBSD-specific extension fields for tcp_info.
20476          */
20477         ti->tcpi_rcv_space = tp->rcv_wnd;
20478         ti->tcpi_rcv_nxt = tp->rcv_nxt;
20479         ti->tcpi_snd_wnd = tp->snd_wnd;
20480         ti->tcpi_snd_bwnd = 0;          /* Unused, kept for compat. */
20481         ti->tcpi_snd_nxt = tp->snd_nxt;
20482         ti->tcpi_snd_mss = tp->t_maxseg;
20483         ti->tcpi_rcv_mss = tp->t_maxseg;
20484         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
20485         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
20486         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
20487 #ifdef NETFLIX_STATS
20488         ti->tcpi_total_tlp = tp->t_sndtlppack;
20489         ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
20490         memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
20491 #endif
20492 #ifdef TCP_OFFLOAD
20493         if (tp->t_flags & TF_TOE) {
20494                 ti->tcpi_options |= TCPI_OPT_TOE;
20495                 tcp_offload_tcp_info(tp, ti);
20496         }
20497 #endif
20498 }
20499
20500 static int
20501 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
20502     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
20503 {
20504         int32_t error, optval;
20505         uint64_t val, loptval;
20506         struct  tcp_info ti;
20507         /*
20508          * Because all our options are either boolean or an int, we can just
20509          * pull everything into optval and then unlock and copy. If we ever
20510          * add a option that is not a int, then this will have quite an
20511          * impact to this routine.
20512          */
20513         error = 0;
20514         switch (sopt->sopt_name) {
20515         case TCP_INFO:
20516                 /* First get the info filled */
20517                 rack_fill_info(tp, &ti);
20518                 /* Fix up the rtt related fields if needed */
20519                 INP_WUNLOCK(inp);
20520                 error = sooptcopyout(sopt, &ti, sizeof ti);
20521                 return (error);
20522         /*
20523          * Beta is the congestion control value for NewReno that influences how
20524          * much of a backoff happens when loss is detected. It is normally set
20525          * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
20526          * when you exit recovery.
20527          */
20528         case TCP_RACK_PACING_BETA:
20529                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20530                         error = EINVAL;
20531                 else if (rack->rc_pacing_cc_set == 0)
20532                         optval = rack->r_ctl.rc_saved_beta.beta;
20533                 else {
20534                         /*
20535                          * Reach out into the CC data and report back what
20536                          * I have previously set. Yeah it looks hackish but
20537                          * we don't want to report the saved values.
20538                          */
20539                         if (tp->ccv->cc_data)
20540                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta;
20541                         else
20542                                 error = EINVAL;
20543                 }
20544                 break;
20545                 /*
20546                  * Beta_ecn is the congestion control value for NewReno that influences how
20547                  * much of a backoff happens when a ECN mark is detected. It is normally set
20548                  * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
20549                  * you exit recovery. Note that classic ECN has a beta of 50, it is only
20550                  * ABE Ecn that uses this "less" value, but we do too with pacing :)
20551                  */
20552
20553         case TCP_RACK_PACING_BETA_ECN:
20554                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20555                         error = EINVAL;
20556                 else if (rack->rc_pacing_cc_set == 0)
20557                         optval = rack->r_ctl.rc_saved_beta.beta_ecn;
20558                 else {
20559                         /*
20560                          * Reach out into the CC data and report back what
20561                          * I have previously set. Yeah it looks hackish but
20562                          * we don't want to report the saved values.
20563                          */
20564                         if (tp->ccv->cc_data)
20565                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn;
20566                         else
20567                                 error = EINVAL;
20568                 }
20569                 break;
20570         case TCP_RACK_DSACK_OPT:
20571                 optval = 0;
20572                 if (rack->rc_rack_tmr_std_based) {
20573                         optval |= 1;
20574                 }
20575                 if (rack->rc_rack_use_dsack) {
20576                         optval |= 2;
20577                 }
20578                 break;
20579         case TCP_RACK_ENABLE_HYSTART:
20580         {
20581                 struct sockopt sopt;
20582                 struct cc_newreno_opts opt;
20583
20584                 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
20585                 sopt.sopt_dir = SOPT_GET;
20586                 opt.name = CC_NEWRENO_ENABLE_HYSTART;
20587                 if (CC_ALGO(tp)->ctl_output != NULL)
20588                         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
20589                 else
20590                         error = EINVAL;
20591                 optval = opt.val;
20592         }
20593         break;
20594         case TCP_FAST_RSM_HACK:
20595                 optval = rack->fast_rsm_hack;
20596                 break;
20597         case TCP_DEFER_OPTIONS:
20598                 optval = rack->defer_options;
20599                 break;
20600         case TCP_RACK_MEASURE_CNT:
20601                 optval = rack->r_ctl.req_measurements;
20602                 break;
20603         case TCP_REC_ABC_VAL:
20604                 optval = rack->r_use_labc_for_rec;
20605                 break;
20606         case TCP_RACK_ABC_VAL:
20607                 optval = rack->rc_labc;
20608                 break;
20609         case TCP_HDWR_UP_ONLY:
20610                 optval= rack->r_up_only;
20611                 break;
20612         case TCP_PACING_RATE_CAP:
20613                 loptval = rack->r_ctl.bw_rate_cap;
20614                 break;
20615         case TCP_RACK_PROFILE:
20616                 /* You cannot retrieve a profile, its write only */
20617                 error = EINVAL;
20618                 break;
20619         case TCP_USE_CMP_ACKS:
20620                 optval = rack->r_use_cmp_ack;
20621                 break;
20622         case TCP_RACK_PACE_TO_FILL:
20623                 optval = rack->rc_pace_to_cwnd;
20624                 if (optval && rack->r_fill_less_agg)
20625                         optval++;
20626                 break;
20627         case TCP_RACK_NO_PUSH_AT_MAX:
20628                 optval = rack->r_ctl.rc_no_push_at_mrtt;
20629                 break;
20630         case TCP_SHARED_CWND_ENABLE:
20631                 optval = rack->rack_enable_scwnd;
20632                 break;
20633         case TCP_RACK_NONRXT_CFG_RATE:
20634                 optval = rack->rack_rec_nonrxt_use_cr;
20635                 break;
20636         case TCP_NO_PRR:
20637                 if (rack->rack_no_prr  == 1)
20638                         optval = 1;
20639                 else if (rack->no_prr_addback == 1)
20640                         optval = 2;
20641                 else
20642                         optval = 0;
20643                 break;
20644         case TCP_RACK_DO_DETECTION:
20645                 optval = rack->do_detection;
20646                 break;
20647         case TCP_RACK_MBUF_QUEUE:
20648                 /* Now do we use the LRO mbuf-queue feature */
20649                 optval = rack->r_mbuf_queue;
20650                 break;
20651         case TCP_TIMELY_DYN_ADJ:
20652                 optval = rack->rc_gp_dyn_mul;
20653                 break;
20654         case TCP_BBR_IWINTSO:
20655                 optval = rack->rc_init_win;
20656                 break;
20657         case TCP_RACK_TLP_REDUCE:
20658                 /* RACK TLP cwnd reduction (bool) */
20659                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
20660                 break;
20661         case TCP_BBR_RACK_INIT_RATE:
20662                 val = rack->r_ctl.init_rate;
20663                 /* convert to kbits per sec */
20664                 val *= 8;
20665                 val /= 1000;
20666                 optval = (uint32_t)val;
20667                 break;
20668         case TCP_RACK_FORCE_MSEG:
20669                 optval = rack->rc_force_max_seg;
20670                 break;
20671         case TCP_RACK_PACE_MAX_SEG:
20672                 /* Max segments in a pace */
20673                 optval = rack->rc_user_set_max_segs;
20674                 break;
20675         case TCP_RACK_PACE_ALWAYS:
20676                 /* Use the always pace method */
20677                 optval = rack->rc_always_pace;
20678                 break;
20679         case TCP_RACK_PRR_SENDALOT:
20680                 /* Allow PRR to send more than one seg */
20681                 optval = rack->r_ctl.rc_prr_sendalot;
20682                 break;
20683         case TCP_RACK_MIN_TO:
20684                 /* Minimum time between rack t-o's in ms */
20685                 optval = rack->r_ctl.rc_min_to;
20686                 break;
20687         case TCP_RACK_EARLY_SEG:
20688                 /* If early recovery max segments */
20689                 optval = rack->r_ctl.rc_early_recovery_segs;
20690                 break;
20691         case TCP_RACK_REORD_THRESH:
20692                 /* RACK reorder threshold (shift amount) */
20693                 optval = rack->r_ctl.rc_reorder_shift;
20694                 break;
20695         case TCP_RACK_REORD_FADE:
20696                 /* Does reordering fade after ms time */
20697                 optval = rack->r_ctl.rc_reorder_fade;
20698                 break;
20699         case TCP_BBR_USE_RACK_RR:
20700                 /* Do we use the rack cheat for rxt */
20701                 optval = rack->use_rack_rr;
20702                 break;
20703         case TCP_RACK_RR_CONF:
20704                 optval = rack->r_rr_config;
20705                 break;
20706         case TCP_HDWR_RATE_CAP:
20707                 optval = rack->r_rack_hw_rate_caps;
20708                 break;
20709         case TCP_BBR_HDWR_PACE:
20710                 optval = rack->rack_hdw_pace_ena;
20711                 break;
20712         case TCP_RACK_TLP_THRESH:
20713                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20714                 optval = rack->r_ctl.rc_tlp_threshold;
20715                 break;
20716         case TCP_RACK_PKT_DELAY:
20717                 /* RACK added ms i.e. rack-rtt + reord + N */
20718                 optval = rack->r_ctl.rc_pkt_delay;
20719                 break;
20720         case TCP_RACK_TLP_USE:
20721                 optval = rack->rack_tlp_threshold_use;
20722                 break;
20723         case TCP_RACK_PACE_RATE_CA:
20724                 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
20725                 break;
20726         case TCP_RACK_PACE_RATE_SS:
20727                 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
20728                 break;
20729         case TCP_RACK_PACE_RATE_REC:
20730                 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
20731                 break;
20732         case TCP_RACK_GP_INCREASE_SS:
20733                 optval = rack->r_ctl.rack_per_of_gp_ca;
20734                 break;
20735         case TCP_RACK_GP_INCREASE_CA:
20736                 optval = rack->r_ctl.rack_per_of_gp_ss;
20737                 break;
20738         case TCP_BBR_RACK_RTT_USE:
20739                 optval = rack->r_ctl.rc_rate_sample_method;
20740                 break;
20741         case TCP_DELACK:
20742                 optval = tp->t_delayed_ack;
20743                 break;
20744         case TCP_DATA_AFTER_CLOSE:
20745                 optval = rack->rc_allow_data_af_clo;
20746                 break;
20747         case TCP_SHARED_CWND_TIME_LIMIT:
20748                 optval = rack->r_limit_scw;
20749                 break;
20750         case TCP_RACK_TIMER_SLOP:
20751                 optval = rack->r_ctl.timer_slop;
20752                 break;
20753         default:
20754                 return (tcp_default_ctloutput(so, sopt, inp, tp));
20755                 break;
20756         }
20757         INP_WUNLOCK(inp);
20758         if (error == 0) {
20759                 if (TCP_PACING_RATE_CAP)
20760                         error = sooptcopyout(sopt, &loptval, sizeof loptval);
20761                 else
20762                         error = sooptcopyout(sopt, &optval, sizeof optval);
20763         }
20764         return (error);
20765 }
20766
20767 static int
20768 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
20769 {
20770         int32_t error = EINVAL;
20771         struct tcp_rack *rack;
20772
20773         rack = (struct tcp_rack *)tp->t_fb_ptr;
20774         if (rack == NULL) {
20775                 /* Huh? */
20776                 goto out;
20777         }
20778         if (sopt->sopt_dir == SOPT_SET) {
20779                 return (rack_set_sockopt(so, sopt, inp, tp, rack));
20780         } else if (sopt->sopt_dir == SOPT_GET) {
20781                 return (rack_get_sockopt(so, sopt, inp, tp, rack));
20782         }
20783 out:
20784         INP_WUNLOCK(inp);
20785         return (error);
20786 }
20787
20788 static const char *rack_stack_names[] = {
20789         __XSTRING(STACKNAME),
20790 #ifdef STACKALIAS
20791         __XSTRING(STACKALIAS),
20792 #endif
20793 };
20794
20795 static int
20796 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
20797 {
20798         memset(mem, 0, size);
20799         return (0);
20800 }
20801
20802 static void
20803 rack_dtor(void *mem, int32_t size, void *arg)
20804 {
20805
20806 }
20807
20808 static bool rack_mod_inited = false;
20809
20810 static int
20811 tcp_addrack(module_t mod, int32_t type, void *data)
20812 {
20813         int32_t err = 0;
20814         int num_stacks;
20815
20816         switch (type) {
20817         case MOD_LOAD:
20818                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
20819                     sizeof(struct rack_sendmap),
20820                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
20821
20822                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
20823                     sizeof(struct tcp_rack),
20824                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
20825
20826                 sysctl_ctx_init(&rack_sysctl_ctx);
20827                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
20828                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
20829                     OID_AUTO,
20830 #ifdef STACKALIAS
20831                     __XSTRING(STACKALIAS),
20832 #else
20833                     __XSTRING(STACKNAME),
20834 #endif
20835                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
20836                     "");
20837                 if (rack_sysctl_root == NULL) {
20838                         printf("Failed to add sysctl node\n");
20839                         err = EFAULT;
20840                         goto free_uma;
20841                 }
20842                 rack_init_sysctls();
20843                 num_stacks = nitems(rack_stack_names);
20844                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
20845                     rack_stack_names, &num_stacks);
20846                 if (err) {
20847                         printf("Failed to register %s stack name for "
20848                             "%s module\n", rack_stack_names[num_stacks],
20849                             __XSTRING(MODNAME));
20850                         sysctl_ctx_free(&rack_sysctl_ctx);
20851 free_uma:
20852                         uma_zdestroy(rack_zone);
20853                         uma_zdestroy(rack_pcb_zone);
20854                         rack_counter_destroy();
20855                         printf("Failed to register rack module -- err:%d\n", err);
20856                         return (err);
20857                 }
20858                 tcp_lro_reg_mbufq();
20859                 rack_mod_inited = true;
20860                 break;
20861         case MOD_QUIESCE:
20862                 err = deregister_tcp_functions(&__tcp_rack, true, false);
20863                 break;
20864         case MOD_UNLOAD:
20865                 err = deregister_tcp_functions(&__tcp_rack, false, true);
20866                 if (err == EBUSY)
20867                         break;
20868                 if (rack_mod_inited) {
20869                         uma_zdestroy(rack_zone);
20870                         uma_zdestroy(rack_pcb_zone);
20871                         sysctl_ctx_free(&rack_sysctl_ctx);
20872                         rack_counter_destroy();
20873                         rack_mod_inited = false;
20874                 }
20875                 tcp_lro_dereg_mbufq();
20876                 err = 0;
20877                 break;
20878         default:
20879                 return (EOPNOTSUPP);
20880         }
20881         return (err);
20882 }
20883
20884 static moduledata_t tcp_rack = {
20885         .name = __XSTRING(MODNAME),
20886         .evhand = tcp_addrack,
20887         .priv = 0
20888 };
20889
20890 MODULE_VERSION(MODNAME, 1);
20891 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
20892 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);