]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/tcp_stacks/rack.c
Update to bmake-20220724
[FreeBSD/FreeBSD.git] / sys / netinet / tcp_stacks / rack.c
1 /*-
2  * Copyright (c) 2016-2020 Netflix, Inc.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_ipsec.h"
33 #include "opt_tcpdebug.h"
34 #include "opt_ratelimit.h"
35 #include "opt_kern_tls.h"
36 #include <sys/param.h>
37 #include <sys/arb.h>
38 #include <sys/module.h>
39 #include <sys/kernel.h>
40 #ifdef TCP_HHOOK
41 #include <sys/hhook.h>
42 #endif
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/lock.h>
46 #include <sys/mutex.h>
47 #include <sys/mbuf.h>
48 #include <sys/proc.h>           /* for proc0 declaration */
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sysctl.h>
52 #include <sys/systm.h>
53 #ifdef STATS
54 #include <sys/qmath.h>
55 #include <sys/tree.h>
56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
57 #else
58 #include <sys/tree.h>
59 #endif
60 #include <sys/refcount.h>
61 #include <sys/queue.h>
62 #include <sys/tim_filter.h>
63 #include <sys/smp.h>
64 #include <sys/kthread.h>
65 #include <sys/kern_prefetch.h>
66 #include <sys/protosw.h>
67 #ifdef TCP_ACCOUNTING
68 #include <sys/sched.h>
69 #include <machine/cpu.h>
70 #endif
71 #include <vm/uma.h>
72
73 #include <net/route.h>
74 #include <net/route/nhop.h>
75 #include <net/vnet.h>
76
77 #define TCPSTATES               /* for logging */
78
79 #include <netinet/in.h>
80 #include <netinet/in_kdtrace.h>
81 #include <netinet/in_pcb.h>
82 #include <netinet/ip.h>
83 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
84 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
85 #include <netinet/ip_var.h>
86 #include <netinet/ip6.h>
87 #include <netinet6/in6_pcb.h>
88 #include <netinet6/ip6_var.h>
89 #include <netinet/tcp.h>
90 #define TCPOUTFLAGS
91 #include <netinet/tcp_fsm.h>
92 #include <netinet/tcp_log_buf.h>
93 #include <netinet/tcp_seq.h>
94 #include <netinet/tcp_timer.h>
95 #include <netinet/tcp_var.h>
96 #include <netinet/tcp_syncache.h>
97 #include <netinet/tcp_hpts.h>
98 #include <netinet/tcp_ratelimit.h>
99 #include <netinet/tcp_accounting.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/cc/cc.h>
102 #include <netinet/cc/cc_newreno.h>
103 #include <netinet/tcp_fastopen.h>
104 #include <netinet/tcp_lro.h>
105 #ifdef NETFLIX_SHARED_CWND
106 #include <netinet/tcp_shared_cwnd.h>
107 #endif
108 #ifdef TCPDEBUG
109 #include <netinet/tcp_debug.h>
110 #endif                          /* TCPDEBUG */
111 #ifdef TCP_OFFLOAD
112 #include <netinet/tcp_offload.h>
113 #endif
114 #ifdef INET6
115 #include <netinet6/tcp6_var.h>
116 #endif
117 #include <netinet/tcp_ecn.h>
118
119 #include <netipsec/ipsec_support.h>
120
121 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
122 #include <netipsec/ipsec.h>
123 #include <netipsec/ipsec6.h>
124 #endif                          /* IPSEC */
125
126 #include <netinet/udp.h>
127 #include <netinet/udp_var.h>
128 #include <machine/in_cksum.h>
129
130 #ifdef MAC
131 #include <security/mac/mac_framework.h>
132 #endif
133 #include "sack_filter.h"
134 #include "tcp_rack.h"
135 #include "rack_bbr_common.h"
136
137 uma_zone_t rack_zone;
138 uma_zone_t rack_pcb_zone;
139
140 #ifndef TICKS2SBT
141 #define TICKS2SBT(__t)  (tick_sbt * ((sbintime_t)(__t)))
142 #endif
143
144 VNET_DECLARE(uint32_t, newreno_beta);
145 VNET_DECLARE(uint32_t, newreno_beta_ecn);
146 #define V_newreno_beta VNET(newreno_beta)
147 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
148
149
150 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
151 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
152
153 struct sysctl_ctx_list rack_sysctl_ctx;
154 struct sysctl_oid *rack_sysctl_root;
155
156 #define CUM_ACKED 1
157 #define SACKED 2
158
159 /*
160  * The RACK module incorporates a number of
161  * TCP ideas that have been put out into the IETF
162  * over the last few years:
163  * - Matt Mathis's Rate Halving which slowly drops
164  *    the congestion window so that the ack clock can
165  *    be maintained during a recovery.
166  * - Yuchung Cheng's RACK TCP (for which its named) that
167  *    will stop us using the number of dup acks and instead
168  *    use time as the gage of when we retransmit.
169  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
170  *    of Dukkipati et.al.
171  * RACK depends on SACK, so if an endpoint arrives that
172  * cannot do SACK the state machine below will shuttle the
173  * connection back to using the "default" TCP stack that is
174  * in FreeBSD.
175  *
176  * To implement RACK the original TCP stack was first decomposed
177  * into a functional state machine with individual states
178  * for each of the possible TCP connection states. The do_segment
179  * functions role in life is to mandate the connection supports SACK
180  * initially and then assure that the RACK state matches the conenction
181  * state before calling the states do_segment function. Each
182  * state is simplified due to the fact that the original do_segment
183  * has been decomposed and we *know* what state we are in (no
184  * switches on the state) and all tests for SACK are gone. This
185  * greatly simplifies what each state does.
186  *
187  * TCP output is also over-written with a new version since it
188  * must maintain the new rack scoreboard.
189  *
190  */
191 static int32_t rack_tlp_thresh = 1;
192 static int32_t rack_tlp_limit = 2;      /* No more than 2 TLPs w-out new data */
193 static int32_t rack_tlp_use_greater = 1;
194 static int32_t rack_reorder_thresh = 2;
195 static int32_t rack_reorder_fade = 60000000;    /* 0 - never fade, def 60,000,000
196                                                  * - 60 seconds */
197 static uint8_t rack_req_measurements = 1;
198 /* Attack threshold detections */
199 static uint32_t rack_highest_sack_thresh_seen = 0;
200 static uint32_t rack_highest_move_thresh_seen = 0;
201 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
202 static int32_t rack_hw_pace_extra_slots = 2;    /* 2 extra MSS time betweens */
203 static int32_t rack_hw_rate_caps = 1; /* 1; */
204 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
205 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
206 static int32_t rack_hw_up_only = 1;
207 static int32_t rack_stats_gets_ms_rtt = 1;
208 static int32_t rack_prr_addbackmax = 2;
209 static int32_t rack_do_hystart = 0;
210 static int32_t rack_apply_rtt_with_reduced_conf = 0;
211
212 static int32_t rack_pkt_delay = 1000;
213 static int32_t rack_send_a_lot_in_prr = 1;
214 static int32_t rack_min_to = 1000;      /* Number of microsecond  min timeout */
215 static int32_t rack_verbose_logging = 0;
216 static int32_t rack_ignore_data_after_close = 1;
217 static int32_t rack_enable_shared_cwnd = 1;
218 static int32_t rack_use_cmp_acks = 1;
219 static int32_t rack_use_fsb = 1;
220 static int32_t rack_use_rfo = 1;
221 static int32_t rack_use_rsm_rfo = 1;
222 static int32_t rack_max_abc_post_recovery = 2;
223 static int32_t rack_client_low_buf = 0;
224 static int32_t rack_dsack_std_based = 0x3;      /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
225 #ifdef TCP_ACCOUNTING
226 static int32_t rack_tcp_accounting = 0;
227 #endif
228 static int32_t rack_limits_scwnd = 1;
229 static int32_t rack_enable_mqueue_for_nonpaced = 0;
230 static int32_t rack_disable_prr = 0;
231 static int32_t use_rack_rr = 1;
232 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
233 static int32_t rack_persist_min = 250000;       /* 250usec */
234 static int32_t rack_persist_max = 2000000;      /* 2 Second in usec's */
235 static int32_t rack_sack_not_required = 1;      /* set to one to allow non-sack to use rack */
236 static int32_t rack_default_init_window = 0;    /* Use system default */
237 static int32_t rack_limit_time_with_srtt = 0;
238 static int32_t rack_autosndbuf_inc = 20;        /* In percentage form */
239 static int32_t rack_enobuf_hw_boost_mult = 2;   /* How many times the hw rate we boost slot using time_between */
240 static int32_t rack_enobuf_hw_max = 12000;      /* 12 ms in usecs */
241 static int32_t rack_enobuf_hw_min = 10000;      /* 10 ms in usecs */
242 static int32_t rack_hw_rwnd_factor = 2;         /* How many max_segs the rwnd must be before we hold off sending */
243
244 /*
245  * Currently regular tcp has a rto_min of 30ms
246  * the backoff goes 12 times so that ends up
247  * being a total of 122.850 seconds before a
248  * connection is killed.
249  */
250 static uint32_t rack_def_data_window = 20;
251 static uint32_t rack_goal_bdp = 2;
252 static uint32_t rack_min_srtts = 1;
253 static uint32_t rack_min_measure_usec = 0;
254 static int32_t rack_tlp_min = 10000;    /* 10ms */
255 static int32_t rack_rto_min = 30000;    /* 30,000 usec same as main freebsd */
256 static int32_t rack_rto_max = 4000000;  /* 4 seconds in usec's */
257 static const int32_t rack_free_cache = 2;
258 static int32_t rack_hptsi_segments = 40;
259 static int32_t rack_rate_sample_method = USE_RTT_LOW;
260 static int32_t rack_pace_every_seg = 0;
261 static int32_t rack_delayed_ack_time = 40000;   /* 40ms in usecs */
262 static int32_t rack_slot_reduction = 4;
263 static int32_t rack_wma_divisor = 8;            /* For WMA calculation */
264 static int32_t rack_cwnd_block_ends_measure = 0;
265 static int32_t rack_rwnd_block_ends_measure = 0;
266 static int32_t rack_def_profile = 0;
267
268 static int32_t rack_lower_cwnd_at_tlp = 0;
269 static int32_t rack_limited_retran = 0;
270 static int32_t rack_always_send_oldest = 0;
271 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
272
273 static uint16_t rack_per_of_gp_ss = 250;        /* 250 % slow-start */
274 static uint16_t rack_per_of_gp_ca = 200;        /* 200 % congestion-avoidance */
275 static uint16_t rack_per_of_gp_rec = 200;       /* 200 % of bw */
276
277 /* Probertt */
278 static uint16_t rack_per_of_gp_probertt = 60;   /* 60% of bw */
279 static uint16_t rack_per_of_gp_lowthresh = 40;  /* 40% is bottom */
280 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
281 static uint16_t rack_atexit_prtt_hbp = 130;     /* Clamp to 130% on exit prtt if highly buffered path */
282 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
283
284 static uint32_t rack_max_drain_wait = 2;        /* How man gp srtt's before we give up draining */
285 static uint32_t rack_must_drain = 1;            /* How many GP srtt's we *must* wait */
286 static uint32_t rack_probertt_use_min_rtt_entry = 1;    /* Use the min to calculate the goal else gp_srtt */
287 static uint32_t rack_probertt_use_min_rtt_exit = 0;
288 static uint32_t rack_probe_rtt_sets_cwnd = 0;
289 static uint32_t rack_probe_rtt_safety_val = 2000000;    /* No more than 2 sec in probe-rtt */
290 static uint32_t rack_time_between_probertt = 9600000;   /* 9.6 sec in usecs */
291 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;       /* How many srtt periods does probe-rtt last top fraction */
292 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;       /* How many srtt periods does probe-rtt last bottom fraction */
293 static uint32_t rack_min_probertt_hold = 40000;         /* Equal to delayed ack time */
294 static uint32_t rack_probertt_filter_life = 10000000;
295 static uint32_t rack_probertt_lower_within = 10;
296 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds)  to count as a lowering */
297 static int32_t rack_pace_one_seg = 0;           /* Shall we pace for less than 1.4Meg 1MSS at a time */
298 static int32_t rack_probertt_clear_is = 1;
299 static int32_t rack_max_drain_hbp = 1;          /* Extra drain times gpsrtt for highly buffered paths */
300 static int32_t rack_hbp_thresh = 3;             /* what is the divisor max_rtt/min_rtt to decided a hbp */
301
302 /* Part of pacing */
303 static int32_t rack_max_per_above = 30;         /* When we go to increment stop if above 100+this% */
304
305 /* Timely information */
306 /* Combine these two gives the range of 'no change' to bw */
307 /* ie the up/down provide the upper and lower bound */
308 static int32_t rack_gp_per_bw_mul_up = 2;       /* 2% */
309 static int32_t rack_gp_per_bw_mul_down = 4;     /* 4% */
310 static int32_t rack_gp_rtt_maxmul = 3;          /* 3 x maxmin */
311 static int32_t rack_gp_rtt_minmul = 1;          /* minrtt + (minrtt/mindiv) is lower rtt */
312 static int32_t rack_gp_rtt_mindiv = 4;          /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
313 static int32_t rack_gp_decrease_per = 20;       /* 20% decrease in multiplier */
314 static int32_t rack_gp_increase_per = 2;        /* 2% increase in multiplier */
315 static int32_t rack_per_lower_bound = 50;       /* Don't allow to drop below this multiplier */
316 static int32_t rack_per_upper_bound_ss = 0;     /* Don't allow SS to grow above this */
317 static int32_t rack_per_upper_bound_ca = 0;     /* Don't allow CA to grow above this */
318 static int32_t rack_do_dyn_mul = 0;             /* Are the rack gp multipliers dynamic */
319 static int32_t rack_gp_no_rec_chg = 1;          /* Prohibit recovery from reducing it's multiplier */
320 static int32_t rack_timely_dec_clear = 6;       /* Do we clear decrement count at a value (6)? */
321 static int32_t rack_timely_max_push_rise = 3;   /* One round of pushing */
322 static int32_t rack_timely_max_push_drop = 3;   /* Three round of pushing */
323 static int32_t rack_timely_min_segs = 4;        /* 4 segment minimum */
324 static int32_t rack_use_max_for_nobackoff = 0;
325 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */
326 static int32_t rack_timely_no_stopping = 0;
327 static int32_t rack_down_raise_thresh = 100;
328 static int32_t rack_req_segs = 1;
329 static uint64_t rack_bw_rate_cap = 0;
330 static uint32_t rack_trace_point_config = 0;
331 static uint32_t rack_trace_point_bb_mode = 4;
332 static int32_t rack_trace_point_count = 0;
333
334
335 /* Weird delayed ack mode */
336 static int32_t rack_use_imac_dack = 0;
337 /* Rack specific counters */
338 counter_u64_t rack_saw_enobuf;
339 counter_u64_t rack_saw_enobuf_hw;
340 counter_u64_t rack_saw_enetunreach;
341 counter_u64_t rack_persists_sends;
342 counter_u64_t rack_persists_acks;
343 counter_u64_t rack_persists_loss;
344 counter_u64_t rack_persists_lost_ends;
345 #ifdef INVARIANTS
346 counter_u64_t rack_adjust_map_bw;
347 #endif
348 /* Tail loss probe counters */
349 counter_u64_t rack_tlp_tot;
350 counter_u64_t rack_tlp_newdata;
351 counter_u64_t rack_tlp_retran;
352 counter_u64_t rack_tlp_retran_bytes;
353 counter_u64_t rack_to_tot;
354 counter_u64_t rack_hot_alloc;
355 counter_u64_t rack_to_alloc;
356 counter_u64_t rack_to_alloc_hard;
357 counter_u64_t rack_to_alloc_emerg;
358 counter_u64_t rack_to_alloc_limited;
359 counter_u64_t rack_alloc_limited_conns;
360 counter_u64_t rack_split_limited;
361
362 counter_u64_t rack_multi_single_eq;
363 counter_u64_t rack_proc_non_comp_ack;
364
365 counter_u64_t rack_fto_send;
366 counter_u64_t rack_fto_rsm_send;
367 counter_u64_t rack_nfto_resend;
368 counter_u64_t rack_non_fto_send;
369 counter_u64_t rack_extended_rfo;
370
371 counter_u64_t rack_sack_proc_all;
372 counter_u64_t rack_sack_proc_short;
373 counter_u64_t rack_sack_proc_restart;
374 counter_u64_t rack_sack_attacks_detected;
375 counter_u64_t rack_sack_attacks_reversed;
376 counter_u64_t rack_sack_used_next_merge;
377 counter_u64_t rack_sack_splits;
378 counter_u64_t rack_sack_used_prev_merge;
379 counter_u64_t rack_sack_skipped_acked;
380 counter_u64_t rack_ack_total;
381 counter_u64_t rack_express_sack;
382 counter_u64_t rack_sack_total;
383 counter_u64_t rack_move_none;
384 counter_u64_t rack_move_some;
385
386 counter_u64_t rack_input_idle_reduces;
387 counter_u64_t rack_collapsed_win;
388 counter_u64_t rack_try_scwnd;
389 counter_u64_t rack_hw_pace_init_fail;
390 counter_u64_t rack_hw_pace_lost;
391
392 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
393 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
394
395
396 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
397
398 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {  \
399         (tv) = (value) + slop;   \
400         if ((u_long)(tv) < (u_long)(tvmin)) \
401                 (tv) = (tvmin); \
402         if ((u_long)(tv) > (u_long)(tvmax)) \
403                 (tv) = (tvmax); \
404 } while (0)
405
406 static void
407 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
408
409 static int
410 rack_process_ack(struct mbuf *m, struct tcphdr *th,
411     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
412     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
413 static int
414 rack_process_data(struct mbuf *m, struct tcphdr *th,
415     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
416     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
417 static void
418 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
419    uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
420 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
421 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
422     uint8_t limit_type);
423 static struct rack_sendmap *
424 rack_check_recovery_mode(struct tcpcb *tp,
425     uint32_t tsused);
426 static void
427 rack_cong_signal(struct tcpcb *tp,
428                  uint32_t type, uint32_t ack, int );
429 static void rack_counter_destroy(void);
430 static int
431 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt);
432 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
433 static void
434 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
435 static void
436 rack_do_segment(struct mbuf *m, struct tcphdr *th,
437     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
438     uint8_t iptos);
439 static void rack_dtor(void *mem, int32_t size, void *arg);
440 static void
441 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
442     uint32_t flex1, uint32_t flex2,
443     uint32_t flex3, uint32_t flex4,
444     uint32_t flex5, uint32_t flex6,
445     uint16_t flex7, uint8_t mod);
446
447 static void
448 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
449    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
450    struct rack_sendmap *rsm, uint8_t quality);
451 static struct rack_sendmap *
452 rack_find_high_nonack(struct tcp_rack *rack,
453     struct rack_sendmap *rsm);
454 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
455 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
456 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
457 static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt);
458 static void
459 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
460                             tcp_seq th_ack, int line, uint8_t quality);
461 static uint32_t
462 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
463 static int32_t rack_handoff_ok(struct tcpcb *tp);
464 static int32_t rack_init(struct tcpcb *tp);
465 static void rack_init_sysctls(void);
466 static void
467 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
468     struct tcphdr *th, int entered_rec, int dup_ack_struck);
469 static void
470 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
471     uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
472     struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
473
474 static void
475 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
476     struct rack_sendmap *rsm);
477 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
478 static int32_t rack_output(struct tcpcb *tp);
479
480 static uint32_t
481 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
482     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
483     uint32_t cts, int *moved_two);
484 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
485 static void rack_remxt_tmr(struct tcpcb *tp);
486 static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt);
487 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
488 static int32_t rack_stopall(struct tcpcb *tp);
489 static void
490 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
491     uint32_t delta);
492 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
493 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
494 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
495 static uint32_t
496 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
497     struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
498 static void
499 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
500     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
501 static int
502 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
503     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
504 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
505 static int
506 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
507     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
508     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
509 static int
510 rack_do_closing(struct mbuf *m, struct tcphdr *th,
511     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
512     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
513 static int
514 rack_do_established(struct mbuf *m, struct tcphdr *th,
515     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
516     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
517 static int
518 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
519     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
520     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
521 static int
522 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
523     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
524     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
525 static int
526 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
527     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
528     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
529 static int
530 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
531     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
532     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
533 static int
534 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
535     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
536     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
537 static int
538 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
539     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
540     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
541 struct rack_sendmap *
542 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
543     uint32_t tsused);
544 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
545     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
546 static void
547      tcp_rack_partialack(struct tcpcb *tp);
548 static int
549 rack_set_profile(struct tcp_rack *rack, int prof);
550 static void
551 rack_apply_deferred_options(struct tcp_rack *rack);
552
553 int32_t rack_clear_counter=0;
554
555 static inline void
556 rack_trace_point(struct tcp_rack *rack, int num)
557 {
558         if (((rack_trace_point_config == num)  ||
559              (rack_trace_point_config = 0xffffffff)) &&
560             (rack_trace_point_bb_mode != 0) &&
561             (rack_trace_point_count > 0) &&
562             (rack->rc_tp->t_logstate == 0)) {
563                 int res;
564                 res = atomic_fetchadd_int(&rack_trace_point_count, -1);
565                 if (res > 0) {
566                         rack->rc_tp->t_logstate = rack_trace_point_bb_mode;
567                 } else {
568                         /* Loss a race assure its zero now */
569                         rack_trace_point_count = 0;
570                 }
571         }
572 }
573
574 static void
575 rack_set_cc_pacing(struct tcp_rack *rack)
576 {
577         struct sockopt sopt;
578         struct cc_newreno_opts opt;
579         struct newreno old, *ptr;
580         struct tcpcb *tp;
581         int error;
582
583         if (rack->rc_pacing_cc_set)
584                 return;
585
586         tp = rack->rc_tp;
587         if (tp->cc_algo == NULL) {
588                 /* Tcb is leaving */
589                 return;
590         }
591         rack->rc_pacing_cc_set = 1;
592         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
593                 /* Not new-reno we can't play games with beta! */
594                 goto out;
595         }
596         ptr = ((struct newreno *)tp->ccv->cc_data);
597         if (CC_ALGO(tp)->ctl_output == NULL)  {
598                 /* Huh, why does new_reno no longer have a set function? */
599                 goto out;
600         }
601         if (ptr == NULL) {
602                 /* Just the default values */
603                 old.beta = V_newreno_beta_ecn;
604                 old.beta_ecn = V_newreno_beta_ecn;
605                 old.newreno_flags = 0;
606         } else {
607                 old.beta = ptr->beta;
608                 old.beta_ecn = ptr->beta_ecn;
609                 old.newreno_flags = ptr->newreno_flags;
610         }
611         sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
612         sopt.sopt_dir = SOPT_SET;
613         opt.name = CC_NEWRENO_BETA;
614         opt.val = rack->r_ctl.rc_saved_beta.beta;
615         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
616         if (error)  {
617                 goto out;
618         }
619         /*
620          * Hack alert we need to set in our newreno_flags
621          * so that Abe behavior is also applied.
622          */
623         ((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
624         opt.name = CC_NEWRENO_BETA_ECN;
625         opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
626         error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
627         if (error) {
628                 goto out;
629         }
630         /* Save off the original values for restoral */
631         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
632 out:
633         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
634                 union tcp_log_stackspecific log;
635                 struct timeval tv;
636
637                 ptr = ((struct newreno *)tp->ccv->cc_data);
638                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
639                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
640                 if (ptr) {
641                         log.u_bbr.flex1 = ptr->beta;
642                         log.u_bbr.flex2 = ptr->beta_ecn;
643                         log.u_bbr.flex3 = ptr->newreno_flags;
644                 }
645                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
646                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
647                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
648                 log.u_bbr.flex7 = rack->gp_ready;
649                 log.u_bbr.flex7 <<= 1;
650                 log.u_bbr.flex7 |= rack->use_fixed_rate;
651                 log.u_bbr.flex7 <<= 1;
652                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
653                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
654                 log.u_bbr.flex8 = 3;
655                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
656                                0, &log, false, NULL, NULL, 0, &tv);
657         }
658 }
659
660 static void
661 rack_undo_cc_pacing(struct tcp_rack *rack)
662 {
663         struct newreno old, *ptr;
664         struct tcpcb *tp;
665
666         if (rack->rc_pacing_cc_set == 0)
667                 return;
668         tp = rack->rc_tp;
669         rack->rc_pacing_cc_set = 0;
670         if (tp->cc_algo == NULL)
671                 /* Tcb is leaving */
672                 return;
673         if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
674                 /* Not new-reno nothing to do! */
675                 return;
676         }
677         ptr = ((struct newreno *)tp->ccv->cc_data);
678         if (ptr == NULL) {
679                 /*
680                  * This happens at rack_fini() if the
681                  * cc module gets freed on us. In that
682                  * case we loose our "new" settings but
683                  * thats ok, since the tcb is going away anyway.
684                  */
685                 return;
686         }
687         /* Grab out our set values */
688         memcpy(&old, ptr, sizeof(struct newreno));
689         /* Copy back in the original values */
690         memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
691         /* Now save back the values we had set in (for when pacing is restored) */
692         memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
693         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
694                 union tcp_log_stackspecific log;
695                 struct timeval tv;
696
697                 ptr = ((struct newreno *)tp->ccv->cc_data);
698                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
699                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
700                 log.u_bbr.flex1 = ptr->beta;
701                 log.u_bbr.flex2 = ptr->beta_ecn;
702                 log.u_bbr.flex3 = ptr->newreno_flags;
703                 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
704                 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
705                 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
706                 log.u_bbr.flex7 = rack->gp_ready;
707                 log.u_bbr.flex7 <<= 1;
708                 log.u_bbr.flex7 |= rack->use_fixed_rate;
709                 log.u_bbr.flex7 <<= 1;
710                 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
711                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
712                 log.u_bbr.flex8 = 4;
713                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
714                                0, &log, false, NULL, NULL, 0, &tv);
715         }
716 }
717
718 #ifdef NETFLIX_PEAKRATE
719 static inline void
720 rack_update_peakrate_thr(struct tcpcb *tp)
721 {
722         /* Keep in mind that t_maxpeakrate is in B/s. */
723         uint64_t peak;
724         peak = uqmax((tp->t_maxseg * 2),
725                      (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
726         tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
727 }
728 #endif
729
730 static int
731 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
732 {
733         uint32_t stat;
734         int32_t error;
735
736         error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
737         if (error || req->newptr == NULL)
738                 return error;
739
740         error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
741         if (error)
742                 return (error);
743         if (stat == 1) {
744 #ifdef INVARIANTS
745                 printf("Clearing RACK counters\n");
746 #endif
747                 counter_u64_zero(rack_tlp_tot);
748                 counter_u64_zero(rack_tlp_newdata);
749                 counter_u64_zero(rack_tlp_retran);
750                 counter_u64_zero(rack_tlp_retran_bytes);
751                 counter_u64_zero(rack_to_tot);
752                 counter_u64_zero(rack_saw_enobuf);
753                 counter_u64_zero(rack_saw_enobuf_hw);
754                 counter_u64_zero(rack_saw_enetunreach);
755                 counter_u64_zero(rack_persists_sends);
756                 counter_u64_zero(rack_persists_acks);
757                 counter_u64_zero(rack_persists_loss);
758                 counter_u64_zero(rack_persists_lost_ends);
759 #ifdef INVARIANTS
760                 counter_u64_zero(rack_adjust_map_bw);
761 #endif
762                 counter_u64_zero(rack_to_alloc_hard);
763                 counter_u64_zero(rack_to_alloc_emerg);
764                 counter_u64_zero(rack_sack_proc_all);
765                 counter_u64_zero(rack_fto_send);
766                 counter_u64_zero(rack_fto_rsm_send);
767                 counter_u64_zero(rack_extended_rfo);
768                 counter_u64_zero(rack_hw_pace_init_fail);
769                 counter_u64_zero(rack_hw_pace_lost);
770                 counter_u64_zero(rack_non_fto_send);
771                 counter_u64_zero(rack_nfto_resend);
772                 counter_u64_zero(rack_sack_proc_short);
773                 counter_u64_zero(rack_sack_proc_restart);
774                 counter_u64_zero(rack_to_alloc);
775                 counter_u64_zero(rack_to_alloc_limited);
776                 counter_u64_zero(rack_alloc_limited_conns);
777                 counter_u64_zero(rack_split_limited);
778                 counter_u64_zero(rack_multi_single_eq);
779                 counter_u64_zero(rack_proc_non_comp_ack);
780                 counter_u64_zero(rack_sack_attacks_detected);
781                 counter_u64_zero(rack_sack_attacks_reversed);
782                 counter_u64_zero(rack_sack_used_next_merge);
783                 counter_u64_zero(rack_sack_used_prev_merge);
784                 counter_u64_zero(rack_sack_splits);
785                 counter_u64_zero(rack_sack_skipped_acked);
786                 counter_u64_zero(rack_ack_total);
787                 counter_u64_zero(rack_express_sack);
788                 counter_u64_zero(rack_sack_total);
789                 counter_u64_zero(rack_move_none);
790                 counter_u64_zero(rack_move_some);
791                 counter_u64_zero(rack_try_scwnd);
792                 counter_u64_zero(rack_collapsed_win);
793         }
794         rack_clear_counter = 0;
795         return (0);
796 }
797
798 static void
799 rack_init_sysctls(void)
800 {
801         struct sysctl_oid *rack_counters;
802         struct sysctl_oid *rack_attack;
803         struct sysctl_oid *rack_pacing;
804         struct sysctl_oid *rack_timely;
805         struct sysctl_oid *rack_timers;
806         struct sysctl_oid *rack_tlp;
807         struct sysctl_oid *rack_misc;
808         struct sysctl_oid *rack_features;
809         struct sysctl_oid *rack_measure;
810         struct sysctl_oid *rack_probertt;
811         struct sysctl_oid *rack_hw_pacing;
812         struct sysctl_oid *rack_tracepoint;
813
814         rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
815             SYSCTL_CHILDREN(rack_sysctl_root),
816             OID_AUTO,
817             "sack_attack",
818             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
819             "Rack Sack Attack Counters and Controls");
820         rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
821             SYSCTL_CHILDREN(rack_sysctl_root),
822             OID_AUTO,
823             "stats",
824             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
825             "Rack Counters");
826         SYSCTL_ADD_S32(&rack_sysctl_ctx,
827             SYSCTL_CHILDREN(rack_sysctl_root),
828             OID_AUTO, "rate_sample_method", CTLFLAG_RW,
829             &rack_rate_sample_method , USE_RTT_LOW,
830             "What method should we use for rate sampling 0=high, 1=low ");
831         /* Probe rtt related controls */
832         rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
833             SYSCTL_CHILDREN(rack_sysctl_root),
834             OID_AUTO,
835             "probertt",
836             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
837             "ProbeRTT related Controls");
838         SYSCTL_ADD_U16(&rack_sysctl_ctx,
839             SYSCTL_CHILDREN(rack_probertt),
840             OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
841             &rack_atexit_prtt_hbp, 130,
842             "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
843         SYSCTL_ADD_U16(&rack_sysctl_ctx,
844             SYSCTL_CHILDREN(rack_probertt),
845             OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
846             &rack_atexit_prtt, 130,
847             "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
848         SYSCTL_ADD_U16(&rack_sysctl_ctx,
849             SYSCTL_CHILDREN(rack_probertt),
850             OID_AUTO, "gp_per_mul", CTLFLAG_RW,
851             &rack_per_of_gp_probertt, 60,
852             "What percentage of goodput do we pace at in probertt");
853         SYSCTL_ADD_U16(&rack_sysctl_ctx,
854             SYSCTL_CHILDREN(rack_probertt),
855             OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
856             &rack_per_of_gp_probertt_reduce, 10,
857             "What percentage of goodput do we reduce every gp_srtt");
858         SYSCTL_ADD_U16(&rack_sysctl_ctx,
859             SYSCTL_CHILDREN(rack_probertt),
860             OID_AUTO, "gp_per_low", CTLFLAG_RW,
861             &rack_per_of_gp_lowthresh, 40,
862             "What percentage of goodput do we allow the multiplier to fall to");
863         SYSCTL_ADD_U32(&rack_sysctl_ctx,
864             SYSCTL_CHILDREN(rack_probertt),
865             OID_AUTO, "time_between", CTLFLAG_RW,
866             & rack_time_between_probertt, 96000000,
867             "How many useconds between the lowest rtt falling must past before we enter probertt");
868         SYSCTL_ADD_U32(&rack_sysctl_ctx,
869             SYSCTL_CHILDREN(rack_probertt),
870             OID_AUTO, "safety", CTLFLAG_RW,
871             &rack_probe_rtt_safety_val, 2000000,
872             "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
873         SYSCTL_ADD_U32(&rack_sysctl_ctx,
874             SYSCTL_CHILDREN(rack_probertt),
875             OID_AUTO, "sets_cwnd", CTLFLAG_RW,
876             &rack_probe_rtt_sets_cwnd, 0,
877             "Do we set the cwnd too (if always_lower is on)");
878         SYSCTL_ADD_U32(&rack_sysctl_ctx,
879             SYSCTL_CHILDREN(rack_probertt),
880             OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
881             &rack_max_drain_wait, 2,
882             "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
883         SYSCTL_ADD_U32(&rack_sysctl_ctx,
884             SYSCTL_CHILDREN(rack_probertt),
885             OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
886             &rack_must_drain, 1,
887             "We must drain this many gp_srtt's waiting for flight to reach goal");
888         SYSCTL_ADD_U32(&rack_sysctl_ctx,
889             SYSCTL_CHILDREN(rack_probertt),
890             OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
891             &rack_probertt_use_min_rtt_entry, 1,
892             "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
893         SYSCTL_ADD_U32(&rack_sysctl_ctx,
894             SYSCTL_CHILDREN(rack_probertt),
895             OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
896             &rack_probertt_use_min_rtt_exit, 0,
897             "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
898         SYSCTL_ADD_U32(&rack_sysctl_ctx,
899             SYSCTL_CHILDREN(rack_probertt),
900             OID_AUTO, "length_div", CTLFLAG_RW,
901             &rack_probertt_gpsrtt_cnt_div, 0,
902             "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
903         SYSCTL_ADD_U32(&rack_sysctl_ctx,
904             SYSCTL_CHILDREN(rack_probertt),
905             OID_AUTO, "length_mul", CTLFLAG_RW,
906             &rack_probertt_gpsrtt_cnt_mul, 0,
907             "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
908         SYSCTL_ADD_U32(&rack_sysctl_ctx,
909             SYSCTL_CHILDREN(rack_probertt),
910             OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
911             &rack_min_probertt_hold, 200000,
912             "What is the minimum time we hold probertt at target");
913         SYSCTL_ADD_U32(&rack_sysctl_ctx,
914             SYSCTL_CHILDREN(rack_probertt),
915             OID_AUTO, "filter_life", CTLFLAG_RW,
916             &rack_probertt_filter_life, 10000000,
917             "What is the time for the filters life in useconds");
918         SYSCTL_ADD_U32(&rack_sysctl_ctx,
919             SYSCTL_CHILDREN(rack_probertt),
920             OID_AUTO, "lower_within", CTLFLAG_RW,
921             &rack_probertt_lower_within, 10,
922             "If the rtt goes lower within this percentage of the time, go into probe-rtt");
923         SYSCTL_ADD_U32(&rack_sysctl_ctx,
924             SYSCTL_CHILDREN(rack_probertt),
925             OID_AUTO, "must_move", CTLFLAG_RW,
926             &rack_min_rtt_movement, 250,
927             "How much is the minimum movement in rtt to count as a drop for probertt purposes");
928         SYSCTL_ADD_U32(&rack_sysctl_ctx,
929             SYSCTL_CHILDREN(rack_probertt),
930             OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
931             &rack_probertt_clear_is, 1,
932             "Do we clear I/S counts on exiting probe-rtt");
933         SYSCTL_ADD_S32(&rack_sysctl_ctx,
934             SYSCTL_CHILDREN(rack_probertt),
935             OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
936             &rack_max_drain_hbp, 1,
937             "How many extra drain gpsrtt's do we get in highly buffered paths");
938         SYSCTL_ADD_S32(&rack_sysctl_ctx,
939             SYSCTL_CHILDREN(rack_probertt),
940             OID_AUTO, "hbp_threshold", CTLFLAG_RW,
941             &rack_hbp_thresh, 3,
942             "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
943
944         rack_tracepoint = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
945             SYSCTL_CHILDREN(rack_sysctl_root),
946             OID_AUTO,
947             "tp",
948             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
949             "Rack tracepoint facility");
950         SYSCTL_ADD_U32(&rack_sysctl_ctx,
951             SYSCTL_CHILDREN(rack_tracepoint),
952             OID_AUTO, "number", CTLFLAG_RW,
953             &rack_trace_point_config, 0,
954             "What is the trace point number to activate (0=none, 0xffffffff = all)?");
955         SYSCTL_ADD_U32(&rack_sysctl_ctx,
956             SYSCTL_CHILDREN(rack_tracepoint),
957             OID_AUTO, "bbmode", CTLFLAG_RW,
958             &rack_trace_point_bb_mode, 4,
959             "What is BB logging mode that is activated?");
960         SYSCTL_ADD_S32(&rack_sysctl_ctx,
961             SYSCTL_CHILDREN(rack_tracepoint),
962             OID_AUTO, "count", CTLFLAG_RW,
963             &rack_trace_point_count, 0,
964             "How many connections will have BB logging turned on that hit the tracepoint?");
965         /* Pacing related sysctls */
966         rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
967             SYSCTL_CHILDREN(rack_sysctl_root),
968             OID_AUTO,
969             "pacing",
970             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
971             "Pacing related Controls");
972         SYSCTL_ADD_S32(&rack_sysctl_ctx,
973             SYSCTL_CHILDREN(rack_pacing),
974             OID_AUTO, "max_pace_over", CTLFLAG_RW,
975             &rack_max_per_above, 30,
976             "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
977         SYSCTL_ADD_S32(&rack_sysctl_ctx,
978             SYSCTL_CHILDREN(rack_pacing),
979             OID_AUTO, "pace_to_one", CTLFLAG_RW,
980             &rack_pace_one_seg, 0,
981             "Do we allow low b/w pacing of 1MSS instead of two");
982         SYSCTL_ADD_S32(&rack_sysctl_ctx,
983             SYSCTL_CHILDREN(rack_pacing),
984             OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
985             &rack_limit_time_with_srtt, 0,
986             "Do we limit pacing time based on srtt");
987         SYSCTL_ADD_S32(&rack_sysctl_ctx,
988             SYSCTL_CHILDREN(rack_pacing),
989             OID_AUTO, "init_win", CTLFLAG_RW,
990             &rack_default_init_window, 0,
991             "Do we have a rack initial window 0 = system default");
992         SYSCTL_ADD_U16(&rack_sysctl_ctx,
993             SYSCTL_CHILDREN(rack_pacing),
994             OID_AUTO, "gp_per_ss", CTLFLAG_RW,
995             &rack_per_of_gp_ss, 250,
996             "If non zero, what percentage of goodput to pace at in slow start");
997         SYSCTL_ADD_U16(&rack_sysctl_ctx,
998             SYSCTL_CHILDREN(rack_pacing),
999             OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1000             &rack_per_of_gp_ca, 150,
1001             "If non zero, what percentage of goodput to pace at in congestion avoidance");
1002         SYSCTL_ADD_U16(&rack_sysctl_ctx,
1003             SYSCTL_CHILDREN(rack_pacing),
1004             OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1005             &rack_per_of_gp_rec, 200,
1006             "If non zero, what percentage of goodput to pace at in recovery");
1007         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1008             SYSCTL_CHILDREN(rack_pacing),
1009             OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1010             &rack_hptsi_segments, 40,
1011             "What size is the max for TSO segments in pacing and burst mitigation");
1012         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1013             SYSCTL_CHILDREN(rack_pacing),
1014             OID_AUTO, "burst_reduces", CTLFLAG_RW,
1015             &rack_slot_reduction, 4,
1016             "When doing only burst mitigation what is the reduce divisor");
1017         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1018             SYSCTL_CHILDREN(rack_sysctl_root),
1019             OID_AUTO, "use_pacing", CTLFLAG_RW,
1020             &rack_pace_every_seg, 0,
1021             "If set we use pacing, if clear we use only the original burst mitigation");
1022         SYSCTL_ADD_U64(&rack_sysctl_ctx,
1023             SYSCTL_CHILDREN(rack_pacing),
1024             OID_AUTO, "rate_cap", CTLFLAG_RW,
1025             &rack_bw_rate_cap, 0,
1026             "If set we apply this value to the absolute rate cap used by pacing");
1027         SYSCTL_ADD_U8(&rack_sysctl_ctx,
1028             SYSCTL_CHILDREN(rack_sysctl_root),
1029             OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1030             &rack_req_measurements, 1,
1031             "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1032         /* Hardware pacing */
1033         rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1034             SYSCTL_CHILDREN(rack_sysctl_root),
1035             OID_AUTO,
1036             "hdwr_pacing",
1037             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1038             "Pacing related Controls");
1039         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1040             SYSCTL_CHILDREN(rack_hw_pacing),
1041             OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1042             &rack_hw_rwnd_factor, 2,
1043             "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1044         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1045             SYSCTL_CHILDREN(rack_hw_pacing),
1046             OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1047             &rack_enobuf_hw_boost_mult, 2,
1048             "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1049         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1050             SYSCTL_CHILDREN(rack_hw_pacing),
1051             OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1052             &rack_enobuf_hw_max, 2,
1053             "What is the max boost the pacing time if we see a ENOBUFS?");
1054         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1055             SYSCTL_CHILDREN(rack_hw_pacing),
1056             OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1057             &rack_enobuf_hw_min, 2,
1058             "What is the min boost the pacing time if we see a ENOBUFS?");
1059         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1060             SYSCTL_CHILDREN(rack_hw_pacing),
1061             OID_AUTO, "enable", CTLFLAG_RW,
1062             &rack_enable_hw_pacing, 0,
1063             "Should RACK attempt to use hw pacing?");
1064         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1065             SYSCTL_CHILDREN(rack_hw_pacing),
1066             OID_AUTO, "rate_cap", CTLFLAG_RW,
1067             &rack_hw_rate_caps, 1,
1068             "Does the highest hardware pacing rate cap the rate we will send at??");
1069         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1070             SYSCTL_CHILDREN(rack_hw_pacing),
1071             OID_AUTO, "rate_min", CTLFLAG_RW,
1072             &rack_hw_rate_min, 0,
1073             "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1074         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1075             SYSCTL_CHILDREN(rack_hw_pacing),
1076             OID_AUTO, "rate_to_low", CTLFLAG_RW,
1077             &rack_hw_rate_to_low, 0,
1078             "If we fall below this rate, dis-engage hw pacing?");
1079         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1080             SYSCTL_CHILDREN(rack_hw_pacing),
1081             OID_AUTO, "up_only", CTLFLAG_RW,
1082             &rack_hw_up_only, 1,
1083             "Do we allow hw pacing to lower the rate selected?");
1084         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1085             SYSCTL_CHILDREN(rack_hw_pacing),
1086             OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
1087             &rack_hw_pace_extra_slots, 2,
1088             "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
1089         rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1090             SYSCTL_CHILDREN(rack_sysctl_root),
1091             OID_AUTO,
1092             "timely",
1093             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1094             "Rack Timely RTT Controls");
1095         /* Timely based GP dynmics */
1096         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1097             SYSCTL_CHILDREN(rack_timely),
1098             OID_AUTO, "upper", CTLFLAG_RW,
1099             &rack_gp_per_bw_mul_up, 2,
1100             "Rack timely upper range for equal b/w (in percentage)");
1101         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1102             SYSCTL_CHILDREN(rack_timely),
1103             OID_AUTO, "lower", CTLFLAG_RW,
1104             &rack_gp_per_bw_mul_down, 4,
1105             "Rack timely lower range for equal b/w (in percentage)");
1106         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1107             SYSCTL_CHILDREN(rack_timely),
1108             OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1109             &rack_gp_rtt_maxmul, 3,
1110             "Rack timely multiplier of lowest rtt for rtt_max");
1111         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1112             SYSCTL_CHILDREN(rack_timely),
1113             OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1114             &rack_gp_rtt_mindiv, 4,
1115             "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1116         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1117             SYSCTL_CHILDREN(rack_timely),
1118             OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1119             &rack_gp_rtt_minmul, 1,
1120             "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1121         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1122             SYSCTL_CHILDREN(rack_timely),
1123             OID_AUTO, "decrease", CTLFLAG_RW,
1124             &rack_gp_decrease_per, 20,
1125             "Rack timely decrease percentage of our GP multiplication factor");
1126         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1127             SYSCTL_CHILDREN(rack_timely),
1128             OID_AUTO, "increase", CTLFLAG_RW,
1129             &rack_gp_increase_per, 2,
1130             "Rack timely increase perentage of our GP multiplication factor");
1131         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1132             SYSCTL_CHILDREN(rack_timely),
1133             OID_AUTO, "lowerbound", CTLFLAG_RW,
1134             &rack_per_lower_bound, 50,
1135             "Rack timely lowest percentage we allow GP multiplier to fall to");
1136         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1137             SYSCTL_CHILDREN(rack_timely),
1138             OID_AUTO, "upperboundss", CTLFLAG_RW,
1139             &rack_per_upper_bound_ss, 0,
1140             "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1141         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1142             SYSCTL_CHILDREN(rack_timely),
1143             OID_AUTO, "upperboundca", CTLFLAG_RW,
1144             &rack_per_upper_bound_ca, 0,
1145             "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1146         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1147             SYSCTL_CHILDREN(rack_timely),
1148             OID_AUTO, "dynamicgp", CTLFLAG_RW,
1149             &rack_do_dyn_mul, 0,
1150             "Rack timely do we enable dynmaic timely goodput by default");
1151         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1152             SYSCTL_CHILDREN(rack_timely),
1153             OID_AUTO, "no_rec_red", CTLFLAG_RW,
1154             &rack_gp_no_rec_chg, 1,
1155             "Rack timely do we prohibit the recovery multiplier from being lowered");
1156         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1157             SYSCTL_CHILDREN(rack_timely),
1158             OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1159             &rack_timely_dec_clear, 6,
1160             "Rack timely what threshold do we count to before another boost during b/w decent");
1161         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1162             SYSCTL_CHILDREN(rack_timely),
1163             OID_AUTO, "max_push_rise", CTLFLAG_RW,
1164             &rack_timely_max_push_rise, 3,
1165             "Rack timely how many times do we push up with b/w increase");
1166         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1167             SYSCTL_CHILDREN(rack_timely),
1168             OID_AUTO, "max_push_drop", CTLFLAG_RW,
1169             &rack_timely_max_push_drop, 3,
1170             "Rack timely how many times do we push back on b/w decent");
1171         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1172             SYSCTL_CHILDREN(rack_timely),
1173             OID_AUTO, "min_segs", CTLFLAG_RW,
1174             &rack_timely_min_segs, 4,
1175             "Rack timely when setting the cwnd what is the min num segments");
1176         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1177             SYSCTL_CHILDREN(rack_timely),
1178             OID_AUTO, "noback_max", CTLFLAG_RW,
1179             &rack_use_max_for_nobackoff, 0,
1180             "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
1181         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1182             SYSCTL_CHILDREN(rack_timely),
1183             OID_AUTO, "interim_timely_only", CTLFLAG_RW,
1184             &rack_timely_int_timely_only, 0,
1185             "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
1186         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1187             SYSCTL_CHILDREN(rack_timely),
1188             OID_AUTO, "nonstop", CTLFLAG_RW,
1189             &rack_timely_no_stopping, 0,
1190             "Rack timely don't stop increase");
1191         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1192             SYSCTL_CHILDREN(rack_timely),
1193             OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1194             &rack_down_raise_thresh, 100,
1195             "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1196         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1197             SYSCTL_CHILDREN(rack_timely),
1198             OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1199             &rack_req_segs, 1,
1200             "Bottom dragging if not these many segments outstanding and room");
1201
1202         /* TLP and Rack related parameters */
1203         rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1204             SYSCTL_CHILDREN(rack_sysctl_root),
1205             OID_AUTO,
1206             "tlp",
1207             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1208             "TLP and Rack related Controls");
1209         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1210             SYSCTL_CHILDREN(rack_tlp),
1211             OID_AUTO, "use_rrr", CTLFLAG_RW,
1212             &use_rack_rr, 1,
1213             "Do we use Rack Rapid Recovery");
1214         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1215             SYSCTL_CHILDREN(rack_tlp),
1216             OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1217             &rack_max_abc_post_recovery, 2,
1218             "Since we do early recovery, do we override the l_abc to a value, if so what?");
1219         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1220             SYSCTL_CHILDREN(rack_tlp),
1221             OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1222             &rack_non_rxt_use_cr, 0,
1223             "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1224         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1225             SYSCTL_CHILDREN(rack_tlp),
1226             OID_AUTO, "tlpmethod", CTLFLAG_RW,
1227             &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1228             "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1229         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1230             SYSCTL_CHILDREN(rack_tlp),
1231             OID_AUTO, "limit", CTLFLAG_RW,
1232             &rack_tlp_limit, 2,
1233             "How many TLP's can be sent without sending new data");
1234         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1235             SYSCTL_CHILDREN(rack_tlp),
1236             OID_AUTO, "use_greater", CTLFLAG_RW,
1237             &rack_tlp_use_greater, 1,
1238             "Should we use the rack_rtt time if its greater than srtt");
1239         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1240             SYSCTL_CHILDREN(rack_tlp),
1241             OID_AUTO, "tlpminto", CTLFLAG_RW,
1242             &rack_tlp_min, 10000,
1243             "TLP minimum timeout per the specification (in microseconds)");
1244         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1245             SYSCTL_CHILDREN(rack_tlp),
1246             OID_AUTO, "send_oldest", CTLFLAG_RW,
1247             &rack_always_send_oldest, 0,
1248             "Should we always send the oldest TLP and RACK-TLP");
1249         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1250             SYSCTL_CHILDREN(rack_tlp),
1251             OID_AUTO, "rack_tlimit", CTLFLAG_RW,
1252             &rack_limited_retran, 0,
1253             "How many times can a rack timeout drive out sends");
1254         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1255             SYSCTL_CHILDREN(rack_tlp),
1256             OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1257             &rack_lower_cwnd_at_tlp, 0,
1258             "When a TLP completes a retran should we enter recovery");
1259         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1260             SYSCTL_CHILDREN(rack_tlp),
1261             OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1262             &rack_reorder_thresh, 2,
1263             "What factor for rack will be added when seeing reordering (shift right)");
1264         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1265             SYSCTL_CHILDREN(rack_tlp),
1266             OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1267             &rack_tlp_thresh, 1,
1268             "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1269         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1270             SYSCTL_CHILDREN(rack_tlp),
1271             OID_AUTO, "reorder_fade", CTLFLAG_RW,
1272             &rack_reorder_fade, 60000000,
1273             "Does reorder detection fade, if so how many microseconds (0 means never)");
1274         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1275             SYSCTL_CHILDREN(rack_tlp),
1276             OID_AUTO, "pktdelay", CTLFLAG_RW,
1277             &rack_pkt_delay, 1000,
1278             "Extra RACK time (in microseconds) besides reordering thresh");
1279
1280         /* Timer related controls */
1281         rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1282             SYSCTL_CHILDREN(rack_sysctl_root),
1283             OID_AUTO,
1284             "timers",
1285             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1286             "Timer related controls");
1287         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1288             SYSCTL_CHILDREN(rack_timers),
1289             OID_AUTO, "persmin", CTLFLAG_RW,
1290             &rack_persist_min, 250000,
1291             "What is the minimum time in microseconds between persists");
1292         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1293             SYSCTL_CHILDREN(rack_timers),
1294             OID_AUTO, "persmax", CTLFLAG_RW,
1295             &rack_persist_max, 2000000,
1296             "What is the largest delay in microseconds between persists");
1297         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1298             SYSCTL_CHILDREN(rack_timers),
1299             OID_AUTO, "delayed_ack", CTLFLAG_RW,
1300             &rack_delayed_ack_time, 40000,
1301             "Delayed ack time (40ms in microseconds)");
1302         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1303             SYSCTL_CHILDREN(rack_timers),
1304             OID_AUTO, "minrto", CTLFLAG_RW,
1305             &rack_rto_min, 30000,
1306             "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1307         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1308             SYSCTL_CHILDREN(rack_timers),
1309             OID_AUTO, "maxrto", CTLFLAG_RW,
1310             &rack_rto_max, 4000000,
1311             "Maximum RTO in microseconds -- should be at least as large as min_rto");
1312         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1313             SYSCTL_CHILDREN(rack_timers),
1314             OID_AUTO, "minto", CTLFLAG_RW,
1315             &rack_min_to, 1000,
1316             "Minimum rack timeout in microseconds");
1317         /* Measure controls */
1318         rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1319             SYSCTL_CHILDREN(rack_sysctl_root),
1320             OID_AUTO,
1321             "measure",
1322             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1323             "Measure related controls");
1324         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1325             SYSCTL_CHILDREN(rack_measure),
1326             OID_AUTO, "wma_divisor", CTLFLAG_RW,
1327             &rack_wma_divisor, 8,
1328             "When doing b/w calculation what is the  divisor for the WMA");
1329         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1330             SYSCTL_CHILDREN(rack_measure),
1331             OID_AUTO, "end_cwnd", CTLFLAG_RW,
1332             &rack_cwnd_block_ends_measure, 0,
1333             "Does a cwnd just-return end the measurement window (app limited)");
1334         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1335             SYSCTL_CHILDREN(rack_measure),
1336             OID_AUTO, "end_rwnd", CTLFLAG_RW,
1337             &rack_rwnd_block_ends_measure, 0,
1338             "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1339         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1340             SYSCTL_CHILDREN(rack_measure),
1341             OID_AUTO, "min_target", CTLFLAG_RW,
1342             &rack_def_data_window, 20,
1343             "What is the minimum target window (in mss) for a GP measurements");
1344         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1345             SYSCTL_CHILDREN(rack_measure),
1346             OID_AUTO, "goal_bdp", CTLFLAG_RW,
1347             &rack_goal_bdp, 2,
1348             "What is the goal BDP to measure");
1349         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1350             SYSCTL_CHILDREN(rack_measure),
1351             OID_AUTO, "min_srtts", CTLFLAG_RW,
1352             &rack_min_srtts, 1,
1353             "What is the goal BDP to measure");
1354         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1355             SYSCTL_CHILDREN(rack_measure),
1356             OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1357             &rack_min_measure_usec, 0,
1358             "What is the Minimum time time for a measurement if 0, this is off");
1359         /* Features */
1360         rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1361             SYSCTL_CHILDREN(rack_sysctl_root),
1362             OID_AUTO,
1363             "features",
1364             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1365             "Feature controls");
1366         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1367             SYSCTL_CHILDREN(rack_features),
1368             OID_AUTO, "cmpack", CTLFLAG_RW,
1369             &rack_use_cmp_acks, 1,
1370             "Should RACK have LRO send compressed acks");
1371         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1372             SYSCTL_CHILDREN(rack_features),
1373             OID_AUTO, "fsb", CTLFLAG_RW,
1374             &rack_use_fsb, 1,
1375             "Should RACK use the fast send block?");
1376         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1377             SYSCTL_CHILDREN(rack_features),
1378             OID_AUTO, "rfo", CTLFLAG_RW,
1379             &rack_use_rfo, 1,
1380             "Should RACK use rack_fast_output()?");
1381         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1382             SYSCTL_CHILDREN(rack_features),
1383             OID_AUTO, "rsmrfo", CTLFLAG_RW,
1384             &rack_use_rsm_rfo, 1,
1385             "Should RACK use rack_fast_rsm_output()?");
1386         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1387             SYSCTL_CHILDREN(rack_features),
1388             OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1389             &rack_enable_mqueue_for_nonpaced, 0,
1390             "Should RACK use mbuf queuing for non-paced connections");
1391         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1392             SYSCTL_CHILDREN(rack_features),
1393             OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1394             &rack_do_hystart, 0,
1395             "Should RACK enable HyStart++ on connections?");
1396         /* Misc rack controls */
1397         rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1398             SYSCTL_CHILDREN(rack_sysctl_root),
1399             OID_AUTO,
1400             "misc",
1401             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1402             "Misc related controls");
1403 #ifdef TCP_ACCOUNTING
1404         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1405             SYSCTL_CHILDREN(rack_misc),
1406             OID_AUTO, "tcp_acct", CTLFLAG_RW,
1407             &rack_tcp_accounting, 0,
1408             "Should we turn on TCP accounting for all rack sessions?");
1409 #endif
1410         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1411             SYSCTL_CHILDREN(rack_misc),
1412             OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
1413             &rack_apply_rtt_with_reduced_conf, 0,
1414             "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
1415         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1416             SYSCTL_CHILDREN(rack_misc),
1417             OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1418             &rack_dsack_std_based, 3,
1419             "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1420         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1421             SYSCTL_CHILDREN(rack_misc),
1422             OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1423             &rack_prr_addbackmax, 2,
1424             "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1425         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1426             SYSCTL_CHILDREN(rack_misc),
1427             OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1428             &rack_stats_gets_ms_rtt, 1,
1429             "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1430         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1431             SYSCTL_CHILDREN(rack_misc),
1432             OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1433             &rack_client_low_buf, 0,
1434             "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1435         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1436             SYSCTL_CHILDREN(rack_misc),
1437             OID_AUTO, "defprofile", CTLFLAG_RW,
1438             &rack_def_profile, 0,
1439             "Should RACK use a default profile (0=no, num == profile num)?");
1440         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1441             SYSCTL_CHILDREN(rack_misc),
1442             OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1443             &rack_enable_shared_cwnd, 1,
1444             "Should RACK try to use the shared cwnd on connections where allowed");
1445         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1446             SYSCTL_CHILDREN(rack_misc),
1447             OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1448             &rack_limits_scwnd, 1,
1449             "Should RACK place low end time limits on the shared cwnd feature");
1450         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1451             SYSCTL_CHILDREN(rack_misc),
1452             OID_AUTO, "iMac_dack", CTLFLAG_RW,
1453             &rack_use_imac_dack, 0,
1454             "Should RACK try to emulate iMac delayed ack");
1455         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1456             SYSCTL_CHILDREN(rack_misc),
1457             OID_AUTO, "no_prr", CTLFLAG_RW,
1458             &rack_disable_prr, 0,
1459             "Should RACK not use prr and only pace (must have pacing on)");
1460         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1461             SYSCTL_CHILDREN(rack_misc),
1462             OID_AUTO, "bb_verbose", CTLFLAG_RW,
1463             &rack_verbose_logging, 0,
1464             "Should RACK black box logging be verbose");
1465         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1466             SYSCTL_CHILDREN(rack_misc),
1467             OID_AUTO, "data_after_close", CTLFLAG_RW,
1468             &rack_ignore_data_after_close, 1,
1469             "Do we hold off sending a RST until all pending data is ack'd");
1470         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1471             SYSCTL_CHILDREN(rack_misc),
1472             OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1473             &rack_sack_not_required, 1,
1474             "Do we allow rack to run on connections not supporting SACK");
1475         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1476             SYSCTL_CHILDREN(rack_misc),
1477             OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1478             &rack_send_a_lot_in_prr, 1,
1479             "Send a lot in prr");
1480         SYSCTL_ADD_S32(&rack_sysctl_ctx,
1481             SYSCTL_CHILDREN(rack_misc),
1482             OID_AUTO, "autoscale", CTLFLAG_RW,
1483             &rack_autosndbuf_inc, 20,
1484             "What percentage should rack scale up its snd buffer by?");
1485         /* Sack Attacker detection stuff */
1486         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1487             SYSCTL_CHILDREN(rack_attack),
1488             OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
1489             &rack_highest_sack_thresh_seen, 0,
1490             "Highest sack to ack ratio seen");
1491         SYSCTL_ADD_U32(&rack_sysctl_ctx,
1492             SYSCTL_CHILDREN(rack_attack),
1493             OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
1494             &rack_highest_move_thresh_seen, 0,
1495             "Highest move to non-move ratio seen");
1496         rack_ack_total = counter_u64_alloc(M_WAITOK);
1497         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1498             SYSCTL_CHILDREN(rack_attack),
1499             OID_AUTO, "acktotal", CTLFLAG_RD,
1500             &rack_ack_total,
1501             "Total number of Ack's");
1502         rack_express_sack = counter_u64_alloc(M_WAITOK);
1503         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1504             SYSCTL_CHILDREN(rack_attack),
1505             OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
1506             &rack_express_sack,
1507             "Total expresss number of Sack's");
1508         rack_sack_total = counter_u64_alloc(M_WAITOK);
1509         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1510             SYSCTL_CHILDREN(rack_attack),
1511             OID_AUTO, "sacktotal", CTLFLAG_RD,
1512             &rack_sack_total,
1513             "Total number of SACKs");
1514         rack_move_none = counter_u64_alloc(M_WAITOK);
1515         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1516             SYSCTL_CHILDREN(rack_attack),
1517             OID_AUTO, "move_none", CTLFLAG_RD,
1518             &rack_move_none,
1519             "Total number of SACK index reuse of positions under threshold");
1520         rack_move_some = counter_u64_alloc(M_WAITOK);
1521         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1522             SYSCTL_CHILDREN(rack_attack),
1523             OID_AUTO, "move_some", CTLFLAG_RD,
1524             &rack_move_some,
1525             "Total number of SACK index reuse of positions over threshold");
1526         rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
1527         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1528             SYSCTL_CHILDREN(rack_attack),
1529             OID_AUTO, "attacks", CTLFLAG_RD,
1530             &rack_sack_attacks_detected,
1531             "Total number of SACK attackers that had sack disabled");
1532         rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
1533         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1534             SYSCTL_CHILDREN(rack_attack),
1535             OID_AUTO, "reversed", CTLFLAG_RD,
1536             &rack_sack_attacks_reversed,
1537             "Total number of SACK attackers that were later determined false positive");
1538         rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
1539         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1540             SYSCTL_CHILDREN(rack_attack),
1541             OID_AUTO, "nextmerge", CTLFLAG_RD,
1542             &rack_sack_used_next_merge,
1543             "Total number of times we used the next merge");
1544         rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
1545         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1546             SYSCTL_CHILDREN(rack_attack),
1547             OID_AUTO, "prevmerge", CTLFLAG_RD,
1548             &rack_sack_used_prev_merge,
1549             "Total number of times we used the prev merge");
1550         /* Counters */
1551         rack_fto_send = counter_u64_alloc(M_WAITOK);
1552         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1553             SYSCTL_CHILDREN(rack_counters),
1554             OID_AUTO, "fto_send", CTLFLAG_RD,
1555             &rack_fto_send, "Total number of rack_fast_output sends");
1556         rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1557         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1558             SYSCTL_CHILDREN(rack_counters),
1559             OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1560             &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1561         rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1562         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1563             SYSCTL_CHILDREN(rack_counters),
1564             OID_AUTO, "nfto_resend", CTLFLAG_RD,
1565             &rack_nfto_resend, "Total number of rack_output retransmissions");
1566         rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1567         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1568             SYSCTL_CHILDREN(rack_counters),
1569             OID_AUTO, "nfto_send", CTLFLAG_RD,
1570             &rack_non_fto_send, "Total number of rack_output first sends");
1571         rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1572         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1573             SYSCTL_CHILDREN(rack_counters),
1574             OID_AUTO, "rfo_extended", CTLFLAG_RD,
1575             &rack_extended_rfo, "Total number of times we extended rfo");
1576
1577         rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1578         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1579             SYSCTL_CHILDREN(rack_counters),
1580             OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1581             &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1582         rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1583
1584         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1585             SYSCTL_CHILDREN(rack_counters),
1586             OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1587             &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1588         rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1589         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1590             SYSCTL_CHILDREN(rack_counters),
1591             OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1592             &rack_tlp_tot,
1593             "Total number of tail loss probe expirations");
1594         rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1595         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1596             SYSCTL_CHILDREN(rack_counters),
1597             OID_AUTO, "tlp_new", CTLFLAG_RD,
1598             &rack_tlp_newdata,
1599             "Total number of tail loss probe sending new data");
1600         rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1601         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1602             SYSCTL_CHILDREN(rack_counters),
1603             OID_AUTO, "tlp_retran", CTLFLAG_RD,
1604             &rack_tlp_retran,
1605             "Total number of tail loss probe sending retransmitted data");
1606         rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1607         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1608             SYSCTL_CHILDREN(rack_counters),
1609             OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1610             &rack_tlp_retran_bytes,
1611             "Total bytes of tail loss probe sending retransmitted data");
1612         rack_to_tot = counter_u64_alloc(M_WAITOK);
1613         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1614             SYSCTL_CHILDREN(rack_counters),
1615             OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1616             &rack_to_tot,
1617             "Total number of times the rack to expired");
1618         rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1619         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1620             SYSCTL_CHILDREN(rack_counters),
1621             OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1622             &rack_saw_enobuf,
1623             "Total number of times a sends returned enobuf for non-hdwr paced connections");
1624         rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1625         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1626             SYSCTL_CHILDREN(rack_counters),
1627             OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1628             &rack_saw_enobuf_hw,
1629             "Total number of times a send returned enobuf for hdwr paced connections");
1630         rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1631         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1632             SYSCTL_CHILDREN(rack_counters),
1633             OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1634             &rack_saw_enetunreach,
1635             "Total number of times a send received a enetunreachable");
1636         rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1637         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1638             SYSCTL_CHILDREN(rack_counters),
1639             OID_AUTO, "alloc_hot", CTLFLAG_RD,
1640             &rack_hot_alloc,
1641             "Total allocations from the top of our list");
1642         rack_to_alloc = counter_u64_alloc(M_WAITOK);
1643         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1644             SYSCTL_CHILDREN(rack_counters),
1645             OID_AUTO, "allocs", CTLFLAG_RD,
1646             &rack_to_alloc,
1647             "Total allocations of tracking structures");
1648         rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1649         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1650             SYSCTL_CHILDREN(rack_counters),
1651             OID_AUTO, "allochard", CTLFLAG_RD,
1652             &rack_to_alloc_hard,
1653             "Total allocations done with sleeping the hard way");
1654         rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1655         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1656             SYSCTL_CHILDREN(rack_counters),
1657             OID_AUTO, "allocemerg", CTLFLAG_RD,
1658             &rack_to_alloc_emerg,
1659             "Total allocations done from emergency cache");
1660         rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1661         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1662             SYSCTL_CHILDREN(rack_counters),
1663             OID_AUTO, "alloc_limited", CTLFLAG_RD,
1664             &rack_to_alloc_limited,
1665             "Total allocations dropped due to limit");
1666         rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1667         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1668             SYSCTL_CHILDREN(rack_counters),
1669             OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1670             &rack_alloc_limited_conns,
1671             "Connections with allocations dropped due to limit");
1672         rack_split_limited = counter_u64_alloc(M_WAITOK);
1673         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1674             SYSCTL_CHILDREN(rack_counters),
1675             OID_AUTO, "split_limited", CTLFLAG_RD,
1676             &rack_split_limited,
1677             "Split allocations dropped due to limit");
1678         rack_persists_sends = counter_u64_alloc(M_WAITOK);
1679         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1680             SYSCTL_CHILDREN(rack_counters),
1681             OID_AUTO, "persist_sends", CTLFLAG_RD,
1682             &rack_persists_sends,
1683             "Number of times we sent a persist probe");
1684         rack_persists_acks = counter_u64_alloc(M_WAITOK);
1685         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1686             SYSCTL_CHILDREN(rack_counters),
1687             OID_AUTO, "persist_acks", CTLFLAG_RD,
1688             &rack_persists_acks,
1689             "Number of times a persist probe was acked");
1690         rack_persists_loss = counter_u64_alloc(M_WAITOK);
1691         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1692             SYSCTL_CHILDREN(rack_counters),
1693             OID_AUTO, "persist_loss", CTLFLAG_RD,
1694             &rack_persists_loss,
1695             "Number of times we detected a lost persist probe (no ack)");
1696         rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
1697         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1698             SYSCTL_CHILDREN(rack_counters),
1699             OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
1700             &rack_persists_lost_ends,
1701             "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
1702 #ifdef INVARIANTS
1703         rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1704         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1705             SYSCTL_CHILDREN(rack_counters),
1706             OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1707             &rack_adjust_map_bw,
1708             "Number of times we hit the case where the sb went up and down on a sendmap entry");
1709 #endif
1710         rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1711         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1712             SYSCTL_CHILDREN(rack_counters),
1713             OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1714             &rack_multi_single_eq,
1715             "Number of compressed acks total represented");
1716         rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1717         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1718             SYSCTL_CHILDREN(rack_counters),
1719             OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1720             &rack_proc_non_comp_ack,
1721             "Number of non compresseds acks that we processed");
1722
1723
1724         rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1725         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1726             SYSCTL_CHILDREN(rack_counters),
1727             OID_AUTO, "sack_long", CTLFLAG_RD,
1728             &rack_sack_proc_all,
1729             "Total times we had to walk whole list for sack processing");
1730         rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1731         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1732             SYSCTL_CHILDREN(rack_counters),
1733             OID_AUTO, "sack_restart", CTLFLAG_RD,
1734             &rack_sack_proc_restart,
1735             "Total times we had to walk whole list due to a restart");
1736         rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1737         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1738             SYSCTL_CHILDREN(rack_counters),
1739             OID_AUTO, "sack_short", CTLFLAG_RD,
1740             &rack_sack_proc_short,
1741             "Total times we took shortcut for sack processing");
1742         rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
1743         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1744             SYSCTL_CHILDREN(rack_attack),
1745             OID_AUTO, "skipacked", CTLFLAG_RD,
1746             &rack_sack_skipped_acked,
1747             "Total number of times we skipped previously sacked");
1748         rack_sack_splits = counter_u64_alloc(M_WAITOK);
1749         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1750             SYSCTL_CHILDREN(rack_attack),
1751             OID_AUTO, "ofsplit", CTLFLAG_RD,
1752             &rack_sack_splits,
1753             "Total number of times we did the old fashion tree split");
1754         rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1755         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1756             SYSCTL_CHILDREN(rack_counters),
1757             OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1758             &rack_input_idle_reduces,
1759             "Total number of idle reductions on input");
1760         rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1761         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1762             SYSCTL_CHILDREN(rack_counters),
1763             OID_AUTO, "collapsed_win", CTLFLAG_RD,
1764             &rack_collapsed_win,
1765             "Total number of collapsed windows");
1766         rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1767         SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1768             SYSCTL_CHILDREN(rack_counters),
1769             OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1770             &rack_try_scwnd,
1771             "Total number of scwnd attempts");
1772         COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1773         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1774             OID_AUTO, "outsize", CTLFLAG_RD,
1775             rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1776         COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1777         SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1778             OID_AUTO, "opts", CTLFLAG_RD,
1779             rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1780         SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1781             SYSCTL_CHILDREN(rack_sysctl_root),
1782             OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1783             &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1784 }
1785
1786 static __inline int
1787 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
1788 {
1789         if (SEQ_GEQ(b->r_start, a->r_start) &&
1790             SEQ_LT(b->r_start, a->r_end)) {
1791                 /*
1792                  * The entry b is within the
1793                  * block a. i.e.:
1794                  * a --   |-------------|
1795                  * b --   |----|
1796                  * <or>
1797                  * b --       |------|
1798                  * <or>
1799                  * b --       |-----------|
1800                  */
1801                 return (0);
1802         } else if (SEQ_GEQ(b->r_start, a->r_end)) {
1803                 /*
1804                  * b falls as either the next
1805                  * sequence block after a so a
1806                  * is said to be smaller than b.
1807                  * i.e:
1808                  * a --   |------|
1809                  * b --          |--------|
1810                  * or
1811                  * b --              |-----|
1812                  */
1813                 return (1);
1814         }
1815         /*
1816          * Whats left is where a is
1817          * larger than b. i.e:
1818          * a --         |-------|
1819          * b --  |---|
1820          * or even possibly
1821          * b --   |--------------|
1822          */
1823         return (-1);
1824 }
1825
1826 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1827 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
1828
1829 static uint32_t
1830 rc_init_window(struct tcp_rack *rack)
1831 {
1832         uint32_t win;
1833
1834         if (rack->rc_init_win == 0) {
1835                 /*
1836                  * Nothing set by the user, use the system stack
1837                  * default.
1838                  */
1839                 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1840         }
1841         win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
1842         return (win);
1843 }
1844
1845 static uint64_t
1846 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1847 {
1848         if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
1849                 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1850         else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1851                 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1852         else
1853                 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1854 }
1855
1856 static uint64_t
1857 rack_get_bw(struct tcp_rack *rack)
1858 {
1859         if (rack->use_fixed_rate) {
1860                 /* Return the fixed pacing rate */
1861                 return (rack_get_fixed_pacing_bw(rack));
1862         }
1863         if (rack->r_ctl.gp_bw == 0) {
1864                 /*
1865                  * We have yet no b/w measurement,
1866                  * if we have a user set initial bw
1867                  * return it. If we don't have that and
1868                  * we have an srtt, use the tcp IW (10) to
1869                  * calculate a fictional b/w over the SRTT
1870                  * which is more or less a guess. Note
1871                  * we don't use our IW from rack on purpose
1872                  * so if we have like IW=30, we are not
1873                  * calculating a "huge" b/w.
1874                  */
1875                 uint64_t bw, srtt;
1876                 if (rack->r_ctl.init_rate)
1877                         return (rack->r_ctl.init_rate);
1878
1879                 /* Has the user set a max peak rate? */
1880 #ifdef NETFLIX_PEAKRATE
1881                 if (rack->rc_tp->t_maxpeakrate)
1882                         return (rack->rc_tp->t_maxpeakrate);
1883 #endif
1884                 /* Ok lets come up with the IW guess, if we have a srtt */
1885                 if (rack->rc_tp->t_srtt == 0) {
1886                         /*
1887                          * Go with old pacing method
1888                          * i.e. burst mitigation only.
1889                          */
1890                         return (0);
1891                 }
1892                 /* Ok lets get the initial TCP win (not racks) */
1893                 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
1894                 srtt = (uint64_t)rack->rc_tp->t_srtt;
1895                 bw *= (uint64_t)USECS_IN_SECOND;
1896                 bw /= srtt;
1897                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
1898                         bw = rack->r_ctl.bw_rate_cap;
1899                 return (bw);
1900         } else {
1901                 uint64_t bw;
1902
1903                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
1904                         /* Averaging is done, we can return the value */
1905                         bw = rack->r_ctl.gp_bw;
1906                 } else {
1907                         /* Still doing initial average must calculate */
1908                         bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
1909                 }
1910 #ifdef NETFLIX_PEAKRATE
1911                 if ((rack->rc_tp->t_maxpeakrate) &&
1912                     (bw > rack->rc_tp->t_maxpeakrate)) {
1913                         /* The user has set a peak rate to pace at
1914                          * don't allow us to pace faster than that.
1915                          */
1916                         return (rack->rc_tp->t_maxpeakrate);
1917                 }
1918 #endif
1919                 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
1920                         bw = rack->r_ctl.bw_rate_cap;
1921                 return (bw);
1922         }
1923 }
1924
1925 static uint16_t
1926 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
1927 {
1928         if (rack->use_fixed_rate) {
1929                 return (100);
1930         } else if (rack->in_probe_rtt && (rsm == NULL))
1931                 return (rack->r_ctl.rack_per_of_gp_probertt);
1932         else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
1933                   rack->r_ctl.rack_per_of_gp_rec)) {
1934                 if (rsm) {
1935                         /* a retransmission always use the recovery rate */
1936                         return (rack->r_ctl.rack_per_of_gp_rec);
1937                 } else if (rack->rack_rec_nonrxt_use_cr) {
1938                         /* Directed to use the configured rate */
1939                         goto configured_rate;
1940                 } else if (rack->rack_no_prr &&
1941                            (rack->r_ctl.rack_per_of_gp_rec > 100)) {
1942                         /* No PRR, lets just use the b/w estimate only */
1943                         return (100);
1944                 } else {
1945                         /*
1946                          * Here we may have a non-retransmit but we
1947                          * have no overrides, so just use the recovery
1948                          * rate (prr is in effect).
1949                          */
1950                         return (rack->r_ctl.rack_per_of_gp_rec);
1951                 }
1952         }
1953 configured_rate:
1954         /* For the configured rate we look at our cwnd vs the ssthresh */
1955         if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1956                 return (rack->r_ctl.rack_per_of_gp_ss);
1957         else
1958                 return (rack->r_ctl.rack_per_of_gp_ca);
1959 }
1960
1961 static void
1962 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
1963 {
1964         /*
1965          * Types of logs (mod value)
1966          * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
1967          * 2 = a dsack round begins, persist is reset to 16.
1968          * 3 = a dsack round ends
1969          * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
1970          * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
1971          * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
1972          */
1973         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1974                 union tcp_log_stackspecific log;
1975                 struct timeval tv;
1976
1977                 memset(&log, 0, sizeof(log));
1978                 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
1979                 log.u_bbr.flex1 <<= 1;
1980                 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
1981                 log.u_bbr.flex1 <<= 1;
1982                 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
1983                 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
1984                 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
1985                 log.u_bbr.flex4 = flex4;
1986                 log.u_bbr.flex5 = flex5;
1987                 log.u_bbr.flex6 = flex6;
1988                 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
1989                 log.u_bbr.flex8 = mod;
1990                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1991                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
1992                     &rack->rc_inp->inp_socket->so_rcv,
1993                     &rack->rc_inp->inp_socket->so_snd,
1994                     RACK_DSACK_HANDLING, 0,
1995                     0, &log, false, &tv);
1996         }
1997 }
1998
1999 static void
2000 rack_log_hdwr_pacing(struct tcp_rack *rack,
2001                      uint64_t rate, uint64_t hw_rate, int line,
2002                      int error, uint16_t mod)
2003 {
2004         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2005                 union tcp_log_stackspecific log;
2006                 struct timeval tv;
2007                 const struct ifnet *ifp;
2008
2009                 memset(&log, 0, sizeof(log));
2010                 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2011                 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2012                 if (rack->r_ctl.crte) {
2013                         ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2014                 } else if (rack->rc_inp->inp_route.ro_nh &&
2015                            rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2016                         ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2017                 } else
2018                         ifp = NULL;
2019                 if (ifp) {
2020                         log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
2021                         log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
2022                 }
2023                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2024                 log.u_bbr.bw_inuse = rate;
2025                 log.u_bbr.flex5 = line;
2026                 log.u_bbr.flex6 = error;
2027                 log.u_bbr.flex7 = mod;
2028                 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2029                 log.u_bbr.flex8 = rack->use_fixed_rate;
2030                 log.u_bbr.flex8 <<= 1;
2031                 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2032                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2033                 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2034                 if (rack->r_ctl.crte)
2035                         log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2036                 else
2037                         log.u_bbr.cur_del_rate = 0;
2038                 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2039                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2040                     &rack->rc_inp->inp_socket->so_rcv,
2041                     &rack->rc_inp->inp_socket->so_snd,
2042                     BBR_LOG_HDWR_PACE, 0,
2043                     0, &log, false, &tv);
2044         }
2045 }
2046
2047 static uint64_t
2048 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2049 {
2050         /*
2051          * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2052          */
2053         uint64_t bw_est, high_rate;
2054         uint64_t gain;
2055
2056         gain = (uint64_t)rack_get_output_gain(rack, rsm);
2057         bw_est = bw * gain;
2058         bw_est /= (uint64_t)100;
2059         /* Never fall below the minimum (def 64kbps) */
2060         if (bw_est < RACK_MIN_BW)
2061                 bw_est = RACK_MIN_BW;
2062         if (rack->r_rack_hw_rate_caps) {
2063                 /* Rate caps are in place */
2064                 if (rack->r_ctl.crte != NULL) {
2065                         /* We have a hdwr rate already */
2066                         high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2067                         if (bw_est >= high_rate) {
2068                                 /* We are capping bw at the highest rate table entry */
2069                                 rack_log_hdwr_pacing(rack,
2070                                                      bw_est, high_rate, __LINE__,
2071                                                      0, 3);
2072                                 bw_est = high_rate;
2073                                 if (capped)
2074                                         *capped = 1;
2075                         }
2076                 } else if ((rack->rack_hdrw_pacing == 0) &&
2077                            (rack->rack_hdw_pace_ena) &&
2078                            (rack->rack_attempt_hdwr_pace == 0) &&
2079                            (rack->rc_inp->inp_route.ro_nh != NULL) &&
2080                            (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2081                         /*
2082                          * Special case, we have not yet attempted hardware
2083                          * pacing, and yet we may, when we do, find out if we are
2084                          * above the highest rate. We need to know the maxbw for the interface
2085                          * in question (if it supports ratelimiting). We get back
2086                          * a 0, if the interface is not found in the RL lists.
2087                          */
2088                         high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2089                         if (high_rate) {
2090                                 /* Yep, we have a rate is it above this rate? */
2091                                 if (bw_est > high_rate) {
2092                                         bw_est = high_rate;
2093                                         if (capped)
2094                                                 *capped = 1;
2095                                 }
2096                         }
2097                 }
2098         }
2099         return (bw_est);
2100 }
2101
2102 static void
2103 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2104 {
2105         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2106                 union tcp_log_stackspecific log;
2107                 struct timeval tv;
2108
2109                 if ((mod != 1) && (rack_verbose_logging == 0)) {
2110                         /*
2111                          * We get 3 values currently for mod
2112                          * 1 - We are retransmitting and this tells the reason.
2113                          * 2 - We are clearing a dup-ack count.
2114                          * 3 - We are incrementing a dup-ack count.
2115                          *
2116                          * The clear/increment are only logged
2117                          * if you have BBverbose on.
2118                          */
2119                         return;
2120                 }
2121                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2122                 log.u_bbr.flex1 = tsused;
2123                 log.u_bbr.flex2 = thresh;
2124                 log.u_bbr.flex3 = rsm->r_flags;
2125                 log.u_bbr.flex4 = rsm->r_dupack;
2126                 log.u_bbr.flex5 = rsm->r_start;
2127                 log.u_bbr.flex6 = rsm->r_end;
2128                 log.u_bbr.flex8 = mod;
2129                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2130                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2131                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2132                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2133                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2134                 log.u_bbr.pacing_gain = rack->r_must_retran;
2135                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2136                     &rack->rc_inp->inp_socket->so_rcv,
2137                     &rack->rc_inp->inp_socket->so_snd,
2138                     BBR_LOG_SETTINGS_CHG, 0,
2139                     0, &log, false, &tv);
2140         }
2141 }
2142
2143 static void
2144 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
2145 {
2146         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2147                 union tcp_log_stackspecific log;
2148                 struct timeval tv;
2149
2150                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2151                 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2152                 log.u_bbr.flex2 = to;
2153                 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2154                 log.u_bbr.flex4 = slot;
2155                 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
2156                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2157                 log.u_bbr.flex7 = rack->rc_in_persist;
2158                 log.u_bbr.flex8 = which;
2159                 if (rack->rack_no_prr)
2160                         log.u_bbr.pkts_out = 0;
2161                 else
2162                         log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2163                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2164                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2165                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2166                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2167                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2168                 log.u_bbr.pacing_gain = rack->r_must_retran;
2169                 log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
2170                 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2171                 log.u_bbr.lost = rack_rto_min;
2172                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2173                     &rack->rc_inp->inp_socket->so_rcv,
2174                     &rack->rc_inp->inp_socket->so_snd,
2175                     BBR_LOG_TIMERSTAR, 0,
2176                     0, &log, false, &tv);
2177         }
2178 }
2179
2180 static void
2181 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2182 {
2183         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2184                 union tcp_log_stackspecific log;
2185                 struct timeval tv;
2186
2187                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2188                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2189                 log.u_bbr.flex8 = to_num;
2190                 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2191                 log.u_bbr.flex2 = rack->rc_rack_rtt;
2192                 if (rsm == NULL)
2193                         log.u_bbr.flex3 = 0;
2194                 else
2195                         log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2196                 if (rack->rack_no_prr)
2197                         log.u_bbr.flex5 = 0;
2198                 else
2199                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2200                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2201                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2202                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2203                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2204                 log.u_bbr.pacing_gain = rack->r_must_retran;
2205                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2206                     &rack->rc_inp->inp_socket->so_rcv,
2207                     &rack->rc_inp->inp_socket->so_snd,
2208                     BBR_LOG_RTO, 0,
2209                     0, &log, false, &tv);
2210         }
2211 }
2212
2213 static void
2214 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2215                  struct rack_sendmap *prev,
2216                  struct rack_sendmap *rsm,
2217                  struct rack_sendmap *next,
2218                  int flag, uint32_t th_ack, int line)
2219 {
2220         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2221                 union tcp_log_stackspecific log;
2222                 struct timeval tv;
2223
2224                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2225                 log.u_bbr.flex8 = flag;
2226                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2227                 log.u_bbr.cur_del_rate = (uint64_t)prev;
2228                 log.u_bbr.delRate = (uint64_t)rsm;
2229                 log.u_bbr.rttProp = (uint64_t)next;
2230                 log.u_bbr.flex7 = 0;
2231                 if (prev) {
2232                         log.u_bbr.flex1 = prev->r_start;
2233                         log.u_bbr.flex2 = prev->r_end;
2234                         log.u_bbr.flex7 |= 0x4;
2235                 }
2236                 if (rsm) {
2237                         log.u_bbr.flex3 = rsm->r_start;
2238                         log.u_bbr.flex4 = rsm->r_end;
2239                         log.u_bbr.flex7 |= 0x2;
2240                 }
2241                 if (next) {
2242                         log.u_bbr.flex5 = next->r_start;
2243                         log.u_bbr.flex6 = next->r_end;
2244                         log.u_bbr.flex7 |= 0x1;
2245                 }
2246                 log.u_bbr.applimited = line;
2247                 log.u_bbr.pkts_out = th_ack;
2248                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2249                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2250                 if (rack->rack_no_prr)
2251                         log.u_bbr.lost = 0;
2252                 else
2253                         log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2254                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2255                     &rack->rc_inp->inp_socket->so_rcv,
2256                     &rack->rc_inp->inp_socket->so_snd,
2257                     TCP_LOG_MAPCHG, 0,
2258                     0, &log, false, &tv);
2259         }
2260 }
2261
2262 static void
2263 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2264                  struct rack_sendmap *rsm, int conf)
2265 {
2266         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2267                 union tcp_log_stackspecific log;
2268                 struct timeval tv;
2269                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2270                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2271                 log.u_bbr.flex1 = t;
2272                 log.u_bbr.flex2 = len;
2273                 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2274                 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2275                 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2276                 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2277                 log.u_bbr.flex7 = conf;
2278                 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2279                 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2280                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2281                 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2282                 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2283                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2284                 if (rsm) {
2285                         log.u_bbr.pkt_epoch = rsm->r_start;
2286                         log.u_bbr.lost = rsm->r_end;
2287                         log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2288                         /* We loose any upper of the 24 bits */
2289                         log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2290                 } else {
2291                         /* Its a SYN */
2292                         log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2293                         log.u_bbr.lost = 0;
2294                         log.u_bbr.cwnd_gain = 0;
2295                         log.u_bbr.pacing_gain = 0;
2296                 }
2297                 /* Write out general bits of interest rrs here */
2298                 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2299                 log.u_bbr.use_lt_bw <<= 1;
2300                 log.u_bbr.use_lt_bw |= rack->forced_ack;
2301                 log.u_bbr.use_lt_bw <<= 1;
2302                 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2303                 log.u_bbr.use_lt_bw <<= 1;
2304                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2305                 log.u_bbr.use_lt_bw <<= 1;
2306                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2307                 log.u_bbr.use_lt_bw <<= 1;
2308                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2309                 log.u_bbr.use_lt_bw <<= 1;
2310                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2311                 log.u_bbr.use_lt_bw <<= 1;
2312                 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2313                 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2314                 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2315                 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2316                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2317                 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2318                 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
2319                 log.u_bbr.bw_inuse <<= 32;
2320                 if (rsm)
2321                         log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2322                 TCP_LOG_EVENTP(tp, NULL,
2323                     &rack->rc_inp->inp_socket->so_rcv,
2324                     &rack->rc_inp->inp_socket->so_snd,
2325                     BBR_LOG_BBRRTT, 0,
2326                     0, &log, false, &tv);
2327
2328
2329         }
2330 }
2331
2332 static void
2333 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2334 {
2335         /*
2336          * Log the rtt sample we are
2337          * applying to the srtt algorithm in
2338          * useconds.
2339          */
2340         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2341                 union tcp_log_stackspecific log;
2342                 struct timeval tv;
2343
2344                 /* Convert our ms to a microsecond */
2345                 memset(&log, 0, sizeof(log));
2346                 log.u_bbr.flex1 = rtt;
2347                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2348                 log.u_bbr.flex3 = rack->r_ctl.sack_count;
2349                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2350                 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
2351                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2352                 log.u_bbr.flex7 = 1;
2353                 log.u_bbr.flex8 = rack->sack_attack_disable;
2354                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2355                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2356                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2357                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2358                 log.u_bbr.pacing_gain = rack->r_must_retran;
2359                 /*
2360                  * We capture in delRate the upper 32 bits as
2361                  * the confidence level we had declared, and the
2362                  * lower 32 bits as the actual RTT using the arrival
2363                  * timestamp.
2364                  */
2365                 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2366                 log.u_bbr.delRate <<= 32;
2367                 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2368                 /* Lets capture all the things that make up t_rtxcur */
2369                 log.u_bbr.applimited = rack_rto_min;
2370                 log.u_bbr.epoch = rack_rto_max;
2371                 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2372                 log.u_bbr.lost = rack_rto_min;
2373                 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2374                 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2375                 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2376                 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2377                 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2378                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2379                     &rack->rc_inp->inp_socket->so_rcv,
2380                     &rack->rc_inp->inp_socket->so_snd,
2381                     TCP_LOG_RTT, 0,
2382                     0, &log, false, &tv);
2383         }
2384 }
2385
2386 static void
2387 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2388 {
2389         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
2390                 union tcp_log_stackspecific log;
2391                 struct timeval tv;
2392
2393                 /* Convert our ms to a microsecond */
2394                 memset(&log, 0, sizeof(log));
2395                 log.u_bbr.flex1 = rtt;
2396                 log.u_bbr.flex2 = send_time;
2397                 log.u_bbr.flex3 = ack_time;
2398                 log.u_bbr.flex4 = where;
2399                 log.u_bbr.flex7 = 2;
2400                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2401                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2402                     &rack->rc_inp->inp_socket->so_rcv,
2403                     &rack->rc_inp->inp_socket->so_snd,
2404                     TCP_LOG_RTT, 0,
2405                     0, &log, false, &tv);
2406         }
2407 }
2408
2409
2410
2411 static inline void
2412 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
2413 {
2414         if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
2415                 union tcp_log_stackspecific log;
2416                 struct timeval tv;
2417
2418                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2419                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2420                 log.u_bbr.flex1 = line;
2421                 log.u_bbr.flex2 = tick;
2422                 log.u_bbr.flex3 = tp->t_maxunacktime;
2423                 log.u_bbr.flex4 = tp->t_acktime;
2424                 log.u_bbr.flex8 = event;
2425                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2426                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2427                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2428                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2429                 log.u_bbr.pacing_gain = rack->r_must_retran;
2430                 TCP_LOG_EVENTP(tp, NULL,
2431                     &rack->rc_inp->inp_socket->so_rcv,
2432                     &rack->rc_inp->inp_socket->so_snd,
2433                     BBR_LOG_PROGRESS, 0,
2434                     0, &log, false, &tv);
2435         }
2436 }
2437
2438 static void
2439 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
2440 {
2441         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2442                 union tcp_log_stackspecific log;
2443
2444                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2445                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2446                 log.u_bbr.flex1 = slot;
2447                 if (rack->rack_no_prr)
2448                         log.u_bbr.flex2 = 0;
2449                 else
2450                         log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2451                 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2452                 log.u_bbr.flex8 = rack->rc_in_persist;
2453                 log.u_bbr.timeStamp = cts;
2454                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2455                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2456                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2457                 log.u_bbr.pacing_gain = rack->r_must_retran;
2458                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2459                     &rack->rc_inp->inp_socket->so_rcv,
2460                     &rack->rc_inp->inp_socket->so_snd,
2461                     BBR_LOG_BBRSND, 0,
2462                     0, &log, false, tv);
2463         }
2464 }
2465
2466 static void
2467 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2468 {
2469         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2470                 union tcp_log_stackspecific log;
2471                 struct timeval tv;
2472
2473                 memset(&log, 0, sizeof(log));
2474                 log.u_bbr.flex1 = did_out;
2475                 log.u_bbr.flex2 = nxt_pkt;
2476                 log.u_bbr.flex3 = way_out;
2477                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2478                 if (rack->rack_no_prr)
2479                         log.u_bbr.flex5 = 0;
2480                 else
2481                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2482                 log.u_bbr.flex6 = nsegs;
2483                 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2484                 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data;        /* Do we have ack-can-send set */
2485                 log.u_bbr.flex7 <<= 1;
2486                 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2487                 log.u_bbr.flex7 <<= 1;
2488                 log.u_bbr.flex7 |= rack->r_wanted_output;       /* Do we want output */
2489                 log.u_bbr.flex8 = rack->rc_in_persist;
2490                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2491                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2492                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2493                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2494                 log.u_bbr.use_lt_bw <<= 1;
2495                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2496                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2497                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2498                 log.u_bbr.pacing_gain = rack->r_must_retran;
2499                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2500                     &rack->rc_inp->inp_socket->so_rcv,
2501                     &rack->rc_inp->inp_socket->so_snd,
2502                     BBR_LOG_DOSEG_DONE, 0,
2503                     0, &log, false, &tv);
2504         }
2505 }
2506
2507 static void
2508 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2509 {
2510         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
2511                 union tcp_log_stackspecific log;
2512                 struct timeval tv;
2513
2514                 memset(&log, 0, sizeof(log));
2515                 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2516                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
2517                 log.u_bbr.flex4 = arg1;
2518                 log.u_bbr.flex5 = arg2;
2519                 log.u_bbr.flex6 = arg3;
2520                 log.u_bbr.flex8 = frm;
2521                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2522                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2523                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2524                 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
2525                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2526                 log.u_bbr.pacing_gain = rack->r_must_retran;
2527                 TCP_LOG_EVENTP(tp, NULL,
2528                     &tp->t_inpcb->inp_socket->so_rcv,
2529                     &tp->t_inpcb->inp_socket->so_snd,
2530                     TCP_HDWR_PACE_SIZE, 0,
2531                     0, &log, false, &tv);
2532         }
2533 }
2534
2535 static void
2536 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
2537                           uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
2538 {
2539         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2540                 union tcp_log_stackspecific log;
2541                 struct timeval tv;
2542
2543                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2544                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2545                 log.u_bbr.flex1 = slot;
2546                 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
2547                 log.u_bbr.flex4 = reason;
2548                 if (rack->rack_no_prr)
2549                         log.u_bbr.flex5 = 0;
2550                 else
2551                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2552                 log.u_bbr.flex7 = hpts_calling;
2553                 log.u_bbr.flex8 = rack->rc_in_persist;
2554                 log.u_bbr.lt_epoch = cwnd_to_use;
2555                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2556                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2557                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2558                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2559                 log.u_bbr.pacing_gain = rack->r_must_retran;
2560                 log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
2561                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2562                     &rack->rc_inp->inp_socket->so_rcv,
2563                     &rack->rc_inp->inp_socket->so_snd,
2564                     BBR_LOG_JUSTRET, 0,
2565                     tlen, &log, false, &tv);
2566         }
2567 }
2568
2569 static void
2570 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
2571                    struct timeval *tv, uint32_t flags_on_entry)
2572 {
2573         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2574                 union tcp_log_stackspecific log;
2575
2576                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2577                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
2578                 log.u_bbr.flex1 = line;
2579                 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
2580                 log.u_bbr.flex3 = flags_on_entry;
2581                 log.u_bbr.flex4 = us_cts;
2582                 if (rack->rack_no_prr)
2583                         log.u_bbr.flex5 = 0;
2584                 else
2585                         log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2586                 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2587                 log.u_bbr.flex7 = hpts_removed;
2588                 log.u_bbr.flex8 = 1;
2589                 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
2590                 log.u_bbr.timeStamp = us_cts;
2591                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2592                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2593                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2594                 log.u_bbr.pacing_gain = rack->r_must_retran;
2595                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2596                     &rack->rc_inp->inp_socket->so_rcv,
2597                     &rack->rc_inp->inp_socket->so_snd,
2598                     BBR_LOG_TIMERCANC, 0,
2599                     0, &log, false, tv);
2600         }
2601 }
2602
2603 static void
2604 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
2605                           uint32_t flex1, uint32_t flex2,
2606                           uint32_t flex3, uint32_t flex4,
2607                           uint32_t flex5, uint32_t flex6,
2608                           uint16_t flex7, uint8_t mod)
2609 {
2610         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2611                 union tcp_log_stackspecific log;
2612                 struct timeval tv;
2613
2614                 if (mod == 1) {
2615                         /* No you can't use 1, its for the real to cancel */
2616                         return;
2617                 }
2618                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2619                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2620                 log.u_bbr.flex1 = flex1;
2621                 log.u_bbr.flex2 = flex2;
2622                 log.u_bbr.flex3 = flex3;
2623                 log.u_bbr.flex4 = flex4;
2624                 log.u_bbr.flex5 = flex5;
2625                 log.u_bbr.flex6 = flex6;
2626                 log.u_bbr.flex7 = flex7;
2627                 log.u_bbr.flex8 = mod;
2628                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2629                     &rack->rc_inp->inp_socket->so_rcv,
2630                     &rack->rc_inp->inp_socket->so_snd,
2631                     BBR_LOG_TIMERCANC, 0,
2632                     0, &log, false, &tv);
2633         }
2634 }
2635
2636 static void
2637 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
2638 {
2639         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2640                 union tcp_log_stackspecific log;
2641                 struct timeval tv;
2642
2643                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2644                 log.u_bbr.flex1 = timers;
2645                 log.u_bbr.flex2 = ret;
2646                 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
2647                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2648                 log.u_bbr.flex5 = cts;
2649                 if (rack->rack_no_prr)
2650                         log.u_bbr.flex6 = 0;
2651                 else
2652                         log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
2653                 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2654                 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2655                 log.u_bbr.pacing_gain = rack->r_must_retran;
2656                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2657                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2658                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2659                     &rack->rc_inp->inp_socket->so_rcv,
2660                     &rack->rc_inp->inp_socket->so_snd,
2661                     BBR_LOG_TO_PROCESS, 0,
2662                     0, &log, false, &tv);
2663         }
2664 }
2665
2666 static void
2667 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
2668 {
2669         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2670                 union tcp_log_stackspecific log;
2671                 struct timeval tv;
2672
2673                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2674                 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
2675                 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
2676                 if (rack->rack_no_prr)
2677                         log.u_bbr.flex3 = 0;
2678                 else
2679                         log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
2680                 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
2681                 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
2682                 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
2683                 log.u_bbr.flex7 = line;
2684                 log.u_bbr.flex8 = frm;
2685                 log.u_bbr.pkts_out = orig_cwnd;
2686                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2687                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2688                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2689                 log.u_bbr.use_lt_bw <<= 1;
2690                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2691                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2692                     &rack->rc_inp->inp_socket->so_rcv,
2693                     &rack->rc_inp->inp_socket->so_snd,
2694                     BBR_LOG_BBRUPD, 0,
2695                     0, &log, false, &tv);
2696         }
2697 }
2698
2699 #ifdef NETFLIX_EXP_DETECTION
2700 static void
2701 rack_log_sad(struct tcp_rack *rack, int event)
2702 {
2703         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
2704                 union tcp_log_stackspecific log;
2705                 struct timeval tv;
2706
2707                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
2708                 log.u_bbr.flex1 = rack->r_ctl.sack_count;
2709                 log.u_bbr.flex2 = rack->r_ctl.ack_count;
2710                 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
2711                 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
2712                 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
2713                 log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
2714                 log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
2715                 log.u_bbr.lt_epoch = (tcp_force_detection << 8);
2716                 log.u_bbr.lt_epoch |= rack->do_detection;
2717                 log.u_bbr.applimited = tcp_map_minimum;
2718                 log.u_bbr.flex7 = rack->sack_attack_disable;
2719                 log.u_bbr.flex8 = event;
2720                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2721                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2722                 log.u_bbr.delivered = tcp_sad_decay_val;
2723                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2724                     &rack->rc_inp->inp_socket->so_rcv,
2725                     &rack->rc_inp->inp_socket->so_snd,
2726                     TCP_SAD_DETECTION, 0,
2727                     0, &log, false, &tv);
2728         }
2729 }
2730 #endif
2731
2732 static void
2733 rack_counter_destroy(void)
2734 {
2735         counter_u64_free(rack_fto_send);
2736         counter_u64_free(rack_fto_rsm_send);
2737         counter_u64_free(rack_nfto_resend);
2738         counter_u64_free(rack_hw_pace_init_fail);
2739         counter_u64_free(rack_hw_pace_lost);
2740         counter_u64_free(rack_non_fto_send);
2741         counter_u64_free(rack_extended_rfo);
2742         counter_u64_free(rack_ack_total);
2743         counter_u64_free(rack_express_sack);
2744         counter_u64_free(rack_sack_total);
2745         counter_u64_free(rack_move_none);
2746         counter_u64_free(rack_move_some);
2747         counter_u64_free(rack_sack_attacks_detected);
2748         counter_u64_free(rack_sack_attacks_reversed);
2749         counter_u64_free(rack_sack_used_next_merge);
2750         counter_u64_free(rack_sack_used_prev_merge);
2751         counter_u64_free(rack_tlp_tot);
2752         counter_u64_free(rack_tlp_newdata);
2753         counter_u64_free(rack_tlp_retran);
2754         counter_u64_free(rack_tlp_retran_bytes);
2755         counter_u64_free(rack_to_tot);
2756         counter_u64_free(rack_saw_enobuf);
2757         counter_u64_free(rack_saw_enobuf_hw);
2758         counter_u64_free(rack_saw_enetunreach);
2759         counter_u64_free(rack_hot_alloc);
2760         counter_u64_free(rack_to_alloc);
2761         counter_u64_free(rack_to_alloc_hard);
2762         counter_u64_free(rack_to_alloc_emerg);
2763         counter_u64_free(rack_to_alloc_limited);
2764         counter_u64_free(rack_alloc_limited_conns);
2765         counter_u64_free(rack_split_limited);
2766         counter_u64_free(rack_multi_single_eq);
2767         counter_u64_free(rack_proc_non_comp_ack);
2768         counter_u64_free(rack_sack_proc_all);
2769         counter_u64_free(rack_sack_proc_restart);
2770         counter_u64_free(rack_sack_proc_short);
2771         counter_u64_free(rack_sack_skipped_acked);
2772         counter_u64_free(rack_sack_splits);
2773         counter_u64_free(rack_input_idle_reduces);
2774         counter_u64_free(rack_collapsed_win);
2775         counter_u64_free(rack_try_scwnd);
2776         counter_u64_free(rack_persists_sends);
2777         counter_u64_free(rack_persists_acks);
2778         counter_u64_free(rack_persists_loss);
2779         counter_u64_free(rack_persists_lost_ends);
2780 #ifdef INVARIANTS
2781         counter_u64_free(rack_adjust_map_bw);
2782 #endif
2783         COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
2784         COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
2785 }
2786
2787 static struct rack_sendmap *
2788 rack_alloc(struct tcp_rack *rack)
2789 {
2790         struct rack_sendmap *rsm;
2791
2792         /*
2793          * First get the top of the list it in
2794          * theory is the "hottest" rsm we have,
2795          * possibly just freed by ack processing.
2796          */
2797         if (rack->rc_free_cnt > rack_free_cache) {
2798                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
2799                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2800                 counter_u64_add(rack_hot_alloc, 1);
2801                 rack->rc_free_cnt--;
2802                 return (rsm);
2803         }
2804         /*
2805          * Once we get under our free cache we probably
2806          * no longer have a "hot" one available. Lets
2807          * get one from UMA.
2808          */
2809         rsm = uma_zalloc(rack_zone, M_NOWAIT);
2810         if (rsm) {
2811                 rack->r_ctl.rc_num_maps_alloced++;
2812                 counter_u64_add(rack_to_alloc, 1);
2813                 return (rsm);
2814         }
2815         /*
2816          * Dig in to our aux rsm's (the last two) since
2817          * UMA failed to get us one.
2818          */
2819         if (rack->rc_free_cnt) {
2820                 counter_u64_add(rack_to_alloc_emerg, 1);
2821                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
2822                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2823                 rack->rc_free_cnt--;
2824                 return (rsm);
2825         }
2826         return (NULL);
2827 }
2828
2829 static struct rack_sendmap *
2830 rack_alloc_full_limit(struct tcp_rack *rack)
2831 {
2832         if ((V_tcp_map_entries_limit > 0) &&
2833             (rack->do_detection == 0) &&
2834             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
2835                 counter_u64_add(rack_to_alloc_limited, 1);
2836                 if (!rack->alloc_limit_reported) {
2837                         rack->alloc_limit_reported = 1;
2838                         counter_u64_add(rack_alloc_limited_conns, 1);
2839                 }
2840                 return (NULL);
2841         }
2842         return (rack_alloc(rack));
2843 }
2844
2845 /* wrapper to allocate a sendmap entry, subject to a specific limit */
2846 static struct rack_sendmap *
2847 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
2848 {
2849         struct rack_sendmap *rsm;
2850
2851         if (limit_type) {
2852                 /* currently there is only one limit type */
2853                 if (V_tcp_map_split_limit > 0 &&
2854                     (rack->do_detection == 0) &&
2855                     rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
2856                         counter_u64_add(rack_split_limited, 1);
2857                         if (!rack->alloc_limit_reported) {
2858                                 rack->alloc_limit_reported = 1;
2859                                 counter_u64_add(rack_alloc_limited_conns, 1);
2860                         }
2861                         return (NULL);
2862                 }
2863         }
2864
2865         /* allocate and mark in the limit type, if set */
2866         rsm = rack_alloc(rack);
2867         if (rsm != NULL && limit_type) {
2868                 rsm->r_limit_type = limit_type;
2869                 rack->r_ctl.rc_num_split_allocs++;
2870         }
2871         return (rsm);
2872 }
2873
2874 static void
2875 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
2876 {
2877         if (rsm->r_flags & RACK_APP_LIMITED) {
2878                 if (rack->r_ctl.rc_app_limited_cnt > 0) {
2879                         rack->r_ctl.rc_app_limited_cnt--;
2880                 }
2881         }
2882         if (rsm->r_limit_type) {
2883                 /* currently there is only one limit type */
2884                 rack->r_ctl.rc_num_split_allocs--;
2885         }
2886         if (rsm == rack->r_ctl.rc_first_appl) {
2887                 if (rack->r_ctl.rc_app_limited_cnt == 0)
2888                         rack->r_ctl.rc_first_appl = NULL;
2889                 else {
2890                         /* Follow the next one out */
2891                         struct rack_sendmap fe;
2892
2893                         fe.r_start = rsm->r_nseq_appl;
2894                         rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
2895                 }
2896         }
2897         if (rsm == rack->r_ctl.rc_resend)
2898                 rack->r_ctl.rc_resend = NULL;
2899         if (rsm == rack->r_ctl.rc_end_appl)
2900                 rack->r_ctl.rc_end_appl = NULL;
2901         if (rack->r_ctl.rc_tlpsend == rsm)
2902                 rack->r_ctl.rc_tlpsend = NULL;
2903         if (rack->r_ctl.rc_sacklast == rsm)
2904                 rack->r_ctl.rc_sacklast = NULL;
2905         memset(rsm, 0, sizeof(struct rack_sendmap));
2906         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
2907         rack->rc_free_cnt++;
2908 }
2909
2910 static void
2911 rack_free_trim(struct tcp_rack *rack)
2912 {
2913         struct rack_sendmap *rsm;
2914
2915         /*
2916          * Free up all the tail entries until
2917          * we get our list down to the limit.
2918          */
2919         while (rack->rc_free_cnt > rack_free_cache) {
2920                 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
2921                 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
2922                 rack->rc_free_cnt--;
2923                 uma_zfree(rack_zone, rsm);
2924         }
2925 }
2926
2927
2928 static uint32_t
2929 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
2930 {
2931         uint64_t srtt, bw, len, tim;
2932         uint32_t segsiz, def_len, minl;
2933
2934         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
2935         def_len = rack_def_data_window * segsiz;
2936         if (rack->rc_gp_filled == 0) {
2937                 /*
2938                  * We have no measurement (IW is in flight?) so
2939                  * we can only guess using our data_window sysctl
2940                  * value (usually 20MSS).
2941                  */
2942                 return (def_len);
2943         }
2944         /*
2945          * Now we have a number of factors to consider.
2946          *
2947          * 1) We have a desired BDP which is usually
2948          *    at least 2.
2949          * 2) We have a minimum number of rtt's usually 1 SRTT
2950          *    but we allow it too to be more.
2951          * 3) We want to make sure a measurement last N useconds (if
2952          *    we have set rack_min_measure_usec.
2953          *
2954          * We handle the first concern here by trying to create a data
2955          * window of max(rack_def_data_window, DesiredBDP). The
2956          * second concern we handle in not letting the measurement
2957          * window end normally until at least the required SRTT's
2958          * have gone by which is done further below in
2959          * rack_enough_for_measurement(). Finally the third concern
2960          * we also handle here by calculating how long that time
2961          * would take at the current BW and then return the
2962          * max of our first calculation and that length. Note
2963          * that if rack_min_measure_usec is 0, we don't deal
2964          * with concern 3. Also for both Concern 1 and 3 an
2965          * application limited period could end the measurement
2966          * earlier.
2967          *
2968          * So lets calculate the BDP with the "known" b/w using
2969          * the SRTT has our rtt and then multiply it by the
2970          * goal.
2971          */
2972         bw = rack_get_bw(rack);
2973         srtt = (uint64_t)tp->t_srtt;
2974         len = bw * srtt;
2975         len /= (uint64_t)HPTS_USEC_IN_SEC;
2976         len *= max(1, rack_goal_bdp);
2977         /* Now we need to round up to the nearest MSS */
2978         len = roundup(len, segsiz);
2979         if (rack_min_measure_usec) {
2980                 /* Now calculate our min length for this b/w */
2981                 tim = rack_min_measure_usec;
2982                 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
2983                 if (minl == 0)
2984                         minl = 1;
2985                 minl = roundup(minl, segsiz);
2986                 if (len < minl)
2987                         len = minl;
2988         }
2989         /*
2990          * Now if we have a very small window we want
2991          * to attempt to get the window that is
2992          * as small as possible. This happens on
2993          * low b/w connections and we don't want to
2994          * span huge numbers of rtt's between measurements.
2995          *
2996          * We basically include 2 over our "MIN window" so
2997          * that the measurement can be shortened (possibly) by
2998          * an ack'ed packet.
2999          */
3000         if (len < def_len)
3001                 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3002         else
3003                 return (max((uint32_t)len, def_len));
3004
3005 }
3006
3007 static int
3008 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3009 {
3010         uint32_t tim, srtts, segsiz;
3011
3012         /*
3013          * Has enough time passed for the GP measurement to be valid?
3014          */
3015         if ((tp->snd_max == tp->snd_una) ||
3016             (th_ack == tp->snd_max)){
3017                 /* All is acked */
3018                 *quality = RACK_QUALITY_ALLACKED;
3019                 return (1);
3020         }
3021         if (SEQ_LT(th_ack, tp->gput_seq)) {
3022                 /* Not enough bytes yet */
3023                 return (0);
3024         }
3025         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3026         if (SEQ_LT(th_ack, tp->gput_ack) &&
3027             ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3028                 /* Not enough bytes yet */
3029                 return (0);
3030         }
3031         if (rack->r_ctl.rc_first_appl &&
3032             (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3033                 /*
3034                  * We are up to the app limited send point
3035                  * we have to measure irrespective of the time..
3036                  */
3037                 *quality = RACK_QUALITY_APPLIMITED;
3038                 return (1);
3039         }
3040         /* Now what about time? */
3041         srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3042         tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3043         if (tim >= srtts) {
3044                 *quality = RACK_QUALITY_HIGH;
3045                 return (1);
3046         }
3047         /* Nope not even a full SRTT has passed */
3048         return (0);
3049 }
3050
3051 static void
3052 rack_log_timely(struct tcp_rack *rack,
3053                 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3054                 uint64_t up_bnd, int line, uint8_t method)
3055 {
3056         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3057                 union tcp_log_stackspecific log;
3058                 struct timeval tv;
3059
3060                 memset(&log, 0, sizeof(log));
3061                 log.u_bbr.flex1 = logged;
3062                 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3063                 log.u_bbr.flex2 <<= 4;
3064                 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3065                 log.u_bbr.flex2 <<= 4;
3066                 log.u_bbr.flex2 |= rack->rc_gp_incr;
3067                 log.u_bbr.flex2 <<= 4;
3068                 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3069                 log.u_bbr.flex3 = rack->rc_gp_incr;
3070                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3071                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3072                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3073                 log.u_bbr.flex7 = rack->rc_gp_bwred;
3074                 log.u_bbr.flex8 = method;
3075                 log.u_bbr.cur_del_rate = cur_bw;
3076                 log.u_bbr.delRate = low_bnd;
3077                 log.u_bbr.bw_inuse = up_bnd;
3078                 log.u_bbr.rttProp = rack_get_bw(rack);
3079                 log.u_bbr.pkt_epoch = line;
3080                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3081                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3082                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3083                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3084                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3085                 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3086                 log.u_bbr.cwnd_gain <<= 1;
3087                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3088                 log.u_bbr.cwnd_gain <<= 1;
3089                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3090                 log.u_bbr.cwnd_gain <<= 1;
3091                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3092                 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3093                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3094                     &rack->rc_inp->inp_socket->so_rcv,
3095                     &rack->rc_inp->inp_socket->so_snd,
3096                     TCP_TIMELY_WORK, 0,
3097                     0, &log, false, &tv);
3098         }
3099 }
3100
3101 static int
3102 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3103 {
3104         /*
3105          * Before we increase we need to know if
3106          * the estimate just made was less than
3107          * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3108          *
3109          * If we already are pacing at a fast enough
3110          * rate to push us faster there is no sense of
3111          * increasing.
3112          *
3113          * We first caculate our actual pacing rate (ss or ca multiplier
3114          * times our cur_bw).
3115          *
3116          * Then we take the last measured rate and multipy by our
3117          * maximum pacing overage to give us a max allowable rate.
3118          *
3119          * If our act_rate is smaller than our max_allowable rate
3120          * then we should increase. Else we should hold steady.
3121          *
3122          */
3123         uint64_t act_rate, max_allow_rate;
3124
3125         if (rack_timely_no_stopping)
3126                 return (1);
3127
3128         if ((cur_bw == 0) || (last_bw_est == 0)) {
3129                 /*
3130                  * Initial startup case or
3131                  * everything is acked case.
3132                  */
3133                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3134                                 __LINE__, 9);
3135                 return (1);
3136         }
3137         if (mult <= 100) {
3138                 /*
3139                  * We can always pace at or slightly above our rate.
3140                  */
3141                 rack_log_timely(rack,  mult, cur_bw, 0, 0,
3142                                 __LINE__, 9);
3143                 return (1);
3144         }
3145         act_rate = cur_bw * (uint64_t)mult;
3146         act_rate /= 100;
3147         max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3148         max_allow_rate /= 100;
3149         if (act_rate < max_allow_rate) {
3150                 /*
3151                  * Here the rate we are actually pacing at
3152                  * is smaller than 10% above our last measurement.
3153                  * This means we are pacing below what we would
3154                  * like to try to achieve (plus some wiggle room).
3155                  */
3156                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3157                                 __LINE__, 9);
3158                 return (1);
3159         } else {
3160                 /*
3161                  * Here we are already pacing at least rack_max_per_above(10%)
3162                  * what we are getting back. This indicates most likely
3163                  * that we are being limited (cwnd/rwnd/app) and can't
3164                  * get any more b/w. There is no sense of trying to
3165                  * raise up the pacing rate its not speeding us up
3166                  * and we already are pacing faster than we are getting.
3167                  */
3168                 rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
3169                                 __LINE__, 8);
3170                 return (0);
3171         }
3172 }
3173
3174 static void
3175 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3176 {
3177         /*
3178          * When we drag bottom, we want to assure
3179          * that no multiplier is below 1.0, if so
3180          * we want to restore it to at least that.
3181          */
3182         if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
3183                 /* This is unlikely we usually do not touch recovery */
3184                 rack->r_ctl.rack_per_of_gp_rec = 100;
3185         }
3186         if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3187                 rack->r_ctl.rack_per_of_gp_ca = 100;
3188         }
3189         if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3190                 rack->r_ctl.rack_per_of_gp_ss = 100;
3191         }
3192 }
3193
3194 static void
3195 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3196 {
3197         if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3198                 rack->r_ctl.rack_per_of_gp_ca = 100;
3199         }
3200         if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3201                 rack->r_ctl.rack_per_of_gp_ss = 100;
3202         }
3203 }
3204
3205 static void
3206 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3207 {
3208         int32_t  calc, logged, plus;
3209
3210         logged = 0;
3211
3212         if (override) {
3213                 /*
3214                  * override is passed when we are
3215                  * loosing b/w and making one last
3216                  * gasp at trying to not loose out
3217                  * to a new-reno flow.
3218                  */
3219                 goto extra_boost;
3220         }
3221         /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3222         if (rack->rc_gp_incr &&
3223             ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3224                 /*
3225                  * Reset and get 5 strokes more before the boost. Note
3226                  * that the count is 0 based so we have to add one.
3227                  */
3228 extra_boost:
3229                 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3230                 rack->rc_gp_timely_inc_cnt = 0;
3231         } else
3232                 plus = (uint32_t)rack_gp_increase_per;
3233         /* Must be at least 1% increase for true timely increases */
3234         if ((plus < 1) &&
3235             ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3236                 plus = 1;
3237         if (rack->rc_gp_saw_rec &&
3238             (rack->rc_gp_no_rec_chg == 0) &&
3239             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3240                                   rack->r_ctl.rack_per_of_gp_rec)) {
3241                 /* We have been in recovery ding it too */
3242                 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3243                 if (calc > 0xffff)
3244                         calc = 0xffff;
3245                 logged |= 1;
3246                 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3247                 if (rack_per_upper_bound_ss &&
3248                     (rack->rc_dragged_bottom == 0) &&
3249                     (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
3250                         rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
3251         }
3252         if (rack->rc_gp_saw_ca &&
3253             (rack->rc_gp_saw_ss == 0) &&
3254             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3255                                   rack->r_ctl.rack_per_of_gp_ca)) {
3256                 /* In CA */
3257                 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3258                 if (calc > 0xffff)
3259                         calc = 0xffff;
3260                 logged |= 2;
3261                 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3262                 if (rack_per_upper_bound_ca &&
3263                     (rack->rc_dragged_bottom == 0) &&
3264                     (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
3265                         rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
3266         }
3267         if (rack->rc_gp_saw_ss &&
3268             rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3269                                   rack->r_ctl.rack_per_of_gp_ss)) {
3270                 /* In SS */
3271                 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3272                 if (calc > 0xffff)
3273                         calc = 0xffff;
3274                 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3275                 if (rack_per_upper_bound_ss &&
3276                     (rack->rc_dragged_bottom == 0) &&
3277                     (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
3278                         rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
3279                 logged |= 4;
3280         }
3281         if (logged &&
3282             (rack->rc_gp_incr == 0)){
3283                 /* Go into increment mode */
3284                 rack->rc_gp_incr = 1;
3285                 rack->rc_gp_timely_inc_cnt = 0;
3286         }
3287         if (rack->rc_gp_incr &&
3288             logged &&
3289             (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3290                 rack->rc_gp_timely_inc_cnt++;
3291         }
3292         rack_log_timely(rack,  logged, plus, 0, 0,
3293                         __LINE__, 1);
3294 }
3295
3296 static uint32_t
3297 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3298 {
3299         /*
3300          * norm_grad = rtt_diff / minrtt;
3301          * new_per = curper * (1 - B * norm_grad)
3302          *
3303          * B = rack_gp_decrease_per (default 10%)
3304          * rtt_dif = input var current rtt-diff
3305          * curper = input var current percentage
3306          * minrtt = from rack filter
3307          *
3308          */
3309         uint64_t perf;
3310
3311         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3312                     ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3313                      (((uint64_t)rtt_diff * (uint64_t)1000000)/
3314                       (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3315                      (uint64_t)1000000)) /
3316                 (uint64_t)1000000);
3317         if (perf > curper) {
3318                 /* TSNH */
3319                 perf = curper - 1;
3320         }
3321         return ((uint32_t)perf);
3322 }
3323
3324 static uint32_t
3325 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3326 {
3327         /*
3328          *                                   highrttthresh
3329          * result = curper * (1 - (B * ( 1 -  ------          ))
3330          *                                     gp_srtt
3331          *
3332          * B = rack_gp_decrease_per (default 10%)
3333          * highrttthresh = filter_min * rack_gp_rtt_maxmul
3334          */
3335         uint64_t perf;
3336         uint32_t highrttthresh;
3337
3338         highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3339
3340         perf = (((uint64_t)curper * ((uint64_t)1000000 -
3341                                      ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3342                                         ((uint64_t)highrttthresh * (uint64_t)1000000) /
3343                                                     (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3344         return (perf);
3345 }
3346
3347 static void
3348 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3349 {
3350         uint64_t logvar, logvar2, logvar3;
3351         uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3352
3353         if (rack->rc_gp_incr) {
3354                 /* Turn off increment counting */
3355                 rack->rc_gp_incr = 0;
3356                 rack->rc_gp_timely_inc_cnt = 0;
3357         }
3358         ss_red = ca_red = rec_red = 0;
3359         logged = 0;
3360         /* Calculate the reduction value */
3361         if (rtt_diff < 0) {
3362                 rtt_diff *= -1;
3363         }
3364         /* Must be at least 1% reduction */
3365         if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3366                 /* We have been in recovery ding it too */
3367                 if (timely_says == 2) {
3368                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3369                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3370                         if (alt < new_per)
3371                                 val = alt;
3372                         else
3373                                 val = new_per;
3374                 } else
3375                          val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3376                 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3377                         rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3378                         rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3379                 } else {
3380                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3381                         rec_red = 0;
3382                 }
3383                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3384                         rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3385                 logged |= 1;
3386         }
3387         if (rack->rc_gp_saw_ss) {
3388                 /* Sent in SS */
3389                 if (timely_says == 2) {
3390                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3391                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3392                         if (alt < new_per)
3393                                 val = alt;
3394                         else
3395                                 val = new_per;
3396                 } else
3397                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3398                 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3399                         ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3400                         rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3401                 } else {
3402                         ss_red = new_per;
3403                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3404                         logvar = new_per;
3405                         logvar <<= 32;
3406                         logvar |= alt;
3407                         logvar2 = (uint32_t)rtt;
3408                         logvar2 <<= 32;
3409                         logvar2 |= (uint32_t)rtt_diff;
3410                         logvar3 = rack_gp_rtt_maxmul;
3411                         logvar3 <<= 32;
3412                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3413                         rack_log_timely(rack, timely_says,
3414                                         logvar2, logvar3,
3415                                         logvar, __LINE__, 10);
3416                 }
3417                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3418                         rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3419                 logged |= 4;
3420         } else if (rack->rc_gp_saw_ca) {
3421                 /* Sent in CA */
3422                 if (timely_says == 2) {
3423                         new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3424                         alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3425                         if (alt < new_per)
3426                                 val = alt;
3427                         else
3428                                 val = new_per;
3429                 } else
3430                         val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3431                 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3432                         ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3433                         rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3434                 } else {
3435                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3436                         ca_red = 0;
3437                         logvar = new_per;
3438                         logvar <<= 32;
3439                         logvar |= alt;
3440                         logvar2 = (uint32_t)rtt;
3441                         logvar2 <<= 32;
3442                         logvar2 |= (uint32_t)rtt_diff;
3443                         logvar3 = rack_gp_rtt_maxmul;
3444                         logvar3 <<= 32;
3445                         logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3446                         rack_log_timely(rack, timely_says,
3447                                         logvar2, logvar3,
3448                                         logvar, __LINE__, 10);
3449                 }
3450                 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3451                         rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3452                 logged |= 2;
3453         }
3454         if (rack->rc_gp_timely_dec_cnt < 0x7) {
3455                 rack->rc_gp_timely_dec_cnt++;
3456                 if (rack_timely_dec_clear &&
3457                     (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3458                         rack->rc_gp_timely_dec_cnt = 0;
3459         }
3460         logvar = ss_red;
3461         logvar <<= 32;
3462         logvar |= ca_red;
3463         rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
3464                         __LINE__, 2);
3465 }
3466
3467 static void
3468 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3469                      uint32_t rtt, uint32_t line, uint8_t reas)
3470 {
3471         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
3472                 union tcp_log_stackspecific log;
3473                 struct timeval tv;
3474
3475                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
3476                 log.u_bbr.flex1 = line;
3477                 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3478                 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3479                 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3480                 log.u_bbr.flex5 = rtt;
3481                 log.u_bbr.flex6 = rack->rc_highly_buffered;
3482                 log.u_bbr.flex6 <<= 1;
3483                 log.u_bbr.flex6 |= rack->forced_ack;
3484                 log.u_bbr.flex6 <<= 1;
3485                 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3486                 log.u_bbr.flex6 <<= 1;
3487                 log.u_bbr.flex6 |= rack->in_probe_rtt;
3488                 log.u_bbr.flex6 <<= 1;
3489                 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
3490                 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
3491                 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
3492                 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
3493                 log.u_bbr.flex8 = reas;
3494                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3495                 log.u_bbr.delRate = rack_get_bw(rack);
3496                 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
3497                 log.u_bbr.cur_del_rate <<= 32;
3498                 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
3499                 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
3500                 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3501                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3502                 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3503                 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3504                 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
3505                 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
3506                 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3507                 log.u_bbr.rttProp = us_cts;
3508                 log.u_bbr.rttProp <<= 32;
3509                 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
3510                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3511                     &rack->rc_inp->inp_socket->so_rcv,
3512                     &rack->rc_inp->inp_socket->so_snd,
3513                     BBR_LOG_RTT_SHRINKS, 0,
3514                     0, &log, false, &rack->r_ctl.act_rcv_time);
3515         }
3516 }
3517
3518 static void
3519 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
3520 {
3521         uint64_t bwdp;
3522
3523         bwdp = rack_get_bw(rack);
3524         bwdp *= (uint64_t)rtt;
3525         bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
3526         rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
3527         if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
3528                 /*
3529                  * A window protocol must be able to have 4 packets
3530                  * outstanding as the floor in order to function
3531                  * (especially considering delayed ack :D).
3532                  */
3533                 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
3534         }
3535 }
3536
3537 static void
3538 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
3539 {
3540         /**
3541          * ProbeRTT is a bit different in rack_pacing than in
3542          * BBR. It is like BBR in that it uses the lowering of
3543          * the RTT as a signal that we saw something new and
3544          * counts from there for how long between. But it is
3545          * different in that its quite simple. It does not
3546          * play with the cwnd and wait until we get down
3547          * to N segments outstanding and hold that for
3548          * 200ms. Instead it just sets the pacing reduction
3549          * rate to a set percentage (70 by default) and hold
3550          * that for a number of recent GP Srtt's.
3551          */
3552         uint32_t segsiz;
3553
3554         if (rack->rc_gp_dyn_mul == 0)
3555                 return;
3556
3557         if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
3558                 /* We are idle */
3559                 return;
3560         }
3561         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3562             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3563                 /*
3564                  * Stop the goodput now, the idea here is
3565                  * that future measurements with in_probe_rtt
3566                  * won't register if they are not greater so
3567                  * we want to get what info (if any) is available
3568                  * now.
3569                  */
3570                 rack_do_goodput_measurement(rack->rc_tp, rack,
3571                                             rack->rc_tp->snd_una, __LINE__,
3572                                             RACK_QUALITY_PROBERTT);
3573         }
3574         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3575         rack->r_ctl.rc_time_probertt_entered = us_cts;
3576         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3577                      rack->r_ctl.rc_pace_min_segs);
3578         rack->in_probe_rtt = 1;
3579         rack->measure_saw_probe_rtt = 1;
3580         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3581         rack->r_ctl.rc_time_probertt_starts = 0;
3582         rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
3583         if (rack_probertt_use_min_rtt_entry)
3584                 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3585         else
3586                 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
3587         rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3588                              __LINE__, RACK_RTTS_ENTERPROBE);
3589 }
3590
3591 static void
3592 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
3593 {
3594         struct rack_sendmap *rsm;
3595         uint32_t segsiz;
3596
3597         segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
3598                      rack->r_ctl.rc_pace_min_segs);
3599         rack->in_probe_rtt = 0;
3600         if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
3601             SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
3602                 /*
3603                  * Stop the goodput now, the idea here is
3604                  * that future measurements with in_probe_rtt
3605                  * won't register if they are not greater so
3606                  * we want to get what info (if any) is available
3607                  * now.
3608                  */
3609                 rack_do_goodput_measurement(rack->rc_tp, rack,
3610                                             rack->rc_tp->snd_una, __LINE__,
3611                                             RACK_QUALITY_PROBERTT);
3612         } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
3613                 /*
3614                  * We don't have enough data to make a measurement.
3615                  * So lets just stop and start here after exiting
3616                  * probe-rtt. We probably are not interested in
3617                  * the results anyway.
3618                  */
3619                 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
3620         }
3621         /*
3622          * Measurements through the current snd_max are going
3623          * to be limited by the slower pacing rate.
3624          *
3625          * We need to mark these as app-limited so we
3626          * don't collapse the b/w.
3627          */
3628         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
3629         if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
3630                 if (rack->r_ctl.rc_app_limited_cnt == 0)
3631                         rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
3632                 else {
3633                         /*
3634                          * Go out to the end app limited and mark
3635                          * this new one as next and move the end_appl up
3636                          * to this guy.
3637                          */
3638                         if (rack->r_ctl.rc_end_appl)
3639                                 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
3640                         rack->r_ctl.rc_end_appl = rsm;
3641                 }
3642                 rsm->r_flags |= RACK_APP_LIMITED;
3643                 rack->r_ctl.rc_app_limited_cnt++;
3644         }
3645         /*
3646          * Now, we need to examine our pacing rate multipliers.
3647          * If its under 100%, we need to kick it back up to
3648          * 100%. We also don't let it be over our "max" above
3649          * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
3650          * Note setting clamp_atexit_prtt to 0 has the effect
3651          * of setting CA/SS to 100% always at exit (which is
3652          * the default behavior).
3653          */
3654         if (rack_probertt_clear_is) {
3655                 rack->rc_gp_incr = 0;
3656                 rack->rc_gp_bwred = 0;
3657                 rack->rc_gp_timely_inc_cnt = 0;
3658                 rack->rc_gp_timely_dec_cnt = 0;
3659         }
3660         /* Do we do any clamping at exit? */
3661         if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
3662                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
3663                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
3664         }
3665         if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
3666                 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
3667                 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
3668         }
3669         /*
3670          * Lets set rtt_diff to 0, so that we will get a "boost"
3671          * after exiting.
3672          */
3673         rack->r_ctl.rc_rtt_diff = 0;
3674
3675         /* Clear all flags so we start fresh */
3676         rack->rc_tp->t_bytes_acked = 0;
3677         rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND;
3678         /*
3679          * If configured to, set the cwnd and ssthresh to
3680          * our targets.
3681          */
3682         if (rack_probe_rtt_sets_cwnd) {
3683                 uint64_t ebdp;
3684                 uint32_t setto;
3685
3686                 /* Set ssthresh so we get into CA once we hit our target */
3687                 if (rack_probertt_use_min_rtt_exit == 1) {
3688                         /* Set to min rtt */
3689                         rack_set_prtt_target(rack, segsiz,
3690                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
3691                 } else if (rack_probertt_use_min_rtt_exit == 2) {
3692                         /* Set to current gp rtt */
3693                         rack_set_prtt_target(rack, segsiz,
3694                                              rack->r_ctl.rc_gp_srtt);
3695                 } else if (rack_probertt_use_min_rtt_exit == 3) {
3696                         /* Set to entry gp rtt */
3697                         rack_set_prtt_target(rack, segsiz,
3698                                              rack->r_ctl.rc_entry_gp_rtt);
3699                 } else {
3700                         uint64_t sum;
3701                         uint32_t setval;
3702
3703                         sum = rack->r_ctl.rc_entry_gp_rtt;
3704                         sum *= 10;
3705                         sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
3706                         if (sum >= 20) {
3707                                 /*
3708                                  * A highly buffered path needs
3709                                  * cwnd space for timely to work.
3710                                  * Lets set things up as if
3711                                  * we are heading back here again.
3712                                  */
3713                                 setval = rack->r_ctl.rc_entry_gp_rtt;
3714                         } else if (sum >= 15) {
3715                                 /*
3716                                  * Lets take the smaller of the
3717                                  * two since we are just somewhat
3718                                  * buffered.
3719                                  */
3720                                 setval = rack->r_ctl.rc_gp_srtt;
3721                                 if (setval > rack->r_ctl.rc_entry_gp_rtt)
3722                                         setval = rack->r_ctl.rc_entry_gp_rtt;
3723                         } else {
3724                                 /*
3725                                  * Here we are not highly buffered
3726                                  * and should pick the min we can to
3727                                  * keep from causing loss.
3728                                  */
3729                                 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3730                         }
3731                         rack_set_prtt_target(rack, segsiz,
3732                                              setval);
3733                 }
3734                 if (rack_probe_rtt_sets_cwnd > 1) {
3735                         /* There is a percentage here to boost */
3736                         ebdp = rack->r_ctl.rc_target_probertt_flight;
3737                         ebdp *= rack_probe_rtt_sets_cwnd;
3738                         ebdp /= 100;
3739                         setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
3740                 } else
3741                         setto = rack->r_ctl.rc_target_probertt_flight;
3742                 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
3743                 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
3744                         /* Enforce a min */
3745                         rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
3746                 }
3747                 /* If we set in the cwnd also set the ssthresh point so we are in CA */
3748                 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
3749         }
3750         rack_log_rtt_shrinks(rack,  us_cts,
3751                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3752                              __LINE__, RACK_RTTS_EXITPROBE);
3753         /* Clear times last so log has all the info */
3754         rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
3755         rack->r_ctl.rc_time_probertt_entered = us_cts;
3756         rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
3757         rack->r_ctl.rc_time_of_last_probertt = us_cts;
3758 }
3759
3760 static void
3761 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
3762 {
3763         /* Check in on probe-rtt */
3764         if (rack->rc_gp_filled == 0) {
3765                 /* We do not do p-rtt unless we have gp measurements */
3766                 return;
3767         }
3768         if (rack->in_probe_rtt) {
3769                 uint64_t no_overflow;
3770                 uint32_t endtime, must_stay;
3771
3772                 if (rack->r_ctl.rc_went_idle_time &&
3773                     ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
3774                         /*
3775                          * We went idle during prtt, just exit now.
3776                          */
3777                         rack_exit_probertt(rack, us_cts);
3778                 } else if (rack_probe_rtt_safety_val &&
3779                     TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
3780                     ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
3781                         /*
3782                          * Probe RTT safety value triggered!
3783                          */
3784                         rack_log_rtt_shrinks(rack,  us_cts,
3785                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3786                                              __LINE__, RACK_RTTS_SAFETY);
3787                         rack_exit_probertt(rack, us_cts);
3788                 }
3789                 /* Calculate the max we will wait */
3790                 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
3791                 if (rack->rc_highly_buffered)
3792                         endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
3793                 /* Calculate the min we must wait */
3794                 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
3795                 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
3796                     TSTMP_LT(us_cts, endtime)) {
3797                         uint32_t calc;
3798                         /* Do we lower more? */
3799 no_exit:
3800                         if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
3801                                 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
3802                         else
3803                                 calc = 0;
3804                         calc /= max(rack->r_ctl.rc_gp_srtt, 1);
3805                         if (calc) {
3806                                 /* Maybe */
3807                                 calc *= rack_per_of_gp_probertt_reduce;
3808                                 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
3809                                 /* Limit it too */
3810                                 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
3811                                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
3812                         }
3813                         /* We must reach target or the time set */
3814                         return;
3815                 }
3816                 if (rack->r_ctl.rc_time_probertt_starts == 0) {
3817                         if ((TSTMP_LT(us_cts, must_stay) &&
3818                              rack->rc_highly_buffered) ||
3819                              (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
3820                               rack->r_ctl.rc_target_probertt_flight)) {
3821                                 /* We are not past the must_stay time */
3822                                 goto no_exit;
3823                         }
3824                         rack_log_rtt_shrinks(rack,  us_cts,
3825                                              get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
3826                                              __LINE__, RACK_RTTS_REACHTARGET);
3827                         rack->r_ctl.rc_time_probertt_starts = us_cts;
3828                         if (rack->r_ctl.rc_time_probertt_starts == 0)
3829                                 rack->r_ctl.rc_time_probertt_starts = 1;
3830                         /* Restore back to our rate we want to pace at in prtt */
3831                         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
3832                 }
3833                 /*
3834                  * Setup our end time, some number of gp_srtts plus 200ms.
3835                  */
3836                 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
3837                                (uint64_t)rack_probertt_gpsrtt_cnt_mul);
3838                 if (rack_probertt_gpsrtt_cnt_div)
3839                         endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
3840                 else
3841                         endtime = 0;
3842                 endtime += rack_min_probertt_hold;
3843                 endtime += rack->r_ctl.rc_time_probertt_starts;
3844                 if (TSTMP_GEQ(us_cts,  endtime)) {
3845                         /* yes, exit probertt */
3846                         rack_exit_probertt(rack, us_cts);
3847                 }
3848
3849         } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
3850                 /* Go into probertt, its been too long since we went lower */
3851                 rack_enter_probertt(rack, us_cts);
3852         }
3853 }
3854
3855 static void
3856 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
3857                        uint32_t rtt, int32_t rtt_diff)
3858 {
3859         uint64_t cur_bw, up_bnd, low_bnd, subfr;
3860         uint32_t losses;
3861
3862         if ((rack->rc_gp_dyn_mul == 0) ||
3863             (rack->use_fixed_rate) ||
3864             (rack->in_probe_rtt) ||
3865             (rack->rc_always_pace == 0)) {
3866                 /* No dynamic GP multiplier in play */
3867                 return;
3868         }
3869         losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
3870         cur_bw = rack_get_bw(rack);
3871         /* Calculate our up and down range */
3872         up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
3873         up_bnd /= 100;
3874         up_bnd += rack->r_ctl.last_gp_comp_bw;
3875
3876         subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
3877         subfr /= 100;
3878         low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
3879         if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
3880                 /*
3881                  * This is the case where our RTT is above
3882                  * the max target and we have been configured
3883                  * to just do timely no bonus up stuff in that case.
3884                  *
3885                  * There are two configurations, set to 1, and we
3886                  * just do timely if we are over our max. If its
3887                  * set above 1 then we slam the multipliers down
3888                  * to 100 and then decrement per timely.
3889                  */
3890                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3891                                 __LINE__, 3);
3892                 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
3893                         rack_validate_multipliers_at_or_below_100(rack);
3894                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
3895         } else if ((last_bw_est < low_bnd) && !losses) {
3896                 /*
3897                  * We are decreasing this is a bit complicated this
3898                  * means we are loosing ground. This could be
3899                  * because another flow entered and we are competing
3900                  * for b/w with it. This will push the RTT up which
3901                  * makes timely unusable unless we want to get shoved
3902                  * into a corner and just be backed off (the age
3903                  * old problem with delay based CC).
3904                  *
3905                  * On the other hand if it was a route change we
3906                  * would like to stay somewhat contained and not
3907                  * blow out the buffers.
3908                  */
3909                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3910                                 __LINE__, 3);
3911                 rack->r_ctl.last_gp_comp_bw = cur_bw;
3912                 if (rack->rc_gp_bwred == 0) {
3913                         /* Go into reduction counting */
3914                         rack->rc_gp_bwred = 1;
3915                         rack->rc_gp_timely_dec_cnt = 0;
3916                 }
3917                 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
3918                     (timely_says == 0)) {
3919                         /*
3920                          * Push another time with a faster pacing
3921                          * to try to gain back (we include override to
3922                          * get a full raise factor).
3923                          */
3924                         if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
3925                             (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
3926                             (timely_says == 0) ||
3927                             (rack_down_raise_thresh == 0)) {
3928                                 /*
3929                                  * Do an override up in b/w if we were
3930                                  * below the threshold or if the threshold
3931                                  * is zero we always do the raise.
3932                                  */
3933                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
3934                         } else {
3935                                 /* Log it stays the same */
3936                                 rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
3937                                                 __LINE__, 11);
3938                         }
3939                         rack->rc_gp_timely_dec_cnt++;
3940                         /* We are not incrementing really no-count */
3941                         rack->rc_gp_incr = 0;
3942                         rack->rc_gp_timely_inc_cnt = 0;
3943                 } else {
3944                         /*
3945                          * Lets just use the RTT
3946                          * information and give up
3947                          * pushing.
3948                          */
3949                         goto use_timely;
3950                 }
3951         } else if ((timely_says != 2) &&
3952                     !losses &&
3953                     (last_bw_est > up_bnd)) {
3954                 /*
3955                  * We are increasing b/w lets keep going, updating
3956                  * our b/w and ignoring any timely input, unless
3957                  * of course we are at our max raise (if there is one).
3958                  */
3959
3960                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3961                                 __LINE__, 3);
3962                 rack->r_ctl.last_gp_comp_bw = cur_bw;
3963                 if (rack->rc_gp_saw_ss &&
3964                     rack_per_upper_bound_ss &&
3965                      (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
3966                             /*
3967                              * In cases where we can't go higher
3968                              * we should just use timely.
3969                              */
3970                             goto use_timely;
3971                 }
3972                 if (rack->rc_gp_saw_ca &&
3973                     rack_per_upper_bound_ca &&
3974                     (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
3975                             /*
3976                              * In cases where we can't go higher
3977                              * we should just use timely.
3978                              */
3979                             goto use_timely;
3980                 }
3981                 rack->rc_gp_bwred = 0;
3982                 rack->rc_gp_timely_dec_cnt = 0;
3983                 /* You get a set number of pushes if timely is trying to reduce */
3984                 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
3985                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
3986                 } else {
3987                         /* Log it stays the same */
3988                         rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
3989                             __LINE__, 12);
3990                 }
3991                 return;
3992         } else {
3993                 /*
3994                  * We are staying between the lower and upper range bounds
3995                  * so use timely to decide.
3996                  */
3997                 rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
3998                                 __LINE__, 3);
3999 use_timely:
4000                 if (timely_says) {
4001                         rack->rc_gp_incr = 0;
4002                         rack->rc_gp_timely_inc_cnt = 0;
4003                         if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4004                             !losses &&
4005                             (last_bw_est < low_bnd)) {
4006                                 /* We are loosing ground */
4007                                 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4008                                 rack->rc_gp_timely_dec_cnt++;
4009                                 /* We are not incrementing really no-count */
4010                                 rack->rc_gp_incr = 0;
4011                                 rack->rc_gp_timely_inc_cnt = 0;
4012                         } else
4013                                 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4014                 } else {
4015                         rack->rc_gp_bwred = 0;
4016                         rack->rc_gp_timely_dec_cnt = 0;
4017                         rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4018                 }
4019         }
4020 }
4021
4022 static int32_t
4023 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4024 {
4025         int32_t timely_says;
4026         uint64_t log_mult, log_rtt_a_diff;
4027
4028         log_rtt_a_diff = rtt;
4029         log_rtt_a_diff <<= 32;
4030         log_rtt_a_diff |= (uint32_t)rtt_diff;
4031         if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4032                     rack_gp_rtt_maxmul)) {
4033                 /* Reduce the b/w multiplier */
4034                 timely_says = 2;
4035                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4036                 log_mult <<= 32;
4037                 log_mult |= prev_rtt;
4038                 rack_log_timely(rack,  timely_says, log_mult,
4039                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4040                                 log_rtt_a_diff, __LINE__, 4);
4041         } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4042                            ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4043                             max(rack_gp_rtt_mindiv , 1)))) {
4044                 /* Increase the b/w multiplier */
4045                 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4046                         ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4047                          max(rack_gp_rtt_mindiv , 1));
4048                 log_mult <<= 32;
4049                 log_mult |= prev_rtt;
4050                 timely_says = 0;
4051                 rack_log_timely(rack,  timely_says, log_mult ,
4052                                 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4053                                 log_rtt_a_diff, __LINE__, 5);
4054         } else {
4055                 /*
4056                  * Use a gradient to find it the timely gradient
4057                  * is:
4058                  * grad = rc_rtt_diff / min_rtt;
4059                  *
4060                  * anything below or equal to 0 will be
4061                  * a increase indication. Anything above
4062                  * zero is a decrease. Note we take care
4063                  * of the actual gradient calculation
4064                  * in the reduction (its not needed for
4065                  * increase).
4066                  */
4067                 log_mult = prev_rtt;
4068                 if (rtt_diff <= 0) {
4069                         /*
4070                          * Rttdiff is less than zero, increase the
4071                          * b/w multiplier (its 0 or negative)
4072                          */
4073                         timely_says = 0;
4074                         rack_log_timely(rack,  timely_says, log_mult,
4075                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4076                 } else {
4077                         /* Reduce the b/w multiplier */
4078                         timely_says = 1;
4079                         rack_log_timely(rack,  timely_says, log_mult,
4080                                         get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4081                 }
4082         }
4083         return (timely_says);
4084 }
4085
4086 static void
4087 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4088                             tcp_seq th_ack, int line, uint8_t quality)
4089 {
4090         uint64_t tim, bytes_ps, ltim, stim, utim;
4091         uint32_t segsiz, bytes, reqbytes, us_cts;
4092         int32_t gput, new_rtt_diff, timely_says;
4093         uint64_t  resid_bw, subpart = 0, addpart = 0, srtt;
4094         int did_add = 0;
4095
4096         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4097         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4098         if (TSTMP_GEQ(us_cts, tp->gput_ts))
4099                 tim = us_cts - tp->gput_ts;
4100         else
4101                 tim = 0;
4102         if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4103                 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4104         else
4105                 stim = 0;
4106         /*
4107          * Use the larger of the send time or ack time. This prevents us
4108          * from being influenced by ack artifacts to come up with too
4109          * high of measurement. Note that since we are spanning over many more
4110          * bytes in most of our measurements hopefully that is less likely to
4111          * occur.
4112          */
4113         if (tim > stim)
4114                 utim = max(tim, 1);
4115         else
4116                 utim = max(stim, 1);
4117         /* Lets get a msec time ltim too for the old stuff */
4118         ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
4119         gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
4120         reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4121         if ((tim == 0) && (stim == 0)) {
4122                 /*
4123                  * Invalid measurement time, maybe
4124                  * all on one ack/one send?
4125                  */
4126                 bytes = 0;
4127                 bytes_ps = 0;
4128                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4129                                            0, 0, 0, 10, __LINE__, NULL, quality);
4130                 goto skip_measurement;
4131         }
4132         if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4133                 /* We never made a us_rtt measurement? */
4134                 bytes = 0;
4135                 bytes_ps = 0;
4136                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4137                                            0, 0, 0, 10, __LINE__, NULL, quality);
4138                 goto skip_measurement;
4139         }
4140         /*
4141          * Calculate the maximum possible b/w this connection
4142          * could have. We base our calculation on the lowest
4143          * rtt we have seen during the measurement and the
4144          * largest rwnd the client has given us in that time. This
4145          * forms a BDP that is the maximum that we could ever
4146          * get to the client. Anything larger is not valid.
4147          *
4148          * I originally had code here that rejected measurements
4149          * where the time was less than 1/2 the latest us_rtt.
4150          * But after thinking on that I realized its wrong since
4151          * say you had a 150Mbps or even 1Gbps link, and you
4152          * were a long way away.. example I am in Europe (100ms rtt)
4153          * talking to my 1Gbps link in S.C. Now measuring say 150,000
4154          * bytes my time would be 1.2ms, and yet my rtt would say
4155          * the measurement was invalid the time was < 50ms. The
4156          * same thing is true for 150Mb (8ms of time).
4157          *
4158          * A better way I realized is to look at what the maximum
4159          * the connection could possibly do. This is gated on
4160          * the lowest RTT we have seen and the highest rwnd.
4161          * We should in theory never exceed that, if we are
4162          * then something on the path is storing up packets
4163          * and then feeding them all at once to our endpoint
4164          * messing up our measurement.
4165          */
4166         rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4167         rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4168         rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4169         if (SEQ_LT(th_ack, tp->gput_seq)) {
4170                 /* No measurement can be made */
4171                 bytes = 0;
4172                 bytes_ps = 0;
4173                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4174                                            0, 0, 0, 10, __LINE__, NULL, quality);
4175                 goto skip_measurement;
4176         } else
4177                 bytes = (th_ack - tp->gput_seq);
4178         bytes_ps = (uint64_t)bytes;
4179         /*
4180          * Don't measure a b/w for pacing unless we have gotten at least
4181          * an initial windows worth of data in this measurement interval.
4182          *
4183          * Small numbers of bytes get badly influenced by delayed ack and
4184          * other artifacts. Note we take the initial window or our
4185          * defined minimum GP (defaulting to 10 which hopefully is the
4186          * IW).
4187          */
4188         if (rack->rc_gp_filled == 0) {
4189                 /*
4190                  * The initial estimate is special. We
4191                  * have blasted out an IW worth of packets
4192                  * without a real valid ack ts results. We
4193                  * then setup the app_limited_needs_set flag,
4194                  * this should get the first ack in (probably 2
4195                  * MSS worth) to be recorded as the timestamp.
4196                  * We thus allow a smaller number of bytes i.e.
4197                  * IW - 2MSS.
4198                  */
4199                 reqbytes -= (2 * segsiz);
4200                 /* Also lets fill previous for our first measurement to be neutral */
4201                 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4202         }
4203         if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4204                 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4205                                            rack->r_ctl.rc_app_limited_cnt,
4206                                            0, 0, 10, __LINE__, NULL, quality);
4207                 goto skip_measurement;
4208         }
4209         /*
4210          * We now need to calculate the Timely like status so
4211          * we can update (possibly) the b/w multipliers.
4212          */
4213         new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4214         if (rack->rc_gp_filled == 0) {
4215                 /* No previous reading */
4216                 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4217         } else {
4218                 if (rack->measure_saw_probe_rtt == 0) {
4219                         /*
4220                          * We don't want a probertt to be counted
4221                          * since it will be negative incorrectly. We
4222                          * expect to be reducing the RTT when we
4223                          * pace at a slower rate.
4224                          */
4225                         rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4226                         rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4227                 }
4228         }
4229         timely_says = rack_make_timely_judgement(rack,
4230                 rack->r_ctl.rc_gp_srtt,
4231                 rack->r_ctl.rc_rtt_diff,
4232                 rack->r_ctl.rc_prev_gp_srtt
4233                 );
4234         bytes_ps *= HPTS_USEC_IN_SEC;
4235         bytes_ps /= utim;
4236         if (bytes_ps > rack->r_ctl.last_max_bw) {
4237                 /*
4238                  * Something is on path playing
4239                  * since this b/w is not possible based
4240                  * on our BDP (highest rwnd and lowest rtt
4241                  * we saw in the measurement window).
4242                  *
4243                  * Another option here would be to
4244                  * instead skip the measurement.
4245                  */
4246                 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4247                                            bytes_ps, rack->r_ctl.last_max_bw, 0,
4248                                            11, __LINE__, NULL, quality);
4249                 bytes_ps = rack->r_ctl.last_max_bw;
4250         }
4251         /* We store gp for b/w in bytes per second */
4252         if (rack->rc_gp_filled == 0) {
4253                 /* Initial measurement */
4254                 if (bytes_ps) {
4255                         rack->r_ctl.gp_bw = bytes_ps;
4256                         rack->rc_gp_filled = 1;
4257                         rack->r_ctl.num_measurements = 1;
4258                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4259                 } else {
4260                         rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4261                                                    rack->r_ctl.rc_app_limited_cnt,
4262                                                    0, 0, 10, __LINE__, NULL, quality);
4263                 }
4264                 if (tcp_in_hpts(rack->rc_inp) &&
4265                     (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4266                         /*
4267                          * Ok we can't trust the pacer in this case
4268                          * where we transition from un-paced to paced.
4269                          * Or for that matter when the burst mitigation
4270                          * was making a wild guess and got it wrong.
4271                          * Stop the pacer and clear up all the aggregate
4272                          * delays etc.
4273                          */
4274                         tcp_hpts_remove(rack->rc_inp);
4275                         rack->r_ctl.rc_hpts_flags = 0;
4276                         rack->r_ctl.rc_last_output_to = 0;
4277                 }
4278                 did_add = 2;
4279         } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4280                 /* Still a small number run an average */
4281                 rack->r_ctl.gp_bw += bytes_ps;
4282                 addpart = rack->r_ctl.num_measurements;
4283                 rack->r_ctl.num_measurements++;
4284                 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4285                         /* We have collected enough to move forward */
4286                         rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4287                 }
4288                 did_add = 3;
4289         } else {
4290                 /*
4291                  * We want to take 1/wma of the goodput and add in to 7/8th
4292                  * of the old value weighted by the srtt. So if your measurement
4293                  * period is say 2 SRTT's long you would get 1/4 as the
4294                  * value, if it was like 1/2 SRTT then you would get 1/16th.
4295                  *
4296                  * But we must be careful not to take too much i.e. if the
4297                  * srtt is say 20ms and the measurement is taken over
4298                  * 400ms our weight would be 400/20 i.e. 20. On the
4299                  * other hand if we get a measurement over 1ms with a
4300                  * 10ms rtt we only want to take a much smaller portion.
4301                  */
4302                 if (rack->r_ctl.num_measurements < 0xff) {
4303                         rack->r_ctl.num_measurements++;
4304                 }
4305                 srtt = (uint64_t)tp->t_srtt;
4306                 if (srtt == 0) {
4307                         /*
4308                          * Strange why did t_srtt go back to zero?
4309                          */
4310                         if (rack->r_ctl.rc_rack_min_rtt)
4311                                 srtt = rack->r_ctl.rc_rack_min_rtt;
4312                         else
4313                                 srtt = HPTS_USEC_IN_MSEC;
4314                 }
4315                 /*
4316                  * XXXrrs: Note for reviewers, in playing with
4317                  * dynamic pacing I discovered this GP calculation
4318                  * as done originally leads to some undesired results.
4319                  * Basically you can get longer measurements contributing
4320                  * too much to the WMA. Thus I changed it if you are doing
4321                  * dynamic adjustments to only do the aportioned adjustment
4322                  * if we have a very small (time wise) measurement. Longer
4323                  * measurements just get there weight (defaulting to 1/8)
4324                  * add to the WMA. We may want to think about changing
4325                  * this to always do that for both sides i.e. dynamic
4326                  * and non-dynamic... but considering lots of folks
4327                  * were playing with this I did not want to change the
4328                  * calculation per.se. without your thoughts.. Lawerence?
4329                  * Peter??
4330                  */
4331                 if (rack->rc_gp_dyn_mul == 0) {
4332                         subpart = rack->r_ctl.gp_bw * utim;
4333                         subpart /= (srtt * 8);
4334                         if (subpart < (rack->r_ctl.gp_bw / 2)) {
4335                                 /*
4336                                  * The b/w update takes no more
4337                                  * away then 1/2 our running total
4338                                  * so factor it in.
4339                                  */
4340                                 addpart = bytes_ps * utim;
4341                                 addpart /= (srtt * 8);
4342                         } else {
4343                                 /*
4344                                  * Don't allow a single measurement
4345                                  * to account for more than 1/2 of the
4346                                  * WMA. This could happen on a retransmission
4347                                  * where utim becomes huge compared to
4348                                  * srtt (multiple retransmissions when using
4349                                  * the sending rate which factors in all the
4350                                  * transmissions from the first one).
4351                                  */
4352                                 subpart = rack->r_ctl.gp_bw / 2;
4353                                 addpart = bytes_ps / 2;
4354                         }
4355                         resid_bw = rack->r_ctl.gp_bw - subpart;
4356                         rack->r_ctl.gp_bw = resid_bw + addpart;
4357                         did_add = 1;
4358                 } else {
4359                         if ((utim / srtt) <= 1) {
4360                                 /*
4361                                  * The b/w update was over a small period
4362                                  * of time. The idea here is to prevent a small
4363                                  * measurement time period from counting
4364                                  * too much. So we scale it based on the
4365                                  * time so it attributes less than 1/rack_wma_divisor
4366                                  * of its measurement.
4367                                  */
4368                                 subpart = rack->r_ctl.gp_bw * utim;
4369                                 subpart /= (srtt * rack_wma_divisor);
4370                                 addpart = bytes_ps * utim;
4371                                 addpart /= (srtt * rack_wma_divisor);
4372                         } else {
4373                                 /*
4374                                  * The scaled measurement was long
4375                                  * enough so lets just add in the
4376                                  * portion of the measurement i.e. 1/rack_wma_divisor
4377                                  */
4378                                 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
4379                                 addpart = bytes_ps / rack_wma_divisor;
4380                         }
4381                         if ((rack->measure_saw_probe_rtt == 0) ||
4382                             (bytes_ps > rack->r_ctl.gp_bw)) {
4383                                 /*
4384                                  * For probe-rtt we only add it in
4385                                  * if its larger, all others we just
4386                                  * add in.
4387                                  */
4388                                 did_add = 1;
4389                                 resid_bw = rack->r_ctl.gp_bw - subpart;
4390                                 rack->r_ctl.gp_bw = resid_bw + addpart;
4391                         }
4392                 }
4393         }
4394         if ((rack->gp_ready == 0) &&
4395             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
4396                 /* We have enough measurements now */
4397                 rack->gp_ready = 1;
4398                 rack_set_cc_pacing(rack);
4399                 if (rack->defer_options)
4400                         rack_apply_deferred_options(rack);
4401         }
4402         rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
4403                                    rack_get_bw(rack), 22, did_add, NULL, quality);
4404         /* We do not update any multipliers if we are in or have seen a probe-rtt */
4405         if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
4406                 rack_update_multiplier(rack, timely_says, bytes_ps,
4407                                        rack->r_ctl.rc_gp_srtt,
4408                                        rack->r_ctl.rc_rtt_diff);
4409         rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
4410                                    rack_get_bw(rack), 3, line, NULL, quality);
4411         /* reset the gp srtt and setup the new prev */
4412         rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4413         /* Record the lost count for the next measurement */
4414         rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
4415         /*
4416          * We restart our diffs based on the gpsrtt in the
4417          * measurement window.
4418          */
4419         rack->rc_gp_rtt_set = 0;
4420         rack->rc_gp_saw_rec = 0;
4421         rack->rc_gp_saw_ca = 0;
4422         rack->rc_gp_saw_ss = 0;
4423         rack->rc_dragged_bottom = 0;
4424 skip_measurement:
4425
4426 #ifdef STATS
4427         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
4428                                  gput);
4429         /*
4430          * XXXLAS: This is a temporary hack, and should be
4431          * chained off VOI_TCP_GPUT when stats(9) grows an
4432          * API to deal with chained VOIs.
4433          */
4434         if (tp->t_stats_gput_prev > 0)
4435                 stats_voi_update_abs_s32(tp->t_stats,
4436                                          VOI_TCP_GPUT_ND,
4437                                          ((gput - tp->t_stats_gput_prev) * 100) /
4438                                          tp->t_stats_gput_prev);
4439 #endif
4440         tp->t_flags &= ~TF_GPUTINPROG;
4441         tp->t_stats_gput_prev = gput;
4442         /*
4443          * Now are we app limited now and there is space from where we
4444          * were to where we want to go?
4445          *
4446          * We don't do the other case i.e. non-applimited here since
4447          * the next send will trigger us picking up the missing data.
4448          */
4449         if (rack->r_ctl.rc_first_appl &&
4450             TCPS_HAVEESTABLISHED(tp->t_state) &&
4451             rack->r_ctl.rc_app_limited_cnt &&
4452             (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
4453             ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
4454              max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
4455                 /*
4456                  * Yep there is enough outstanding to make a measurement here.
4457                  */
4458                 struct rack_sendmap *rsm, fe;
4459
4460                 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
4461                 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
4462                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
4463                 rack->app_limited_needs_set = 0;
4464                 tp->gput_seq = th_ack;
4465                 if (rack->in_probe_rtt)
4466                         rack->measure_saw_probe_rtt = 1;
4467                 else if ((rack->measure_saw_probe_rtt) &&
4468                          (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
4469                         rack->measure_saw_probe_rtt = 0;
4470                 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
4471                         /* There is a full window to gain info from */
4472                         tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
4473                 } else {
4474                         /* We can only measure up to the applimited point */
4475                         tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
4476                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
4477                                 /*
4478                                  * We don't have enough to make a measurement.
4479                                  */
4480                                 tp->t_flags &= ~TF_GPUTINPROG;
4481                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
4482                                                            0, 0, 0, 6, __LINE__, NULL, quality);
4483                                 return;
4484                         }
4485                 }
4486                 if (tp->t_state >= TCPS_FIN_WAIT_1) {
4487                         /*
4488                          * We will get no more data into the SB
4489                          * this means we need to have the data available
4490                          * before we start a measurement.
4491                          */
4492                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) {
4493                                 /* Nope not enough data. */
4494                                 return;
4495                         }
4496                 }
4497                 tp->t_flags |= TF_GPUTINPROG;
4498                 /*
4499                  * Now we need to find the timestamp of the send at tp->gput_seq
4500                  * for the send based measurement.
4501                  */
4502                 fe.r_start = tp->gput_seq;
4503                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
4504                 if (rsm) {
4505                         /* Ok send-based limit is set */
4506                         if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
4507                                 /*
4508                                  * Move back to include the earlier part
4509                                  * so our ack time lines up right (this may
4510                                  * make an overlapping measurement but thats
4511                                  * ok).
4512                                  */
4513                                 tp->gput_seq = rsm->r_start;
4514                         }
4515                         if (rsm->r_flags & RACK_ACKED)
4516                                 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
4517                         else
4518                                 rack->app_limited_needs_set = 1;
4519                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
4520                 } else {
4521                         /*
4522                          * If we don't find the rsm due to some
4523                          * send-limit set the current time, which
4524                          * basically disables the send-limit.
4525                          */
4526                         struct timeval tv;
4527
4528                         microuptime(&tv);
4529                         rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
4530                 }
4531                 rack_log_pacing_delay_calc(rack,
4532                                            tp->gput_seq,
4533                                            tp->gput_ack,
4534                                            (uint64_t)rsm,
4535                                            tp->gput_ts,
4536                                            rack->r_ctl.rc_app_limited_cnt,
4537                                            9,
4538                                            __LINE__, NULL, quality);
4539         }
4540 }
4541
4542 /*
4543  * CC wrapper hook functions
4544  */
4545 static void
4546 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
4547     uint16_t type, int32_t recovery)
4548 {
4549         uint32_t prior_cwnd, acked;
4550         struct tcp_log_buffer *lgb = NULL;
4551         uint8_t labc_to_use, quality;
4552
4553         INP_WLOCK_ASSERT(tp->t_inpcb);
4554         tp->ccv->nsegs = nsegs;
4555         acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una);
4556         if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
4557                 uint32_t max;
4558
4559                 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
4560                 if (tp->ccv->bytes_this_ack > max) {
4561                         tp->ccv->bytes_this_ack = max;
4562                 }
4563         }
4564 #ifdef STATS
4565         stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
4566             ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
4567 #endif
4568         quality = RACK_QUALITY_NONE;
4569         if ((tp->t_flags & TF_GPUTINPROG) &&
4570             rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
4571                 /* Measure the Goodput */
4572                 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
4573 #ifdef NETFLIX_PEAKRATE
4574                 if ((type == CC_ACK) &&
4575                     (tp->t_maxpeakrate)) {
4576                         /*
4577                          * We update t_peakrate_thr. This gives us roughly
4578                          * one update per round trip time. Note
4579                          * it will only be used if pace_always is off i.e
4580                          * we don't do this for paced flows.
4581                          */
4582                         rack_update_peakrate_thr(tp);
4583                 }
4584 #endif
4585         }
4586         /* Which way our we limited, if not cwnd limited no advance in CA */
4587         if (tp->snd_cwnd <= tp->snd_wnd)
4588                 tp->ccv->flags |= CCF_CWND_LIMITED;
4589         else
4590                 tp->ccv->flags &= ~CCF_CWND_LIMITED;
4591         if (tp->snd_cwnd > tp->snd_ssthresh) {
4592                 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
4593                          nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
4594                 /* For the setting of a window past use the actual scwnd we are using */
4595                 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
4596                         tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
4597                         tp->ccv->flags |= CCF_ABC_SENTAWND;
4598                 }
4599         } else {
4600                 tp->ccv->flags &= ~CCF_ABC_SENTAWND;
4601                 tp->t_bytes_acked = 0;
4602         }
4603         prior_cwnd = tp->snd_cwnd;
4604         if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
4605             (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
4606                 labc_to_use = rack->rc_labc;
4607         else
4608                 labc_to_use = rack_max_abc_post_recovery;
4609         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4610                 union tcp_log_stackspecific log;
4611                 struct timeval tv;
4612
4613                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4614                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4615                 log.u_bbr.flex1 = th_ack;
4616                 log.u_bbr.flex2 = tp->ccv->flags;
4617                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4618                 log.u_bbr.flex4 = tp->ccv->nsegs;
4619                 log.u_bbr.flex5 = labc_to_use;
4620                 log.u_bbr.flex6 = prior_cwnd;
4621                 log.u_bbr.flex7 = V_tcp_do_newsack;
4622                 log.u_bbr.flex8 = 1;
4623                 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4624                                      0, &log, false, NULL, NULL, 0, &tv);
4625         }
4626         if (CC_ALGO(tp)->ack_received != NULL) {
4627                 /* XXXLAS: Find a way to live without this */
4628                 tp->ccv->curack = th_ack;
4629                 tp->ccv->labc = labc_to_use;
4630                 tp->ccv->flags |= CCF_USE_LOCAL_ABC;
4631                 CC_ALGO(tp)->ack_received(tp->ccv, type);
4632         }
4633         if (lgb) {
4634                 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
4635         }
4636         if (rack->r_must_retran) {
4637                 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
4638                         /*
4639                          * We now are beyond the rxt point so lets disable
4640                          * the flag.
4641                          */
4642                         rack->r_ctl.rc_out_at_rto = 0;
4643                         rack->r_must_retran = 0;
4644                 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
4645                         /*
4646                          * Only decrement the rc_out_at_rto if the cwnd advances
4647                          * at least a whole segment. Otherwise next time the peer
4648                          * acks, we won't be able to send this generaly happens
4649                          * when we are in Congestion Avoidance.
4650                          */
4651                         if (acked <= rack->r_ctl.rc_out_at_rto){
4652                                 rack->r_ctl.rc_out_at_rto -= acked;
4653                         } else {
4654                                 rack->r_ctl.rc_out_at_rto = 0;
4655                         }
4656                 }
4657         }
4658 #ifdef STATS
4659         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
4660 #endif
4661         if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
4662                 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
4663         }
4664 #ifdef NETFLIX_PEAKRATE
4665         /* we enforce max peak rate if it is set and we are not pacing */
4666         if ((rack->rc_always_pace == 0) &&
4667             tp->t_peakrate_thr &&
4668             (tp->snd_cwnd > tp->t_peakrate_thr)) {
4669                 tp->snd_cwnd = tp->t_peakrate_thr;
4670         }
4671 #endif
4672 }
4673
4674 static void
4675 tcp_rack_partialack(struct tcpcb *tp)
4676 {
4677         struct tcp_rack *rack;
4678
4679         rack = (struct tcp_rack *)tp->t_fb_ptr;
4680         INP_WLOCK_ASSERT(tp->t_inpcb);
4681         /*
4682          * If we are doing PRR and have enough
4683          * room to send <or> we are pacing and prr
4684          * is disabled we will want to see if we
4685          * can send data (by setting r_wanted_output to
4686          * true).
4687          */
4688         if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
4689             rack->rack_no_prr)
4690                 rack->r_wanted_output = 1;
4691 }
4692
4693 static void
4694 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
4695 {
4696         struct tcp_rack *rack;
4697         uint32_t orig_cwnd;
4698
4699         orig_cwnd = tp->snd_cwnd;
4700         INP_WLOCK_ASSERT(tp->t_inpcb);
4701         rack = (struct tcp_rack *)tp->t_fb_ptr;
4702         /* only alert CC if we alerted when we entered */
4703         if (CC_ALGO(tp)->post_recovery != NULL) {
4704                 tp->ccv->curack = th_ack;
4705                 CC_ALGO(tp)->post_recovery(tp->ccv);
4706                 if (tp->snd_cwnd < tp->snd_ssthresh) {
4707                         /*
4708                          * Rack has burst control and pacing
4709                          * so lets not set this any lower than
4710                          * snd_ssthresh per RFC-6582 (option 2).
4711                          */
4712                         tp->snd_cwnd = tp->snd_ssthresh;
4713                 }
4714         }
4715         if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
4716                 union tcp_log_stackspecific log;
4717                 struct timeval tv;
4718
4719                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
4720                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4721                 log.u_bbr.flex1 = th_ack;
4722                 log.u_bbr.flex2 = tp->ccv->flags;
4723                 log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
4724                 log.u_bbr.flex4 = tp->ccv->nsegs;
4725                 log.u_bbr.flex5 = V_tcp_abc_l_var;
4726                 log.u_bbr.flex6 = orig_cwnd;
4727                 log.u_bbr.flex7 = V_tcp_do_newsack;
4728                 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
4729                 log.u_bbr.flex8 = 2;
4730                 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
4731                                0, &log, false, NULL, NULL, 0, &tv);
4732         }
4733         if ((rack->rack_no_prr == 0) &&
4734             (rack->no_prr_addback == 0) &&
4735             (rack->r_ctl.rc_prr_sndcnt > 0)) {
4736                 /*
4737                  * Suck the next prr cnt back into cwnd, but
4738                  * only do that if we are not application limited.
4739                  */
4740                 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
4741                         /*
4742                          * We are allowed to add back to the cwnd the amount we did
4743                          * not get out if:
4744                          * a) no_prr_addback is off.
4745                          * b) we are not app limited
4746                          * c) we are doing prr
4747                          * <and>
4748                          * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
4749                          */
4750                         tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
4751                                             rack->r_ctl.rc_prr_sndcnt);
4752                 }
4753                 rack->r_ctl.rc_prr_sndcnt = 0;
4754                 rack_log_to_prr(rack, 1, 0, __LINE__);
4755         }
4756         rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
4757         tp->snd_recover = tp->snd_una;
4758         if (rack->r_ctl.dsack_persist) {
4759                 rack->r_ctl.dsack_persist--;
4760                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
4761                         rack->r_ctl.num_dsack = 0;
4762                 }
4763                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
4764         }
4765         EXIT_RECOVERY(tp->t_flags);
4766 }
4767
4768 static void
4769 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
4770 {
4771         struct tcp_rack *rack;
4772         uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
4773
4774         INP_WLOCK_ASSERT(tp->t_inpcb);
4775 #ifdef STATS
4776         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
4777 #endif
4778         if (IN_RECOVERY(tp->t_flags) == 0) {
4779                 in_rec_at_entry = 0;
4780                 ssthresh_enter = tp->snd_ssthresh;
4781                 cwnd_enter = tp->snd_cwnd;
4782         } else
4783                 in_rec_at_entry = 1;
4784         rack = (struct tcp_rack *)tp->t_fb_ptr;
4785         switch (type) {
4786         case CC_NDUPACK:
4787                 tp->t_flags &= ~TF_WASFRECOVERY;
4788                 tp->t_flags &= ~TF_WASCRECOVERY;
4789                 if (!IN_FASTRECOVERY(tp->t_flags)) {
4790                         rack->r_ctl.rc_prr_delivered = 0;
4791                         rack->r_ctl.rc_prr_out = 0;
4792                         if (rack->rack_no_prr == 0) {
4793                                 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
4794                                 rack_log_to_prr(rack, 2, in_rec_at_entry, line);
4795                         }
4796                         rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
4797                         tp->snd_recover = tp->snd_max;
4798                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4799                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4800                 }
4801                 break;
4802         case CC_ECN:
4803                 if (!IN_CONGRECOVERY(tp->t_flags) ||
4804                     /*
4805                      * Allow ECN reaction on ACK to CWR, if
4806                      * that data segment was also CE marked.
4807                      */
4808                     SEQ_GEQ(ack, tp->snd_recover)) {
4809                         EXIT_CONGRECOVERY(tp->t_flags);
4810                         KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
4811                         tp->snd_recover = tp->snd_max + 1;
4812                         if (tp->t_flags2 & TF2_ECN_PERMIT)
4813                                 tp->t_flags2 |= TF2_ECN_SND_CWR;
4814                 }
4815                 break;
4816         case CC_RTO:
4817                 tp->t_dupacks = 0;
4818                 tp->t_bytes_acked = 0;
4819                 EXIT_RECOVERY(tp->t_flags);
4820                 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
4821                     ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
4822                 orig_cwnd = tp->snd_cwnd;
4823                 tp->snd_cwnd = ctf_fixed_maxseg(tp);
4824                 rack_log_to_prr(rack, 16, orig_cwnd, line);
4825                 if (tp->t_flags2 & TF2_ECN_PERMIT)
4826                         tp->t_flags2 |= TF2_ECN_SND_CWR;
4827                 break;
4828         case CC_RTO_ERR:
4829                 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
4830                 /* RTO was unnecessary, so reset everything. */
4831                 tp->snd_cwnd = tp->snd_cwnd_prev;
4832                 tp->snd_ssthresh = tp->snd_ssthresh_prev;
4833                 tp->snd_recover = tp->snd_recover_prev;
4834                 if (tp->t_flags & TF_WASFRECOVERY) {
4835                         ENTER_FASTRECOVERY(tp->t_flags);
4836                         tp->t_flags &= ~TF_WASFRECOVERY;
4837                 }
4838                 if (tp->t_flags & TF_WASCRECOVERY) {
4839                         ENTER_CONGRECOVERY(tp->t_flags);
4840                         tp->t_flags &= ~TF_WASCRECOVERY;
4841                 }
4842                 tp->snd_nxt = tp->snd_max;
4843                 tp->t_badrxtwin = 0;
4844                 break;
4845         }
4846         if ((CC_ALGO(tp)->cong_signal != NULL)  &&
4847             (type != CC_RTO)){
4848                 tp->ccv->curack = ack;
4849                 CC_ALGO(tp)->cong_signal(tp->ccv, type);
4850         }
4851         if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
4852                 rack_log_to_prr(rack, 15, cwnd_enter, line);
4853                 rack->r_ctl.dsack_byte_cnt = 0;
4854                 rack->r_ctl.retran_during_recovery = 0;
4855                 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
4856                 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
4857                 rack->r_ent_rec_ns = 1;
4858         }
4859 }
4860
4861 static inline void
4862 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
4863 {
4864         uint32_t i_cwnd;
4865
4866         INP_WLOCK_ASSERT(tp->t_inpcb);
4867
4868 #ifdef NETFLIX_STATS
4869         KMOD_TCPSTAT_INC(tcps_idle_restarts);
4870         if (tp->t_state == TCPS_ESTABLISHED)
4871                 KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
4872 #endif
4873         if (CC_ALGO(tp)->after_idle != NULL)
4874                 CC_ALGO(tp)->after_idle(tp->ccv);
4875
4876         if (tp->snd_cwnd == 1)
4877                 i_cwnd = tp->t_maxseg;          /* SYN(-ACK) lost */
4878         else
4879                 i_cwnd = rc_init_window(rack);
4880
4881         /*
4882          * Being idle is no different than the initial window. If the cc
4883          * clamps it down below the initial window raise it to the initial
4884          * window.
4885          */
4886         if (tp->snd_cwnd < i_cwnd) {
4887                 tp->snd_cwnd = i_cwnd;
4888         }
4889 }
4890
4891 /*
4892  * Indicate whether this ack should be delayed.  We can delay the ack if
4893  * following conditions are met:
4894  *      - There is no delayed ack timer in progress.
4895  *      - Our last ack wasn't a 0-sized window. We never want to delay
4896  *        the ack that opens up a 0-sized window.
4897  *      - LRO wasn't used for this segment. We make sure by checking that the
4898  *        segment size is not larger than the MSS.
4899  *      - Delayed acks are enabled or this is a half-synchronized T/TCP
4900  *        connection.
4901  */
4902 #define DELAY_ACK(tp, tlen)                      \
4903         (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
4904         ((tp->t_flags & TF_DELACK) == 0) &&      \
4905         (tlen <= tp->t_maxseg) &&                \
4906         (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
4907
4908 static struct rack_sendmap *
4909 rack_find_lowest_rsm(struct tcp_rack *rack)
4910 {
4911         struct rack_sendmap *rsm;
4912
4913         /*
4914          * Walk the time-order transmitted list looking for an rsm that is
4915          * not acked. This will be the one that was sent the longest time
4916          * ago that is still outstanding.
4917          */
4918         TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
4919                 if (rsm->r_flags & RACK_ACKED) {
4920                         continue;
4921                 }
4922                 goto finish;
4923         }
4924 finish:
4925         return (rsm);
4926 }
4927
4928 static struct rack_sendmap *
4929 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
4930 {
4931         struct rack_sendmap *prsm;
4932
4933         /*
4934          * Walk the sequence order list backward until we hit and arrive at
4935          * the highest seq not acked. In theory when this is called it
4936          * should be the last segment (which it was not).
4937          */
4938         prsm = rsm;
4939         RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
4940                 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
4941                         continue;
4942                 }
4943                 return (prsm);
4944         }
4945         return (NULL);
4946 }
4947
4948 static uint32_t
4949 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
4950 {
4951         int32_t lro;
4952         uint32_t thresh;
4953
4954         /*
4955          * lro is the flag we use to determine if we have seen reordering.
4956          * If it gets set we have seen reordering. The reorder logic either
4957          * works in one of two ways:
4958          *
4959          * If reorder-fade is configured, then we track the last time we saw
4960          * re-ordering occur. If we reach the point where enough time as
4961          * passed we no longer consider reordering has occuring.
4962          *
4963          * Or if reorder-face is 0, then once we see reordering we consider
4964          * the connection to alway be subject to reordering and just set lro
4965          * to 1.
4966          *
4967          * In the end if lro is non-zero we add the extra time for
4968          * reordering in.
4969          */
4970         if (srtt == 0)
4971                 srtt = 1;
4972         if (rack->r_ctl.rc_reorder_ts) {
4973                 if (rack->r_ctl.rc_reorder_fade) {
4974                         if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
4975                                 lro = cts - rack->r_ctl.rc_reorder_ts;
4976                                 if (lro == 0) {
4977                                         /*
4978                                          * No time as passed since the last
4979                                          * reorder, mark it as reordering.
4980                                          */
4981                                         lro = 1;
4982                                 }
4983                         } else {
4984                                 /* Negative time? */
4985                                 lro = 0;
4986                         }
4987                         if (lro > rack->r_ctl.rc_reorder_fade) {
4988                                 /* Turn off reordering seen too */
4989                                 rack->r_ctl.rc_reorder_ts = 0;
4990                                 lro = 0;
4991                         }
4992                 } else {
4993                         /* Reodering does not fade */
4994                         lro = 1;
4995                 }
4996         } else {
4997                 lro = 0;
4998         }
4999         if (rack->rc_rack_tmr_std_based == 0) {
5000                 thresh = srtt + rack->r_ctl.rc_pkt_delay;
5001         } else {
5002                 /* Standards based pkt-delay is 1/4 srtt */
5003                 thresh = srtt +  (srtt >> 2);
5004         }
5005         if (lro && (rack->rc_rack_tmr_std_based == 0)) {
5006                 /* It must be set, if not you get 1/4 rtt */
5007                 if (rack->r_ctl.rc_reorder_shift)
5008                         thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
5009                 else
5010                         thresh += (srtt >> 2);
5011         }
5012         if (rack->rc_rack_use_dsack &&
5013             lro &&
5014             (rack->r_ctl.num_dsack > 0)) {
5015                 /*
5016                  * We only increase the reordering window if we
5017                  * have seen reordering <and> we have a DSACK count.
5018                  */
5019                 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
5020                 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
5021         }
5022         /* SRTT * 2 is the ceiling */
5023         if (thresh > (srtt * 2)) {
5024                 thresh = srtt * 2;
5025         }
5026         /* And we don't want it above the RTO max either */
5027         if (thresh > rack_rto_max) {
5028                 thresh = rack_rto_max;
5029         }
5030         rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
5031         return (thresh);
5032 }
5033
5034 static uint32_t
5035 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
5036                      struct rack_sendmap *rsm, uint32_t srtt)
5037 {
5038         struct rack_sendmap *prsm;
5039         uint32_t thresh, len;
5040         int segsiz;
5041
5042         if (srtt == 0)
5043                 srtt = 1;
5044         if (rack->r_ctl.rc_tlp_threshold)
5045                 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
5046         else
5047                 thresh = (srtt * 2);
5048
5049         /* Get the previous sent packet, if any */
5050         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5051         len = rsm->r_end - rsm->r_start;
5052         if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
5053                 /* Exactly like the ID */
5054                 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
5055                         uint32_t alt_thresh;
5056                         /*
5057                          * Compensate for delayed-ack with the d-ack time.
5058                          */
5059                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5060                         if (alt_thresh > thresh)
5061                                 thresh = alt_thresh;
5062                 }
5063         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
5064                 /* 2.1 behavior */
5065                 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
5066                 if (prsm && (len <= segsiz)) {
5067                         /*
5068                          * Two packets outstanding, thresh should be (2*srtt) +
5069                          * possible inter-packet delay (if any).
5070                          */
5071                         uint32_t inter_gap = 0;
5072                         int idx, nidx;
5073
5074                         idx = rsm->r_rtr_cnt - 1;
5075                         nidx = prsm->r_rtr_cnt - 1;
5076                         if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
5077                                 /* Yes it was sent later (or at the same time) */
5078                                 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
5079                         }
5080                         thresh += inter_gap;
5081                 } else if (len <= segsiz) {
5082                         /*
5083                          * Possibly compensate for delayed-ack.
5084                          */
5085                         uint32_t alt_thresh;
5086
5087                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5088                         if (alt_thresh > thresh)
5089                                 thresh = alt_thresh;
5090                 }
5091         } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
5092                 /* 2.2 behavior */
5093                 if (len <= segsiz) {
5094                         uint32_t alt_thresh;
5095                         /*
5096                          * Compensate for delayed-ack with the d-ack time.
5097                          */
5098                         alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5099                         if (alt_thresh > thresh)
5100                                 thresh = alt_thresh;
5101                 }
5102         }
5103         /* Not above an RTO */
5104         if (thresh > tp->t_rxtcur) {
5105                 thresh = tp->t_rxtcur;
5106         }
5107         /* Not above a RTO max */
5108         if (thresh > rack_rto_max) {
5109                 thresh = rack_rto_max;
5110         }
5111         /* Apply user supplied min TLP */
5112         if (thresh < rack_tlp_min) {
5113                 thresh = rack_tlp_min;
5114         }
5115         return (thresh);
5116 }
5117
5118 static uint32_t
5119 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
5120 {
5121         /*
5122          * We want the rack_rtt which is the
5123          * last rtt we measured. However if that
5124          * does not exist we fallback to the srtt (which
5125          * we probably will never do) and then as a last
5126          * resort we use RACK_INITIAL_RTO if no srtt is
5127          * yet set.
5128          */
5129         if (rack->rc_rack_rtt)
5130                 return (rack->rc_rack_rtt);
5131         else if (tp->t_srtt == 0)
5132                 return (RACK_INITIAL_RTO);
5133         return (tp->t_srtt);
5134 }
5135
5136 static struct rack_sendmap *
5137 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
5138 {
5139         /*
5140          * Check to see that we don't need to fall into recovery. We will
5141          * need to do so if our oldest transmit is past the time we should
5142          * have had an ack.
5143          */
5144         struct tcp_rack *rack;
5145         struct rack_sendmap *rsm;
5146         int32_t idx;
5147         uint32_t srtt, thresh;
5148
5149         rack = (struct tcp_rack *)tp->t_fb_ptr;
5150         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
5151                 return (NULL);
5152         }
5153         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5154         if (rsm == NULL)
5155                 return (NULL);
5156
5157
5158         if (rsm->r_flags & RACK_ACKED) {
5159                 rsm = rack_find_lowest_rsm(rack);
5160                 if (rsm == NULL)
5161                         return (NULL);
5162         }
5163         idx = rsm->r_rtr_cnt - 1;
5164         srtt = rack_grab_rtt(tp, rack);
5165         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
5166         if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
5167                 return (NULL);
5168         }
5169         if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
5170                 return (NULL);
5171         }
5172         /* Ok if we reach here we are over-due and this guy can be sent */
5173         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
5174         return (rsm);
5175 }
5176
5177 static uint32_t
5178 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
5179 {
5180         int32_t t;
5181         int32_t tt;
5182         uint32_t ret_val;
5183
5184         t = (tp->t_srtt + (tp->t_rttvar << 2));
5185         RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
5186             rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
5187         rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
5188         ret_val = (uint32_t)tt;
5189         return (ret_val);
5190 }
5191
5192 static uint32_t
5193 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
5194 {
5195         /*
5196          * Start the FR timer, we do this based on getting the first one in
5197          * the rc_tmap. Note that if its NULL we must stop the timer. in all
5198          * events we need to stop the running timer (if its running) before
5199          * starting the new one.
5200          */
5201         uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
5202         uint32_t srtt_cur;
5203         int32_t idx;
5204         int32_t is_tlp_timer = 0;
5205         struct rack_sendmap *rsm;
5206
5207         if (rack->t_timers_stopped) {
5208                 /* All timers have been stopped none are to run */
5209                 return (0);
5210         }
5211         if (rack->rc_in_persist) {
5212                 /* We can't start any timer in persists */
5213                 return (rack_get_persists_timer_val(tp, rack));
5214         }
5215         rack->rc_on_min_to = 0;
5216         if ((tp->t_state < TCPS_ESTABLISHED) ||
5217             ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
5218                 goto activate_rxt;
5219         }
5220         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5221         if ((rsm == NULL) || sup_rack) {
5222                 /* Nothing on the send map or no rack */
5223 activate_rxt:
5224                 time_since_sent = 0;
5225                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5226                 if (rsm) {
5227                         /*
5228                          * Should we discount the RTX timer any?
5229                          *
5230                          * We want to discount it the smallest amount.
5231                          * If a timer (Rack/TLP or RXT) has gone off more
5232                          * recently thats the discount we want to use (now - timer time).
5233                          * If the retransmit of the oldest packet was more recent then
5234                          * we want to use that (now - oldest-packet-last_transmit_time).
5235                          *
5236                          */
5237                         idx = rsm->r_rtr_cnt - 1;
5238                         if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
5239                                 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5240                         else
5241                                 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5242                         if (TSTMP_GT(cts, tstmp_touse))
5243                             time_since_sent = cts - tstmp_touse;
5244                 }
5245                 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
5246                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
5247                         to = tp->t_rxtcur;
5248                         if (to > time_since_sent)
5249                                 to -= time_since_sent;
5250                         else
5251                                 to = rack->r_ctl.rc_min_to;
5252                         if (to == 0)
5253                                 to = 1;
5254                         /* Special case for KEEPINIT */
5255                         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
5256                             (TP_KEEPINIT(tp) != 0) &&
5257                             rsm) {
5258                                 /*
5259                                  * We have to put a ceiling on the rxt timer
5260                                  * of the keep-init timeout.
5261                                  */
5262                                 uint32_t max_time, red;
5263
5264                                 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
5265                                 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
5266                                         red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
5267                                         if (red < max_time)
5268                                                 max_time -= red;
5269                                         else
5270                                                 max_time = 1;
5271                                 }
5272                                 /* Reduce timeout to the keep value if needed */
5273                                 if (max_time < to)
5274                                         to = max_time;
5275                         }
5276                         return (to);
5277                 }
5278                 return (0);
5279         }
5280         if (rsm->r_flags & RACK_ACKED) {
5281                 rsm = rack_find_lowest_rsm(rack);
5282                 if (rsm == NULL) {
5283                         /* No lowest? */
5284                         goto activate_rxt;
5285                 }
5286         }
5287         if (rack->sack_attack_disable) {
5288                 /*
5289                  * We don't want to do
5290                  * any TLP's if you are an attacker.
5291                  * Though if you are doing what
5292                  * is expected you may still have
5293                  * SACK-PASSED marks.
5294                  */
5295                 goto activate_rxt;
5296         }
5297         /* Convert from ms to usecs */
5298         if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
5299                 if ((tp->t_flags & TF_SENTFIN) &&
5300                     ((tp->snd_max - tp->snd_una) == 1) &&
5301                     (rsm->r_flags & RACK_HAS_FIN)) {
5302                         /*
5303                          * We don't start a rack timer if all we have is a
5304                          * FIN outstanding.
5305                          */
5306                         goto activate_rxt;
5307                 }
5308                 if ((rack->use_rack_rr == 0) &&
5309                     (IN_FASTRECOVERY(tp->t_flags)) &&
5310                     (rack->rack_no_prr == 0) &&
5311                      (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
5312                         /*
5313                          * We are not cheating, in recovery  and
5314                          * not enough ack's to yet get our next
5315                          * retransmission out.
5316                          *
5317                          * Note that classified attackers do not
5318                          * get to use the rack-cheat.
5319                          */
5320                         goto activate_tlp;
5321                 }
5322                 srtt = rack_grab_rtt(tp, rack);
5323                 thresh = rack_calc_thresh_rack(rack, srtt, cts);
5324                 idx = rsm->r_rtr_cnt - 1;
5325                 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
5326                 if (SEQ_GEQ(exp, cts)) {
5327                         to = exp - cts;
5328                         if (to < rack->r_ctl.rc_min_to) {
5329                                 to = rack->r_ctl.rc_min_to;
5330                                 if (rack->r_rr_config == 3)
5331                                         rack->rc_on_min_to = 1;
5332                         }
5333                 } else {
5334                         to = rack->r_ctl.rc_min_to;
5335                         if (rack->r_rr_config == 3)
5336                                 rack->rc_on_min_to = 1;
5337                 }
5338         } else {
5339                 /* Ok we need to do a TLP not RACK */
5340 activate_tlp:
5341                 if ((rack->rc_tlp_in_progress != 0) &&
5342                     (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
5343                         /*
5344                          * The previous send was a TLP and we have sent
5345                          * N TLP's without sending new data.
5346                          */
5347                         goto activate_rxt;
5348                 }
5349                 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
5350                 if (rsm == NULL) {
5351                         /* We found no rsm to TLP with. */
5352                         goto activate_rxt;
5353                 }
5354                 if (rsm->r_flags & RACK_HAS_FIN) {
5355                         /* If its a FIN we dont do TLP */
5356                         rsm = NULL;
5357                         goto activate_rxt;
5358                 }
5359                 idx = rsm->r_rtr_cnt - 1;
5360                 time_since_sent = 0;
5361                 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
5362                         tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
5363                 else
5364                         tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
5365                 if (TSTMP_GT(cts, tstmp_touse))
5366                     time_since_sent = cts - tstmp_touse;
5367                 is_tlp_timer = 1;
5368                 if (tp->t_srtt) {
5369                         if ((rack->rc_srtt_measure_made == 0) &&
5370                             (tp->t_srtt == 1)) {
5371                                 /*
5372                                  * If another stack as run and set srtt to 1,
5373                                  * then the srtt was 0, so lets use the initial.
5374                                  */
5375                                 srtt = RACK_INITIAL_RTO;
5376                         } else {
5377                                 srtt_cur = tp->t_srtt;
5378                                 srtt = srtt_cur;
5379                         }
5380                 } else
5381                         srtt = RACK_INITIAL_RTO;
5382                 /*
5383                  * If the SRTT is not keeping up and the
5384                  * rack RTT has spiked we want to use
5385                  * the last RTT not the smoothed one.
5386                  */
5387                 if (rack_tlp_use_greater &&
5388                     tp->t_srtt &&
5389                     (srtt < rack_grab_rtt(tp, rack))) {
5390                         srtt = rack_grab_rtt(tp, rack);
5391                 }
5392                 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
5393                 if (thresh > time_since_sent) {
5394                         to = thresh - time_since_sent;
5395                 } else {
5396                         to = rack->r_ctl.rc_min_to;
5397                         rack_log_alt_to_to_cancel(rack,
5398                                                   thresh,               /* flex1 */
5399                                                   time_since_sent,      /* flex2 */
5400                                                   tstmp_touse,          /* flex3 */
5401                                                   rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
5402                                                   (uint32_t)rsm->r_tim_lastsent[idx],
5403                                                   srtt,
5404                                                   idx, 99);
5405                 }
5406                 if (to < rack_tlp_min) {
5407                         to = rack_tlp_min;
5408                 }
5409                 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
5410                         /*
5411                          * If the TLP time works out to larger than the max
5412                          * RTO lets not do TLP.. just RTO.
5413                          */
5414                         goto activate_rxt;
5415                 }
5416         }
5417         if (is_tlp_timer == 0) {
5418                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
5419         } else {
5420                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
5421         }
5422         if (to == 0)
5423                 to = 1;
5424         return (to);
5425 }
5426
5427 static void
5428 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5429 {
5430         if (rack->rc_in_persist == 0) {
5431                 if (tp->t_flags & TF_GPUTINPROG) {
5432                         /*
5433                          * Stop the goodput now, the calling of the
5434                          * measurement function clears the flag.
5435                          */
5436                         rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
5437                                                     RACK_QUALITY_PERSIST);
5438                 }
5439 #ifdef NETFLIX_SHARED_CWND
5440                 if (rack->r_ctl.rc_scw) {
5441                         tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5442                         rack->rack_scwnd_is_idle = 1;
5443                 }
5444 #endif
5445                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
5446                 if (rack->r_ctl.rc_went_idle_time == 0)
5447                         rack->r_ctl.rc_went_idle_time = 1;
5448                 rack_timer_cancel(tp, rack, cts, __LINE__);
5449                 rack->r_ctl.persist_lost_ends = 0;
5450                 rack->probe_not_answered = 0;
5451                 rack->forced_ack = 0;
5452                 tp->t_rxtshift = 0;
5453                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5454                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5455                 rack->rc_in_persist = 1;
5456         }
5457 }
5458
5459 static void
5460 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5461 {
5462         if (tcp_in_hpts(rack->rc_inp)) {
5463                 tcp_hpts_remove(rack->rc_inp);
5464                 rack->r_ctl.rc_hpts_flags = 0;
5465         }
5466 #ifdef NETFLIX_SHARED_CWND
5467         if (rack->r_ctl.rc_scw) {
5468                 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
5469                 rack->rack_scwnd_is_idle = 0;
5470         }
5471 #endif
5472         if (rack->rc_gp_dyn_mul &&
5473             (rack->use_fixed_rate == 0) &&
5474             (rack->rc_always_pace)) {
5475                 /*
5476                  * Do we count this as if a probe-rtt just
5477                  * finished?
5478                  */
5479                 uint32_t time_idle, idle_min;
5480
5481                 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
5482                 idle_min = rack_min_probertt_hold;
5483                 if (rack_probertt_gpsrtt_cnt_div) {
5484                         uint64_t extra;
5485                         extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
5486                                 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
5487                         extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
5488                         idle_min += (uint32_t)extra;
5489                 }
5490                 if (time_idle >= idle_min) {
5491                         /* Yes, we count it as a probe-rtt. */
5492                         uint32_t us_cts;
5493
5494                         us_cts = tcp_get_usecs(NULL);
5495                         if (rack->in_probe_rtt == 0) {
5496                                 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
5497                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
5498                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
5499                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
5500                         } else {
5501                                 rack_exit_probertt(rack, us_cts);
5502                         }
5503                 }
5504         }
5505         rack->rc_in_persist = 0;
5506         rack->r_ctl.rc_went_idle_time = 0;
5507         tp->t_rxtshift = 0;
5508         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
5509            rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
5510         rack->r_ctl.rc_agg_delayed = 0;
5511         rack->r_early = 0;
5512         rack->r_late = 0;
5513         rack->r_ctl.rc_agg_early = 0;
5514 }
5515
5516 static void
5517 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
5518                    struct hpts_diag *diag, struct timeval *tv)
5519 {
5520         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5521                 union tcp_log_stackspecific log;
5522
5523                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5524                 log.u_bbr.flex1 = diag->p_nxt_slot;
5525                 log.u_bbr.flex2 = diag->p_cur_slot;
5526                 log.u_bbr.flex3 = diag->slot_req;
5527                 log.u_bbr.flex4 = diag->inp_hptsslot;
5528                 log.u_bbr.flex5 = diag->slot_remaining;
5529                 log.u_bbr.flex6 = diag->need_new_to;
5530                 log.u_bbr.flex7 = diag->p_hpts_active;
5531                 log.u_bbr.flex8 = diag->p_on_min_sleep;
5532                 /* Hijack other fields as needed */
5533                 log.u_bbr.epoch = diag->have_slept;
5534                 log.u_bbr.lt_epoch = diag->yet_to_sleep;
5535                 log.u_bbr.pkts_out = diag->co_ret;
5536                 log.u_bbr.applimited = diag->hpts_sleep_time;
5537                 log.u_bbr.delivered = diag->p_prev_slot;
5538                 log.u_bbr.inflight = diag->p_runningslot;
5539                 log.u_bbr.bw_inuse = diag->wheel_slot;
5540                 log.u_bbr.rttProp = diag->wheel_cts;
5541                 log.u_bbr.timeStamp = cts;
5542                 log.u_bbr.delRate = diag->maxslots;
5543                 log.u_bbr.cur_del_rate = diag->p_curtick;
5544                 log.u_bbr.cur_del_rate <<= 32;
5545                 log.u_bbr.cur_del_rate |= diag->p_lasttick;
5546                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5547                     &rack->rc_inp->inp_socket->so_rcv,
5548                     &rack->rc_inp->inp_socket->so_snd,
5549                     BBR_LOG_HPTSDIAG, 0,
5550                     0, &log, false, tv);
5551         }
5552
5553 }
5554
5555 static void
5556 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
5557 {
5558         if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
5559                 union tcp_log_stackspecific log;
5560                 struct timeval tv;
5561
5562                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
5563                 log.u_bbr.flex1 = sb->sb_flags;
5564                 log.u_bbr.flex2 = len;
5565                 log.u_bbr.flex3 = sb->sb_state;
5566                 log.u_bbr.flex8 = type;
5567                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5568                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
5569                     &rack->rc_inp->inp_socket->so_rcv,
5570                     &rack->rc_inp->inp_socket->so_snd,
5571                     TCP_LOG_SB_WAKE, 0,
5572                     len, &log, false, &tv);
5573         }
5574 }
5575
5576 static void
5577 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
5578       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
5579 {
5580         struct hpts_diag diag;
5581         struct inpcb *inp;
5582         struct timeval tv;
5583         uint32_t delayed_ack = 0;
5584         uint32_t hpts_timeout;
5585         uint32_t entry_slot = slot;
5586         uint8_t stopped;
5587         uint32_t left = 0;
5588         uint32_t us_cts;
5589
5590         inp = tp->t_inpcb;
5591         if ((tp->t_state == TCPS_CLOSED) ||
5592             (tp->t_state == TCPS_LISTEN)) {
5593                 return;
5594         }
5595         if (tcp_in_hpts(inp)) {
5596                 /* Already on the pacer */
5597                 return;
5598         }
5599         stopped = rack->rc_tmr_stopped;
5600         if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
5601                 left = rack->r_ctl.rc_timer_exp - cts;
5602         }
5603         rack->r_ctl.rc_timer_exp = 0;
5604         rack->r_ctl.rc_hpts_flags = 0;
5605         us_cts = tcp_get_usecs(&tv);
5606         /* Now early/late accounting */
5607         rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
5608         if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
5609                 /*
5610                  * We have a early carry over set,
5611                  * we can always add more time so we
5612                  * can always make this compensation.
5613                  *
5614                  * Note if ack's are allowed to wake us do not
5615                  * penalize the next timer for being awoke
5616                  * by an ack aka the rc_agg_early (non-paced mode).
5617                  */
5618                 slot += rack->r_ctl.rc_agg_early;
5619                 rack->r_early = 0;
5620                 rack->r_ctl.rc_agg_early = 0;
5621         }
5622         if (rack->r_late) {
5623                 /*
5624                  * This is harder, we can
5625                  * compensate some but it
5626                  * really depends on what
5627                  * the current pacing time is.
5628                  */
5629                 if (rack->r_ctl.rc_agg_delayed >= slot) {
5630                         /*
5631                          * We can't compensate for it all.
5632                          * And we have to have some time
5633                          * on the clock. We always have a min
5634                          * 10 slots (10 x 10 i.e. 100 usecs).
5635                          */
5636                         if (slot <= HPTS_TICKS_PER_SLOT) {
5637                                 /* We gain delay */
5638                                 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
5639                                 slot = HPTS_TICKS_PER_SLOT;
5640                         } else {
5641                                 /* We take off some */
5642                                 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
5643                                 slot = HPTS_TICKS_PER_SLOT;
5644                         }
5645                 } else {
5646                         slot -= rack->r_ctl.rc_agg_delayed;
5647                         rack->r_ctl.rc_agg_delayed = 0;
5648                         /* Make sure we have 100 useconds at minimum */
5649                         if (slot < HPTS_TICKS_PER_SLOT) {
5650                                 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
5651                                 slot = HPTS_TICKS_PER_SLOT;
5652                         }
5653                         if (rack->r_ctl.rc_agg_delayed == 0)
5654                                 rack->r_late = 0;
5655                 }
5656         }
5657         if (slot) {
5658                 /* We are pacing too */
5659                 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
5660         }
5661         hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
5662 #ifdef NETFLIX_EXP_DETECTION
5663         if (rack->sack_attack_disable &&
5664             (slot < tcp_sad_pacing_interval)) {
5665                 /*
5666                  * We have a potential attacker on
5667                  * the line. We have possibly some
5668                  * (or now) pacing time set. We want to
5669                  * slow down the processing of sacks by some
5670                  * amount (if it is an attacker). Set the default
5671                  * slot for attackers in place (unless the orginal
5672                  * interval is longer). Its stored in
5673                  * micro-seconds, so lets convert to msecs.
5674                  */
5675                 slot = tcp_sad_pacing_interval;
5676         }
5677 #endif
5678         if (tp->t_flags & TF_DELACK) {
5679                 delayed_ack = TICKS_2_USEC(tcp_delacktime);
5680                 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
5681         }
5682         if (delayed_ack && ((hpts_timeout == 0) ||
5683                             (delayed_ack < hpts_timeout)))
5684                 hpts_timeout = delayed_ack;
5685         else
5686                 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
5687         /*
5688          * If no timers are going to run and we will fall off the hptsi
5689          * wheel, we resort to a keep-alive timer if its configured.
5690          */
5691         if ((hpts_timeout == 0) &&
5692             (slot == 0)) {
5693                 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
5694                     (tp->t_state <= TCPS_CLOSING)) {
5695                         /*
5696                          * Ok we have no timer (persists, rack, tlp, rxt  or
5697                          * del-ack), we don't have segments being paced. So
5698                          * all that is left is the keepalive timer.
5699                          */
5700                         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
5701                                 /* Get the established keep-alive time */
5702                                 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
5703                         } else {
5704                                 /*
5705                                  * Get the initial setup keep-alive time,
5706                                  * note that this is probably not going to
5707                                  * happen, since rack will be running a rxt timer
5708                                  * if a SYN of some sort is outstanding. It is
5709                                  * actually handled in rack_timeout_rxt().
5710                                  */
5711                                 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
5712                         }
5713                         rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
5714                         if (rack->in_probe_rtt) {
5715                                 /*
5716                                  * We want to instead not wake up a long time from
5717                                  * now but to wake up about the time we would
5718                                  * exit probe-rtt and initiate a keep-alive ack.
5719                                  * This will get us out of probe-rtt and update
5720                                  * our min-rtt.
5721                                  */
5722                                 hpts_timeout = rack_min_probertt_hold;
5723                         }
5724                 }
5725         }
5726         if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
5727             (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
5728                 /*
5729                  * RACK, TLP, persists and RXT timers all are restartable
5730                  * based on actions input .. i.e we received a packet (ack
5731                  * or sack) and that changes things (rw, or snd_una etc).
5732                  * Thus we can restart them with a new value. For
5733                  * keep-alive, delayed_ack we keep track of what was left
5734                  * and restart the timer with a smaller value.
5735                  */
5736                 if (left < hpts_timeout)
5737                         hpts_timeout = left;
5738         }
5739         if (hpts_timeout) {
5740                 /*
5741                  * Hack alert for now we can't time-out over 2,147,483
5742                  * seconds (a bit more than 596 hours), which is probably ok
5743                  * :).
5744                  */
5745                 if (hpts_timeout > 0x7ffffffe)
5746                         hpts_timeout = 0x7ffffffe;
5747                 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
5748         }
5749         rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
5750         if ((rack->gp_ready == 0) &&
5751             (rack->use_fixed_rate == 0) &&
5752             (hpts_timeout < slot) &&
5753             (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
5754                 /*
5755                  * We have no good estimate yet for the
5756                  * old clunky burst mitigation or the
5757                  * real pacing. And the tlp or rxt is smaller
5758                  * than the pacing calculation. Lets not
5759                  * pace that long since we know the calculation
5760                  * so far is not accurate. 
5761                  */
5762                 slot = hpts_timeout;
5763         }
5764         /**
5765          * Turn off all the flags for queuing by default. The
5766          * flags have important meanings to what happens when
5767          * LRO interacts with the transport. Most likely (by default now)
5768          * mbuf_queueing and ack compression are on. So the transport
5769          * has a couple of flags that control what happens (if those
5770          * are not on then these flags won't have any effect since it
5771          * won't go through the queuing LRO path).
5772          *
5773          * INP_MBUF_QUEUE_READY - This flags says that I am busy
5774          *                        pacing output, so don't disturb. But
5775          *                        it also means LRO can wake me if there
5776          *                        is a SACK arrival.
5777          *
5778          * INP_DONT_SACK_QUEUE - This flag is used in conjunction
5779          *                       with the above flag (QUEUE_READY) and
5780          *                       when present it says don't even wake me
5781          *                       if a SACK arrives.
5782          *
5783          * The idea behind these flags is that if we are pacing we
5784          * set the MBUF_QUEUE_READY and only get woken up if
5785          * a SACK arrives (which could change things) or if
5786          * our pacing timer expires. If, however, we have a rack
5787          * timer running, then we don't even want a sack to wake
5788          * us since the rack timer has to expire before we can send.
5789          *
5790          * Other cases should usually have none of the flags set
5791          * so LRO can call into us.
5792          */
5793         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
5794         if (slot) {
5795                 rack->r_ctl.rc_last_output_to = us_cts + slot;
5796                 /*
5797                  * A pacing timer (slot) is being set, in
5798                  * such a case we cannot send (we are blocked by
5799                  * the timer). So lets tell LRO that it should not
5800                  * wake us unless there is a SACK. Note this only
5801                  * will be effective if mbuf queueing is on or
5802                  * compressed acks are being processed.
5803                  */
5804                 inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
5805                 /*
5806                  * But wait if we have a Rack timer running
5807                  * even a SACK should not disturb us (with
5808                  * the exception of r_rr_config 3).
5809                  */
5810                 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
5811                     (rack->r_rr_config != 3))
5812                         inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
5813                 if (rack->rc_ack_can_sendout_data) {
5814                         /*
5815                          * Ahh but wait, this is that special case
5816                          * where the pacing timer can be disturbed
5817                          * backout the changes (used for non-paced
5818                          * burst limiting).
5819                          */
5820                         inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
5821                 }
5822                 if ((rack->use_rack_rr) &&
5823                     (rack->r_rr_config < 2) &&
5824                     ((hpts_timeout) && (hpts_timeout < slot))) {
5825                         /*
5826                          * Arrange for the hpts to kick back in after the
5827                          * t-o if the t-o does not cause a send.
5828                          */
5829                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
5830                                                    __LINE__, &diag);
5831                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5832                         rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
5833                 } else {
5834                         (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
5835                                                    __LINE__, &diag);
5836                         rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5837                         rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
5838                 }
5839         } else if (hpts_timeout) {
5840                 /*
5841                  * With respect to inp_flags2 here, lets let any new acks wake
5842                  * us up here. Since we are not pacing (no pacing timer), output
5843                  * can happen so we should let it. If its a Rack timer, then any inbound
5844                  * packet probably won't change the sending (we will be blocked)
5845                  * but it may change the prr stats so letting it in (the set defaults
5846                  * at the start of this block) are good enough.
5847                  */
5848                 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
5849                                            __LINE__, &diag);
5850                 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
5851                 rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
5852         } else {
5853                 /* No timer starting */
5854 #ifdef INVARIANTS
5855                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
5856                         panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
5857                             tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
5858                 }
5859 #endif
5860         }
5861         rack->rc_tmr_stopped = 0;
5862         if (slot)
5863                 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
5864 }
5865
5866 /*
5867  * RACK Timer, here we simply do logging and house keeping.
5868  * the normal rack_output() function will call the
5869  * appropriate thing to check if we need to do a RACK retransmit.
5870  * We return 1, saying don't proceed with rack_output only
5871  * when all timers have been stopped (destroyed PCB?).
5872  */
5873 static int
5874 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
5875 {
5876         /*
5877          * This timer simply provides an internal trigger to send out data.
5878          * The check_recovery_mode call will see if there are needed
5879          * retransmissions, if so we will enter fast-recovery. The output
5880          * call may or may not do the same thing depending on sysctl
5881          * settings.
5882          */
5883         struct rack_sendmap *rsm;
5884
5885         if (tp->t_timers->tt_flags & TT_STOPPED) {
5886                 return (1);
5887         }
5888         counter_u64_add(rack_to_tot, 1);
5889         if (rack->r_state && (rack->r_state != tp->t_state))
5890                 rack_set_state(tp, rack);
5891         rack->rc_on_min_to = 0;
5892         rsm = rack_check_recovery_mode(tp, cts);
5893         rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
5894         if (rsm) {
5895                 rack->r_ctl.rc_resend = rsm;
5896                 rack->r_timer_override = 1;
5897                 if (rack->use_rack_rr) {
5898                         /*
5899                          * Don't accumulate extra pacing delay
5900                          * we are allowing the rack timer to
5901                          * over-ride pacing i.e. rrr takes precedence
5902                          * if the pacing interval is longer than the rrr
5903                          * time (in other words we get the min pacing
5904                          * time versus rrr pacing time).
5905                          */
5906                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
5907                 }
5908         }
5909         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
5910         if (rsm == NULL) {
5911                 /* restart a timer and return 1 */
5912                 rack_start_hpts_timer(rack, tp, cts,
5913                                       0, 0, 0);
5914                 return (1);
5915         }
5916         return (0);
5917 }
5918
5919 static void
5920 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
5921 {
5922         if (rsm->m->m_len > rsm->orig_m_len) {
5923                 /*
5924                  * Mbuf grew, caused by sbcompress, our offset does
5925                  * not change.
5926                  */
5927                 rsm->orig_m_len = rsm->m->m_len;
5928         } else if (rsm->m->m_len < rsm->orig_m_len) {
5929                 /*
5930                  * Mbuf shrank, trimmed off the top by an ack, our
5931                  * offset changes.
5932                  */
5933                 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
5934                 rsm->orig_m_len = rsm->m->m_len;
5935         }
5936 }
5937
5938 static void
5939 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
5940 {
5941         struct mbuf *m;
5942         uint32_t soff;
5943
5944         if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
5945                 /* Fix up the orig_m_len and possibly the mbuf offset */
5946                 rack_adjust_orig_mlen(src_rsm);
5947         }
5948         m = src_rsm->m;
5949         soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
5950         while (soff >= m->m_len) {
5951                 /* Move out past this mbuf */
5952                 soff -= m->m_len;
5953                 m = m->m_next;
5954                 KASSERT((m != NULL),
5955                         ("rsm:%p nrsm:%p hit at soff:%u null m",
5956                          src_rsm, rsm, soff));
5957         }
5958         rsm->m = m;
5959         rsm->soff = soff;
5960         rsm->orig_m_len = m->m_len;
5961 }
5962
5963 static __inline void
5964 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
5965                struct rack_sendmap *rsm, uint32_t start)
5966 {
5967         int idx;
5968
5969         nrsm->r_start = start;
5970         nrsm->r_end = rsm->r_end;
5971         nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
5972         nrsm->r_flags = rsm->r_flags;
5973         nrsm->r_dupack = rsm->r_dupack;
5974         nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
5975         nrsm->r_rtr_bytes = 0;
5976         nrsm->r_fas = rsm->r_fas;
5977         rsm->r_end = nrsm->r_start;
5978         nrsm->r_just_ret = rsm->r_just_ret;
5979         for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
5980                 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
5981         }
5982         /* Now if we have SYN flag we keep it on the left edge */
5983         if (nrsm->r_flags & RACK_HAS_SYN)
5984                 nrsm->r_flags &= ~RACK_HAS_SYN;
5985         /* Now if we have a FIN flag we keep it on the right edge */
5986         if (rsm->r_flags & RACK_HAS_FIN)
5987                 rsm->r_flags &= ~RACK_HAS_FIN;
5988         /* Push bit must go to the right edge as well */
5989         if (rsm->r_flags & RACK_HAD_PUSH)
5990                 rsm->r_flags &= ~RACK_HAD_PUSH;
5991         /* Clone over the state of the hw_tls flag */
5992         nrsm->r_hw_tls = rsm->r_hw_tls;
5993         /*
5994          * Now we need to find nrsm's new location in the mbuf chain
5995          * we basically calculate a new offset, which is soff +
5996          * how much is left in original rsm. Then we walk out the mbuf
5997          * chain to find the righ position, it may be the same mbuf
5998          * or maybe not.
5999          */
6000         KASSERT(((rsm->m != NULL) ||
6001                  (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
6002                 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
6003         if (rsm->m)
6004                 rack_setup_offset_for_rsm(rsm, nrsm);
6005 }
6006
6007 static struct rack_sendmap *
6008 rack_merge_rsm(struct tcp_rack *rack,
6009                struct rack_sendmap *l_rsm,
6010                struct rack_sendmap *r_rsm)
6011 {
6012         /*
6013          * We are merging two ack'd RSM's,
6014          * the l_rsm is on the left (lower seq
6015          * values) and the r_rsm is on the right
6016          * (higher seq value). The simplest way
6017          * to merge these is to move the right
6018          * one into the left. I don't think there
6019          * is any reason we need to try to find
6020          * the oldest (or last oldest retransmitted).
6021          */
6022 #ifdef INVARIANTS
6023         struct rack_sendmap *rm;
6024 #endif
6025         rack_log_map_chg(rack->rc_tp, rack, NULL,
6026                          l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
6027         l_rsm->r_end = r_rsm->r_end;
6028         if (l_rsm->r_dupack < r_rsm->r_dupack)
6029                 l_rsm->r_dupack = r_rsm->r_dupack;
6030         if (r_rsm->r_rtr_bytes)
6031                 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
6032         if (r_rsm->r_in_tmap) {
6033                 /* This really should not happen */
6034                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
6035                 r_rsm->r_in_tmap = 0;
6036         }
6037
6038         /* Now the flags */
6039         if (r_rsm->r_flags & RACK_HAS_FIN)
6040                 l_rsm->r_flags |= RACK_HAS_FIN;
6041         if (r_rsm->r_flags & RACK_TLP)
6042                 l_rsm->r_flags |= RACK_TLP;
6043         if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
6044                 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
6045         if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
6046             ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
6047                 /*
6048                  * If both are app-limited then let the
6049                  * free lower the count. If right is app
6050                  * limited and left is not, transfer.
6051                  */
6052                 l_rsm->r_flags |= RACK_APP_LIMITED;
6053                 r_rsm->r_flags &= ~RACK_APP_LIMITED;
6054                 if (r_rsm == rack->r_ctl.rc_first_appl)
6055                         rack->r_ctl.rc_first_appl = l_rsm;
6056         }
6057 #ifndef INVARIANTS
6058         (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
6059 #else
6060         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
6061         if (rm != r_rsm) {
6062                 panic("removing head in rack:%p rsm:%p rm:%p",
6063                       rack, r_rsm, rm);
6064         }
6065 #endif
6066         if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
6067                 /* Transfer the split limit to the map we free */
6068                 r_rsm->r_limit_type = l_rsm->r_limit_type;
6069                 l_rsm->r_limit_type = 0;
6070         }
6071         rack_free(rack, r_rsm);
6072         return (l_rsm);
6073 }
6074
6075 /*
6076  * TLP Timer, here we simply setup what segment we want to
6077  * have the TLP expire on, the normal rack_output() will then
6078  * send it out.
6079  *
6080  * We return 1, saying don't proceed with rack_output only
6081  * when all timers have been stopped (destroyed PCB?).
6082  */
6083 static int
6084 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
6085 {
6086         /*
6087          * Tail Loss Probe.
6088          */
6089         struct rack_sendmap *rsm = NULL;
6090 #ifdef INVARIANTS
6091         struct rack_sendmap *insret;
6092 #endif
6093         struct socket *so;
6094         uint32_t amm;
6095         uint32_t out, avail;
6096         int collapsed_win = 0;
6097
6098         if (tp->t_timers->tt_flags & TT_STOPPED) {
6099                 return (1);
6100         }
6101         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
6102                 /* Its not time yet */
6103                 return (0);
6104         }
6105         if (ctf_progress_timeout_check(tp, true)) {
6106                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6107                 return (-ETIMEDOUT);    /* tcp_drop() */
6108         }
6109         /*
6110          * A TLP timer has expired. We have been idle for 2 rtts. So we now
6111          * need to figure out how to force a full MSS segment out.
6112          */
6113         rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
6114         rack->r_ctl.retran_during_recovery = 0;
6115         rack->r_ctl.dsack_byte_cnt = 0;
6116         counter_u64_add(rack_tlp_tot, 1);
6117         if (rack->r_state && (rack->r_state != tp->t_state))
6118                 rack_set_state(tp, rack);
6119         so = tp->t_inpcb->inp_socket;
6120         avail = sbavail(&so->so_snd);
6121         out = tp->snd_max - tp->snd_una;
6122         if ((out > tp->snd_wnd) || rack->rc_has_collapsed) {
6123                 /* special case, we need a retransmission */
6124                 collapsed_win = 1;
6125                 goto need_retran;
6126         }
6127         if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
6128                 rack->r_ctl.dsack_persist--;
6129                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6130                         rack->r_ctl.num_dsack = 0;
6131                 }
6132                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6133         }
6134         if ((tp->t_flags & TF_GPUTINPROG) &&
6135             (rack->r_ctl.rc_tlp_cnt_out == 1)) {
6136                 /*
6137                  * If this is the second in a row
6138                  * TLP and we are doing a measurement
6139                  * its time to abandon the measurement.
6140                  * Something is likely broken on
6141                  * the clients network and measuring a
6142                  * broken network does us no good.
6143                  */
6144                 tp->t_flags &= ~TF_GPUTINPROG;
6145                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6146                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6147                                            tp->gput_seq,
6148                                            0, 0, 18, __LINE__, NULL, 0);
6149         }
6150         /*
6151          * Check our send oldest always settings, and if
6152          * there is an oldest to send jump to the need_retran.
6153          */
6154         if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
6155                 goto need_retran;
6156
6157         if (avail > out) {
6158                 /* New data is available */
6159                 amm = avail - out;
6160                 if (amm > ctf_fixed_maxseg(tp)) {
6161                         amm = ctf_fixed_maxseg(tp);
6162                         if ((amm + out) > tp->snd_wnd) {
6163                                 /* We are rwnd limited */
6164                                 goto need_retran;
6165                         }
6166                 } else if (amm < ctf_fixed_maxseg(tp)) {
6167                         /* not enough to fill a MTU */
6168                         goto need_retran;
6169                 }
6170                 if (IN_FASTRECOVERY(tp->t_flags)) {
6171                         /* Unlikely */
6172                         if (rack->rack_no_prr == 0) {
6173                                 if (out + amm <= tp->snd_wnd) {
6174                                         rack->r_ctl.rc_prr_sndcnt = amm;
6175                                         rack->r_ctl.rc_tlp_new_data = amm;
6176                                         rack_log_to_prr(rack, 4, 0, __LINE__);
6177                                 }
6178                         } else
6179                                 goto need_retran;
6180                 } else {
6181                         /* Set the send-new override */
6182                         if (out + amm <= tp->snd_wnd)
6183                                 rack->r_ctl.rc_tlp_new_data = amm;
6184                         else
6185                                 goto need_retran;
6186                 }
6187                 rack->r_ctl.rc_tlpsend = NULL;
6188                 counter_u64_add(rack_tlp_newdata, 1);
6189                 goto send;
6190         }
6191 need_retran:
6192         /*
6193          * Ok we need to arrange the last un-acked segment to be re-sent, or
6194          * optionally the first un-acked segment.
6195          */
6196         if (collapsed_win == 0) {
6197                 if (rack_always_send_oldest)
6198                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6199                 else {
6200                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6201                         if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
6202                                 rsm = rack_find_high_nonack(rack, rsm);
6203                         }
6204                 }
6205                 if (rsm == NULL) {
6206 #ifdef TCP_BLACKBOX
6207                         tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6208 #endif
6209                         goto out;
6210                 }
6211         } else {
6212                 /*
6213                  * We must find the last segment
6214                  * that was acceptable by the client.
6215                  */
6216                 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6217                         if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
6218                                 /* Found one */
6219                                 break;
6220                         }
6221                 }
6222                 if (rsm == NULL) {
6223                         /* None? if so send the first */
6224                         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6225                         if (rsm == NULL) {
6226 #ifdef TCP_BLACKBOX
6227                                 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
6228 #endif
6229                                 goto out;
6230                         }
6231                 }
6232         }
6233         if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
6234                 /*
6235                  * We need to split this the last segment in two.
6236                  */
6237                 struct rack_sendmap *nrsm;
6238
6239                 nrsm = rack_alloc_full_limit(rack);
6240                 if (nrsm == NULL) {
6241                         /*
6242                          * No memory to split, we will just exit and punt
6243                          * off to the RXT timer.
6244                          */
6245                         goto out;
6246                 }
6247                 rack_clone_rsm(rack, nrsm, rsm,
6248                                (rsm->r_end - ctf_fixed_maxseg(tp)));
6249                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
6250 #ifndef INVARIANTS
6251                 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6252 #else
6253                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
6254                 if (insret != NULL) {
6255                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
6256                               nrsm, insret, rack, rsm);
6257                 }
6258 #endif
6259                 if (rsm->r_in_tmap) {
6260                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
6261                         nrsm->r_in_tmap = 1;
6262                 }
6263                 rsm = nrsm;
6264         }
6265         rack->r_ctl.rc_tlpsend = rsm;
6266 send:
6267         /* Make sure output path knows we are doing a TLP */
6268         *doing_tlp = 1;
6269         rack->r_timer_override = 1;
6270         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6271         return (0);
6272 out:
6273         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
6274         return (0);
6275 }
6276
6277 /*
6278  * Delayed ack Timer, here we simply need to setup the
6279  * ACK_NOW flag and remove the DELACK flag. From there
6280  * the output routine will send the ack out.
6281  *
6282  * We only return 1, saying don't proceed, if all timers
6283  * are stopped (destroyed PCB?).
6284  */
6285 static int
6286 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6287 {
6288         if (tp->t_timers->tt_flags & TT_STOPPED) {
6289                 return (1);
6290         }
6291         rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
6292         tp->t_flags &= ~TF_DELACK;
6293         tp->t_flags |= TF_ACKNOW;
6294         KMOD_TCPSTAT_INC(tcps_delack);
6295         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6296         return (0);
6297 }
6298
6299 /*
6300  * Persists timer, here we simply send the
6301  * same thing as a keepalive will.
6302  * the one byte send.
6303  *
6304  * We only return 1, saying don't proceed, if all timers
6305  * are stopped (destroyed PCB?).
6306  */
6307 static int
6308 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6309 {
6310         struct tcptemp *t_template;
6311 #ifdef INVARIANTS
6312         struct inpcb *inp = tp->t_inpcb;
6313 #endif
6314         int32_t retval = 1;
6315
6316         if (tp->t_timers->tt_flags & TT_STOPPED) {
6317                 return (1);
6318         }
6319         if (rack->rc_in_persist == 0)
6320                 return (0);
6321         if (ctf_progress_timeout_check(tp, false)) {
6322                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6323                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6324                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6325                 return (-ETIMEDOUT);    /* tcp_drop() */
6326         }
6327         KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
6328         /*
6329          * Persistence timer into zero window. Force a byte to be output, if
6330          * possible.
6331          */
6332         KMOD_TCPSTAT_INC(tcps_persisttimeo);
6333         /*
6334          * Hack: if the peer is dead/unreachable, we do not time out if the
6335          * window is closed.  After a full backoff, drop the connection if
6336          * the idle time (no responses to probes) reaches the maximum
6337          * backoff that we would use if retransmitting.
6338          */
6339         if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
6340             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
6341              TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
6342                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6343                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6344                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6345                 retval = -ETIMEDOUT;    /* tcp_drop() */
6346                 goto out;
6347         }
6348         if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
6349             tp->snd_una == tp->snd_max)
6350                 rack_exit_persist(tp, rack, cts);
6351         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
6352         /*
6353          * If the user has closed the socket then drop a persisting
6354          * connection after a much reduced timeout.
6355          */
6356         if (tp->t_state > TCPS_CLOSE_WAIT &&
6357             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
6358                 KMOD_TCPSTAT_INC(tcps_persistdrop);
6359                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
6360                 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
6361                 retval = -ETIMEDOUT;    /* tcp_drop() */
6362                 goto out;
6363         }
6364         t_template = tcpip_maketemplate(rack->rc_inp);
6365         if (t_template) {
6366                 /* only set it if we were answered */
6367                 if (rack->forced_ack == 0) {
6368                         rack->forced_ack = 1;
6369                         rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6370                 } else {
6371                         rack->probe_not_answered = 1;
6372                         counter_u64_add(rack_persists_loss, 1);
6373                         rack->r_ctl.persist_lost_ends++;
6374                 }
6375                 counter_u64_add(rack_persists_sends, 1);
6376                 tcp_respond(tp, t_template->tt_ipgen,
6377                             &t_template->tt_t, (struct mbuf *)NULL,
6378                             tp->rcv_nxt, tp->snd_una - 1, 0);
6379                 /* This sends an ack */
6380                 if (tp->t_flags & TF_DELACK)
6381                         tp->t_flags &= ~TF_DELACK;
6382                 free(t_template, M_TEMP);
6383         }
6384         if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
6385                 tp->t_rxtshift++;
6386 out:
6387         rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
6388         rack_start_hpts_timer(rack, tp, cts,
6389                               0, 0, 0);
6390         return (retval);
6391 }
6392
6393 /*
6394  * If a keepalive goes off, we had no other timers
6395  * happening. We always return 1 here since this
6396  * routine either drops the connection or sends
6397  * out a segment with respond.
6398  */
6399 static int
6400 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6401 {
6402         struct tcptemp *t_template;
6403         struct inpcb *inp;
6404
6405         if (tp->t_timers->tt_flags & TT_STOPPED) {
6406                 return (1);
6407         }
6408         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
6409         inp = tp->t_inpcb;
6410         rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
6411         /*
6412          * Keep-alive timer went off; send something or drop connection if
6413          * idle for too long.
6414          */
6415         KMOD_TCPSTAT_INC(tcps_keeptimeo);
6416         if (tp->t_state < TCPS_ESTABLISHED)
6417                 goto dropit;
6418         if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6419             tp->t_state <= TCPS_CLOSING) {
6420                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
6421                         goto dropit;
6422                 /*
6423                  * Send a packet designed to force a response if the peer is
6424                  * up and reachable: either an ACK if the connection is
6425                  * still alive, or an RST if the peer has closed the
6426                  * connection due to timeout or reboot. Using sequence
6427                  * number tp->snd_una-1 causes the transmitted zero-length
6428                  * segment to lie outside the receive window; by the
6429                  * protocol spec, this requires the correspondent TCP to
6430                  * respond.
6431                  */
6432                 KMOD_TCPSTAT_INC(tcps_keepprobe);
6433                 t_template = tcpip_maketemplate(inp);
6434                 if (t_template) {
6435                         if (rack->forced_ack == 0) {
6436                                 rack->forced_ack = 1;
6437                                 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
6438                         } else {
6439                                 rack->probe_not_answered = 1;
6440                         }
6441                         tcp_respond(tp, t_template->tt_ipgen,
6442                             &t_template->tt_t, (struct mbuf *)NULL,
6443                             tp->rcv_nxt, tp->snd_una - 1, 0);
6444                         free(t_template, M_TEMP);
6445                 }
6446         }
6447         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
6448         return (1);
6449 dropit:
6450         KMOD_TCPSTAT_INC(tcps_keepdrops);
6451         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6452         return (-ETIMEDOUT);    /* tcp_drop() */
6453 }
6454
6455 /*
6456  * Retransmit helper function, clear up all the ack
6457  * flags and take care of important book keeping.
6458  */
6459 static void
6460 rack_remxt_tmr(struct tcpcb *tp)
6461 {
6462         /*
6463          * The retransmit timer went off, all sack'd blocks must be
6464          * un-acked.
6465          */
6466         struct rack_sendmap *rsm, *trsm = NULL;
6467         struct tcp_rack *rack;
6468
6469         rack = (struct tcp_rack *)tp->t_fb_ptr;
6470         rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
6471         rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
6472         if (rack->r_state && (rack->r_state != tp->t_state))
6473                 rack_set_state(tp, rack);
6474         /*
6475          * Ideally we would like to be able to
6476          * mark SACK-PASS on anything not acked here.
6477          *
6478          * However, if we do that we would burst out
6479          * all that data 1ms apart. This would be unwise,
6480          * so for now we will just let the normal rxt timer
6481          * and tlp timer take care of it.
6482          *
6483          * Also we really need to stick them back in sequence
6484          * order. This way we send in the proper order and any
6485          * sacks that come floating in will "re-ack" the data.
6486          * To do this we zap the tmap with an INIT and then
6487          * walk through and place every rsm in the RB tree
6488          * back in its seq ordered place.
6489          */
6490         TAILQ_INIT(&rack->r_ctl.rc_tmap);
6491         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
6492                 rsm->r_dupack = 0;
6493                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
6494                 /* We must re-add it back to the tlist */
6495                 if (trsm == NULL) {
6496                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6497                 } else {
6498                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
6499                 }
6500                 rsm->r_in_tmap = 1;
6501                 trsm = rsm;
6502                 if (rsm->r_flags & RACK_ACKED)
6503                         rsm->r_flags |= RACK_WAS_ACKED;
6504                 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
6505                 rsm->r_flags |= RACK_MUST_RXT;
6506         }
6507         /* Clear the count (we just un-acked them) */
6508         rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
6509         rack->r_ctl.rc_sacked = 0;
6510         rack->r_ctl.rc_sacklast = NULL;
6511         rack->r_ctl.rc_agg_delayed = 0;
6512         rack->r_early = 0;
6513         rack->r_ctl.rc_agg_early = 0;
6514         rack->r_late = 0;
6515         /* Clear the tlp rtx mark */
6516         rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6517         if (rack->r_ctl.rc_resend != NULL)
6518                 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
6519         rack->r_ctl.rc_prr_sndcnt = 0;
6520         rack_log_to_prr(rack, 6, 0, __LINE__);
6521         rack->r_timer_override = 1;
6522         if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
6523 #ifdef NETFLIX_EXP_DETECTION
6524             || (rack->sack_attack_disable != 0)
6525 #endif
6526                     ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
6527                 /*
6528                  * For non-sack customers new data
6529                  * needs to go out as retransmits until
6530                  * we retransmit up to snd_max.
6531                  */
6532                 rack->r_must_retran = 1;
6533                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
6534                                                 rack->r_ctl.rc_sacked);
6535         }
6536         rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
6537 }
6538
6539 static void
6540 rack_convert_rtts(struct tcpcb *tp)
6541 {
6542         if (tp->t_srtt > 1) {
6543                 uint32_t val, frac;
6544
6545                 val = tp->t_srtt >> TCP_RTT_SHIFT;
6546                 frac = tp->t_srtt & 0x1f;
6547                 tp->t_srtt = TICKS_2_USEC(val);
6548                 /*
6549                  * frac is the fractional part of the srtt (if any)
6550                  * but its in ticks and every bit represents
6551                  * 1/32nd of a hz.
6552                  */
6553                 if (frac) {
6554                         if (hz == 1000) {
6555                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6556                         } else {
6557                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6558                         }
6559                         tp->t_srtt += frac;
6560                 }
6561         }
6562         if (tp->t_rttvar) {
6563                 uint32_t val, frac;
6564
6565                 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
6566                 frac = tp->t_rttvar & 0x1f;
6567                 tp->t_rttvar = TICKS_2_USEC(val);
6568                 /*
6569                  * frac is the fractional part of the srtt (if any)
6570                  * but its in ticks and every bit represents
6571                  * 1/32nd of a hz.
6572                  */
6573                 if (frac) {
6574                         if (hz == 1000) {
6575                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
6576                         } else {
6577                                 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
6578                         }
6579                         tp->t_rttvar += frac;
6580                 }
6581         }
6582         tp->t_rxtcur = RACK_REXMTVAL(tp);
6583         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6584                 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
6585         }
6586         if (tp->t_rxtcur > rack_rto_max) {
6587                 tp->t_rxtcur = rack_rto_max;
6588         }
6589 }
6590
6591 static void
6592 rack_cc_conn_init(struct tcpcb *tp)
6593 {
6594         struct tcp_rack *rack;
6595         uint32_t srtt;
6596
6597         rack = (struct tcp_rack *)tp->t_fb_ptr;
6598         srtt = tp->t_srtt;
6599         cc_conn_init(tp);
6600         /*
6601          * Now convert to rack's internal format,
6602          * if required.
6603          */
6604         if ((srtt == 0) && (tp->t_srtt != 0))
6605                 rack_convert_rtts(tp);
6606         /*
6607          * We want a chance to stay in slowstart as
6608          * we create a connection. TCP spec says that
6609          * initially ssthresh is infinite. For our
6610          * purposes that is the snd_wnd.
6611          */
6612         if (tp->snd_ssthresh < tp->snd_wnd) {
6613                 tp->snd_ssthresh = tp->snd_wnd;
6614         }
6615         /*
6616          * We also want to assure a IW worth of
6617          * data can get inflight.
6618          */
6619         if (rc_init_window(rack) < tp->snd_cwnd)
6620                 tp->snd_cwnd = rc_init_window(rack);
6621 }
6622
6623 /*
6624  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
6625  * we will setup to retransmit the lowest seq number outstanding.
6626  */
6627 static int
6628 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6629 {
6630         int32_t rexmt;
6631         int32_t retval = 0;
6632         bool isipv6;
6633
6634         if (tp->t_timers->tt_flags & TT_STOPPED) {
6635                 return (1);
6636         }
6637         if ((tp->t_flags & TF_GPUTINPROG) &&
6638             (tp->t_rxtshift)) {
6639                 /*
6640                  * We have had a second timeout
6641                  * measurements on successive rxt's are not profitable.
6642                  * It is unlikely to be of any use (the network is
6643                  * broken or the client went away).
6644                  */
6645                 tp->t_flags &= ~TF_GPUTINPROG;
6646                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6647                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
6648                                            tp->gput_seq,
6649                                            0, 0, 18, __LINE__, NULL, 0);
6650         }
6651         if (ctf_progress_timeout_check(tp, false)) {
6652                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6653                 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
6654                 return (-ETIMEDOUT);    /* tcp_drop() */
6655         }
6656         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
6657         rack->r_ctl.retran_during_recovery = 0;
6658         rack->r_ctl.dsack_byte_cnt = 0;
6659         if (IN_FASTRECOVERY(tp->t_flags))
6660                 tp->t_flags |= TF_WASFRECOVERY;
6661         else
6662                 tp->t_flags &= ~TF_WASFRECOVERY;
6663         if (IN_CONGRECOVERY(tp->t_flags))
6664                 tp->t_flags |= TF_WASCRECOVERY;
6665         else
6666                 tp->t_flags &= ~TF_WASCRECOVERY;
6667         if (TCPS_HAVEESTABLISHED(tp->t_state) &&
6668             (tp->snd_una == tp->snd_max)) {
6669                 /* Nothing outstanding .. nothing to do */
6670                 return (0);
6671         }
6672         if (rack->r_ctl.dsack_persist) {
6673                 rack->r_ctl.dsack_persist--;
6674                 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
6675                         rack->r_ctl.num_dsack = 0;
6676                 }
6677                 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
6678         }
6679         /*
6680          * Rack can only run one timer  at a time, so we cannot
6681          * run a KEEPINIT (gating SYN sending) and a retransmit
6682          * timer for the SYN. So if we are in a front state and
6683          * have a KEEPINIT timer we need to check the first transmit
6684          * against now to see if we have exceeded the KEEPINIT time
6685          * (if one is set).
6686          */
6687         if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6688             (TP_KEEPINIT(tp) != 0)) {
6689                 struct rack_sendmap *rsm;
6690
6691                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
6692                 if (rsm) {
6693                         /* Ok we have something outstanding to test keepinit with */
6694                         if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
6695                             ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
6696                                 /* We have exceeded the KEEPINIT time */
6697                                 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
6698                                 goto drop_it;
6699                         }
6700                 }
6701         }
6702         /*
6703          * Retransmission timer went off.  Message has not been acked within
6704          * retransmit interval.  Back off to a longer retransmit interval
6705          * and retransmit one segment.
6706          */
6707         rack_remxt_tmr(tp);
6708         if ((rack->r_ctl.rc_resend == NULL) ||
6709             ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
6710                 /*
6711                  * If the rwnd collapsed on
6712                  * the one we are retransmitting
6713                  * it does not count against the
6714                  * rxt count.
6715                  */
6716                 tp->t_rxtshift++;
6717         }
6718         if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
6719                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
6720 drop_it:
6721                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
6722                 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
6723                 /* XXXGL: previously t_softerror was casted to uint16_t */
6724                 MPASS(tp->t_softerror >= 0);
6725                 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
6726                 goto out;       /* tcp_drop() */
6727         }
6728         if (tp->t_state == TCPS_SYN_SENT) {
6729                 /*
6730                  * If the SYN was retransmitted, indicate CWND to be limited
6731                  * to 1 segment in cc_conn_init().
6732                  */
6733                 tp->snd_cwnd = 1;
6734         } else if (tp->t_rxtshift == 1) {
6735                 /*
6736                  * first retransmit; record ssthresh and cwnd so they can be
6737                  * recovered if this turns out to be a "bad" retransmit. A
6738                  * retransmit is considered "bad" if an ACK for this segment
6739                  * is received within RTT/2 interval; the assumption here is
6740                  * that the ACK was already in flight.  See "On Estimating
6741                  * End-to-End Network Path Properties" by Allman and Paxson
6742                  * for more details.
6743                  */
6744                 tp->snd_cwnd_prev = tp->snd_cwnd;
6745                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
6746                 tp->snd_recover_prev = tp->snd_recover;
6747                 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
6748                 tp->t_flags |= TF_PREVVALID;
6749         } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
6750                 tp->t_flags &= ~TF_PREVVALID;
6751         KMOD_TCPSTAT_INC(tcps_rexmttimeo);
6752         if ((tp->t_state == TCPS_SYN_SENT) ||
6753             (tp->t_state == TCPS_SYN_RECEIVED))
6754                 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
6755         else
6756                 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
6757
6758         RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
6759            max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
6760         /*
6761          * We enter the path for PLMTUD if connection is established or, if
6762          * connection is FIN_WAIT_1 status, reason for the last is that if
6763          * amount of data we send is very small, we could send it in couple
6764          * of packets and process straight to FIN. In that case we won't
6765          * catch ESTABLISHED state.
6766          */
6767 #ifdef INET6
6768         isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
6769 #else
6770         isipv6 = false;
6771 #endif
6772         if (((V_tcp_pmtud_blackhole_detect == 1) ||
6773             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
6774             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
6775             ((tp->t_state == TCPS_ESTABLISHED) ||
6776             (tp->t_state == TCPS_FIN_WAIT_1))) {
6777                 /*
6778                  * Idea here is that at each stage of mtu probe (usually,
6779                  * 1448 -> 1188 -> 524) should be given 2 chances to recover
6780                  * before further clamping down. 'tp->t_rxtshift % 2 == 0'
6781                  * should take care of that.
6782                  */
6783                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
6784                     (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
6785                     (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
6786                     tp->t_rxtshift % 2 == 0)) {
6787                         /*
6788                          * Enter Path MTU Black-hole Detection mechanism: -
6789                          * Disable Path MTU Discovery (IP "DF" bit). -
6790                          * Reduce MTU to lower value than what we negotiated
6791                          * with peer.
6792                          */
6793                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
6794                                 /* Record that we may have found a black hole. */
6795                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
6796                                 /* Keep track of previous MSS. */
6797                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
6798                         }
6799
6800                         /*
6801                          * Reduce the MSS to blackhole value or to the
6802                          * default in an attempt to retransmit.
6803                          */
6804 #ifdef INET6
6805                         if (isipv6 &&
6806                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
6807                                 /* Use the sysctl tuneable blackhole MSS. */
6808                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
6809                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
6810                         } else if (isipv6) {
6811                                 /* Use the default MSS. */
6812                                 tp->t_maxseg = V_tcp_v6mssdflt;
6813                                 /*
6814                                  * Disable Path MTU Discovery when we switch
6815                                  * to minmss.
6816                                  */
6817                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
6818                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
6819                         }
6820 #endif
6821 #if defined(INET6) && defined(INET)
6822                         else
6823 #endif
6824 #ifdef INET
6825                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
6826                                 /* Use the sysctl tuneable blackhole MSS. */
6827                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
6828                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
6829                         } else {
6830                                 /* Use the default MSS. */
6831                                 tp->t_maxseg = V_tcp_mssdflt;
6832                                 /*
6833                                  * Disable Path MTU Discovery when we switch
6834                                  * to minmss.
6835                                  */
6836                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
6837                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
6838                         }
6839 #endif
6840                 } else {
6841                         /*
6842                          * If further retransmissions are still unsuccessful
6843                          * with a lowered MTU, maybe this isn't a blackhole
6844                          * and we restore the previous MSS and blackhole
6845                          * detection flags. The limit '6' is determined by
6846                          * giving each probe stage (1448, 1188, 524) 2
6847                          * chances to recover.
6848                          */
6849                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
6850                             (tp->t_rxtshift >= 6)) {
6851                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
6852                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
6853                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
6854                                 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
6855                         }
6856                 }
6857         }
6858         /*
6859          * Disable RFC1323 and SACK if we haven't got any response to
6860          * our third SYN to work-around some broken terminal servers
6861          * (most of which have hopefully been retired) that have bad VJ
6862          * header compression code which trashes TCP segments containing
6863          * unknown-to-them TCP options.
6864          */
6865         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
6866             (tp->t_rxtshift == 3))
6867                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
6868         /*
6869          * If we backed off this far, our srtt estimate is probably bogus.
6870          * Clobber it so we'll take the next rtt measurement as our srtt;
6871          * move the current srtt into rttvar to keep the current retransmit
6872          * times until then.
6873          */
6874         if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
6875 #ifdef INET6
6876                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
6877                         in6_losing(tp->t_inpcb);
6878                 else
6879 #endif
6880                         in_losing(tp->t_inpcb);
6881                 tp->t_rttvar += tp->t_srtt;
6882                 tp->t_srtt = 0;
6883         }
6884         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
6885         tp->snd_recover = tp->snd_max;
6886         tp->t_flags |= TF_ACKNOW;
6887         tp->t_rtttime = 0;
6888         rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__);
6889 out:
6890         return (retval);
6891 }
6892
6893 static int
6894 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
6895 {
6896         int32_t ret = 0;
6897         int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
6898
6899         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
6900             (tp->t_flags & TF_GPUTINPROG)) {
6901                 /*
6902                  * We have a goodput in progress
6903                  * and we have entered a late state.
6904                  * Do we have enough data in the sb
6905                  * to handle the GPUT request?
6906                  */
6907                 uint32_t bytes;
6908
6909                 bytes = tp->gput_ack - tp->gput_seq;
6910                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
6911                         bytes += tp->gput_seq - tp->snd_una;
6912                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
6913                         /*
6914                          * There are not enough bytes in the socket
6915                          * buffer that have been sent to cover this
6916                          * measurement. Cancel it.
6917                          */
6918                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
6919                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
6920                                                    tp->gput_seq,
6921                                                    0, 0, 18, __LINE__, NULL, 0);
6922                         tp->t_flags &= ~TF_GPUTINPROG;
6923                 }
6924         }
6925         if (timers == 0) {
6926                 return (0);
6927         }
6928         if (tp->t_state == TCPS_LISTEN) {
6929                 /* no timers on listen sockets */
6930                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
6931                         return (0);
6932                 return (1);
6933         }
6934         if ((timers & PACE_TMR_RACK) &&
6935             rack->rc_on_min_to) {
6936                 /*
6937                  * For the rack timer when we
6938                  * are on a min-timeout (which means rrr_conf = 3)
6939                  * we don't want to check the timer. It may
6940                  * be going off for a pace and thats ok we
6941                  * want to send the retransmit (if its ready).
6942                  *
6943                  * If its on a normal rack timer (non-min) then
6944                  * we will check if its expired.
6945                  */
6946                 goto skip_time_check;
6947         }
6948         if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
6949                 uint32_t left;
6950
6951                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
6952                         ret = -1;
6953                         rack_log_to_processing(rack, cts, ret, 0);
6954                         return (0);
6955                 }
6956                 if (hpts_calling == 0) {
6957                         /*
6958                          * A user send or queued mbuf (sack) has called us? We
6959                          * return 0 and let the pacing guards
6960                          * deal with it if they should or
6961                          * should not cause a send.
6962                          */
6963                         ret = -2;
6964                         rack_log_to_processing(rack, cts, ret, 0);
6965                         return (0);
6966                 }
6967                 /*
6968                  * Ok our timer went off early and we are not paced false
6969                  * alarm, go back to sleep.
6970                  */
6971                 ret = -3;
6972                 left = rack->r_ctl.rc_timer_exp - cts;
6973                 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
6974                 rack_log_to_processing(rack, cts, ret, left);
6975                 return (1);
6976         }
6977 skip_time_check:
6978         rack->rc_tmr_stopped = 0;
6979         rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
6980         if (timers & PACE_TMR_DELACK) {
6981                 ret = rack_timeout_delack(tp, rack, cts);
6982         } else if (timers & PACE_TMR_RACK) {
6983                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
6984                 rack->r_fast_output = 0;
6985                 ret = rack_timeout_rack(tp, rack, cts);
6986         } else if (timers & PACE_TMR_TLP) {
6987                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
6988                 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
6989         } else if (timers & PACE_TMR_RXT) {
6990                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
6991                 rack->r_fast_output = 0;
6992                 ret = rack_timeout_rxt(tp, rack, cts);
6993         } else if (timers & PACE_TMR_PERSIT) {
6994                 ret = rack_timeout_persist(tp, rack, cts);
6995         } else if (timers & PACE_TMR_KEEP) {
6996                 ret = rack_timeout_keepalive(tp, rack, cts);
6997         }
6998         rack_log_to_processing(rack, cts, ret, timers);
6999         return (ret);
7000 }
7001
7002 static void
7003 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
7004 {
7005         struct timeval tv;
7006         uint32_t us_cts, flags_on_entry;
7007         uint8_t hpts_removed = 0;
7008
7009         flags_on_entry = rack->r_ctl.rc_hpts_flags;
7010         us_cts = tcp_get_usecs(&tv);
7011         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
7012             ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
7013              ((tp->snd_max - tp->snd_una) == 0))) {
7014                 tcp_hpts_remove(rack->rc_inp);
7015                 hpts_removed = 1;
7016                 /* If we were not delayed cancel out the flag. */
7017                 if ((tp->snd_max - tp->snd_una) == 0)
7018                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
7019                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7020         }
7021         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7022                 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7023                 if (tcp_in_hpts(rack->rc_inp) &&
7024                     ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
7025                         /*
7026                          * Canceling timer's when we have no output being
7027                          * paced. We also must remove ourselves from the
7028                          * hpts.
7029                          */
7030                         tcp_hpts_remove(rack->rc_inp);
7031                         hpts_removed = 1;
7032                 }
7033                 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
7034         }
7035         if (hpts_removed == 0)
7036                 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7037 }
7038
7039 static void
7040 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
7041 {
7042         return;
7043 }
7044
7045 static int
7046 rack_stopall(struct tcpcb *tp)
7047 {
7048         struct tcp_rack *rack;
7049         rack = (struct tcp_rack *)tp->t_fb_ptr;
7050         rack->t_timers_stopped = 1;
7051         return (0);
7052 }
7053
7054 static void
7055 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
7056 {
7057         return;
7058 }
7059
7060 static int
7061 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
7062 {
7063         return (0);
7064 }
7065
7066 static void
7067 rack_stop_all_timers(struct tcpcb *tp)
7068 {
7069         struct tcp_rack *rack;
7070
7071         /*
7072          * Assure no timers are running.
7073          */
7074         if (tcp_timer_active(tp, TT_PERSIST)) {
7075                 /* We enter in persists, set the flag appropriately */
7076                 rack = (struct tcp_rack *)tp->t_fb_ptr;
7077                 rack->rc_in_persist = 1;
7078         }
7079         tcp_timer_suspend(tp, TT_PERSIST);
7080         tcp_timer_suspend(tp, TT_REXMT);
7081         tcp_timer_suspend(tp, TT_KEEP);
7082         tcp_timer_suspend(tp, TT_DELACK);
7083 }
7084
7085 static void
7086 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
7087     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
7088 {
7089         int32_t idx;
7090
7091         rsm->r_rtr_cnt++;
7092         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7093         rsm->r_dupack = 0;
7094         if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
7095                 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
7096                 rsm->r_flags |= RACK_OVERMAX;
7097         }
7098         if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
7099                 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
7100                 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
7101         }
7102         idx = rsm->r_rtr_cnt - 1;
7103         rsm->r_tim_lastsent[idx] = ts;
7104         /*
7105          * Here we don't add in the len of send, since its already
7106          * in snduna <->snd_max.
7107          */
7108         rsm->r_fas = ctf_flight_size(rack->rc_tp,
7109                                      rack->r_ctl.rc_sacked);
7110         if (rsm->r_flags & RACK_ACKED) {
7111                 /* Problably MTU discovery messing with us */
7112                 rsm->r_flags &= ~RACK_ACKED;
7113                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
7114         }
7115         if (rsm->r_in_tmap) {
7116                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7117                 rsm->r_in_tmap = 0;
7118         }
7119         TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7120         rsm->r_in_tmap = 1;
7121         /* Take off the must retransmit flag, if its on */
7122         if (rsm->r_flags & RACK_MUST_RXT) {
7123                 if (rack->r_must_retran)
7124                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
7125                 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
7126                         /*
7127                          * We have retransmitted all we need. Clear
7128                          * any must retransmit flags.
7129                          */
7130                         rack->r_must_retran = 0;
7131                         rack->r_ctl.rc_out_at_rto = 0;
7132                 }
7133                 rsm->r_flags &= ~RACK_MUST_RXT;
7134         }
7135         if (rsm->r_flags & RACK_SACK_PASSED) {
7136                 /* We have retransmitted due to the SACK pass */
7137                 rsm->r_flags &= ~RACK_SACK_PASSED;
7138                 rsm->r_flags |= RACK_WAS_SACKPASS;
7139         }
7140 }
7141
7142 static uint32_t
7143 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
7144     struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
7145 {
7146         /*
7147          * We (re-)transmitted starting at rsm->r_start for some length
7148          * (possibly less than r_end.
7149          */
7150         struct rack_sendmap *nrsm;
7151 #ifdef INVARIANTS
7152         struct rack_sendmap *insret;
7153 #endif
7154         uint32_t c_end;
7155         int32_t len;
7156
7157         len = *lenp;
7158         c_end = rsm->r_start + len;
7159         if (SEQ_GEQ(c_end, rsm->r_end)) {
7160                 /*
7161                  * We retransmitted the whole piece or more than the whole
7162                  * slopping into the next rsm.
7163                  */
7164                 rack_update_rsm(tp, rack, rsm, ts, add_flag);
7165                 if (c_end == rsm->r_end) {
7166                         *lenp = 0;
7167                         return (0);
7168                 } else {
7169                         int32_t act_len;
7170
7171                         /* Hangs over the end return whats left */
7172                         act_len = rsm->r_end - rsm->r_start;
7173                         *lenp = (len - act_len);
7174                         return (rsm->r_end);
7175                 }
7176                 /* We don't get out of this block. */
7177         }
7178         /*
7179          * Here we retransmitted less than the whole thing which means we
7180          * have to split this into what was transmitted and what was not.
7181          */
7182         nrsm = rack_alloc_full_limit(rack);
7183         if (nrsm == NULL) {
7184                 /*
7185                  * We can't get memory, so lets not proceed.
7186                  */
7187                 *lenp = 0;
7188                 return (0);
7189         }
7190         /*
7191          * So here we are going to take the original rsm and make it what we
7192          * retransmitted. nrsm will be the tail portion we did not
7193          * retransmit. For example say the chunk was 1, 11 (10 bytes). And
7194          * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
7195          * 1, 6 and the new piece will be 6, 11.
7196          */
7197         rack_clone_rsm(rack, nrsm, rsm, c_end);
7198         nrsm->r_dupack = 0;
7199         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
7200 #ifndef INVARIANTS
7201         (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7202 #else
7203         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7204         if (insret != NULL) {
7205                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7206                       nrsm, insret, rack, rsm);
7207         }
7208 #endif
7209         if (rsm->r_in_tmap) {
7210                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7211                 nrsm->r_in_tmap = 1;
7212         }
7213         rsm->r_flags &= (~RACK_HAS_FIN);
7214         rack_update_rsm(tp, rack, rsm, ts, add_flag);
7215         /* Log a split of rsm into rsm and nrsm */
7216         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7217         *lenp = 0;
7218         return (0);
7219 }
7220
7221 static void
7222 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
7223                 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
7224                 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
7225 {
7226         struct tcp_rack *rack;
7227         struct rack_sendmap *rsm, *nrsm, fe;
7228 #ifdef INVARIANTS
7229         struct rack_sendmap *insret;
7230 #endif
7231         register uint32_t snd_max, snd_una;
7232
7233         /*
7234          * Add to the RACK log of packets in flight or retransmitted. If
7235          * there is a TS option we will use the TS echoed, if not we will
7236          * grab a TS.
7237          *
7238          * Retransmissions will increment the count and move the ts to its
7239          * proper place. Note that if options do not include TS's then we
7240          * won't be able to effectively use the ACK for an RTT on a retran.
7241          *
7242          * Notes about r_start and r_end. Lets consider a send starting at
7243          * sequence 1 for 10 bytes. In such an example the r_start would be
7244          * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
7245          * This means that r_end is actually the first sequence for the next
7246          * slot (11).
7247          *
7248          */
7249         /*
7250          * If err is set what do we do XXXrrs? should we not add the thing?
7251          * -- i.e. return if err != 0 or should we pretend we sent it? --
7252          * i.e. proceed with add ** do this for now.
7253          */
7254         INP_WLOCK_ASSERT(tp->t_inpcb);
7255         if (err)
7256                 /*
7257                  * We don't log errors -- we could but snd_max does not
7258                  * advance in this case either.
7259                  */
7260                 return;
7261
7262         if (th_flags & TH_RST) {
7263                 /*
7264                  * We don't log resets and we return immediately from
7265                  * sending
7266                  */
7267                 return;
7268         }
7269         rack = (struct tcp_rack *)tp->t_fb_ptr;
7270         snd_una = tp->snd_una;
7271         snd_max = tp->snd_max;
7272         if (th_flags & (TH_SYN | TH_FIN)) {
7273                 /*
7274                  * The call to rack_log_output is made before bumping
7275                  * snd_max. This means we can record one extra byte on a SYN
7276                  * or FIN if seq_out is adding more on and a FIN is present
7277                  * (and we are not resending).
7278                  */
7279                 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
7280                         len++;
7281                 if (th_flags & TH_FIN)
7282                         len++;
7283                 if (SEQ_LT(snd_max, tp->snd_nxt)) {
7284                         /*
7285                          * The add/update as not been done for the FIN/SYN
7286                          * yet.
7287                          */
7288                         snd_max = tp->snd_nxt;
7289                 }
7290         }
7291         if (SEQ_LEQ((seq_out + len), snd_una)) {
7292                 /* Are sending an old segment to induce an ack (keep-alive)? */
7293                 return;
7294         }
7295         if (SEQ_LT(seq_out, snd_una)) {
7296                 /* huh? should we panic? */
7297                 uint32_t end;
7298
7299                 end = seq_out + len;
7300                 seq_out = snd_una;
7301                 if (SEQ_GEQ(end, seq_out))
7302                         len = end - seq_out;
7303                 else
7304                         len = 0;
7305         }
7306         if (len == 0) {
7307                 /* We don't log zero window probes */
7308                 return;
7309         }
7310         if (IN_FASTRECOVERY(tp->t_flags)) {
7311                 rack->r_ctl.rc_prr_out += len;
7312         }
7313         /* First question is it a retransmission or new? */
7314         if (seq_out == snd_max) {
7315                 /* Its new */
7316 again:
7317                 rsm = rack_alloc(rack);
7318                 if (rsm == NULL) {
7319                         /*
7320                          * Hmm out of memory and the tcb got destroyed while
7321                          * we tried to wait.
7322                          */
7323                         return;
7324                 }
7325                 if (th_flags & TH_FIN) {
7326                         rsm->r_flags = RACK_HAS_FIN|add_flag;
7327                 } else {
7328                         rsm->r_flags = add_flag;
7329                 }
7330                 if (hw_tls)
7331                         rsm->r_hw_tls = 1;
7332                 rsm->r_tim_lastsent[0] = cts;
7333                 rsm->r_rtr_cnt = 1;
7334                 rsm->r_rtr_bytes = 0;
7335                 if (th_flags & TH_SYN) {
7336                         /* The data space is one beyond snd_una */
7337                         rsm->r_flags |= RACK_HAS_SYN;
7338                 }
7339                 rsm->r_start = seq_out;
7340                 rsm->r_end = rsm->r_start + len;
7341                 rsm->r_dupack = 0;
7342                 /*
7343                  * save off the mbuf location that
7344                  * sndmbuf_noadv returned (which is
7345                  * where we started copying from)..
7346                  */
7347                 rsm->m = s_mb;
7348                 rsm->soff = s_moff;
7349                 /*
7350                  * Here we do add in the len of send, since its not yet
7351                  * reflected in in snduna <->snd_max
7352                  */
7353                 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
7354                                               rack->r_ctl.rc_sacked) +
7355                               (rsm->r_end - rsm->r_start));
7356                 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
7357                 if (rsm->m) {
7358                         if (rsm->m->m_len <= rsm->soff) {
7359                                 /*
7360                                  * XXXrrs Question, will this happen?
7361                                  *
7362                                  * If sbsndptr is set at the correct place
7363                                  * then s_moff should always be somewhere
7364                                  * within rsm->m. But if the sbsndptr was
7365                                  * off then that won't be true. If it occurs
7366                                  * we need to walkout to the correct location.
7367                                  */
7368                                 struct mbuf *lm;
7369
7370                                 lm = rsm->m;
7371                                 while (lm->m_len <= rsm->soff) {
7372                                         rsm->soff -= lm->m_len;
7373                                         lm = lm->m_next;
7374                                         KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
7375                                                              __func__, rack, s_moff, s_mb, rsm->soff));
7376                                 }
7377                                 rsm->m = lm;
7378                         }
7379                         rsm->orig_m_len = rsm->m->m_len;
7380                 } else
7381                         rsm->orig_m_len = 0;
7382                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7383                 /* Log a new rsm */
7384                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
7385 #ifndef INVARIANTS
7386                 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7387 #else
7388                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7389                 if (insret != NULL) {
7390                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7391                               nrsm, insret, rack, rsm);
7392                 }
7393 #endif
7394                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7395                 rsm->r_in_tmap = 1;
7396                 /*
7397                  * Special case detection, is there just a single
7398                  * packet outstanding when we are not in recovery?
7399                  *
7400                  * If this is true mark it so.
7401                  */
7402                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
7403                     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
7404                         struct rack_sendmap *prsm;
7405
7406                         prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
7407                         if (prsm)
7408                                 prsm->r_one_out_nr = 1;
7409                 }
7410                 return;
7411         }
7412         /*
7413          * If we reach here its a retransmission and we need to find it.
7414          */
7415         memset(&fe, 0, sizeof(fe));
7416 more:
7417         if (hintrsm && (hintrsm->r_start == seq_out)) {
7418                 rsm = hintrsm;
7419                 hintrsm = NULL;
7420         } else {
7421                 /* No hints sorry */
7422                 rsm = NULL;
7423         }
7424         if ((rsm) && (rsm->r_start == seq_out)) {
7425                 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7426                 if (len == 0) {
7427                         return;
7428                 } else {
7429                         goto more;
7430                 }
7431         }
7432         /* Ok it was not the last pointer go through it the hard way. */
7433 refind:
7434         fe.r_start = seq_out;
7435         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
7436         if (rsm) {
7437                 if (rsm->r_start == seq_out) {
7438                         seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
7439                         if (len == 0) {
7440                                 return;
7441                         } else {
7442                                 goto refind;
7443                         }
7444                 }
7445                 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
7446                         /* Transmitted within this piece */
7447                         /*
7448                          * Ok we must split off the front and then let the
7449                          * update do the rest
7450                          */
7451                         nrsm = rack_alloc_full_limit(rack);
7452                         if (nrsm == NULL) {
7453                                 rack_update_rsm(tp, rack, rsm, cts, add_flag);
7454                                 return;
7455                         }
7456                         /*
7457                          * copy rsm to nrsm and then trim the front of rsm
7458                          * to not include this part.
7459                          */
7460                         rack_clone_rsm(rack, nrsm, rsm, seq_out);
7461                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7462 #ifndef INVARIANTS
7463                         (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7464 #else
7465                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
7466                         if (insret != NULL) {
7467                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
7468                                       nrsm, insret, rack, rsm);
7469                         }
7470 #endif
7471                         if (rsm->r_in_tmap) {
7472                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7473                                 nrsm->r_in_tmap = 1;
7474                         }
7475                         rsm->r_flags &= (~RACK_HAS_FIN);
7476                         seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
7477                         if (len == 0) {
7478                                 return;
7479                         } else if (len > 0)
7480                                 goto refind;
7481                 }
7482         }
7483         /*
7484          * Hmm not found in map did they retransmit both old and on into the
7485          * new?
7486          */
7487         if (seq_out == tp->snd_max) {
7488                 goto again;
7489         } else if (SEQ_LT(seq_out, tp->snd_max)) {
7490 #ifdef INVARIANTS
7491                 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
7492                        seq_out, len, tp->snd_una, tp->snd_max);
7493                 printf("Starting Dump of all rack entries\n");
7494                 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
7495                         printf("rsm:%p start:%u end:%u\n",
7496                                rsm, rsm->r_start, rsm->r_end);
7497                 }
7498                 printf("Dump complete\n");
7499                 panic("seq_out not found rack:%p tp:%p",
7500                       rack, tp);
7501 #endif
7502         } else {
7503 #ifdef INVARIANTS
7504                 /*
7505                  * Hmm beyond sndmax? (only if we are using the new rtt-pack
7506                  * flag)
7507                  */
7508                 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
7509                       seq_out, len, tp->snd_max, tp);
7510 #endif
7511         }
7512 }
7513
7514 /*
7515  * Record one of the RTT updates from an ack into
7516  * our sample structure.
7517  */
7518
7519 static void
7520 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
7521                     int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
7522 {
7523         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7524             (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
7525                 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
7526         }
7527         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7528             (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
7529                 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
7530         }
7531         if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
7532             if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
7533                 rack->r_ctl.rc_gp_lowrtt = us_rtt;
7534             if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
7535                     rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
7536         }
7537         if ((confidence == 1) &&
7538             ((rsm == NULL) ||
7539              (rsm->r_just_ret) ||
7540              (rsm->r_one_out_nr &&
7541               len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
7542                 /*
7543                  * If the rsm had a just return
7544                  * hit it then we can't trust the
7545                  * rtt measurement for buffer deterimination
7546                  * Note that a confidence of 2, indicates
7547                  * SACK'd which overrides the r_just_ret or
7548                  * the r_one_out_nr. If it was a CUM-ACK and
7549                  * we had only two outstanding, but get an
7550                  * ack for only 1. Then that also lowers our
7551                  * confidence.
7552                  */
7553                 confidence = 0;
7554         }
7555         if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
7556             (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
7557                 if (rack->r_ctl.rack_rs.confidence == 0) {
7558                         /*
7559                          * We take anything with no current confidence
7560                          * saved.
7561                          */
7562                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7563                         rack->r_ctl.rack_rs.confidence = confidence;
7564                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7565                 } else if (confidence || rack->r_ctl.rack_rs.confidence) {
7566                         /*
7567                          * Once we have a confident number,
7568                          * we can update it with a smaller
7569                          * value since this confident number
7570                          * may include the DSACK time until
7571                          * the next segment (the second one) arrived.
7572                          */
7573                         rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
7574                         rack->r_ctl.rack_rs.confidence = confidence;
7575                         rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
7576                 }
7577         }
7578         rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
7579         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
7580         rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
7581         rack->r_ctl.rack_rs.rs_rtt_cnt++;
7582 }
7583
7584 /*
7585  * Collect new round-trip time estimate
7586  * and update averages and current timeout.
7587  */
7588 static void
7589 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
7590 {
7591         int32_t delta;
7592         int32_t rtt;
7593
7594         if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
7595                 /* No valid sample */
7596                 return;
7597         if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
7598                 /* We are to use the lowest RTT seen in a single ack */
7599                 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
7600         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
7601                 /* We are to use the highest RTT seen in a single ack */
7602                 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
7603         } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
7604                 /* We are to use the average RTT seen in a single ack */
7605                 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
7606                                 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
7607         } else {
7608 #ifdef INVARIANTS
7609                 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
7610 #endif
7611                 return;
7612         }
7613         if (rtt == 0)
7614                 rtt = 1;
7615         if (rack->rc_gp_rtt_set == 0) {
7616                 /*
7617                  * With no RTT we have to accept
7618                  * even one we are not confident of.
7619                  */
7620                 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
7621                 rack->rc_gp_rtt_set = 1;
7622         } else if (rack->r_ctl.rack_rs.confidence) {
7623                 /* update the running gp srtt */
7624                 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
7625                 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
7626         }
7627         if (rack->r_ctl.rack_rs.confidence) {
7628                 /*
7629                  * record the low and high for highly buffered path computation,
7630                  * we only do this if we are confident (not a retransmission).
7631                  */
7632                 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
7633                         rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7634                 }
7635                 if (rack->rc_highly_buffered == 0) {
7636                         /*
7637                          * Currently once we declare a path has
7638                          * highly buffered there is no going
7639                          * back, which may be a problem...
7640                          */
7641                         if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
7642                                 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
7643                                                      rack->r_ctl.rc_highest_us_rtt,
7644                                                      rack->r_ctl.rc_lowest_us_rtt,
7645                                                      RACK_RTTS_SEEHBP);
7646                                 rack->rc_highly_buffered = 1;
7647                         }
7648                 }
7649         }
7650         if ((rack->r_ctl.rack_rs.confidence) ||
7651             (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
7652                 /*
7653                  * If we are highly confident of it <or> it was
7654                  * never retransmitted we accept it as the last us_rtt.
7655                  */
7656                 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7657                 /* The lowest rtt can be set if its was not retransmited */
7658                 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
7659                         rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
7660                         if (rack->r_ctl.rc_lowest_us_rtt == 0)
7661                                 rack->r_ctl.rc_lowest_us_rtt = 1;
7662                 }
7663         }
7664         rack = (struct tcp_rack *)tp->t_fb_ptr;
7665         if (tp->t_srtt != 0) {
7666                 /*
7667                  * We keep a simple srtt in microseconds, like our rtt
7668                  * measurement. We don't need to do any tricks with shifting
7669                  * etc. Instead we just add in 1/8th of the new measurement
7670                  * and subtract out 1/8 of the old srtt. We do the same with
7671                  * the variance after finding the absolute value of the
7672                  * difference between this sample and the current srtt.
7673                  */
7674                 delta = tp->t_srtt - rtt;
7675                 /* Take off 1/8th of the current sRTT */
7676                 tp->t_srtt -= (tp->t_srtt >> 3);
7677                 /* Add in 1/8th of the new RTT just measured */
7678                 tp->t_srtt += (rtt >> 3);
7679                 if (tp->t_srtt <= 0)
7680                         tp->t_srtt = 1;
7681                 /* Now lets make the absolute value of the variance */
7682                 if (delta < 0)
7683                         delta = -delta;
7684                 /* Subtract out 1/8th */
7685                 tp->t_rttvar -= (tp->t_rttvar >> 3);
7686                 /* Add in 1/8th of the new variance we just saw */
7687                 tp->t_rttvar += (delta >> 3);
7688                 if (tp->t_rttvar <= 0)
7689                         tp->t_rttvar = 1;
7690                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
7691                         tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7692         } else {
7693                 /*
7694                  * No rtt measurement yet - use the unsmoothed rtt. Set the
7695                  * variance to half the rtt (so our first retransmit happens
7696                  * at 3*rtt).
7697                  */
7698                 tp->t_srtt = rtt;
7699                 tp->t_rttvar = rtt >> 1;
7700                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
7701         }
7702         rack->rc_srtt_measure_made = 1;
7703         KMOD_TCPSTAT_INC(tcps_rttupdated);
7704         tp->t_rttupdated++;
7705 #ifdef STATS
7706         if (rack_stats_gets_ms_rtt == 0) {
7707                 /* Send in the microsecond rtt used for rxt timeout purposes */
7708                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
7709         } else if (rack_stats_gets_ms_rtt == 1) {
7710                 /* Send in the millisecond rtt used for rxt timeout purposes */
7711                 int32_t ms_rtt;
7712
7713                 /* Round up */
7714                 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7715                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7716         } else if (rack_stats_gets_ms_rtt == 2) {
7717                 /* Send in the millisecond rtt has close to the path RTT as we can get  */
7718                 int32_t ms_rtt;
7719
7720                 /* Round up */
7721                 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
7722                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
7723         }  else {
7724                 /* Send in the microsecond rtt has close to the path RTT as we can get  */
7725                 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
7726         }
7727
7728 #endif
7729         /*
7730          * the retransmit should happen at rtt + 4 * rttvar. Because of the
7731          * way we do the smoothing, srtt and rttvar will each average +1/2
7732          * tick of bias.  When we compute the retransmit timer, we want 1/2
7733          * tick of rounding and 1 extra tick because of +-1/2 tick
7734          * uncertainty in the firing of the timer.  The bias will give us
7735          * exactly the 1.5 tick we need.  But, because the bias is
7736          * statistical, we have to test that we don't drop below the minimum
7737          * feasible timer (which is 2 ticks).
7738          */
7739         tp->t_rxtshift = 0;
7740         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
7741                       max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
7742         rack_log_rtt_sample(rack, rtt);
7743         tp->t_softerror = 0;
7744 }
7745
7746
7747 static void
7748 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
7749 {
7750         /*
7751          * Apply to filter the inbound us-rtt at us_cts.
7752          */
7753         uint32_t old_rtt;
7754
7755         old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
7756         apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
7757                                us_rtt, us_cts);
7758         if (old_rtt > us_rtt) {
7759                 /* We just hit a new lower rtt time */
7760                 rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
7761                                      __LINE__, RACK_RTTS_NEWRTT);
7762                 /*
7763                  * Only count it if its lower than what we saw within our
7764                  * calculated range.
7765                  */
7766                 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
7767                         if (rack_probertt_lower_within &&
7768                             rack->rc_gp_dyn_mul &&
7769                             (rack->use_fixed_rate == 0) &&
7770                             (rack->rc_always_pace)) {
7771                                 /*
7772                                  * We are seeing a new lower rtt very close
7773                                  * to the time that we would have entered probe-rtt.
7774                                  * This is probably due to the fact that a peer flow
7775                                  * has entered probe-rtt. Lets go in now too.
7776                                  */
7777                                 uint32_t val;
7778
7779                                 val = rack_probertt_lower_within * rack_time_between_probertt;
7780                                 val /= 100;
7781                                 if ((rack->in_probe_rtt == 0)  &&
7782                                     ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
7783                                         rack_enter_probertt(rack, us_cts);
7784                                 }
7785                         }
7786                         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
7787                 }
7788         }
7789 }
7790
7791 static int
7792 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
7793     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
7794 {
7795         uint32_t us_rtt;
7796         int32_t i, all;
7797         uint32_t t, len_acked;
7798
7799         if ((rsm->r_flags & RACK_ACKED) ||
7800             (rsm->r_flags & RACK_WAS_ACKED))
7801                 /* Already done */
7802                 return (0);
7803         if (rsm->r_no_rtt_allowed) {
7804                 /* Not allowed */
7805                 return (0);
7806         }
7807         if (ack_type == CUM_ACKED) {
7808                 if (SEQ_GT(th_ack, rsm->r_end)) {
7809                         len_acked = rsm->r_end - rsm->r_start;
7810                         all = 1;
7811                 } else {
7812                         len_acked = th_ack - rsm->r_start;
7813                         all = 0;
7814                 }
7815         } else {
7816                 len_acked = rsm->r_end - rsm->r_start;
7817                 all = 0;
7818         }
7819         if (rsm->r_rtr_cnt == 1) {
7820
7821                 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
7822                 if ((int)t <= 0)
7823                         t = 1;
7824                 if (!tp->t_rttlow || tp->t_rttlow > t)
7825                         tp->t_rttlow = t;
7826                 if (!rack->r_ctl.rc_rack_min_rtt ||
7827                     SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
7828                         rack->r_ctl.rc_rack_min_rtt = t;
7829                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
7830                                 rack->r_ctl.rc_rack_min_rtt = 1;
7831                         }
7832                 }
7833                 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
7834                         us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
7835                 else
7836                         us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
7837                 if (us_rtt == 0)
7838                         us_rtt = 1;
7839                 if (CC_ALGO(tp)->rttsample != NULL) {
7840                         /* Kick the RTT to the CC */
7841                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
7842                 }
7843                 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
7844                 if (ack_type == SACKED) {
7845                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
7846                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
7847                 } else {
7848                         /*
7849                          * We need to setup what our confidence
7850                          * is in this ack.
7851                          *
7852                          * If the rsm was app limited and it is
7853                          * less than a mss in length (the end
7854                          * of the send) then we have a gap. If we
7855                          * were app limited but say we were sending
7856                          * multiple MSS's then we are more confident
7857                          * int it.
7858                          *
7859                          * When we are not app-limited then we see if
7860                          * the rsm is being included in the current
7861                          * measurement, we tell this by the app_limited_needs_set
7862                          * flag.
7863                          *
7864                          * Note that being cwnd blocked is not applimited
7865                          * as well as the pacing delay between packets which
7866                          * are sending only 1 or 2 MSS's also will show up
7867                          * in the RTT. We probably need to examine this algorithm
7868                          * a bit more and enhance it to account for the delay
7869                          * between rsm's. We could do that by saving off the
7870                          * pacing delay of each rsm (in an rsm) and then
7871                          * factoring that in somehow though for now I am
7872                          * not sure how :)
7873                          */
7874                         int calc_conf = 0;
7875
7876                         if (rsm->r_flags & RACK_APP_LIMITED) {
7877                                 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
7878                                         calc_conf = 0;
7879                                 else
7880                                         calc_conf = 1;
7881                         } else if (rack->app_limited_needs_set == 0) {
7882                                 calc_conf = 1;
7883                         } else {
7884                                 calc_conf = 0;
7885                         }
7886                         rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
7887                         tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
7888                                             calc_conf, rsm, rsm->r_rtr_cnt);
7889                 }
7890                 if ((rsm->r_flags & RACK_TLP) &&
7891                     (!IN_FASTRECOVERY(tp->t_flags))) {
7892                         /* Segment was a TLP and our retrans matched */
7893                         if (rack->r_ctl.rc_tlp_cwnd_reduce) {
7894                                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
7895                         }
7896                 }
7897                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
7898                         /* New more recent rack_tmit_time */
7899                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
7900                         rack->rc_rack_rtt = t;
7901                 }
7902                 return (1);
7903         }
7904         /*
7905          * We clear the soft/rxtshift since we got an ack.
7906          * There is no assurance we will call the commit() function
7907          * so we need to clear these to avoid incorrect handling.
7908          */
7909         tp->t_rxtshift = 0;
7910         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
7911                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
7912         tp->t_softerror = 0;
7913         if (to && (to->to_flags & TOF_TS) &&
7914             (ack_type == CUM_ACKED) &&
7915             (to->to_tsecr) &&
7916             ((rsm->r_flags & RACK_OVERMAX) == 0)) {
7917                 /*
7918                  * Now which timestamp does it match? In this block the ACK
7919                  * must be coming from a previous transmission.
7920                  */
7921                 for (i = 0; i < rsm->r_rtr_cnt; i++) {
7922                         if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
7923                                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
7924                                 if ((int)t <= 0)
7925                                         t = 1;
7926                                 if (CC_ALGO(tp)->rttsample != NULL) {
7927                                         /*
7928                                          * Kick the RTT to the CC, here
7929                                          * we lie a bit in that we know the
7930                                          * retransmission is correct even though
7931                                          * we retransmitted. This is because
7932                                          * we match the timestamps.
7933                                          */
7934                                         if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
7935                                                 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
7936                                         else
7937                                                 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
7938                                         CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
7939                                 }
7940                                 if ((i + 1) < rsm->r_rtr_cnt) {
7941                                         /*
7942                                          * The peer ack'd from our previous
7943                                          * transmission. We have a spurious
7944                                          * retransmission and thus we dont
7945                                          * want to update our rack_rtt.
7946                                          *
7947                                          * Hmm should there be a CC revert here?
7948                                          *
7949                                          */
7950                                         return (0);
7951                                 }
7952                                 if (!tp->t_rttlow || tp->t_rttlow > t)
7953                                         tp->t_rttlow = t;
7954                                 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
7955                                         rack->r_ctl.rc_rack_min_rtt = t;
7956                                         if (rack->r_ctl.rc_rack_min_rtt == 0) {
7957                                                 rack->r_ctl.rc_rack_min_rtt = 1;
7958                                         }
7959                                 }
7960                                 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
7961                                            (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
7962                                         /* New more recent rack_tmit_time */
7963                                         rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
7964                                         rack->rc_rack_rtt = t;
7965                                 }
7966                                 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
7967                                 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
7968                                                     rsm->r_rtr_cnt);
7969                                 return (1);
7970                         }
7971                 }
7972                 goto ts_not_found;
7973         } else {
7974                 /*
7975                  * Ok its a SACK block that we retransmitted. or a windows
7976                  * machine without timestamps. We can tell nothing from the
7977                  * time-stamp since its not there or the time the peer last
7978                  * recieved a segment that moved forward its cum-ack point.
7979                  */
7980 ts_not_found:
7981                 i = rsm->r_rtr_cnt - 1;
7982                 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
7983                 if ((int)t <= 0)
7984                         t = 1;
7985                 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
7986                         /*
7987                          * We retransmitted and the ack came back in less
7988                          * than the smallest rtt we have observed. We most
7989                          * likely did an improper retransmit as outlined in
7990                          * 6.2 Step 2 point 2 in the rack-draft so we
7991                          * don't want to update our rack_rtt. We in
7992                          * theory (in future) might want to think about reverting our
7993                          * cwnd state but we won't for now.
7994                          */
7995                         return (0);
7996                 } else if (rack->r_ctl.rc_rack_min_rtt) {
7997                         /*
7998                          * We retransmitted it and the retransmit did the
7999                          * job.
8000                          */
8001                         if (!rack->r_ctl.rc_rack_min_rtt ||
8002                             SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8003                                 rack->r_ctl.rc_rack_min_rtt = t;
8004                                 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8005                                         rack->r_ctl.rc_rack_min_rtt = 1;
8006                                 }
8007                         }
8008                         if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
8009                                 /* New more recent rack_tmit_time */
8010                                 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
8011                                 rack->rc_rack_rtt = t;
8012                         }
8013                         return (1);
8014                 }
8015         }
8016         return (0);
8017 }
8018
8019 /*
8020  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
8021  */
8022 static void
8023 rack_log_sack_passed(struct tcpcb *tp,
8024     struct tcp_rack *rack, struct rack_sendmap *rsm)
8025 {
8026         struct rack_sendmap *nrsm;
8027
8028         nrsm = rsm;
8029         TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
8030             rack_head, r_tnext) {
8031                 if (nrsm == rsm) {
8032                         /* Skip orginal segment he is acked */
8033                         continue;
8034                 }
8035                 if (nrsm->r_flags & RACK_ACKED) {
8036                         /*
8037                          * Skip ack'd segments, though we
8038                          * should not see these, since tmap
8039                          * should not have ack'd segments.
8040                          */
8041                         continue;
8042                 }
8043                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8044                         /*
8045                          * We found one that is already marked
8046                          * passed, we have been here before and
8047                          * so all others below this are marked.
8048                          */
8049                         break;
8050                 }
8051                 nrsm->r_flags |= RACK_SACK_PASSED;
8052                 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
8053         }
8054 }
8055
8056 static void
8057 rack_need_set_test(struct tcpcb *tp,
8058                    struct tcp_rack *rack,
8059                    struct rack_sendmap *rsm,
8060                    tcp_seq th_ack,
8061                    int line,
8062                    int use_which)
8063 {
8064
8065         if ((tp->t_flags & TF_GPUTINPROG) &&
8066             SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8067                 /*
8068                  * We were app limited, and this ack
8069                  * butts up or goes beyond the point where we want
8070                  * to start our next measurement. We need
8071                  * to record the new gput_ts as here and
8072                  * possibly update the start sequence.
8073                  */
8074                 uint32_t seq, ts;
8075
8076                 if (rsm->r_rtr_cnt > 1) {
8077                         /*
8078                          * This is a retransmit, can we
8079                          * really make any assessment at this
8080                          * point?  We are not really sure of
8081                          * the timestamp, is it this or the
8082                          * previous transmission?
8083                          *
8084                          * Lets wait for something better that
8085                          * is not retransmitted.
8086                          */
8087                         return;
8088                 }
8089                 seq = tp->gput_seq;
8090                 ts = tp->gput_ts;
8091                 rack->app_limited_needs_set = 0;
8092                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
8093                 /* Do we start at a new end? */
8094                 if ((use_which == RACK_USE_BEG) &&
8095                     SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
8096                         /*
8097                          * When we get an ACK that just eats
8098                          * up some of the rsm, we set RACK_USE_BEG
8099                          * since whats at r_start (i.e. th_ack)
8100                          * is left unacked and thats where the
8101                          * measurement not starts.
8102                          */
8103                         tp->gput_seq = rsm->r_start;
8104                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8105                 }
8106                 if ((use_which == RACK_USE_END) &&
8107                     SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
8108                             /*
8109                              * We use the end when the cumack
8110                              * is moving forward and completely
8111                              * deleting the rsm passed so basically
8112                              * r_end holds th_ack.
8113                              *
8114                              * For SACK's we also want to use the end
8115                              * since this piece just got sacked and
8116                              * we want to target anything after that
8117                              * in our measurement.
8118                              */
8119                             tp->gput_seq = rsm->r_end;
8120                             rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8121                 }
8122                 if (use_which == RACK_USE_END_OR_THACK) {
8123                         /*
8124                          * special case for ack moving forward,
8125                          * not a sack, we need to move all the
8126                          * way up to where this ack cum-ack moves
8127                          * to.
8128                          */
8129                         if (SEQ_GT(th_ack, rsm->r_end))
8130                                 tp->gput_seq = th_ack;
8131                         else
8132                                 tp->gput_seq = rsm->r_end;
8133                         rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8134                 }
8135                 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
8136                         /*
8137                          * We moved beyond this guy's range, re-calculate
8138                          * the new end point.
8139                          */
8140                         if (rack->rc_gp_filled == 0) {
8141                                 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
8142                         } else {
8143                                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
8144                         }
8145                 }
8146                 /*
8147                  * We are moving the goal post, we may be able to clear the
8148                  * measure_saw_probe_rtt flag.
8149                  */
8150                 if ((rack->in_probe_rtt == 0) &&
8151                     (rack->measure_saw_probe_rtt) &&
8152                     (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
8153                         rack->measure_saw_probe_rtt = 0;
8154                 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
8155                                            seq, tp->gput_seq, 0, 5, line, NULL, 0);
8156                 if (rack->rc_gp_filled &&
8157                     ((tp->gput_ack - tp->gput_seq) <
8158                      max(rc_init_window(rack), (MIN_GP_WIN *
8159                                                 ctf_fixed_maxseg(tp))))) {
8160                         uint32_t ideal_amount;
8161
8162                         ideal_amount = rack_get_measure_window(tp, rack);
8163                         if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
8164                                 /*
8165                                  * There is no sense of continuing this measurement
8166                                  * because its too small to gain us anything we
8167                                  * trust. Skip it and that way we can start a new
8168                                  * measurement quicker.
8169                                  */
8170                                 tp->t_flags &= ~TF_GPUTINPROG;
8171                                 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
8172                                                            0, 0, 0, 6, __LINE__, NULL, 0);
8173                         } else {
8174                                 /*
8175                                  * Reset the window further out.
8176                                  */
8177                                 tp->gput_ack = tp->gput_seq + ideal_amount;
8178                         }
8179                 }
8180         }
8181 }
8182
8183 static inline int
8184 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
8185 {
8186         if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
8187                 /* Behind our TLP definition or right at */
8188                 return (0);
8189         }
8190         if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
8191                 /* The start is beyond or right at our end of TLP definition */
8192                 return (0);
8193         }
8194         /* It has to be a sub-part of the original TLP recorded */
8195         return (1);
8196 }
8197
8198
8199 static uint32_t
8200 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
8201                    struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
8202 {
8203         uint32_t start, end, changed = 0;
8204         struct rack_sendmap stack_map;
8205         struct rack_sendmap *rsm, *nrsm, fe, *prev, *next;
8206 #ifdef INVARIANTS
8207         struct rack_sendmap *insret;
8208 #endif
8209         int32_t used_ref = 1;
8210         int moved = 0;
8211
8212         start = sack->start;
8213         end = sack->end;
8214         rsm = *prsm;
8215         memset(&fe, 0, sizeof(fe));
8216 do_rest_ofb:
8217         if ((rsm == NULL) ||
8218             (SEQ_LT(end, rsm->r_start)) ||
8219             (SEQ_GEQ(start, rsm->r_end)) ||
8220             (SEQ_LT(start, rsm->r_start))) {
8221                 /*
8222                  * We are not in the right spot,
8223                  * find the correct spot in the tree.
8224                  */
8225                 used_ref = 0;
8226                 fe.r_start = start;
8227                 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
8228                 moved++;
8229         }
8230         if (rsm == NULL) {
8231                 /* TSNH */
8232                 goto out;
8233         }
8234         /* Ok we have an ACK for some piece of this rsm */
8235         if (rsm->r_start != start) {
8236                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8237                         /*
8238                          * Before any splitting or hookery is
8239                          * done is it a TLP of interest i.e. rxt?
8240                          */
8241                         if ((rsm->r_flags & RACK_TLP) &&
8242                             (rsm->r_rtr_cnt > 1)) {
8243                                 /*
8244                                  * We are splitting a rxt TLP, check
8245                                  * if we need to save off the start/end
8246                                  */
8247                                 if (rack->rc_last_tlp_acked_set &&
8248                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8249                                         /*
8250                                          * We already turned this on since we are inside
8251                                          * the previous one was a partially sack now we
8252                                          * are getting another one (maybe all of it).
8253                                          *
8254                                          */
8255                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8256                                         /*
8257                                          * Lets make sure we have all of it though.
8258                                          */
8259                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8260                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8261                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8262                                                                      rack->r_ctl.last_tlp_acked_end);
8263                                         }
8264                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8265                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8266                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8267                                                                      rack->r_ctl.last_tlp_acked_end);
8268                                         }
8269                                 } else {
8270                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8271                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8272                                         rack->rc_last_tlp_past_cumack = 0;
8273                                         rack->rc_last_tlp_acked_set = 1;
8274                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8275                                 }
8276                         }
8277                         /**
8278                          * Need to split this in two pieces the before and after,
8279                          * the before remains in the map, the after must be
8280                          * added. In other words we have:
8281                          * rsm        |--------------|
8282                          * sackblk        |------->
8283                          * rsm will become
8284                          *     rsm    |---|
8285                          * and nrsm will be  the sacked piece
8286                          *     nrsm       |----------|
8287                          *
8288                          * But before we start down that path lets
8289                          * see if the sack spans over on top of
8290                          * the next guy and it is already sacked.
8291                          *
8292                          */
8293                         next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8294                         if (next && (next->r_flags & RACK_ACKED) &&
8295                             SEQ_GEQ(end, next->r_start)) {
8296                                 /**
8297                                  * So the next one is already acked, and
8298                                  * we can thus by hookery use our stack_map
8299                                  * to reflect the piece being sacked and
8300                                  * then adjust the two tree entries moving
8301                                  * the start and ends around. So we start like:
8302                                  *  rsm     |------------|             (not-acked)
8303                                  *  next                 |-----------| (acked)
8304                                  *  sackblk        |-------->
8305                                  *  We want to end like so:
8306                                  *  rsm     |------|                   (not-acked)
8307                                  *  next           |-----------------| (acked)
8308                                  *  nrsm           |-----|
8309                                  * Where nrsm is a temporary stack piece we
8310                                  * use to update all the gizmos.
8311                                  */
8312                                 /* Copy up our fudge block */
8313                                 nrsm = &stack_map;
8314                                 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8315                                 /* Now adjust our tree blocks */
8316                                 rsm->r_end = start;
8317                                 next->r_start = start;
8318                                 /* Now we must adjust back where next->m is */
8319                                 rack_setup_offset_for_rsm(rsm, next);
8320
8321                                 /* We don't need to adjust rsm, it did not change */
8322                                 /* Clear out the dup ack count of the remainder */
8323                                 rsm->r_dupack = 0;
8324                                 rsm->r_just_ret = 0;
8325                                 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8326                                 /* Now lets make sure our fudge block is right */
8327                                 nrsm->r_start = start;
8328                                 /* Now lets update all the stats and such */
8329                                 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8330                                 if (rack->app_limited_needs_set)
8331                                         rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8332                                 changed += (nrsm->r_end - nrsm->r_start);
8333                                 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8334                                 if (nrsm->r_flags & RACK_SACK_PASSED) {
8335                                         rack->r_ctl.rc_reorder_ts = cts;
8336                                 }
8337                                 /*
8338                                  * Now we want to go up from rsm (the
8339                                  * one left un-acked) to the next one
8340                                  * in the tmap. We do this so when
8341                                  * we walk backwards we include marking
8342                                  * sack-passed on rsm (The one passed in
8343                                  * is skipped since it is generally called
8344                                  * on something sacked before removing it
8345                                  * from the tmap).
8346                                  */
8347                                 if (rsm->r_in_tmap) {
8348                                         nrsm = TAILQ_NEXT(rsm, r_tnext);
8349                                         /*
8350                                          * Now that we have the next
8351                                          * one walk backwards from there.
8352                                          */
8353                                         if (nrsm && nrsm->r_in_tmap)
8354                                                 rack_log_sack_passed(tp, rack, nrsm);
8355                                 }
8356                                 /* Now are we done? */
8357                                 if (SEQ_LT(end, next->r_end) ||
8358                                     (end == next->r_end)) {
8359                                         /* Done with block */
8360                                         goto out;
8361                                 }
8362                                 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
8363                                 counter_u64_add(rack_sack_used_next_merge, 1);
8364                                 /* Postion for the next block */
8365                                 start = next->r_end;
8366                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
8367                                 if (rsm == NULL)
8368                                         goto out;
8369                         } else {
8370                                 /**
8371                                  * We can't use any hookery here, so we
8372                                  * need to split the map. We enter like
8373                                  * so:
8374                                  *  rsm      |--------|
8375                                  *  sackblk       |----->
8376                                  * We will add the new block nrsm and
8377                                  * that will be the new portion, and then
8378                                  * fall through after reseting rsm. So we
8379                                  * split and look like this:
8380                                  *  rsm      |----|
8381                                  *  sackblk       |----->
8382                                  *  nrsm          |---|
8383                                  * We then fall through reseting
8384                                  * rsm to nrsm, so the next block
8385                                  * picks it up.
8386                                  */
8387                                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8388                                 if (nrsm == NULL) {
8389                                         /*
8390                                          * failed XXXrrs what can we do but loose the sack
8391                                          * info?
8392                                          */
8393                                         goto out;
8394                                 }
8395                                 counter_u64_add(rack_sack_splits, 1);
8396                                 rack_clone_rsm(rack, nrsm, rsm, start);
8397                                 rsm->r_just_ret = 0;
8398 #ifndef INVARIANTS
8399                                 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8400 #else
8401                                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8402                                 if (insret != NULL) {
8403                                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8404                                               nrsm, insret, rack, rsm);
8405                                 }
8406 #endif
8407                                 if (rsm->r_in_tmap) {
8408                                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8409                                         nrsm->r_in_tmap = 1;
8410                                 }
8411                                 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
8412                                 rsm->r_flags &= (~RACK_HAS_FIN);
8413                                 /* Position us to point to the new nrsm that starts the sack blk */
8414                                 rsm = nrsm;
8415                         }
8416                 } else {
8417                         /* Already sacked this piece */
8418                         counter_u64_add(rack_sack_skipped_acked, 1);
8419                         moved++;
8420                         if (end == rsm->r_end) {
8421                                 /* Done with block */
8422                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8423                                 goto out;
8424                         } else if (SEQ_LT(end, rsm->r_end)) {
8425                                 /* A partial sack to a already sacked block */
8426                                 moved++;
8427                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8428                                 goto out;
8429                         } else {
8430                                 /*
8431                                  * The end goes beyond this guy
8432                                  * reposition the start to the
8433                                  * next block.
8434                                  */
8435                                 start = rsm->r_end;
8436                                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8437                                 if (rsm == NULL)
8438                                         goto out;
8439                         }
8440                 }
8441         }
8442         if (SEQ_GEQ(end, rsm->r_end)) {
8443                 /**
8444                  * The end of this block is either beyond this guy or right
8445                  * at this guy. I.e.:
8446                  *  rsm ---                 |-----|
8447                  *  end                     |-----|
8448                  *  <or>
8449                  *  end                     |---------|
8450                  */
8451                 if ((rsm->r_flags & RACK_ACKED) == 0) {
8452                         /*
8453                          * Is it a TLP of interest?
8454                          */
8455                         if ((rsm->r_flags & RACK_TLP) &&
8456                             (rsm->r_rtr_cnt > 1)) {
8457                                 /*
8458                                  * We are splitting a rxt TLP, check
8459                                  * if we need to save off the start/end
8460                                  */
8461                                 if (rack->rc_last_tlp_acked_set &&
8462                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8463                                         /*
8464                                          * We already turned this on since we are inside
8465                                          * the previous one was a partially sack now we
8466                                          * are getting another one (maybe all of it).
8467                                          */
8468                                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8469                                         /*
8470                                          * Lets make sure we have all of it though.
8471                                          */
8472                                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8473                                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8474                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8475                                                                      rack->r_ctl.last_tlp_acked_end);
8476                                         }
8477                                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8478                                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8479                                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8480                                                                      rack->r_ctl.last_tlp_acked_end);
8481                                         }
8482                                 } else {
8483                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8484                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8485                                         rack->rc_last_tlp_past_cumack = 0;
8486                                         rack->rc_last_tlp_acked_set = 1;
8487                                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8488                                 }
8489                         }
8490                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8491                         changed += (rsm->r_end - rsm->r_start);
8492                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8493                         if (rsm->r_in_tmap) /* should be true */
8494                                 rack_log_sack_passed(tp, rack, rsm);
8495                         /* Is Reordering occuring? */
8496                         if (rsm->r_flags & RACK_SACK_PASSED) {
8497                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8498                                 rack->r_ctl.rc_reorder_ts = cts;
8499                         }
8500                         if (rack->app_limited_needs_set)
8501                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8502                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8503                         rsm->r_flags |= RACK_ACKED;
8504                         if (rsm->r_in_tmap) {
8505                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8506                                 rsm->r_in_tmap = 0;
8507                         }
8508                         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
8509                 } else {
8510                         counter_u64_add(rack_sack_skipped_acked, 1);
8511                         moved++;
8512                 }
8513                 if (end == rsm->r_end) {
8514                         /* This block only - done, setup for next */
8515                         goto out;
8516                 }
8517                 /*
8518                  * There is more not coverend by this rsm move on
8519                  * to the next block in the RB tree.
8520                  */
8521                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8522                 start = rsm->r_end;
8523                 rsm = nrsm;
8524                 if (rsm == NULL)
8525                         goto out;
8526                 goto do_rest_ofb;
8527         }
8528         /**
8529          * The end of this sack block is smaller than
8530          * our rsm i.e.:
8531          *  rsm ---                 |-----|
8532          *  end                     |--|
8533          */
8534         if ((rsm->r_flags & RACK_ACKED) == 0) {
8535                 /*
8536                  * Is it a TLP of interest?
8537                  */
8538                 if ((rsm->r_flags & RACK_TLP) &&
8539                     (rsm->r_rtr_cnt > 1)) {
8540                         /*
8541                          * We are splitting a rxt TLP, check
8542                          * if we need to save off the start/end
8543                          */
8544                         if (rack->rc_last_tlp_acked_set &&
8545                             (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8546                                 /*
8547                                  * We already turned this on since we are inside
8548                                  * the previous one was a partially sack now we
8549                                  * are getting another one (maybe all of it).
8550                                  */
8551                                 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8552                                 /*
8553                                  * Lets make sure we have all of it though.
8554                                  */
8555                                 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8556                                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8557                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8558                                                              rack->r_ctl.last_tlp_acked_end);
8559                                 }
8560                                 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8561                                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8562                                         rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8563                                                              rack->r_ctl.last_tlp_acked_end);
8564                                 }
8565                         } else {
8566                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8567                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8568                                 rack->rc_last_tlp_past_cumack = 0;
8569                                 rack->rc_last_tlp_acked_set = 1;
8570                                 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8571                         }
8572                 }
8573                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8574                 if (prev &&
8575                     (prev->r_flags & RACK_ACKED)) {
8576                         /**
8577                          * Goal, we want the right remainder of rsm to shrink
8578                          * in place and span from (rsm->r_start = end) to rsm->r_end.
8579                          * We want to expand prev to go all the way
8580                          * to prev->r_end <- end.
8581                          * so in the tree we have before:
8582                          *   prev     |--------|         (acked)
8583                          *   rsm               |-------| (non-acked)
8584                          *   sackblk           |-|
8585                          * We churn it so we end up with
8586                          *   prev     |----------|       (acked)
8587                          *   rsm                 |-----| (non-acked)
8588                          *   nrsm              |-| (temporary)
8589                          *
8590                          * Note if either prev/rsm is a TLP we don't
8591                          * do this.
8592                          */
8593                         nrsm = &stack_map;
8594                         memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
8595                         prev->r_end = end;
8596                         rsm->r_start = end;
8597                         /* Now adjust nrsm (stack copy) to be
8598                          * the one that is the small
8599                          * piece that was "sacked".
8600                          */
8601                         nrsm->r_end = end;
8602                         rsm->r_dupack = 0;
8603                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8604                         /*
8605                          * Now that the rsm has had its start moved forward
8606                          * lets go ahead and get its new place in the world.
8607                          */
8608                         rack_setup_offset_for_rsm(prev, rsm);
8609                         /*
8610                          * Now nrsm is our new little piece
8611                          * that is acked (which was merged
8612                          * to prev). Update the rtt and changed
8613                          * based on that. Also check for reordering.
8614                          */
8615                         rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
8616                         if (rack->app_limited_needs_set)
8617                                 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
8618                         changed += (nrsm->r_end - nrsm->r_start);
8619                         rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
8620                         if (nrsm->r_flags & RACK_SACK_PASSED) {
8621                                 rack->r_ctl.rc_reorder_ts = cts;
8622                         }
8623                         rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
8624                         rsm = prev;
8625                         counter_u64_add(rack_sack_used_prev_merge, 1);
8626                 } else {
8627                         /**
8628                          * This is the case where our previous
8629                          * block is not acked either, so we must
8630                          * split the block in two.
8631                          */
8632                         nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
8633                         if (nrsm == NULL) {
8634                                 /* failed rrs what can we do but loose the sack info? */
8635                                 goto out;
8636                         }
8637                         if ((rsm->r_flags & RACK_TLP) &&
8638                             (rsm->r_rtr_cnt > 1)) {
8639                                 /*
8640                                  * We are splitting a rxt TLP, check
8641                                  * if we need to save off the start/end
8642                                  */
8643                                 if (rack->rc_last_tlp_acked_set &&
8644                                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8645                                             /*
8646                                              * We already turned this on since this block is inside
8647                                              * the previous one was a partially sack now we
8648                                              * are getting another one (maybe all of it).
8649                                              */
8650                                             rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8651                                             /*
8652                                              * Lets make sure we have all of it though.
8653                                              */
8654                                             if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
8655                                                     rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8656                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8657                                                                          rack->r_ctl.last_tlp_acked_end);
8658                                             }
8659                                             if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
8660                                                     rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8661                                                     rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
8662                                                                          rack->r_ctl.last_tlp_acked_end);
8663                                             }
8664                                     } else {
8665                                             rack->r_ctl.last_tlp_acked_start = rsm->r_start;
8666                                             rack->r_ctl.last_tlp_acked_end = rsm->r_end;
8667                                             rack->rc_last_tlp_acked_set = 1;
8668                                             rack->rc_last_tlp_past_cumack = 0;
8669                                             rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
8670                                     }
8671                         }
8672                         /**
8673                          * In this case nrsm becomes
8674                          * nrsm->r_start = end;
8675                          * nrsm->r_end = rsm->r_end;
8676                          * which is un-acked.
8677                          * <and>
8678                          * rsm->r_end = nrsm->r_start;
8679                          * i.e. the remaining un-acked
8680                          * piece is left on the left
8681                          * hand side.
8682                          *
8683                          * So we start like this
8684                          * rsm      |----------| (not acked)
8685                          * sackblk  |---|
8686                          * build it so we have
8687                          * rsm      |---|         (acked)
8688                          * nrsm         |------|  (not acked)
8689                          */
8690                         counter_u64_add(rack_sack_splits, 1);
8691                         rack_clone_rsm(rack, nrsm, rsm, end);
8692                         rsm->r_flags &= (~RACK_HAS_FIN);
8693                         rsm->r_just_ret = 0;
8694 #ifndef INVARIANTS
8695                         (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8696 #else
8697                         insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
8698                         if (insret != NULL) {
8699                                 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
8700                                       nrsm, insret, rack, rsm);
8701                         }
8702 #endif
8703                         if (rsm->r_in_tmap) {
8704                                 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8705                                 nrsm->r_in_tmap = 1;
8706                         }
8707                         nrsm->r_dupack = 0;
8708                         rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8709                         rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
8710                         changed += (rsm->r_end - rsm->r_start);
8711                         rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
8712                         if (rsm->r_in_tmap) /* should be true */
8713                                 rack_log_sack_passed(tp, rack, rsm);
8714                         /* Is Reordering occuring? */
8715                         if (rsm->r_flags & RACK_SACK_PASSED) {
8716                                 rsm->r_flags &= ~RACK_SACK_PASSED;
8717                                 rack->r_ctl.rc_reorder_ts = cts;
8718                         }
8719                         if (rack->app_limited_needs_set)
8720                                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
8721                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
8722                         rsm->r_flags |= RACK_ACKED;
8723                         rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
8724                         if (rsm->r_in_tmap) {
8725                                 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8726                                 rsm->r_in_tmap = 0;
8727                         }
8728                 }
8729         } else if (start != end){
8730                 /*
8731                  * The block was already acked.
8732                  */
8733                 counter_u64_add(rack_sack_skipped_acked, 1);
8734                 moved++;
8735         }
8736 out:
8737         if (rsm &&
8738             ((rsm->r_flags & RACK_TLP) == 0) &&
8739             (rsm->r_flags & RACK_ACKED)) {
8740                 /*
8741                  * Now can we merge where we worked
8742                  * with either the previous or
8743                  * next block?
8744                  */
8745                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8746                 while (next) {
8747                         if (next->r_flags & RACK_TLP)
8748                                 break;
8749                         if (next->r_flags & RACK_ACKED) {
8750                         /* yep this and next can be merged */
8751                                 rsm = rack_merge_rsm(rack, rsm, next);
8752                                 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8753                         } else
8754                                 break;
8755                 }
8756                 /* Now what about the previous? */
8757                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8758                 while (prev) {
8759                         if (prev->r_flags & RACK_TLP)
8760                                 break;
8761                         if (prev->r_flags & RACK_ACKED) {
8762                                 /* yep the previous and this can be merged */
8763                                 rsm = rack_merge_rsm(rack, prev, rsm);
8764                                 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8765                         } else
8766                                 break;
8767                 }
8768         }
8769         if (used_ref == 0) {
8770                 counter_u64_add(rack_sack_proc_all, 1);
8771         } else {
8772                 counter_u64_add(rack_sack_proc_short, 1);
8773         }
8774         /* Save off the next one for quick reference. */
8775         if (rsm)
8776                 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8777         else
8778                 nrsm = NULL;
8779         *prsm = rack->r_ctl.rc_sacklast = nrsm;
8780         /* Pass back the moved. */
8781         *moved_two = moved;
8782         return (changed);
8783 }
8784
8785 static void inline
8786 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
8787 {
8788         struct rack_sendmap *tmap;
8789
8790         tmap = NULL;
8791         while (rsm && (rsm->r_flags & RACK_ACKED)) {
8792                 /* Its no longer sacked, mark it so */
8793                 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
8794 #ifdef INVARIANTS
8795                 if (rsm->r_in_tmap) {
8796                         panic("rack:%p rsm:%p flags:0x%x in tmap?",
8797                               rack, rsm, rsm->r_flags);
8798                 }
8799 #endif
8800                 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
8801                 /* Rebuild it into our tmap */
8802                 if (tmap == NULL) {
8803                         TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8804                         tmap = rsm;
8805                 } else {
8806                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
8807                         tmap = rsm;
8808                 }
8809                 tmap->r_in_tmap = 1;
8810                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
8811         }
8812         /*
8813          * Now lets possibly clear the sack filter so we start
8814          * recognizing sacks that cover this area.
8815          */
8816         sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
8817
8818 }
8819
8820 static void
8821 rack_do_decay(struct tcp_rack *rack)
8822 {
8823         struct timeval res;
8824
8825 #define timersub(tvp, uvp, vvp)                                         \
8826         do {                                                            \
8827                 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
8828                 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
8829                 if ((vvp)->tv_usec < 0) {                               \
8830                         (vvp)->tv_sec--;                                \
8831                         (vvp)->tv_usec += 1000000;                      \
8832                 }                                                       \
8833         } while (0)
8834
8835         timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
8836 #undef timersub
8837
8838         rack->r_ctl.input_pkt++;
8839         if ((rack->rc_in_persist) ||
8840             (res.tv_sec >= 1) ||
8841             (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
8842                 /*
8843                  * Check for decay of non-SAD,
8844                  * we want all SAD detection metrics to
8845                  * decay 1/4 per second (or more) passed.
8846                  */
8847 #ifdef NETFLIX_EXP_DETECTION
8848                 uint32_t pkt_delta;
8849
8850                 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
8851 #endif
8852                 /* Update our saved tracking values */
8853                 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
8854                 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
8855                 /* Now do we escape without decay? */
8856 #ifdef NETFLIX_EXP_DETECTION
8857                 if (rack->rc_in_persist ||
8858                     (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
8859                     (pkt_delta < tcp_sad_low_pps)){
8860                         /*
8861                          * We don't decay idle connections
8862                          * or ones that have a low input pps.
8863                          */
8864                         return;
8865                 }
8866                 /* Decay the counters */
8867                 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
8868                                                         tcp_sad_decay_val);
8869                 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
8870                                                          tcp_sad_decay_val);
8871                 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
8872                                                                tcp_sad_decay_val);
8873                 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
8874                                                                 tcp_sad_decay_val);
8875 #endif
8876         }
8877 }
8878
8879 static void
8880 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
8881 {
8882         struct rack_sendmap *rsm;
8883 #ifdef INVARIANTS
8884         struct rack_sendmap *rm;
8885 #endif
8886
8887         /*
8888          * The ACK point is advancing to th_ack, we must drop off
8889          * the packets in the rack log and calculate any eligble
8890          * RTT's.
8891          */
8892         rack->r_wanted_output = 1;
8893
8894         /* Tend any TLP that has been marked for 1/2 the seq space (its old)  */
8895         if ((rack->rc_last_tlp_acked_set == 1)&&
8896             (rack->rc_last_tlp_past_cumack == 1) &&
8897             (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
8898                 /*
8899                  * We have reached the point where our last rack
8900                  * tlp retransmit sequence is ahead of the cum-ack.
8901                  * This can only happen when the cum-ack moves all
8902                  * the way around (its been a full 2^^31+1 bytes
8903                  * or more since we sent a retransmitted TLP). Lets
8904                  * turn off the valid flag since its not really valid.
8905                  *
8906                  * Note since sack's also turn on this event we have
8907                  * a complication, we have to wait to age it out until
8908                  * the cum-ack is by the TLP before checking which is
8909                  * what the next else clause does.
8910                  */
8911                 rack_log_dsack_event(rack, 9, __LINE__,
8912                                      rack->r_ctl.last_tlp_acked_start,
8913                                      rack->r_ctl.last_tlp_acked_end);
8914                 rack->rc_last_tlp_acked_set = 0;
8915                 rack->rc_last_tlp_past_cumack = 0;
8916         } else if ((rack->rc_last_tlp_acked_set == 1) &&
8917                    (rack->rc_last_tlp_past_cumack == 0) &&
8918                    (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
8919                 /*
8920                  * It is safe to start aging TLP's out.
8921                  */
8922                 rack->rc_last_tlp_past_cumack = 1;
8923         }
8924         /* We do the same for the tlp send seq as well */
8925         if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
8926             (rack->rc_last_sent_tlp_past_cumack == 1) &&
8927             (SEQ_GT(rack->r_ctl.last_sent_tlp_seq,  th_ack))) {
8928                 rack_log_dsack_event(rack, 9, __LINE__,
8929                                      rack->r_ctl.last_sent_tlp_seq,
8930                                      (rack->r_ctl.last_sent_tlp_seq +
8931                                       rack->r_ctl.last_sent_tlp_len));
8932                 rack->rc_last_sent_tlp_seq_valid = 0;
8933                 rack->rc_last_sent_tlp_past_cumack = 0;
8934         } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
8935                    (rack->rc_last_sent_tlp_past_cumack == 0) &&
8936                    (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
8937                 /*
8938                  * It is safe to start aging TLP's send.
8939                  */
8940                 rack->rc_last_sent_tlp_past_cumack = 1;
8941         }
8942 more:
8943         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
8944         if (rsm == NULL) {
8945                 if ((th_ack - 1) == tp->iss) {
8946                         /*
8947                          * For the SYN incoming case we will not
8948                          * have called tcp_output for the sending of
8949                          * the SYN, so there will be no map. All
8950                          * other cases should probably be a panic.
8951                          */
8952                         return;
8953                 }
8954                 if (tp->t_flags & TF_SENTFIN) {
8955                         /* if we sent a FIN we often will not have map */
8956                         return;
8957                 }
8958 #ifdef INVARIANTS
8959                 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
8960                       tp,
8961                       tp->t_state, th_ack, rack,
8962                       tp->snd_una, tp->snd_max, tp->snd_nxt);
8963 #endif
8964                 return;
8965         }
8966         if (SEQ_LT(th_ack, rsm->r_start)) {
8967                 /* Huh map is missing this */
8968 #ifdef INVARIANTS
8969                 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
8970                        rsm->r_start,
8971                        th_ack, tp->t_state, rack->r_state);
8972 #endif
8973                 return;
8974         }
8975         rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
8976
8977         /* Now was it a retransmitted TLP? */
8978         if ((rsm->r_flags & RACK_TLP) &&
8979             (rsm->r_rtr_cnt > 1)) {
8980                 /*
8981                  * Yes, this rsm was a TLP and retransmitted, remember that
8982                  * since if a DSACK comes back on this we don't want
8983                  * to think of it as a reordered segment. This may
8984                  * get updated again with possibly even other TLPs
8985                  * in flight, but thats ok. Only when we don't send
8986                  * a retransmitted TLP for 1/2 the sequences space
8987                  * will it get turned off (above).
8988                  */
8989                 if (rack->rc_last_tlp_acked_set &&
8990                     (is_rsm_inside_declared_tlp_block(rack, rsm))) {
8991                         /*
8992                          * We already turned this on since the end matches,
8993                          * the previous one was a partially ack now we
8994                          * are getting another one (maybe all of it).
8995                          */
8996                         rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
8997                         /*
8998                          * Lets make sure we have all of it though.
8999                          */
9000                         if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9001                                 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9002                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9003                                                      rack->r_ctl.last_tlp_acked_end);
9004                         }
9005                         if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9006                                 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9007                                 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9008                                                      rack->r_ctl.last_tlp_acked_end);
9009                         }
9010                 } else {
9011                         rack->rc_last_tlp_past_cumack = 1;
9012                         rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9013                         rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9014                         rack->rc_last_tlp_acked_set = 1;
9015                         rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9016                 }
9017         }
9018         /* Now do we consume the whole thing? */
9019         if (SEQ_GEQ(th_ack, rsm->r_end)) {
9020                 /* Its all consumed. */
9021                 uint32_t left;
9022                 uint8_t newly_acked;
9023
9024                 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
9025                 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
9026                 rsm->r_rtr_bytes = 0;
9027                 /* Record the time of highest cumack sent */
9028                 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9029 #ifndef INVARIANTS
9030                 (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
9031 #else
9032                 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
9033                 if (rm != rsm) {
9034                         panic("removing head in rack:%p rsm:%p rm:%p",
9035                               rack, rsm, rm);
9036                 }
9037 #endif
9038                 if (rsm->r_in_tmap) {
9039                         TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9040                         rsm->r_in_tmap = 0;
9041                 }
9042                 newly_acked = 1;
9043                 if (rsm->r_flags & RACK_ACKED) {
9044                         /*
9045                          * It was acked on the scoreboard -- remove
9046                          * it from total
9047                          */
9048                         rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
9049                         newly_acked = 0;
9050                 } else if (rsm->r_flags & RACK_SACK_PASSED) {
9051                         /*
9052                          * There are segments ACKED on the
9053                          * scoreboard further up. We are seeing
9054                          * reordering.
9055                          */
9056                         rsm->r_flags &= ~RACK_SACK_PASSED;
9057                         rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9058                         rsm->r_flags |= RACK_ACKED;
9059                         rack->r_ctl.rc_reorder_ts = cts;
9060                         if (rack->r_ent_rec_ns) {
9061                                 /*
9062                                  * We have sent no more, and we saw an sack
9063                                  * then ack arrive.
9064                                  */
9065                                 rack->r_might_revert = 1;
9066                         }
9067                 }
9068                 if ((rsm->r_flags & RACK_TO_REXT) &&
9069                     (tp->t_flags & TF_RCVD_TSTMP) &&
9070                     (to->to_flags & TOF_TS) &&
9071                     (to->to_tsecr != 0) &&
9072                     (tp->t_flags & TF_PREVVALID)) {
9073                         /*
9074                          * We can use the timestamp to see
9075                          * if this retransmission was from the
9076                          * first transmit. If so we made a mistake.
9077                          */
9078                         tp->t_flags &= ~TF_PREVVALID;
9079                         if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
9080                                 /* The first transmit is what this ack is for */
9081                                 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__);
9082                         }
9083                 }
9084                 left = th_ack - rsm->r_end;
9085                 if (rack->app_limited_needs_set && newly_acked)
9086                         rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
9087                 /* Free back to zone */
9088                 rack_free(rack, rsm);
9089                 if (left) {
9090                         goto more;
9091                 }
9092                 /* Check for reneging */
9093                 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9094                 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
9095                         /*
9096                          * The peer has moved snd_una up to
9097                          * the edge of this send, i.e. one
9098                          * that it had previously acked. The only
9099                          * way that can be true if the peer threw
9100                          * away data (space issues) that it had
9101                          * previously sacked (else it would have
9102                          * given us snd_una up to (rsm->r_end).
9103                          * We need to undo the acked markings here.
9104                          *
9105                          * Note we have to look to make sure th_ack is
9106                          * our rsm->r_start in case we get an old ack
9107                          * where th_ack is behind snd_una.
9108                          */
9109                         rack_peer_reneges(rack, rsm, th_ack);
9110                 }
9111                 return;
9112         }
9113         if (rsm->r_flags & RACK_ACKED) {
9114                 /*
9115                  * It was acked on the scoreboard -- remove it from
9116                  * total for the part being cum-acked.
9117                  */
9118                 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
9119         }
9120         /*
9121          * Clear the dup ack count for
9122          * the piece that remains.
9123          */
9124         rsm->r_dupack = 0;
9125         rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9126         if (rsm->r_rtr_bytes) {
9127                 /*
9128                  * It was retransmitted adjust the
9129                  * sack holes for what was acked.
9130                  */
9131                 int ack_am;
9132
9133                 ack_am = (th_ack - rsm->r_start);
9134                 if (ack_am >= rsm->r_rtr_bytes) {
9135                         rack->r_ctl.rc_holes_rxt -= ack_am;
9136                         rsm->r_rtr_bytes -= ack_am;
9137                 }
9138         }
9139         /*
9140          * Update where the piece starts and record
9141          * the time of send of highest cumack sent.
9142          */
9143         rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9144         rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
9145         /* Now we need to move our offset forward too */
9146         if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
9147                 /* Fix up the orig_m_len and possibly the mbuf offset */
9148                 rack_adjust_orig_mlen(rsm);
9149         }
9150         rsm->soff += (th_ack - rsm->r_start);
9151         rsm->r_start = th_ack;
9152         /* Now do we need to move the mbuf fwd too? */
9153         if (rsm->m) {
9154                 while (rsm->soff >= rsm->m->m_len) {
9155                         rsm->soff -= rsm->m->m_len;
9156                         rsm->m = rsm->m->m_next;
9157                         KASSERT((rsm->m != NULL),
9158                                 (" nrsm:%p hit at soff:%u null m",
9159                                  rsm, rsm->soff));
9160                 }
9161                 rsm->orig_m_len = rsm->m->m_len;
9162         }
9163         if (rack->app_limited_needs_set)
9164                 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
9165 }
9166
9167 static void
9168 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
9169 {
9170         struct rack_sendmap *rsm;
9171         int sack_pass_fnd = 0;
9172
9173         if (rack->r_might_revert) {
9174                 /*
9175                  * Ok we have reordering, have not sent anything, we
9176                  * might want to revert the congestion state if nothing
9177                  * further has SACK_PASSED on it. Lets check.
9178                  *
9179                  * We also get here when we have DSACKs come in for
9180                  * all the data that we FR'd. Note that a rxt or tlp
9181                  * timer clears this from happening.
9182                  */
9183
9184                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
9185                         if (rsm->r_flags & RACK_SACK_PASSED) {
9186                                 sack_pass_fnd = 1;
9187                                 break;
9188                         }
9189                 }
9190                 if (sack_pass_fnd == 0) {
9191                         /*
9192                          * We went into recovery
9193                          * incorrectly due to reordering!
9194                          */
9195                         int orig_cwnd;
9196
9197                         rack->r_ent_rec_ns = 0;
9198                         orig_cwnd = tp->snd_cwnd;
9199                         tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
9200                         tp->snd_recover = tp->snd_una;
9201                         rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
9202                         EXIT_RECOVERY(tp->t_flags);
9203                 }
9204                 rack->r_might_revert = 0;
9205         }
9206 }
9207
9208 #ifdef NETFLIX_EXP_DETECTION
9209 static void
9210 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t bytes_this_ack, uint32_t segsiz)
9211 {
9212         if ((rack->do_detection || tcp_force_detection) &&
9213             tcp_sack_to_ack_thresh &&
9214             tcp_sack_to_move_thresh &&
9215             ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
9216                 /*
9217                  * We have thresholds set to find
9218                  * possible attackers and disable sack.
9219                  * Check them.
9220                  */
9221                 uint64_t ackratio, moveratio, movetotal;
9222
9223                 /* Log detecting */
9224                 rack_log_sad(rack, 1);
9225                 ackratio = (uint64_t)(rack->r_ctl.sack_count);
9226                 ackratio *= (uint64_t)(1000);
9227                 if (rack->r_ctl.ack_count)
9228                         ackratio /= (uint64_t)(rack->r_ctl.ack_count);
9229                 else {
9230                         /* We really should not hit here */
9231                         ackratio = 1000;
9232                 }
9233                 if ((rack->sack_attack_disable == 0) &&
9234                     (ackratio > rack_highest_sack_thresh_seen))
9235                         rack_highest_sack_thresh_seen = (uint32_t)ackratio;
9236                 movetotal = rack->r_ctl.sack_moved_extra;
9237                 movetotal += rack->r_ctl.sack_noextra_move;
9238                 moveratio = rack->r_ctl.sack_moved_extra;
9239                 moveratio *= (uint64_t)1000;
9240                 if (movetotal)
9241                         moveratio /= movetotal;
9242                 else {
9243                         /* No moves, thats pretty good */
9244                         moveratio = 0;
9245                 }
9246                 if ((rack->sack_attack_disable == 0) &&
9247                     (moveratio > rack_highest_move_thresh_seen))
9248                         rack_highest_move_thresh_seen = (uint32_t)moveratio;
9249                 if (rack->sack_attack_disable == 0) {
9250                         if ((ackratio > tcp_sack_to_ack_thresh) &&
9251                             (moveratio > tcp_sack_to_move_thresh)) {
9252                                 /* Disable sack processing */
9253                                 rack->sack_attack_disable = 1;
9254                                 if (rack->r_rep_attack == 0) {
9255                                         rack->r_rep_attack = 1;
9256                                         counter_u64_add(rack_sack_attacks_detected, 1);
9257                                 }
9258                                 if (tcp_attack_on_turns_on_logging) {
9259                                         /*
9260                                          * Turn on logging, used for debugging
9261                                          * false positives.
9262                                          */
9263                                         rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
9264                                 }
9265                                 /* Clamp the cwnd at flight size */
9266                                 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
9267                                 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
9268                                 rack_log_sad(rack, 2);
9269                         }
9270                 } else {
9271                         /* We are sack-disabled check for false positives */
9272                         if ((ackratio <= tcp_restoral_thresh) ||
9273                             (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
9274                                 rack->sack_attack_disable = 0;
9275                                 rack_log_sad(rack, 3);
9276                                 /* Restart counting */
9277                                 rack->r_ctl.sack_count = 0;
9278                                 rack->r_ctl.sack_moved_extra = 0;
9279                                 rack->r_ctl.sack_noextra_move = 1;
9280                                 rack->r_ctl.ack_count = max(1,
9281                                       (bytes_this_ack / segsiz));
9282
9283                                 if (rack->r_rep_reverse == 0) {
9284                                         rack->r_rep_reverse = 1;
9285                                         counter_u64_add(rack_sack_attacks_reversed, 1);
9286                                 }
9287                                 /* Restore the cwnd */
9288                                 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
9289                                         rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
9290                         }
9291                 }
9292         }
9293 }
9294 #endif
9295
9296 static int
9297 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
9298 {
9299
9300         uint32_t am, l_end;
9301         int was_tlp = 0;
9302
9303         if (SEQ_GT(end, start))
9304                 am = end - start;
9305         else
9306                 am = 0;
9307         if ((rack->rc_last_tlp_acked_set ) &&
9308             (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
9309             (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
9310                 /*
9311                  * The DSACK is because of a TLP which we don't
9312                  * do anything with the reordering window over since
9313                  * it was not reordering that caused the DSACK but
9314                  * our previous retransmit TLP.
9315                  */
9316                 rack_log_dsack_event(rack, 7, __LINE__, start, end);
9317                 was_tlp = 1;
9318                 goto skip_dsack_round;
9319         }
9320         if (rack->rc_last_sent_tlp_seq_valid) {
9321                 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
9322                 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
9323                     (SEQ_LEQ(end, l_end))) {
9324                         /*
9325                          * This dsack is from the last sent TLP, ignore it
9326                          * for reordering purposes.
9327                          */
9328                         rack_log_dsack_event(rack, 7, __LINE__, start, end);
9329                         was_tlp = 1;
9330                         goto skip_dsack_round;
9331                 }
9332         }
9333         if (rack->rc_dsack_round_seen == 0) {
9334                 rack->rc_dsack_round_seen = 1;
9335                 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
9336                 rack->r_ctl.num_dsack++;
9337                 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
9338                 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
9339         }
9340 skip_dsack_round:
9341         /*
9342          * We keep track of how many DSACK blocks we get
9343          * after a recovery incident.
9344          */
9345         rack->r_ctl.dsack_byte_cnt += am;
9346         if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
9347             rack->r_ctl.retran_during_recovery &&
9348             (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
9349                 /*
9350                  * False recovery most likely culprit is reordering. If
9351                  * nothing else is missing we need to revert.
9352                  */
9353                 rack->r_might_revert = 1;
9354                 rack_handle_might_revert(rack->rc_tp, rack);
9355                 rack->r_might_revert = 0;
9356                 rack->r_ctl.retran_during_recovery = 0;
9357                 rack->r_ctl.dsack_byte_cnt = 0;
9358         }
9359         return (was_tlp);
9360 }
9361
9362 static void
9363 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
9364 {
9365         /* Deal with changed and PRR here (in recovery only) */
9366         uint32_t pipe, snd_una;
9367
9368         rack->r_ctl.rc_prr_delivered += changed;
9369
9370         if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
9371                 /*
9372                  * It is all outstanding, we are application limited
9373                  * and thus we don't need more room to send anything.
9374                  * Note we use tp->snd_una here and not th_ack because
9375                  * the data as yet not been cut from the sb.
9376                  */
9377                 rack->r_ctl.rc_prr_sndcnt = 0;
9378                 return;
9379         }
9380         /* Compute prr_sndcnt */
9381         if (SEQ_GT(tp->snd_una, th_ack)) {
9382                 snd_una = tp->snd_una;
9383         } else {
9384                 snd_una = th_ack;
9385         }
9386         pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
9387         if (pipe > tp->snd_ssthresh) {
9388                 long sndcnt;
9389
9390                 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
9391                 if (rack->r_ctl.rc_prr_recovery_fs > 0)
9392                         sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
9393                 else {
9394                         rack->r_ctl.rc_prr_sndcnt = 0;
9395                         rack_log_to_prr(rack, 9, 0, __LINE__);
9396                         sndcnt = 0;
9397                 }
9398                 sndcnt++;
9399                 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
9400                         sndcnt -= rack->r_ctl.rc_prr_out;
9401                 else
9402                         sndcnt = 0;
9403                 rack->r_ctl.rc_prr_sndcnt = sndcnt;
9404                 rack_log_to_prr(rack, 10, 0, __LINE__);
9405         } else {
9406                 uint32_t limit;
9407
9408                 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
9409                         limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
9410                 else
9411                         limit = 0;
9412                 if (changed > limit)
9413                         limit = changed;
9414                 limit += ctf_fixed_maxseg(tp);
9415                 if (tp->snd_ssthresh > pipe) {
9416                         rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
9417                         rack_log_to_prr(rack, 11, 0, __LINE__);
9418                 } else {
9419                         rack->r_ctl.rc_prr_sndcnt = min(0, limit);
9420                         rack_log_to_prr(rack, 12, 0, __LINE__);
9421                 }
9422         }
9423 }
9424
9425 static void
9426 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
9427 {
9428         uint32_t changed;
9429         struct tcp_rack *rack;
9430         struct rack_sendmap *rsm;
9431         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
9432         register uint32_t th_ack;
9433         int32_t i, j, k, num_sack_blks = 0;
9434         uint32_t cts, acked, ack_point;
9435         int loop_start = 0, moved_two = 0;
9436         uint32_t tsused;
9437
9438
9439         INP_WLOCK_ASSERT(tp->t_inpcb);
9440         if (tcp_get_flags(th) & TH_RST) {
9441                 /* We don't log resets */
9442                 return;
9443         }
9444         rack = (struct tcp_rack *)tp->t_fb_ptr;
9445         cts = tcp_get_usecs(NULL);
9446         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
9447         changed = 0;
9448         th_ack = th->th_ack;
9449         if (rack->sack_attack_disable == 0)
9450                 rack_do_decay(rack);
9451         if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
9452                 /*
9453                  * You only get credit for
9454                  * MSS and greater (and you get extra
9455                  * credit for larger cum-ack moves).
9456                  */
9457                 int ac;
9458
9459                 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
9460                 rack->r_ctl.ack_count += ac;
9461                 counter_u64_add(rack_ack_total, ac);
9462         }
9463         if (rack->r_ctl.ack_count > 0xfff00000) {
9464                 /*
9465                  * reduce the number to keep us under
9466                  * a uint32_t.
9467                  */
9468                 rack->r_ctl.ack_count /= 2;
9469                 rack->r_ctl.sack_count /= 2;
9470         }
9471         if (SEQ_GT(th_ack, tp->snd_una)) {
9472                 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
9473                 tp->t_acktime = ticks;
9474         }
9475         if (rsm && SEQ_GT(th_ack, rsm->r_start))
9476                 changed = th_ack - rsm->r_start;
9477         if (changed) {
9478                 rack_process_to_cumack(tp, rack, th_ack, cts, to);
9479         }
9480         if ((to->to_flags & TOF_SACK) == 0) {
9481                 /* We are done nothing left and no sack. */
9482                 rack_handle_might_revert(tp, rack);
9483                 /*
9484                  * For cases where we struck a dup-ack
9485                  * with no SACK, add to the changes so
9486                  * PRR will work right.
9487                  */
9488                 if (dup_ack_struck && (changed == 0)) {
9489                         changed += ctf_fixed_maxseg(rack->rc_tp);
9490                 }
9491                 goto out;
9492         }
9493         /* Sack block processing */
9494         if (SEQ_GT(th_ack, tp->snd_una))
9495                 ack_point = th_ack;
9496         else
9497                 ack_point = tp->snd_una;
9498         for (i = 0; i < to->to_nsacks; i++) {
9499                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
9500                       &sack, sizeof(sack));
9501                 sack.start = ntohl(sack.start);
9502                 sack.end = ntohl(sack.end);
9503                 if (SEQ_GT(sack.end, sack.start) &&
9504                     SEQ_GT(sack.start, ack_point) &&
9505                     SEQ_LT(sack.start, tp->snd_max) &&
9506                     SEQ_GT(sack.end, ack_point) &&
9507                     SEQ_LEQ(sack.end, tp->snd_max)) {
9508                         sack_blocks[num_sack_blks] = sack;
9509                         num_sack_blks++;
9510                 } else if (SEQ_LEQ(sack.start, th_ack) &&
9511                            SEQ_LEQ(sack.end, th_ack)) {
9512                         int was_tlp;
9513
9514                         was_tlp = rack_note_dsack(rack, sack.start, sack.end);
9515                         /*
9516                          * Its a D-SACK block.
9517                          */
9518                         tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
9519                 }
9520         }
9521         if (rack->rc_dsack_round_seen) {
9522                 /* Is the dsack roound over? */
9523                 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
9524                         /* Yes it is */
9525                         rack->rc_dsack_round_seen = 0;
9526                         rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
9527                 }
9528         }
9529         /*
9530          * Sort the SACK blocks so we can update the rack scoreboard with
9531          * just one pass.
9532          */
9533         num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
9534                                          num_sack_blks, th->th_ack);
9535         ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
9536         if (num_sack_blks == 0) {
9537                 /* Nothing to sack (DSACKs?) */
9538                 goto out_with_totals;
9539         }
9540         if (num_sack_blks < 2) {
9541                 /* Only one, we don't need to sort */
9542                 goto do_sack_work;
9543         }
9544         /* Sort the sacks */
9545         for (i = 0; i < num_sack_blks; i++) {
9546                 for (j = i + 1; j < num_sack_blks; j++) {
9547                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
9548                                 sack = sack_blocks[i];
9549                                 sack_blocks[i] = sack_blocks[j];
9550                                 sack_blocks[j] = sack;
9551                         }
9552                 }
9553         }
9554         /*
9555          * Now are any of the sack block ends the same (yes some
9556          * implementations send these)?
9557          */
9558 again:
9559         if (num_sack_blks == 0)
9560                 goto out_with_totals;
9561         if (num_sack_blks > 1) {
9562                 for (i = 0; i < num_sack_blks; i++) {
9563                         for (j = i + 1; j < num_sack_blks; j++) {
9564                                 if (sack_blocks[i].end == sack_blocks[j].end) {
9565                                         /*
9566                                          * Ok these two have the same end we
9567                                          * want the smallest end and then
9568                                          * throw away the larger and start
9569                                          * again.
9570                                          */
9571                                         if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
9572                                                 /*
9573                                                  * The second block covers
9574                                                  * more area use that
9575                                                  */
9576                                                 sack_blocks[i].start = sack_blocks[j].start;
9577                                         }
9578                                         /*
9579                                          * Now collapse out the dup-sack and
9580                                          * lower the count
9581                                          */
9582                                         for (k = (j + 1); k < num_sack_blks; k++) {
9583                                                 sack_blocks[j].start = sack_blocks[k].start;
9584                                                 sack_blocks[j].end = sack_blocks[k].end;
9585                                                 j++;
9586                                         }
9587                                         num_sack_blks--;
9588                                         goto again;
9589                                 }
9590                         }
9591                 }
9592         }
9593 do_sack_work:
9594         /*
9595          * First lets look to see if
9596          * we have retransmitted and
9597          * can use the transmit next?
9598          */
9599         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
9600         if (rsm &&
9601             SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
9602             SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
9603                 /*
9604                  * We probably did the FR and the next
9605                  * SACK in continues as we would expect.
9606                  */
9607                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
9608                 if (acked) {
9609                         rack->r_wanted_output = 1;
9610                         changed += acked;
9611                 }
9612                 if (num_sack_blks == 1) {
9613                         /*
9614                          * This is what we would expect from
9615                          * a normal implementation to happen
9616                          * after we have retransmitted the FR,
9617                          * i.e the sack-filter pushes down
9618                          * to 1 block and the next to be retransmitted
9619                          * is the sequence in the sack block (has more
9620                          * are acked). Count this as ACK'd data to boost
9621                          * up the chances of recovering any false positives.
9622                          */
9623                         rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
9624                         counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
9625                         counter_u64_add(rack_express_sack, 1);
9626                         if (rack->r_ctl.ack_count > 0xfff00000) {
9627                                 /*
9628                                  * reduce the number to keep us under
9629                                  * a uint32_t.
9630                                  */
9631                                 rack->r_ctl.ack_count /= 2;
9632                                 rack->r_ctl.sack_count /= 2;
9633                         }
9634                         goto out_with_totals;
9635                 } else {
9636                         /*
9637                          * Start the loop through the
9638                          * rest of blocks, past the first block.
9639                          */
9640                         moved_two = 0;
9641                         loop_start = 1;
9642                 }
9643         }
9644         /* Its a sack of some sort */
9645         rack->r_ctl.sack_count++;
9646         if (rack->r_ctl.sack_count > 0xfff00000) {
9647                 /*
9648                  * reduce the number to keep us under
9649                  * a uint32_t.
9650                  */
9651                 rack->r_ctl.ack_count /= 2;
9652                 rack->r_ctl.sack_count /= 2;
9653         }
9654         counter_u64_add(rack_sack_total, 1);
9655         if (rack->sack_attack_disable) {
9656                 /* An attacker disablement is in place */
9657                 if (num_sack_blks > 1) {
9658                         rack->r_ctl.sack_count += (num_sack_blks - 1);
9659                         rack->r_ctl.sack_moved_extra++;
9660                         counter_u64_add(rack_move_some, 1);
9661                         if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
9662                                 rack->r_ctl.sack_moved_extra /= 2;
9663                                 rack->r_ctl.sack_noextra_move /= 2;
9664                         }
9665                 }
9666                 goto out;
9667         }
9668         rsm = rack->r_ctl.rc_sacklast;
9669         for (i = loop_start; i < num_sack_blks; i++) {
9670                 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
9671                 if (acked) {
9672                         rack->r_wanted_output = 1;
9673                         changed += acked;
9674                 }
9675                 if (moved_two) {
9676                         /*
9677                          * If we did not get a SACK for at least a MSS and
9678                          * had to move at all, or if we moved more than our
9679                          * threshold, it counts against the "extra" move.
9680                          */
9681                         rack->r_ctl.sack_moved_extra += moved_two;
9682                         counter_u64_add(rack_move_some, 1);
9683                 } else {
9684                         /*
9685                          * else we did not have to move
9686                          * any more than we would expect.
9687                          */
9688                         rack->r_ctl.sack_noextra_move++;
9689                         counter_u64_add(rack_move_none, 1);
9690                 }
9691                 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
9692                         /*
9693                          * If the SACK was not a full MSS then
9694                          * we add to sack_count the number of
9695                          * MSS's (or possibly more than
9696                          * a MSS if its a TSO send) we had to skip by.
9697                          */
9698                         rack->r_ctl.sack_count += moved_two;
9699                         counter_u64_add(rack_sack_total, moved_two);
9700                 }
9701                 /*
9702                  * Now we need to setup for the next
9703                  * round. First we make sure we won't
9704                  * exceed the size of our uint32_t on
9705                  * the various counts, and then clear out
9706                  * moved_two.
9707                  */
9708                 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
9709                     (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
9710                         rack->r_ctl.sack_moved_extra /= 2;
9711                         rack->r_ctl.sack_noextra_move /= 2;
9712                 }
9713                 if (rack->r_ctl.sack_count > 0xfff00000) {
9714                         rack->r_ctl.ack_count /= 2;
9715                         rack->r_ctl.sack_count /= 2;
9716                 }
9717                 moved_two = 0;
9718         }
9719 out_with_totals:
9720         if (num_sack_blks > 1) {
9721                 /*
9722                  * You get an extra stroke if
9723                  * you have more than one sack-blk, this
9724                  * could be where we are skipping forward
9725                  * and the sack-filter is still working, or
9726                  * it could be an attacker constantly
9727                  * moving us.
9728                  */
9729                 rack->r_ctl.sack_moved_extra++;
9730                 counter_u64_add(rack_move_some, 1);
9731         }
9732 out:
9733 #ifdef NETFLIX_EXP_DETECTION
9734         rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
9735 #endif
9736         if (changed) {
9737                 /* Something changed cancel the rack timer */
9738                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
9739         }
9740         tsused = tcp_get_usecs(NULL);
9741         rsm = tcp_rack_output(tp, rack, tsused);
9742         if ((!IN_FASTRECOVERY(tp->t_flags)) &&
9743             rsm &&
9744             ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
9745                 /* Enter recovery */
9746                 entered_recovery = 1;
9747                 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
9748                 /*
9749                  * When we enter recovery we need to assure we send
9750                  * one packet.
9751                  */
9752                 if (rack->rack_no_prr == 0) {
9753                         rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
9754                         rack_log_to_prr(rack, 8, 0, __LINE__);
9755                 }
9756                 rack->r_timer_override = 1;
9757                 rack->r_early = 0;
9758                 rack->r_ctl.rc_agg_early = 0;
9759         } else if (IN_FASTRECOVERY(tp->t_flags) &&
9760                    rsm &&
9761                    (rack->r_rr_config == 3)) {
9762                 /*
9763                  * Assure we can output and we get no
9764                  * remembered pace time except the retransmit.
9765                  */
9766                 rack->r_timer_override = 1;
9767                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
9768                 rack->r_ctl.rc_resend = rsm;
9769         }
9770         if (IN_FASTRECOVERY(tp->t_flags) &&
9771             (rack->rack_no_prr == 0) &&
9772             (entered_recovery == 0)) {
9773                 rack_update_prr(tp, rack, changed, th_ack);
9774                 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
9775                      ((tcp_in_hpts(rack->rc_inp) == 0) &&
9776                       ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
9777                         /*
9778                          * If you are pacing output you don't want
9779                          * to override.
9780                          */
9781                         rack->r_early = 0;
9782                         rack->r_ctl.rc_agg_early = 0;
9783                         rack->r_timer_override = 1;
9784                 }
9785         }
9786 }
9787
9788 static void
9789 rack_strike_dupack(struct tcp_rack *rack)
9790 {
9791         struct rack_sendmap *rsm;
9792
9793         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
9794         while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
9795                 rsm = TAILQ_NEXT(rsm, r_tnext);
9796                 if (rsm->r_flags & RACK_MUST_RXT) {
9797                         /* Sendmap entries that are marked to
9798                          * be retransmitted do not need dupack's
9799                          * struck. We get these marks for a number
9800                          * of reasons (rxt timeout with no sack, 
9801                          * mtu change, or rwnd collapses). When
9802                          * these events occur, we know we must retransmit
9803                          * them and mark the sendmap entries. Dupack counting
9804                          * is not needed since we are already set to retransmit
9805                          * it as soon as we can.
9806                          */
9807                         continue;
9808                 }
9809         }
9810         if (rsm && (rsm->r_dupack < 0xff)) {
9811                 rsm->r_dupack++;
9812                 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
9813                         struct timeval tv;
9814                         uint32_t cts;
9815                         /*
9816                          * Here we see if we need to retransmit. For
9817                          * a SACK type connection if enough time has passed
9818                          * we will get a return of the rsm. For a non-sack
9819                          * connection we will get the rsm returned if the
9820                          * dupack value is 3 or more.
9821                          */
9822                         cts = tcp_get_usecs(&tv);
9823                         rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
9824                         if (rack->r_ctl.rc_resend != NULL) {
9825                                 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
9826                                         rack_cong_signal(rack->rc_tp, CC_NDUPACK,
9827                                                          rack->rc_tp->snd_una, __LINE__);
9828                                 }
9829                                 rack->r_wanted_output = 1;
9830                                 rack->r_timer_override = 1;
9831                                 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
9832                         }
9833                 } else {
9834                         rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
9835                 }
9836         }
9837 }
9838
9839 static void
9840 rack_check_bottom_drag(struct tcpcb *tp,
9841                        struct tcp_rack *rack,
9842                        struct socket *so, int32_t acked)
9843 {
9844         uint32_t segsiz, minseg;
9845
9846         segsiz = ctf_fixed_maxseg(tp);
9847         minseg = segsiz;
9848
9849         if (tp->snd_max == tp->snd_una) {
9850                 /*
9851                  * We are doing dynamic pacing and we are way
9852                  * under. Basically everything got acked while
9853                  * we were still waiting on the pacer to expire.
9854                  *
9855                  * This means we need to boost the b/w in
9856                  * addition to any earlier boosting of
9857                  * the multiplier.
9858                  */
9859                 rack->rc_dragged_bottom = 1;
9860                 rack_validate_multipliers_at_or_above100(rack);
9861                 /*
9862                  * Lets use the segment bytes acked plus
9863                  * the lowest RTT seen as the basis to
9864                  * form a b/w estimate. This will be off
9865                  * due to the fact that the true estimate
9866                  * should be around 1/2 the time of the RTT
9867                  * but we can settle for that.
9868                  */
9869                 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
9870                     acked) {
9871                         uint64_t bw, calc_bw, rtt;
9872
9873                         rtt = rack->r_ctl.rack_rs.rs_us_rtt;
9874                         if (rtt == 0) {
9875                                 /* no us sample is there a ms one? */
9876                                 if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
9877                                         rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
9878                                 } else {
9879                                         goto no_measurement;
9880                                 }
9881                         }
9882                         bw = acked;
9883                         calc_bw = bw * 1000000;
9884                         calc_bw /= rtt;
9885                         if (rack->r_ctl.last_max_bw &&
9886                             (rack->r_ctl.last_max_bw < calc_bw)) {
9887                                 /*
9888                                  * If we have a last calculated max bw
9889                                  * enforce it.
9890                                  */
9891                                 calc_bw = rack->r_ctl.last_max_bw;
9892                         }
9893                         /* now plop it in */
9894                         if (rack->rc_gp_filled == 0) {
9895                                 if (calc_bw > ONE_POINT_TWO_MEG) {
9896                                         /*
9897                                          * If we have no measurement
9898                                          * don't let us set in more than
9899                                          * 1.2Mbps. If we are still too
9900                                          * low after pacing with this we
9901                                          * will hopefully have a max b/w
9902                                          * available to sanity check things.
9903                                          */
9904                                         calc_bw = ONE_POINT_TWO_MEG;
9905                                 }
9906                                 rack->r_ctl.rc_rtt_diff = 0;
9907                                 rack->r_ctl.gp_bw = calc_bw;
9908                                 rack->rc_gp_filled = 1;
9909                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
9910                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
9911                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
9912                         } else if (calc_bw > rack->r_ctl.gp_bw) {
9913                                 rack->r_ctl.rc_rtt_diff = 0;
9914                                 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
9915                                         rack->r_ctl.num_measurements = RACK_REQ_AVG;
9916                                 rack->r_ctl.gp_bw = calc_bw;
9917                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
9918                         } else
9919                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
9920                         if ((rack->gp_ready == 0) &&
9921                             (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
9922                                 /* We have enough measurements now */
9923                                 rack->gp_ready = 1;
9924                                 rack_set_cc_pacing(rack);
9925                                 if (rack->defer_options)
9926                                         rack_apply_deferred_options(rack);
9927                         }
9928                         /*
9929                          * For acks over 1mss we do a extra boost to simulate
9930                          * where we would get 2 acks (we want 110 for the mul).
9931                          */
9932                         if (acked > segsiz)
9933                                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
9934                 } else {
9935                         /*
9936                          * zero rtt possibly?, settle for just an old increase.
9937                          */
9938 no_measurement:
9939                         rack_increase_bw_mul(rack, -1, 0, 0, 1);
9940                 }
9941         } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
9942                    (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
9943                                                minseg)) &&
9944                    (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
9945                    (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
9946                    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
9947                     (segsiz * rack_req_segs))) {
9948                 /*
9949                  * We are doing dynamic GP pacing and
9950                  * we have everything except 1MSS or less
9951                  * bytes left out. We are still pacing away.
9952                  * And there is data that could be sent, This
9953                  * means we are inserting delayed ack time in
9954                  * our measurements because we are pacing too slow.
9955                  */
9956                 rack_validate_multipliers_at_or_above100(rack);
9957                 rack->rc_dragged_bottom = 1;
9958                 rack_increase_bw_mul(rack, -1, 0, 0, 1);
9959         }
9960 }
9961
9962
9963
9964 static void
9965 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
9966 {
9967         /*
9968          * The fast output path is enabled and we
9969          * have moved the cumack forward. Lets see if
9970          * we can expand forward the fast path length by
9971          * that amount. What we would ideally like to
9972          * do is increase the number of bytes in the
9973          * fast path block (left_to_send) by the
9974          * acked amount. However we have to gate that
9975          * by two factors:
9976          * 1) The amount outstanding and the rwnd of the peer
9977          *    (i.e. we don't want to exceed the rwnd of the peer).
9978          *    <and>
9979          * 2) The amount of data left in the socket buffer (i.e.
9980          *    we can't send beyond what is in the buffer).
9981          *
9982          * Note that this does not take into account any increase
9983          * in the cwnd. We will only extend the fast path by
9984          * what was acked.
9985          */
9986         uint32_t new_total, gating_val;
9987
9988         new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
9989         gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
9990                          (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
9991         if (new_total <= gating_val) {
9992                 /* We can increase left_to_send by the acked amount */
9993                 counter_u64_add(rack_extended_rfo, 1);
9994                 rack->r_ctl.fsb.left_to_send = new_total;
9995                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
9996                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
9997                          rack, rack->r_ctl.fsb.left_to_send,
9998                          sbavail(&rack->rc_inp->inp_socket->so_snd),
9999                          (tp->snd_max - tp->snd_una)));
10000
10001         }
10002 }
10003
10004 static void
10005 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
10006 {
10007         /*
10008          * Here any sendmap entry that points to the
10009          * beginning mbuf must be adjusted to the correct
10010          * offset. This must be called with:
10011          * 1) The socket buffer locked
10012          * 2) snd_una adjusted to its new postion.
10013          *
10014          * Note that (2) implies rack_ack_received has also
10015          * been called.
10016          *
10017          * We grab the first mbuf in the socket buffer and
10018          * then go through the front of the sendmap, recalculating
10019          * the stored offset for any sendmap entry that has
10020          * that mbuf. We must use the sb functions to do this
10021          * since its possible an add was done has well as
10022          * the subtraction we may have just completed. This should
10023          * not be a penalty though, since we just referenced the sb
10024          * to go in and trim off the mbufs that we freed (of course
10025          * there will be a penalty for the sendmap references though).
10026          */
10027         struct mbuf *m;
10028         struct rack_sendmap *rsm;
10029
10030         SOCKBUF_LOCK_ASSERT(sb);
10031         m = sb->sb_mb;
10032         rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
10033         if ((rsm == NULL) || (m == NULL)) {
10034                 /* Nothing outstanding */
10035                 return;
10036         }
10037         while (rsm->m && (rsm->m == m)) {
10038                 /* one to adjust */
10039 #ifdef INVARIANTS
10040                 struct mbuf *tm;
10041                 uint32_t soff;
10042
10043                 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
10044                 if (rsm->orig_m_len != m->m_len) {
10045                         rack_adjust_orig_mlen(rsm);
10046                 }
10047                 if (rsm->soff != soff) {
10048                         /*
10049                          * This is not a fatal error, we anticipate it
10050                          * might happen (the else code), so we count it here
10051                          * so that under invariant we can see that it really
10052                          * does happen.
10053                          */
10054                         counter_u64_add(rack_adjust_map_bw, 1);
10055                 }
10056                 rsm->m = tm;
10057                 rsm->soff = soff;
10058                 if (tm)
10059                         rsm->orig_m_len = rsm->m->m_len;
10060                 else
10061                         rsm->orig_m_len = 0;
10062 #else
10063                 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
10064                 if (rsm->m)
10065                         rsm->orig_m_len = rsm->m->m_len;
10066                 else
10067                         rsm->orig_m_len = 0;
10068 #endif
10069                 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
10070                               rsm);
10071                 if (rsm == NULL)
10072                         break;
10073         }
10074 }
10075
10076 /*
10077  * Return value of 1, we do not need to call rack_process_data().
10078  * return value of 0, rack_process_data can be called.
10079  * For ret_val if its 0 the TCP is locked, if its non-zero
10080  * its unlocked and probably unsafe to touch the TCB.
10081  */
10082 static int
10083 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
10084     struct tcpcb *tp, struct tcpopt *to,
10085     uint32_t tiwin, int32_t tlen,
10086     int32_t * ofia, int32_t thflags, int32_t *ret_val)
10087 {
10088         int32_t ourfinisacked = 0;
10089         int32_t nsegs, acked_amount;
10090         int32_t acked;
10091         struct mbuf *mfree;
10092         struct tcp_rack *rack;
10093         int32_t under_pacing = 0;
10094         int32_t recovery = 0;
10095
10096         rack = (struct tcp_rack *)tp->t_fb_ptr;
10097         if (SEQ_GT(th->th_ack, tp->snd_max)) {
10098                 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
10099                                       &rack->r_ctl.challenge_ack_ts,
10100                                       &rack->r_ctl.challenge_ack_cnt);
10101                 rack->r_wanted_output = 1;
10102                 return (1);
10103         }
10104         if (rack->gp_ready &&
10105             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
10106                 under_pacing = 1;
10107         }
10108         if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
10109                 int in_rec, dup_ack_struck = 0;
10110
10111                 in_rec = IN_FASTRECOVERY(tp->t_flags);
10112                 if (rack->rc_in_persist) {
10113                         tp->t_rxtshift = 0;
10114                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10115                                       rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10116                 }
10117                 if ((th->th_ack == tp->snd_una) &&
10118                     (tiwin == tp->snd_wnd) &&
10119                     ((to->to_flags & TOF_SACK) == 0)) {
10120                         rack_strike_dupack(rack);
10121                         dup_ack_struck = 1;
10122                 }
10123                 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
10124         }
10125         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
10126                 /*
10127                  * Old ack, behind (or duplicate to) the last one rcv'd
10128                  * Note: We mark reordering is occuring if its
10129                  * less than and we have not closed our window.
10130                  */
10131                 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
10132                         rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
10133                 }
10134                 return (0);
10135         }
10136         /*
10137          * If we reach this point, ACK is not a duplicate, i.e., it ACKs
10138          * something we sent.
10139          */
10140         if (tp->t_flags & TF_NEEDSYN) {
10141                 /*
10142                  * T/TCP: Connection was half-synchronized, and our SYN has
10143                  * been ACK'd (so connection is now fully synchronized).  Go
10144                  * to non-starred state, increment snd_una for ACK of SYN,
10145                  * and check if we can do window scaling.
10146                  */
10147                 tp->t_flags &= ~TF_NEEDSYN;
10148                 tp->snd_una++;
10149                 /* Do window scaling? */
10150                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
10151                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
10152                         tp->rcv_scale = tp->request_r_scale;
10153                         /* Send window already scaled. */
10154                 }
10155         }
10156         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10157         INP_WLOCK_ASSERT(tp->t_inpcb);
10158
10159         acked = BYTES_THIS_ACK(tp, th);
10160         if (acked) {
10161                 /*
10162                  * Any time we move the cum-ack forward clear
10163                  * keep-alive tied probe-not-answered. The
10164                  * persists clears its own on entry.
10165                  */
10166                 rack->probe_not_answered = 0;
10167         }
10168         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
10169         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
10170         /*
10171          * If we just performed our first retransmit, and the ACK arrives
10172          * within our recovery window, then it was a mistake to do the
10173          * retransmit in the first place.  Recover our original cwnd and
10174          * ssthresh, and proceed to transmit where we left off.
10175          */
10176         if ((tp->t_flags & TF_PREVVALID) &&
10177             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
10178                 tp->t_flags &= ~TF_PREVVALID;
10179                 if (tp->t_rxtshift == 1 &&
10180                     (int)(ticks - tp->t_badrxtwin) < 0)
10181                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
10182         }
10183         if (acked) {
10184                 /* assure we are not backed off */
10185                 tp->t_rxtshift = 0;
10186                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
10187                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
10188                 rack->rc_tlp_in_progress = 0;
10189                 rack->r_ctl.rc_tlp_cnt_out = 0;
10190                 /*
10191                  * If it is the RXT timer we want to
10192                  * stop it, so we can restart a TLP.
10193                  */
10194                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
10195                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10196 #ifdef NETFLIX_HTTP_LOGGING
10197                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
10198 #endif
10199         }
10200         /*
10201          * If we have a timestamp reply, update smoothed round trip time. If
10202          * no timestamp is present but transmit timer is running and timed
10203          * sequence number was acked, update smoothed round trip time. Since
10204          * we now have an rtt measurement, cancel the timer backoff (cf.,
10205          * Phil Karn's retransmit alg.). Recompute the initial retransmit
10206          * timer.
10207          *
10208          * Some boxes send broken timestamp replies during the SYN+ACK
10209          * phase, ignore timestamps of 0 or we could calculate a huge RTT
10210          * and blow up the retransmit timer.
10211          */
10212         /*
10213          * If all outstanding data is acked, stop retransmit timer and
10214          * remember to restart (more output or persist). If there is more
10215          * data to be acked, restart retransmit timer, using current
10216          * (possibly backed-off) value.
10217          */
10218         if (acked == 0) {
10219                 if (ofia)
10220                         *ofia = ourfinisacked;
10221                 return (0);
10222         }
10223         if (IN_RECOVERY(tp->t_flags)) {
10224                 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
10225                     (SEQ_LT(th->th_ack, tp->snd_max))) {
10226                         tcp_rack_partialack(tp);
10227                 } else {
10228                         rack_post_recovery(tp, th->th_ack);
10229                         recovery = 1;
10230                 }
10231         }
10232         /*
10233          * Let the congestion control algorithm update congestion control
10234          * related information. This typically means increasing the
10235          * congestion window.
10236          */
10237         rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
10238         SOCKBUF_LOCK(&so->so_snd);
10239         acked_amount = min(acked, (int)sbavail(&so->so_snd));
10240         tp->snd_wnd -= acked_amount;
10241         mfree = sbcut_locked(&so->so_snd, acked_amount);
10242         if ((sbused(&so->so_snd) == 0) &&
10243             (acked > acked_amount) &&
10244             (tp->t_state >= TCPS_FIN_WAIT_1) &&
10245             (tp->t_flags & TF_SENTFIN)) {
10246                 /*
10247                  * We must be sure our fin
10248                  * was sent and acked (we can be
10249                  * in FIN_WAIT_1 without having
10250                  * sent the fin).
10251                  */
10252                 ourfinisacked = 1;
10253         }
10254         tp->snd_una = th->th_ack;
10255         if (acked_amount && sbavail(&so->so_snd))
10256                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
10257         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
10258         /* NB: sowwakeup_locked() does an implicit unlock. */
10259         sowwakeup_locked(so);
10260         m_freem(mfree);
10261         if (SEQ_GT(tp->snd_una, tp->snd_recover))
10262                 tp->snd_recover = tp->snd_una;
10263
10264         if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
10265                 tp->snd_nxt = tp->snd_una;
10266         }
10267         if (under_pacing &&
10268             (rack->use_fixed_rate == 0) &&
10269             (rack->in_probe_rtt == 0) &&
10270             rack->rc_gp_dyn_mul &&
10271             rack->rc_always_pace) {
10272                 /* Check if we are dragging bottom */
10273                 rack_check_bottom_drag(tp, rack, so, acked);
10274         }
10275         if (tp->snd_una == tp->snd_max) {
10276                 /* Nothing left outstanding */
10277                 tp->t_flags &= ~TF_PREVVALID;
10278                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
10279                 rack->r_ctl.retran_during_recovery = 0;
10280                 rack->r_ctl.dsack_byte_cnt = 0;
10281                 if (rack->r_ctl.rc_went_idle_time == 0)
10282                         rack->r_ctl.rc_went_idle_time = 1;
10283                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
10284                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
10285                         tp->t_acktime = 0;
10286                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10287                 /* Set need output so persist might get set */
10288                 rack->r_wanted_output = 1;
10289                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
10290                 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
10291                     (sbavail(&so->so_snd) == 0) &&
10292                     (tp->t_flags2 & TF2_DROP_AF_DATA)) {
10293                         /*
10294                          * The socket was gone and the
10295                          * peer sent data (now or in the past), time to
10296                          * reset him.
10297                          */
10298                         *ret_val = 1;
10299                         /* tcp_close will kill the inp pre-log the Reset */
10300                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
10301                         tp = tcp_close(tp);
10302                         ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
10303                         return (1);
10304                 }
10305         }
10306         if (ofia)
10307                 *ofia = ourfinisacked;
10308         return (0);
10309 }
10310
10311 static void
10312 rack_collapsed_window(struct tcp_rack *rack)
10313 {
10314         /*
10315          * Now we must walk the
10316          * send map and divide the
10317          * ones left stranded. These
10318          * guys can't cause us to abort
10319          * the connection and are really
10320          * "unsent". However if a buggy
10321          * client actually did keep some
10322          * of the data i.e. collapsed the win
10323          * and refused to ack and then opened
10324          * the win and acked that data. We would
10325          * get into an ack war, the simplier
10326          * method then of just pretending we
10327          * did not send those segments something
10328          * won't work.
10329          */
10330         struct rack_sendmap *rsm, *nrsm, fe;
10331 #ifdef INVARIANTS
10332         struct rack_sendmap *insret;
10333 #endif
10334         tcp_seq max_seq;
10335
10336         rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
10337         max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
10338         memset(&fe, 0, sizeof(fe));
10339         fe.r_start = max_seq;
10340         /* Find the first seq past or at maxseq */
10341         rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
10342         if (rsm == NULL) {
10343                 /* Nothing to do strange */
10344                 rack->rc_has_collapsed = 0;
10345                 return;
10346         }
10347         /*
10348          * Now do we need to split at
10349          * the collapse point?
10350          */
10351         if (SEQ_GT(max_seq, rsm->r_start)) {
10352                 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
10353                 if (nrsm == NULL) {
10354                         /* We can't get a rsm, mark all? */
10355                         nrsm = rsm;
10356                         goto no_split;
10357                 }
10358                 /* Clone it */
10359                 rack_clone_rsm(rack, nrsm, rsm, max_seq);
10360 #ifndef INVARIANTS
10361                 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
10362 #else
10363                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
10364                 if (insret != NULL) {
10365                         panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
10366                               nrsm, insret, rack, rsm);
10367                 }
10368 #endif
10369                 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
10370                 if (rsm->r_in_tmap) {
10371                         TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
10372                         nrsm->r_in_tmap = 1;
10373                 }
10374                 /*
10375                  * Set in the new RSM as the
10376                  * collapsed starting point
10377                  */
10378                 rsm = nrsm;
10379         }
10380 no_split:
10381         counter_u64_add(rack_collapsed_win, 1);
10382         RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
10383                 nrsm->r_flags |= RACK_RWND_COLLAPSED;
10384         }
10385         rack->rc_has_collapsed = 1;
10386 }
10387
10388 static void
10389 rack_un_collapse_window(struct tcp_rack *rack)
10390 {
10391         struct rack_sendmap *rsm;
10392         int cnt = 0;;
10393
10394         rack->r_ctl.rc_out_at_rto = 0;
10395         rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una;
10396         RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
10397                 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
10398                         rsm->r_flags &= ~RACK_RWND_COLLAPSED;
10399                         rsm->r_flags |= RACK_MUST_RXT;
10400                         if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
10401                                 rack->r_ctl.rc_snd_max_at_rto = rsm->r_end;
10402                                 rack->r_ctl.rc_out_at_rto += (rsm->r_end - rsm->r_start);
10403                         }
10404                         cnt++;
10405                 }
10406                 else
10407                         break;
10408         }
10409         rack->rc_has_collapsed = 0;
10410         if (cnt) {
10411                 rack->r_must_retran = 1;
10412         }
10413 }
10414
10415 static void
10416 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
10417                         int32_t tlen, int32_t tfo_syn)
10418 {
10419         if (DELAY_ACK(tp, tlen) || tfo_syn) {
10420                 if (rack->rc_dack_mode &&
10421                     (tlen > 500) &&
10422                     (rack->rc_dack_toggle == 1)) {
10423                         goto no_delayed_ack;
10424                 }
10425                 rack_timer_cancel(tp, rack,
10426                                   rack->r_ctl.rc_rcvtime, __LINE__);
10427                 tp->t_flags |= TF_DELACK;
10428         } else {
10429 no_delayed_ack:
10430                 rack->r_wanted_output = 1;
10431                 tp->t_flags |= TF_ACKNOW;
10432                 if (rack->rc_dack_mode) {
10433                         if (tp->t_flags & TF_DELACK)
10434                                 rack->rc_dack_toggle = 1;
10435                         else
10436                                 rack->rc_dack_toggle = 0;
10437                 }
10438         }
10439 }
10440
10441 static void
10442 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
10443 {
10444         /*
10445          * If fast output is in progress, lets validate that
10446          * the new window did not shrink on us and make it
10447          * so fast output should end.
10448          */
10449         if (rack->r_fast_output) {
10450                 uint32_t out;
10451
10452                 /*
10453                  * Calculate what we will send if left as is
10454                  * and compare that to our send window.
10455                  */
10456                 out = ctf_outstanding(tp);
10457                 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
10458                         /* ok we have an issue */
10459                         if (out >= tp->snd_wnd) {
10460                                 /* Turn off fast output the window is met or collapsed */
10461                                 rack->r_fast_output = 0;
10462                         } else {
10463                                 /* we have some room left */
10464                                 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
10465                                 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
10466                                         /* If not at least 1 full segment never mind */
10467                                         rack->r_fast_output = 0;
10468                                 }
10469                         }
10470                 }
10471         }
10472 }
10473
10474
10475 /*
10476  * Return value of 1, the TCB is unlocked and most
10477  * likely gone, return value of 0, the TCP is still
10478  * locked.
10479  */
10480 static int
10481 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
10482     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
10483     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
10484 {
10485         /*
10486          * Update window information. Don't look at window if no ACK: TAC's
10487          * send garbage on first SYN.
10488          */
10489         int32_t nsegs;
10490         int32_t tfo_syn;
10491         struct tcp_rack *rack;
10492
10493         rack = (struct tcp_rack *)tp->t_fb_ptr;
10494         INP_WLOCK_ASSERT(tp->t_inpcb);
10495         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10496         if ((thflags & TH_ACK) &&
10497             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
10498             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
10499             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
10500                 /* keep track of pure window updates */
10501                 if (tlen == 0 &&
10502                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
10503                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
10504                 tp->snd_wnd = tiwin;
10505                 rack_validate_fo_sendwin_up(tp, rack);
10506                 tp->snd_wl1 = th->th_seq;
10507                 tp->snd_wl2 = th->th_ack;
10508                 if (tp->snd_wnd > tp->max_sndwnd)
10509                         tp->max_sndwnd = tp->snd_wnd;
10510                 rack->r_wanted_output = 1;
10511         } else if (thflags & TH_ACK) {
10512                 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
10513                         tp->snd_wnd = tiwin;
10514                         rack_validate_fo_sendwin_up(tp, rack);
10515                         tp->snd_wl1 = th->th_seq;
10516                         tp->snd_wl2 = th->th_ack;
10517                 }
10518         }
10519         if (tp->snd_wnd < ctf_outstanding(tp))
10520                 /* The peer collapsed the window */
10521                 rack_collapsed_window(rack);
10522         else if (rack->rc_has_collapsed)
10523                 rack_un_collapse_window(rack);
10524         /* Was persist timer active and now we have window space? */
10525         if ((rack->rc_in_persist != 0) &&
10526             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
10527                                 rack->r_ctl.rc_pace_min_segs))) {
10528                 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10529                 tp->snd_nxt = tp->snd_max;
10530                 /* Make sure we output to start the timer */
10531                 rack->r_wanted_output = 1;
10532         }
10533         /* Do we enter persists? */
10534         if ((rack->rc_in_persist == 0) &&
10535             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
10536             TCPS_HAVEESTABLISHED(tp->t_state) &&
10537             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
10538             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
10539             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
10540                 /*
10541                  * Here the rwnd is less than
10542                  * the pacing size, we are established,
10543                  * nothing is outstanding, and there is
10544                  * data to send. Enter persists.
10545                  */
10546                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
10547         }
10548         if (tp->t_flags2 & TF2_DROP_AF_DATA) {
10549                 m_freem(m);
10550                 return (0);
10551         }
10552         /*
10553          * don't process the URG bit, ignore them drag
10554          * along the up.
10555          */
10556         tp->rcv_up = tp->rcv_nxt;
10557         INP_WLOCK_ASSERT(tp->t_inpcb);
10558
10559         /*
10560          * Process the segment text, merging it into the TCP sequencing
10561          * queue, and arranging for acknowledgment of receipt if necessary.
10562          * This process logically involves adjusting tp->rcv_wnd as data is
10563          * presented to the user (this happens in tcp_usrreq.c, case
10564          * PRU_RCVD).  If a FIN has already been received on this connection
10565          * then we just ignore the text.
10566          */
10567         tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
10568                    IS_FASTOPEN(tp->t_flags));
10569         if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
10570             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10571                 tcp_seq save_start = th->th_seq;
10572                 tcp_seq save_rnxt  = tp->rcv_nxt;
10573                 int     save_tlen  = tlen;
10574
10575                 m_adj(m, drop_hdrlen);  /* delayed header drop */
10576                 /*
10577                  * Insert segment which includes th into TCP reassembly
10578                  * queue with control block tp.  Set thflags to whether
10579                  * reassembly now includes a segment with FIN.  This handles
10580                  * the common case inline (segment is the next to be
10581                  * received on an established connection, and the queue is
10582                  * empty), avoiding linkage into and removal from the queue
10583                  * and repetition of various conversions. Set DELACK for
10584                  * segments received in order, but ack immediately when
10585                  * segments are out of order (so fast retransmit can work).
10586                  */
10587                 if (th->th_seq == tp->rcv_nxt &&
10588                     SEGQ_EMPTY(tp) &&
10589                     (TCPS_HAVEESTABLISHED(tp->t_state) ||
10590                     tfo_syn)) {
10591 #ifdef NETFLIX_SB_LIMITS
10592                         u_int mcnt, appended;
10593
10594                         if (so->so_rcv.sb_shlim) {
10595                                 mcnt = m_memcnt(m);
10596                                 appended = 0;
10597                                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
10598                                     CFO_NOSLEEP, NULL) == false) {
10599                                         counter_u64_add(tcp_sb_shlim_fails, 1);
10600                                         m_freem(m);
10601                                         return (0);
10602                                 }
10603                         }
10604 #endif
10605                         rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
10606                         tp->rcv_nxt += tlen;
10607                         if (tlen &&
10608                             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
10609                             (tp->t_fbyte_in == 0)) {
10610                                 tp->t_fbyte_in = ticks;
10611                                 if (tp->t_fbyte_in == 0)
10612                                         tp->t_fbyte_in = 1;
10613                                 if (tp->t_fbyte_out && tp->t_fbyte_in)
10614                                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
10615                         }
10616                         thflags = tcp_get_flags(th) & TH_FIN;
10617                         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
10618                         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
10619                         SOCKBUF_LOCK(&so->so_rcv);
10620                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10621                                 m_freem(m);
10622                         } else
10623 #ifdef NETFLIX_SB_LIMITS
10624                                 appended =
10625 #endif
10626                                         sbappendstream_locked(&so->so_rcv, m, 0);
10627
10628                         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
10629                         /* NB: sorwakeup_locked() does an implicit unlock. */
10630                         sorwakeup_locked(so);
10631 #ifdef NETFLIX_SB_LIMITS
10632                         if (so->so_rcv.sb_shlim && appended != mcnt)
10633                                 counter_fo_release(so->so_rcv.sb_shlim,
10634                                     mcnt - appended);
10635 #endif
10636                 } else {
10637                         /*
10638                          * XXX: Due to the header drop above "th" is
10639                          * theoretically invalid by now.  Fortunately
10640                          * m_adj() doesn't actually frees any mbufs when
10641                          * trimming from the head.
10642                          */
10643                         tcp_seq temp = save_start;
10644
10645                         thflags = tcp_reass(tp, th, &temp, &tlen, m);
10646                         tp->t_flags |= TF_ACKNOW;
10647                         if (tp->t_flags & TF_WAKESOR) {
10648                                 tp->t_flags &= ~TF_WAKESOR;
10649                                 /* NB: sorwakeup_locked() does an implicit unlock. */
10650                                 sorwakeup_locked(so);
10651                         }
10652                 }
10653                 if ((tp->t_flags & TF_SACK_PERMIT) &&
10654                     (save_tlen > 0) &&
10655                     TCPS_HAVEESTABLISHED(tp->t_state)) {
10656                         if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
10657                                 /*
10658                                  * DSACK actually handled in the fastpath
10659                                  * above.
10660                                  */
10661                                 RACK_OPTS_INC(tcp_sack_path_1);
10662                                 tcp_update_sack_list(tp, save_start,
10663                                     save_start + save_tlen);
10664                         } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
10665                                 if ((tp->rcv_numsacks >= 1) &&
10666                                     (tp->sackblks[0].end == save_start)) {
10667                                         /*
10668                                          * Partial overlap, recorded at todrop
10669                                          * above.
10670                                          */
10671                                         RACK_OPTS_INC(tcp_sack_path_2a);
10672                                         tcp_update_sack_list(tp,
10673                                             tp->sackblks[0].start,
10674                                             tp->sackblks[0].end);
10675                                 } else {
10676                                         RACK_OPTS_INC(tcp_sack_path_2b);
10677                                         tcp_update_dsack_list(tp, save_start,
10678                                             save_start + save_tlen);
10679                                 }
10680                         } else if (tlen >= save_tlen) {
10681                                 /* Update of sackblks. */
10682                                 RACK_OPTS_INC(tcp_sack_path_3);
10683                                 tcp_update_dsack_list(tp, save_start,
10684                                     save_start + save_tlen);
10685                         } else if (tlen > 0) {
10686                                 RACK_OPTS_INC(tcp_sack_path_4);
10687                                 tcp_update_dsack_list(tp, save_start,
10688                                     save_start + tlen);
10689                         }
10690                 }
10691         } else {
10692                 m_freem(m);
10693                 thflags &= ~TH_FIN;
10694         }
10695
10696         /*
10697          * If FIN is received ACK the FIN and let the user know that the
10698          * connection is closing.
10699          */
10700         if (thflags & TH_FIN) {
10701                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
10702                         /* The socket upcall is handled by socantrcvmore. */
10703                         socantrcvmore(so);
10704                         /*
10705                          * If connection is half-synchronized (ie NEEDSYN
10706                          * flag on) then delay ACK, so it may be piggybacked
10707                          * when SYN is sent. Otherwise, since we received a
10708                          * FIN then no more input can be expected, send ACK
10709                          * now.
10710                          */
10711                         if (tp->t_flags & TF_NEEDSYN) {
10712                                 rack_timer_cancel(tp, rack,
10713                                     rack->r_ctl.rc_rcvtime, __LINE__);
10714                                 tp->t_flags |= TF_DELACK;
10715                         } else {
10716                                 tp->t_flags |= TF_ACKNOW;
10717                         }
10718                         tp->rcv_nxt++;
10719                 }
10720                 switch (tp->t_state) {
10721                         /*
10722                          * In SYN_RECEIVED and ESTABLISHED STATES enter the
10723                          * CLOSE_WAIT state.
10724                          */
10725                 case TCPS_SYN_RECEIVED:
10726                         tp->t_starttime = ticks;
10727                         /* FALLTHROUGH */
10728                 case TCPS_ESTABLISHED:
10729                         rack_timer_cancel(tp, rack,
10730                             rack->r_ctl.rc_rcvtime, __LINE__);
10731                         tcp_state_change(tp, TCPS_CLOSE_WAIT);
10732                         break;
10733
10734                         /*
10735                          * If still in FIN_WAIT_1 STATE FIN has not been
10736                          * acked so enter the CLOSING state.
10737                          */
10738                 case TCPS_FIN_WAIT_1:
10739                         rack_timer_cancel(tp, rack,
10740                             rack->r_ctl.rc_rcvtime, __LINE__);
10741                         tcp_state_change(tp, TCPS_CLOSING);
10742                         break;
10743
10744                         /*
10745                          * In FIN_WAIT_2 state enter the TIME_WAIT state,
10746                          * starting the time-wait timer, turning off the
10747                          * other standard timers.
10748                          */
10749                 case TCPS_FIN_WAIT_2:
10750                         rack_timer_cancel(tp, rack,
10751                             rack->r_ctl.rc_rcvtime, __LINE__);
10752                         tcp_twstart(tp);
10753                         return (1);
10754                 }
10755         }
10756         /*
10757          * Return any desired output.
10758          */
10759         if ((tp->t_flags & TF_ACKNOW) ||
10760             (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
10761                 rack->r_wanted_output = 1;
10762         }
10763         INP_WLOCK_ASSERT(tp->t_inpcb);
10764         return (0);
10765 }
10766
10767 /*
10768  * Here nothing is really faster, its just that we
10769  * have broken out the fast-data path also just like
10770  * the fast-ack.
10771  */
10772 static int
10773 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
10774     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10775     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
10776 {
10777         int32_t nsegs;
10778         int32_t newsize = 0;    /* automatic sockbuf scaling */
10779         struct tcp_rack *rack;
10780 #ifdef NETFLIX_SB_LIMITS
10781         u_int mcnt, appended;
10782 #endif
10783 #ifdef TCPDEBUG
10784         /*
10785          * The size of tcp_saveipgen must be the size of the max ip header,
10786          * now IPv6.
10787          */
10788         u_char tcp_saveipgen[IP6_HDR_LEN];
10789         struct tcphdr tcp_savetcp;
10790         short ostate = 0;
10791
10792 #endif
10793         /*
10794          * If last ACK falls within this segment's sequence numbers, record
10795          * the timestamp. NOTE that the test is modified according to the
10796          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
10797          */
10798         if (__predict_false(th->th_seq != tp->rcv_nxt)) {
10799                 return (0);
10800         }
10801         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
10802                 return (0);
10803         }
10804         if (tiwin && tiwin != tp->snd_wnd) {
10805                 return (0);
10806         }
10807         if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
10808                 return (0);
10809         }
10810         if (__predict_false((to->to_flags & TOF_TS) &&
10811             (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
10812                 return (0);
10813         }
10814         if (__predict_false((th->th_ack != tp->snd_una))) {
10815                 return (0);
10816         }
10817         if (__predict_false(tlen > sbspace(&so->so_rcv))) {
10818                 return (0);
10819         }
10820         if ((to->to_flags & TOF_TS) != 0 &&
10821             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
10822                 tp->ts_recent_age = tcp_ts_getticks();
10823                 tp->ts_recent = to->to_tsval;
10824         }
10825         rack = (struct tcp_rack *)tp->t_fb_ptr;
10826         /*
10827          * This is a pure, in-sequence data packet with nothing on the
10828          * reassembly queue and we have enough buffer space to take it.
10829          */
10830         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10831
10832 #ifdef NETFLIX_SB_LIMITS
10833         if (so->so_rcv.sb_shlim) {
10834                 mcnt = m_memcnt(m);
10835                 appended = 0;
10836                 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
10837                     CFO_NOSLEEP, NULL) == false) {
10838                         counter_u64_add(tcp_sb_shlim_fails, 1);
10839                         m_freem(m);
10840                         return (1);
10841                 }
10842         }
10843 #endif
10844         /* Clean receiver SACK report if present */
10845         if (tp->rcv_numsacks)
10846                 tcp_clean_sackreport(tp);
10847         KMOD_TCPSTAT_INC(tcps_preddat);
10848         tp->rcv_nxt += tlen;
10849         if (tlen &&
10850             ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
10851             (tp->t_fbyte_in == 0)) {
10852                 tp->t_fbyte_in = ticks;
10853                 if (tp->t_fbyte_in == 0)
10854                         tp->t_fbyte_in = 1;
10855                 if (tp->t_fbyte_out && tp->t_fbyte_in)
10856                         tp->t_flags2 |= TF2_FBYTES_COMPLETE;
10857         }
10858         /*
10859          * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
10860          */
10861         tp->snd_wl1 = th->th_seq;
10862         /*
10863          * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
10864          */
10865         tp->rcv_up = tp->rcv_nxt;
10866         KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
10867         KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
10868 #ifdef TCPDEBUG
10869         if (so->so_options & SO_DEBUG)
10870                 tcp_trace(TA_INPUT, ostate, tp,
10871                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
10872 #endif
10873         newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
10874
10875         /* Add data to socket buffer. */
10876         SOCKBUF_LOCK(&so->so_rcv);
10877         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
10878                 m_freem(m);
10879         } else {
10880                 /*
10881                  * Set new socket buffer size. Give up when limit is
10882                  * reached.
10883                  */
10884                 if (newsize)
10885                         if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
10886                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
10887                 m_adj(m, drop_hdrlen);  /* delayed header drop */
10888 #ifdef NETFLIX_SB_LIMITS
10889                 appended =
10890 #endif
10891                         sbappendstream_locked(&so->so_rcv, m, 0);
10892                 ctf_calc_rwin(so, tp);
10893         }
10894         rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
10895         /* NB: sorwakeup_locked() does an implicit unlock. */
10896         sorwakeup_locked(so);
10897 #ifdef NETFLIX_SB_LIMITS
10898         if (so->so_rcv.sb_shlim && mcnt != appended)
10899                 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
10900 #endif
10901         rack_handle_delayed_ack(tp, rack, tlen, 0);
10902         if (tp->snd_una == tp->snd_max)
10903                 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
10904         return (1);
10905 }
10906
10907 /*
10908  * This subfunction is used to try to highly optimize the
10909  * fast path. We again allow window updates that are
10910  * in sequence to remain in the fast-path. We also add
10911  * in the __predict's to attempt to help the compiler.
10912  * Note that if we return a 0, then we can *not* process
10913  * it and the caller should push the packet into the
10914  * slow-path.
10915  */
10916 static int
10917 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
10918     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
10919     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
10920 {
10921         int32_t acked;
10922         int32_t nsegs;
10923 #ifdef TCPDEBUG
10924         /*
10925          * The size of tcp_saveipgen must be the size of the max ip header,
10926          * now IPv6.
10927          */
10928         u_char tcp_saveipgen[IP6_HDR_LEN];
10929         struct tcphdr tcp_savetcp;
10930         short ostate = 0;
10931 #endif
10932         int32_t under_pacing = 0;
10933         struct tcp_rack *rack;
10934
10935         if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
10936                 /* Old ack, behind (or duplicate to) the last one rcv'd */
10937                 return (0);
10938         }
10939         if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
10940                 /* Above what we have sent? */
10941                 return (0);
10942         }
10943         if (__predict_false(tp->snd_nxt != tp->snd_max)) {
10944                 /* We are retransmitting */
10945                 return (0);
10946         }
10947         if (__predict_false(tiwin == 0)) {
10948                 /* zero window */
10949                 return (0);
10950         }
10951         if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
10952                 /* We need a SYN or a FIN, unlikely.. */
10953                 return (0);
10954         }
10955         if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
10956                 /* Timestamp is behind .. old ack with seq wrap? */
10957                 return (0);
10958         }
10959         if (__predict_false(IN_RECOVERY(tp->t_flags))) {
10960                 /* Still recovering */
10961                 return (0);
10962         }
10963         rack = (struct tcp_rack *)tp->t_fb_ptr;
10964         if (rack->r_ctl.rc_sacked) {
10965                 /* We have sack holes on our scoreboard */
10966                 return (0);
10967         }
10968         /* Ok if we reach here, we can process a fast-ack */
10969         if (rack->gp_ready &&
10970             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
10971                 under_pacing = 1;
10972         }
10973         nsegs = max(1, m->m_pkthdr.lro_nsegs);
10974         rack_log_ack(tp, to, th, 0, 0);
10975         /* Did the window get updated? */
10976         if (tiwin != tp->snd_wnd) {
10977                 tp->snd_wnd = tiwin;
10978                 rack_validate_fo_sendwin_up(tp, rack);
10979                 tp->snd_wl1 = th->th_seq;
10980                 if (tp->snd_wnd > tp->max_sndwnd)
10981                         tp->max_sndwnd = tp->snd_wnd;
10982         }
10983         /* Do we exit persists? */
10984         if ((rack->rc_in_persist != 0) &&
10985             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
10986                                rack->r_ctl.rc_pace_min_segs))) {
10987                 rack_exit_persist(tp, rack, cts);
10988         }
10989         /* Do we enter persists? */
10990         if ((rack->rc_in_persist == 0) &&
10991             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
10992             TCPS_HAVEESTABLISHED(tp->t_state) &&
10993             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
10994             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
10995             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
10996                 /*
10997                  * Here the rwnd is less than
10998                  * the pacing size, we are established,
10999                  * nothing is outstanding, and there is
11000                  * data to send. Enter persists.
11001                  */
11002                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
11003         }
11004         /*
11005          * If last ACK falls within this segment's sequence numbers, record
11006          * the timestamp. NOTE that the test is modified according to the
11007          * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
11008          */
11009         if ((to->to_flags & TOF_TS) != 0 &&
11010             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
11011                 tp->ts_recent_age = tcp_ts_getticks();
11012                 tp->ts_recent = to->to_tsval;
11013         }
11014         /*
11015          * This is a pure ack for outstanding data.
11016          */
11017         KMOD_TCPSTAT_INC(tcps_predack);
11018
11019         /*
11020          * "bad retransmit" recovery.
11021          */
11022         if ((tp->t_flags & TF_PREVVALID) &&
11023             ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
11024                 tp->t_flags &= ~TF_PREVVALID;
11025                 if (tp->t_rxtshift == 1 &&
11026                     (int)(ticks - tp->t_badrxtwin) < 0)
11027                         rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
11028         }
11029         /*
11030          * Recalculate the transmit timer / rtt.
11031          *
11032          * Some boxes send broken timestamp replies during the SYN+ACK
11033          * phase, ignore timestamps of 0 or we could calculate a huge RTT
11034          * and blow up the retransmit timer.
11035          */
11036         acked = BYTES_THIS_ACK(tp, th);
11037
11038 #ifdef TCP_HHOOK
11039         /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
11040         hhook_run_tcp_est_in(tp, th, to);
11041 #endif
11042         KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
11043         KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
11044         if (acked) {
11045                 struct mbuf *mfree;
11046
11047                 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
11048                 SOCKBUF_LOCK(&so->so_snd);
11049                 mfree = sbcut_locked(&so->so_snd, acked);
11050                 tp->snd_una = th->th_ack;
11051                 /* Note we want to hold the sb lock through the sendmap adjust */
11052                 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
11053                 /* Wake up the socket if we have room to write more */
11054                 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
11055                 sowwakeup_locked(so);
11056                 m_freem(mfree);
11057                 tp->t_rxtshift = 0;
11058                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11059                               rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11060                 rack->rc_tlp_in_progress = 0;
11061                 rack->r_ctl.rc_tlp_cnt_out = 0;
11062                 /*
11063                  * If it is the RXT timer we want to
11064                  * stop it, so we can restart a TLP.
11065                  */
11066                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
11067                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11068 #ifdef NETFLIX_HTTP_LOGGING
11069                 tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
11070 #endif
11071         }
11072         /*
11073          * Let the congestion control algorithm update congestion control
11074          * related information. This typically means increasing the
11075          * congestion window.
11076          */
11077         if (tp->snd_wnd < ctf_outstanding(tp)) {
11078                 /* The peer collapsed the window */
11079                 rack_collapsed_window(rack);
11080         } else if (rack->rc_has_collapsed)
11081                 rack_un_collapse_window(rack);
11082
11083         /*
11084          * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
11085          */
11086         tp->snd_wl2 = th->th_ack;
11087         tp->t_dupacks = 0;
11088         m_freem(m);
11089         /* ND6_HINT(tp);         *//* Some progress has been made. */
11090
11091         /*
11092          * If all outstanding data are acked, stop retransmit timer,
11093          * otherwise restart timer using current (possibly backed-off)
11094          * value. If process is waiting for space, wakeup/selwakeup/signal.
11095          * If data are ready to send, let tcp_output decide between more
11096          * output or persist.
11097          */
11098 #ifdef TCPDEBUG
11099         if (so->so_options & SO_DEBUG)
11100                 tcp_trace(TA_INPUT, ostate, tp,
11101                     (void *)tcp_saveipgen,
11102                     &tcp_savetcp, 0);
11103 #endif
11104         if (under_pacing &&
11105             (rack->use_fixed_rate == 0) &&
11106             (rack->in_probe_rtt == 0) &&
11107             rack->rc_gp_dyn_mul &&
11108             rack->rc_always_pace) {
11109                 /* Check if we are dragging bottom */
11110                 rack_check_bottom_drag(tp, rack, so, acked);
11111         }
11112         if (tp->snd_una == tp->snd_max) {
11113                 tp->t_flags &= ~TF_PREVVALID;
11114                 rack->r_ctl.retran_during_recovery = 0;
11115                 rack->r_ctl.dsack_byte_cnt = 0;
11116                 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
11117                 if (rack->r_ctl.rc_went_idle_time == 0)
11118                         rack->r_ctl.rc_went_idle_time = 1;
11119                 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
11120                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
11121                         tp->t_acktime = 0;
11122                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11123         }
11124         if (acked && rack->r_fast_output)
11125                 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
11126         if (sbavail(&so->so_snd)) {
11127                 rack->r_wanted_output = 1;
11128         }
11129         return (1);
11130 }
11131
11132 /*
11133  * Return value of 1, the TCB is unlocked and most
11134  * likely gone, return value of 0, the TCP is still
11135  * locked.
11136  */
11137 static int
11138 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
11139     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11140     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11141 {
11142         int32_t ret_val = 0;
11143         int32_t todrop;
11144         int32_t ourfinisacked = 0;
11145         struct tcp_rack *rack;
11146
11147         ctf_calc_rwin(so, tp);
11148         /*
11149          * If the state is SYN_SENT: if seg contains an ACK, but not for our
11150          * SYN, drop the input. if seg contains a RST, then drop the
11151          * connection. if seg does not contain SYN, then drop it. Otherwise
11152          * this is an acceptable SYN segment initialize tp->rcv_nxt and
11153          * tp->irs if seg contains ack then advance tp->snd_una if seg
11154          * contains an ECE and ECN support is enabled, the stream is ECN
11155          * capable. if SYN has been acked change to ESTABLISHED else
11156          * SYN_RCVD state arrange for segment to be acked (eventually)
11157          * continue processing rest of data/controls.
11158          */
11159         if ((thflags & TH_ACK) &&
11160             (SEQ_LEQ(th->th_ack, tp->iss) ||
11161             SEQ_GT(th->th_ack, tp->snd_max))) {
11162                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11163                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11164                 return (1);
11165         }
11166         if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
11167                 TCP_PROBE5(connect__refused, NULL, tp,
11168                     mtod(m, const char *), tp, th);
11169                 tp = tcp_drop(tp, ECONNREFUSED);
11170                 ctf_do_drop(m, tp);
11171                 return (1);
11172         }
11173         if (thflags & TH_RST) {
11174                 ctf_do_drop(m, tp);
11175                 return (1);
11176         }
11177         if (!(thflags & TH_SYN)) {
11178                 ctf_do_drop(m, tp);
11179                 return (1);
11180         }
11181         tp->irs = th->th_seq;
11182         tcp_rcvseqinit(tp);
11183         rack = (struct tcp_rack *)tp->t_fb_ptr;
11184         if (thflags & TH_ACK) {
11185                 int tfo_partial = 0;
11186
11187                 KMOD_TCPSTAT_INC(tcps_connects);
11188                 soisconnected(so);
11189 #ifdef MAC
11190                 mac_socketpeer_set_from_mbuf(m, so);
11191 #endif
11192                 /* Do window scaling on this connection? */
11193                 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11194                     (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11195                         tp->rcv_scale = tp->request_r_scale;
11196                 }
11197                 tp->rcv_adv += min(tp->rcv_wnd,
11198                     TCP_MAXWIN << tp->rcv_scale);
11199                 /*
11200                  * If not all the data that was sent in the TFO SYN
11201                  * has been acked, resend the remainder right away.
11202                  */
11203                 if (IS_FASTOPEN(tp->t_flags) &&
11204                     (tp->snd_una != tp->snd_max)) {
11205                         tp->snd_nxt = th->th_ack;
11206                         tfo_partial = 1;
11207                 }
11208                 /*
11209                  * If there's data, delay ACK; if there's also a FIN ACKNOW
11210                  * will be turned on later.
11211                  */
11212                 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
11213                         rack_timer_cancel(tp, rack,
11214                                           rack->r_ctl.rc_rcvtime, __LINE__);
11215                         tp->t_flags |= TF_DELACK;
11216                 } else {
11217                         rack->r_wanted_output = 1;
11218                         tp->t_flags |= TF_ACKNOW;
11219                         rack->rc_dack_toggle = 0;
11220                 }
11221
11222                 tcp_ecn_input_syn_sent(tp, thflags, iptos);
11223
11224                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
11225                         /*
11226                          * We advance snd_una for the
11227                          * fast open case. If th_ack is
11228                          * acknowledging data beyond
11229                          * snd_una we can't just call
11230                          * ack-processing since the
11231                          * data stream in our send-map
11232                          * will start at snd_una + 1 (one
11233                          * beyond the SYN). If its just
11234                          * equal we don't need to do that
11235                          * and there is no send_map.
11236                          */
11237                         tp->snd_una++;
11238                 }
11239                 /*
11240                  * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
11241                  * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
11242                  */
11243                 tp->t_starttime = ticks;
11244                 if (tp->t_flags & TF_NEEDFIN) {
11245                         tcp_state_change(tp, TCPS_FIN_WAIT_1);
11246                         tp->t_flags &= ~TF_NEEDFIN;
11247                         thflags &= ~TH_SYN;
11248                 } else {
11249                         tcp_state_change(tp, TCPS_ESTABLISHED);
11250                         TCP_PROBE5(connect__established, NULL, tp,
11251                             mtod(m, const char *), tp, th);
11252                         rack_cc_conn_init(tp);
11253                 }
11254         } else {
11255                 /*
11256                  * Received initial SYN in SYN-SENT[*] state => simultaneous
11257                  * open.  If segment contains CC option and there is a
11258                  * cached CC, apply TAO test. If it succeeds, connection is *
11259                  * half-synchronized. Otherwise, do 3-way handshake:
11260                  * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
11261                  * there was no CC option, clear cached CC value.
11262                  */
11263                 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
11264                 tcp_state_change(tp, TCPS_SYN_RECEIVED);
11265         }
11266         INP_WLOCK_ASSERT(tp->t_inpcb);
11267         /*
11268          * Advance th->th_seq to correspond to first data byte. If data,
11269          * trim to stay within window, dropping FIN if necessary.
11270          */
11271         th->th_seq++;
11272         if (tlen > tp->rcv_wnd) {
11273                 todrop = tlen - tp->rcv_wnd;
11274                 m_adj(m, -todrop);
11275                 tlen = tp->rcv_wnd;
11276                 thflags &= ~TH_FIN;
11277                 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
11278                 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
11279         }
11280         tp->snd_wl1 = th->th_seq - 1;
11281         tp->rcv_up = th->th_seq;
11282         /*
11283          * Client side of transaction: already sent SYN and data. If the
11284          * remote host used T/TCP to validate the SYN, our data will be
11285          * ACK'd; if so, enter normal data segment processing in the middle
11286          * of step 5, ack processing. Otherwise, goto step 6.
11287          */
11288         if (thflags & TH_ACK) {
11289                 /* For syn-sent we need to possibly update the rtt */
11290                 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11291                         uint32_t t, mcts;
11292
11293                         mcts = tcp_ts_getticks();
11294                         t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11295                         if (!tp->t_rttlow || tp->t_rttlow > t)
11296                                 tp->t_rttlow = t;
11297                         rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
11298                         tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11299                         tcp_rack_xmit_timer_commit(rack, tp);
11300                 }
11301                 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
11302                         return (ret_val);
11303                 /* We may have changed to FIN_WAIT_1 above */
11304                 if (tp->t_state == TCPS_FIN_WAIT_1) {
11305                         /*
11306                          * In FIN_WAIT_1 STATE in addition to the processing
11307                          * for the ESTABLISHED state if our FIN is now
11308                          * acknowledged then enter FIN_WAIT_2.
11309                          */
11310                         if (ourfinisacked) {
11311                                 /*
11312                                  * If we can't receive any more data, then
11313                                  * closing user can proceed. Starting the
11314                                  * timer is contrary to the specification,
11315                                  * but if we don't get a FIN we'll hang
11316                                  * forever.
11317                                  *
11318                                  * XXXjl: we should release the tp also, and
11319                                  * use a compressed state.
11320                                  */
11321                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11322                                         soisdisconnected(so);
11323                                         tcp_timer_activate(tp, TT_2MSL,
11324                                             (tcp_fast_finwait2_recycle ?
11325                                             tcp_finwait2_timeout :
11326                                             TP_MAXIDLE(tp)));
11327                                 }
11328                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
11329                         }
11330                 }
11331         }
11332         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11333            tiwin, thflags, nxt_pkt));
11334 }
11335
11336 /*
11337  * Return value of 1, the TCB is unlocked and most
11338  * likely gone, return value of 0, the TCP is still
11339  * locked.
11340  */
11341 static int
11342 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
11343     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11344     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11345 {
11346         struct tcp_rack *rack;
11347         int32_t ret_val = 0;
11348         int32_t ourfinisacked = 0;
11349
11350         ctf_calc_rwin(so, tp);
11351         if ((thflags & TH_ACK) &&
11352             (SEQ_LEQ(th->th_ack, tp->snd_una) ||
11353             SEQ_GT(th->th_ack, tp->snd_max))) {
11354                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11355                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11356                 return (1);
11357         }
11358         rack = (struct tcp_rack *)tp->t_fb_ptr;
11359         if (IS_FASTOPEN(tp->t_flags)) {
11360                 /*
11361                  * When a TFO connection is in SYN_RECEIVED, the
11362                  * only valid packets are the initial SYN, a
11363                  * retransmit/copy of the initial SYN (possibly with
11364                  * a subset of the original data), a valid ACK, a
11365                  * FIN, or a RST.
11366                  */
11367                 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
11368                         tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11369                         ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11370                         return (1);
11371                 } else if (thflags & TH_SYN) {
11372                         /* non-initial SYN is ignored */
11373                         if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
11374                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
11375                             (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
11376                                 ctf_do_drop(m, NULL);
11377                                 return (0);
11378                         }
11379                 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
11380                         ctf_do_drop(m, NULL);
11381                         return (0);
11382                 }
11383         }
11384
11385         if ((thflags & TH_RST) ||
11386             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11387                 return (__ctf_process_rst(m, th, so, tp,
11388                                           &rack->r_ctl.challenge_ack_ts,
11389                                           &rack->r_ctl.challenge_ack_cnt));
11390         /*
11391          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11392          * it's less than ts_recent, drop it.
11393          */
11394         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11395             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11396                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11397                         return (ret_val);
11398         }
11399         /*
11400          * In the SYN-RECEIVED state, validate that the packet belongs to
11401          * this connection before trimming the data to fit the receive
11402          * window.  Check the sequence number versus IRS since we know the
11403          * sequence numbers haven't wrapped.  This is a partial fix for the
11404          * "LAND" DoS attack.
11405          */
11406         if (SEQ_LT(th->th_seq, tp->irs)) {
11407                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
11408                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11409                 return (1);
11410         }
11411         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11412                               &rack->r_ctl.challenge_ack_ts,
11413                               &rack->r_ctl.challenge_ack_cnt)) {
11414                 return (ret_val);
11415         }
11416         /*
11417          * If last ACK falls within this segment's sequence numbers, record
11418          * its timestamp. NOTE: 1) That the test incorporates suggestions
11419          * from the latest proposal of the tcplw@cray.com list (Braden
11420          * 1993/04/26). 2) That updating only on newer timestamps interferes
11421          * with our earlier PAWS tests, so this check should be solely
11422          * predicated on the sequence space of this segment. 3) That we
11423          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11424          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11425          * SEG.Len, This modified check allows us to overcome RFC1323's
11426          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11427          * p.869. In such cases, we can still calculate the RTT correctly
11428          * when RCV.NXT == Last.ACK.Sent.
11429          */
11430         if ((to->to_flags & TOF_TS) != 0 &&
11431             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11432             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11433             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11434                 tp->ts_recent_age = tcp_ts_getticks();
11435                 tp->ts_recent = to->to_tsval;
11436         }
11437         tp->snd_wnd = tiwin;
11438         rack_validate_fo_sendwin_up(tp, rack);
11439         /*
11440          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11441          * is on (half-synchronized state), then queue data for later
11442          * processing; else drop segment and return.
11443          */
11444         if ((thflags & TH_ACK) == 0) {
11445                 if (IS_FASTOPEN(tp->t_flags)) {
11446                         rack_cc_conn_init(tp);
11447                 }
11448                 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11449                     tiwin, thflags, nxt_pkt));
11450         }
11451         KMOD_TCPSTAT_INC(tcps_connects);
11452         soisconnected(so);
11453         /* Do window scaling? */
11454         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11455             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11456                 tp->rcv_scale = tp->request_r_scale;
11457         }
11458         /*
11459          * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
11460          * FIN-WAIT-1
11461          */
11462         tp->t_starttime = ticks;
11463         if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
11464                 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
11465                 tp->t_tfo_pending = NULL;
11466         }
11467         if (tp->t_flags & TF_NEEDFIN) {
11468                 tcp_state_change(tp, TCPS_FIN_WAIT_1);
11469                 tp->t_flags &= ~TF_NEEDFIN;
11470         } else {
11471                 tcp_state_change(tp, TCPS_ESTABLISHED);
11472                 TCP_PROBE5(accept__established, NULL, tp,
11473                     mtod(m, const char *), tp, th);
11474                 /*
11475                  * TFO connections call cc_conn_init() during SYN
11476                  * processing.  Calling it again here for such connections
11477                  * is not harmless as it would undo the snd_cwnd reduction
11478                  * that occurs when a TFO SYN|ACK is retransmitted.
11479                  */
11480                 if (!IS_FASTOPEN(tp->t_flags))
11481                         rack_cc_conn_init(tp);
11482         }
11483         /*
11484          * Account for the ACK of our SYN prior to
11485          * regular ACK processing below, except for
11486          * simultaneous SYN, which is handled later.
11487          */
11488         if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
11489                 tp->snd_una++;
11490         /*
11491          * If segment contains data or ACK, will call tcp_reass() later; if
11492          * not, do so now to pass queued data to user.
11493          */
11494         if (tlen == 0 && (thflags & TH_FIN) == 0) {
11495                 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
11496                     (struct mbuf *)0);
11497                 if (tp->t_flags & TF_WAKESOR) {
11498                         tp->t_flags &= ~TF_WAKESOR;
11499                         /* NB: sorwakeup_locked() does an implicit unlock. */
11500                         sorwakeup_locked(so);
11501                 }
11502         }
11503         tp->snd_wl1 = th->th_seq - 1;
11504         /* For syn-recv we need to possibly update the rtt */
11505         if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
11506                 uint32_t t, mcts;
11507
11508                 mcts = tcp_ts_getticks();
11509                 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
11510                 if (!tp->t_rttlow || tp->t_rttlow > t)
11511                         tp->t_rttlow = t;
11512                 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
11513                 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
11514                 tcp_rack_xmit_timer_commit(rack, tp);
11515         }
11516         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
11517                 return (ret_val);
11518         }
11519         if (tp->t_state == TCPS_FIN_WAIT_1) {
11520                 /* We could have went to FIN_WAIT_1 (or EST) above */
11521                 /*
11522                  * In FIN_WAIT_1 STATE in addition to the processing for the
11523                  * ESTABLISHED state if our FIN is now acknowledged then
11524                  * enter FIN_WAIT_2.
11525                  */
11526                 if (ourfinisacked) {
11527                         /*
11528                          * If we can't receive any more data, then closing
11529                          * user can proceed. Starting the timer is contrary
11530                          * to the specification, but if we don't get a FIN
11531                          * we'll hang forever.
11532                          *
11533                          * XXXjl: we should release the tp also, and use a
11534                          * compressed state.
11535                          */
11536                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11537                                 soisdisconnected(so);
11538                                 tcp_timer_activate(tp, TT_2MSL,
11539                                     (tcp_fast_finwait2_recycle ?
11540                                     tcp_finwait2_timeout :
11541                                     TP_MAXIDLE(tp)));
11542                         }
11543                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
11544                 }
11545         }
11546         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11547             tiwin, thflags, nxt_pkt));
11548 }
11549
11550 /*
11551  * Return value of 1, the TCB is unlocked and most
11552  * likely gone, return value of 0, the TCP is still
11553  * locked.
11554  */
11555 static int
11556 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
11557     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11558     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11559 {
11560         int32_t ret_val = 0;
11561         struct tcp_rack *rack;
11562
11563         /*
11564          * Header prediction: check for the two common cases of a
11565          * uni-directional data xfer.  If the packet has no control flags,
11566          * is in-sequence, the window didn't change and we're not
11567          * retransmitting, it's a candidate.  If the length is zero and the
11568          * ack moved forward, we're the sender side of the xfer.  Just free
11569          * the data acked & wake any higher level process that was blocked
11570          * waiting for space.  If the length is non-zero and the ack didn't
11571          * move, we're the receiver side.  If we're getting packets in-order
11572          * (the reassembly queue is empty), add the data toc The socket
11573          * buffer and note that we need a delayed ack. Make sure that the
11574          * hidden state-flags are also off. Since we check for
11575          * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
11576          */
11577         rack = (struct tcp_rack *)tp->t_fb_ptr;
11578         if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
11579             __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
11580             __predict_true(SEGQ_EMPTY(tp)) &&
11581             __predict_true(th->th_seq == tp->rcv_nxt)) {
11582                 if (tlen == 0) {
11583                         if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
11584                             tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
11585                                 return (0);
11586                         }
11587                 } else {
11588                         if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
11589                             tiwin, nxt_pkt, iptos)) {
11590                                 return (0);
11591                         }
11592                 }
11593         }
11594         ctf_calc_rwin(so, tp);
11595
11596         if ((thflags & TH_RST) ||
11597             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11598                 return (__ctf_process_rst(m, th, so, tp,
11599                                           &rack->r_ctl.challenge_ack_ts,
11600                                           &rack->r_ctl.challenge_ack_cnt));
11601
11602         /*
11603          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11604          * synchronized state.
11605          */
11606         if (thflags & TH_SYN) {
11607                 ctf_challenge_ack(m, th, tp, &ret_val);
11608                 return (ret_val);
11609         }
11610         /*
11611          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11612          * it's less than ts_recent, drop it.
11613          */
11614         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11615             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11616                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11617                         return (ret_val);
11618         }
11619         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11620                               &rack->r_ctl.challenge_ack_ts,
11621                               &rack->r_ctl.challenge_ack_cnt)) {
11622                 return (ret_val);
11623         }
11624         /*
11625          * If last ACK falls within this segment's sequence numbers, record
11626          * its timestamp. NOTE: 1) That the test incorporates suggestions
11627          * from the latest proposal of the tcplw@cray.com list (Braden
11628          * 1993/04/26). 2) That updating only on newer timestamps interferes
11629          * with our earlier PAWS tests, so this check should be solely
11630          * predicated on the sequence space of this segment. 3) That we
11631          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11632          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11633          * SEG.Len, This modified check allows us to overcome RFC1323's
11634          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11635          * p.869. In such cases, we can still calculate the RTT correctly
11636          * when RCV.NXT == Last.ACK.Sent.
11637          */
11638         if ((to->to_flags & TOF_TS) != 0 &&
11639             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11640             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11641             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11642                 tp->ts_recent_age = tcp_ts_getticks();
11643                 tp->ts_recent = to->to_tsval;
11644         }
11645         /*
11646          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11647          * is on (half-synchronized state), then queue data for later
11648          * processing; else drop segment and return.
11649          */
11650         if ((thflags & TH_ACK) == 0) {
11651                 if (tp->t_flags & TF_NEEDSYN) {
11652                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11653                             tiwin, thflags, nxt_pkt));
11654
11655                 } else if (tp->t_flags & TF_ACKNOW) {
11656                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11657                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11658                         return (ret_val);
11659                 } else {
11660                         ctf_do_drop(m, NULL);
11661                         return (0);
11662                 }
11663         }
11664         /*
11665          * Ack processing.
11666          */
11667         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11668                 return (ret_val);
11669         }
11670         if (sbavail(&so->so_snd)) {
11671                 if (ctf_progress_timeout_check(tp, true)) {
11672                         rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
11673                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11674                         return (1);
11675                 }
11676         }
11677         /* State changes only happen in rack_process_data() */
11678         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11679             tiwin, thflags, nxt_pkt));
11680 }
11681
11682 /*
11683  * Return value of 1, the TCB is unlocked and most
11684  * likely gone, return value of 0, the TCP is still
11685  * locked.
11686  */
11687 static int
11688 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
11689     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11690     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11691 {
11692         int32_t ret_val = 0;
11693         struct tcp_rack *rack;
11694
11695         rack = (struct tcp_rack *)tp->t_fb_ptr;
11696         ctf_calc_rwin(so, tp);
11697         if ((thflags & TH_RST) ||
11698             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11699                 return (__ctf_process_rst(m, th, so, tp,
11700                                           &rack->r_ctl.challenge_ack_ts,
11701                                           &rack->r_ctl.challenge_ack_cnt));
11702         /*
11703          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11704          * synchronized state.
11705          */
11706         if (thflags & TH_SYN) {
11707                 ctf_challenge_ack(m, th, tp, &ret_val);
11708                 return (ret_val);
11709         }
11710         /*
11711          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11712          * it's less than ts_recent, drop it.
11713          */
11714         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11715             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11716                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11717                         return (ret_val);
11718         }
11719         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11720                               &rack->r_ctl.challenge_ack_ts,
11721                               &rack->r_ctl.challenge_ack_cnt)) {
11722                 return (ret_val);
11723         }
11724         /*
11725          * If last ACK falls within this segment's sequence numbers, record
11726          * its timestamp. NOTE: 1) That the test incorporates suggestions
11727          * from the latest proposal of the tcplw@cray.com list (Braden
11728          * 1993/04/26). 2) That updating only on newer timestamps interferes
11729          * with our earlier PAWS tests, so this check should be solely
11730          * predicated on the sequence space of this segment. 3) That we
11731          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11732          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11733          * SEG.Len, This modified check allows us to overcome RFC1323's
11734          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11735          * p.869. In such cases, we can still calculate the RTT correctly
11736          * when RCV.NXT == Last.ACK.Sent.
11737          */
11738         if ((to->to_flags & TOF_TS) != 0 &&
11739             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11740             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11741             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11742                 tp->ts_recent_age = tcp_ts_getticks();
11743                 tp->ts_recent = to->to_tsval;
11744         }
11745         /*
11746          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11747          * is on (half-synchronized state), then queue data for later
11748          * processing; else drop segment and return.
11749          */
11750         if ((thflags & TH_ACK) == 0) {
11751                 if (tp->t_flags & TF_NEEDSYN) {
11752                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11753                             tiwin, thflags, nxt_pkt));
11754
11755                 } else if (tp->t_flags & TF_ACKNOW) {
11756                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11757                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11758                         return (ret_val);
11759                 } else {
11760                         ctf_do_drop(m, NULL);
11761                         return (0);
11762                 }
11763         }
11764         /*
11765          * Ack processing.
11766          */
11767         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
11768                 return (ret_val);
11769         }
11770         if (sbavail(&so->so_snd)) {
11771                 if (ctf_progress_timeout_check(tp, true)) {
11772                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
11773                                                 tp, tick, PROGRESS_DROP, __LINE__);
11774                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11775                         return (1);
11776                 }
11777         }
11778         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11779             tiwin, thflags, nxt_pkt));
11780 }
11781
11782 static int
11783 rack_check_data_after_close(struct mbuf *m,
11784     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
11785 {
11786         struct tcp_rack *rack;
11787
11788         rack = (struct tcp_rack *)tp->t_fb_ptr;
11789         if (rack->rc_allow_data_af_clo == 0) {
11790         close_now:
11791                 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11792                 /* tcp_close will kill the inp pre-log the Reset */
11793                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
11794                 tp = tcp_close(tp);
11795                 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
11796                 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
11797                 return (1);
11798         }
11799         if (sbavail(&so->so_snd) == 0)
11800                 goto close_now;
11801         /* Ok we allow data that is ignored and a followup reset */
11802         tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
11803         tp->rcv_nxt = th->th_seq + *tlen;
11804         tp->t_flags2 |= TF2_DROP_AF_DATA;
11805         rack->r_wanted_output = 1;
11806         *tlen = 0;
11807         return (0);
11808 }
11809
11810 /*
11811  * Return value of 1, the TCB is unlocked and most
11812  * likely gone, return value of 0, the TCP is still
11813  * locked.
11814  */
11815 static int
11816 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
11817     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11818     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11819 {
11820         int32_t ret_val = 0;
11821         int32_t ourfinisacked = 0;
11822         struct tcp_rack *rack;
11823
11824         rack = (struct tcp_rack *)tp->t_fb_ptr;
11825         ctf_calc_rwin(so, tp);
11826
11827         if ((thflags & TH_RST) ||
11828             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11829                 return (__ctf_process_rst(m, th, so, tp,
11830                                           &rack->r_ctl.challenge_ack_ts,
11831                                           &rack->r_ctl.challenge_ack_cnt));
11832         /*
11833          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11834          * synchronized state.
11835          */
11836         if (thflags & TH_SYN) {
11837                 ctf_challenge_ack(m, th, tp, &ret_val);
11838                 return (ret_val);
11839         }
11840         /*
11841          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11842          * it's less than ts_recent, drop it.
11843          */
11844         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11845             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11846                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11847                         return (ret_val);
11848         }
11849         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11850                               &rack->r_ctl.challenge_ack_ts,
11851                               &rack->r_ctl.challenge_ack_cnt)) {
11852                 return (ret_val);
11853         }
11854         /*
11855          * If new data are received on a connection after the user processes
11856          * are gone, then RST the other end.
11857          */
11858         if ((tp->t_flags & TF_CLOSED) && tlen &&
11859             rack_check_data_after_close(m, tp, &tlen, th, so))
11860                 return (1);
11861         /*
11862          * If last ACK falls within this segment's sequence numbers, record
11863          * its timestamp. NOTE: 1) That the test incorporates suggestions
11864          * from the latest proposal of the tcplw@cray.com list (Braden
11865          * 1993/04/26). 2) That updating only on newer timestamps interferes
11866          * with our earlier PAWS tests, so this check should be solely
11867          * predicated on the sequence space of this segment. 3) That we
11868          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11869          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11870          * SEG.Len, This modified check allows us to overcome RFC1323's
11871          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11872          * p.869. In such cases, we can still calculate the RTT correctly
11873          * when RCV.NXT == Last.ACK.Sent.
11874          */
11875         if ((to->to_flags & TOF_TS) != 0 &&
11876             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
11877             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
11878             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
11879                 tp->ts_recent_age = tcp_ts_getticks();
11880                 tp->ts_recent = to->to_tsval;
11881         }
11882         /*
11883          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
11884          * is on (half-synchronized state), then queue data for later
11885          * processing; else drop segment and return.
11886          */
11887         if ((thflags & TH_ACK) == 0) {
11888                 if (tp->t_flags & TF_NEEDSYN) {
11889                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11890                             tiwin, thflags, nxt_pkt));
11891                 } else if (tp->t_flags & TF_ACKNOW) {
11892                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
11893                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
11894                         return (ret_val);
11895                 } else {
11896                         ctf_do_drop(m, NULL);
11897                         return (0);
11898                 }
11899         }
11900         /*
11901          * Ack processing.
11902          */
11903         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
11904                 return (ret_val);
11905         }
11906         if (ourfinisacked) {
11907                 /*
11908                  * If we can't receive any more data, then closing user can
11909                  * proceed. Starting the timer is contrary to the
11910                  * specification, but if we don't get a FIN we'll hang
11911                  * forever.
11912                  *
11913                  * XXXjl: we should release the tp also, and use a
11914                  * compressed state.
11915                  */
11916                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
11917                         soisdisconnected(so);
11918                         tcp_timer_activate(tp, TT_2MSL,
11919                             (tcp_fast_finwait2_recycle ?
11920                             tcp_finwait2_timeout :
11921                             TP_MAXIDLE(tp)));
11922                 }
11923                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
11924         }
11925         if (sbavail(&so->so_snd)) {
11926                 if (ctf_progress_timeout_check(tp, true)) {
11927                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
11928                                                 tp, tick, PROGRESS_DROP, __LINE__);
11929                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
11930                         return (1);
11931                 }
11932         }
11933         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
11934             tiwin, thflags, nxt_pkt));
11935 }
11936
11937 /*
11938  * Return value of 1, the TCB is unlocked and most
11939  * likely gone, return value of 0, the TCP is still
11940  * locked.
11941  */
11942 static int
11943 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
11944     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
11945     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
11946 {
11947         int32_t ret_val = 0;
11948         int32_t ourfinisacked = 0;
11949         struct tcp_rack *rack;
11950
11951         rack = (struct tcp_rack *)tp->t_fb_ptr;
11952         ctf_calc_rwin(so, tp);
11953
11954         if ((thflags & TH_RST) ||
11955             (tp->t_fin_is_rst && (thflags & TH_FIN)))
11956                 return (__ctf_process_rst(m, th, so, tp,
11957                                           &rack->r_ctl.challenge_ack_ts,
11958                                           &rack->r_ctl.challenge_ack_cnt));
11959         /*
11960          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
11961          * synchronized state.
11962          */
11963         if (thflags & TH_SYN) {
11964                 ctf_challenge_ack(m, th, tp, &ret_val);
11965                 return (ret_val);
11966         }
11967         /*
11968          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
11969          * it's less than ts_recent, drop it.
11970          */
11971         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
11972             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
11973                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
11974                         return (ret_val);
11975         }
11976         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
11977                               &rack->r_ctl.challenge_ack_ts,
11978                               &rack->r_ctl.challenge_ack_cnt)) {
11979                 return (ret_val);
11980         }
11981         /*
11982          * If new data are received on a connection after the user processes
11983          * are gone, then RST the other end.
11984          */
11985         if ((tp->t_flags & TF_CLOSED) && tlen &&
11986             rack_check_data_after_close(m, tp, &tlen, th, so))
11987                 return (1);
11988         /*
11989          * If last ACK falls within this segment's sequence numbers, record
11990          * its timestamp. NOTE: 1) That the test incorporates suggestions
11991          * from the latest proposal of the tcplw@cray.com list (Braden
11992          * 1993/04/26). 2) That updating only on newer timestamps interferes
11993          * with our earlier PAWS tests, so this check should be solely
11994          * predicated on the sequence space of this segment. 3) That we
11995          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
11996          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
11997          * SEG.Len, This modified check allows us to overcome RFC1323's
11998          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
11999          * p.869. In such cases, we can still calculate the RTT correctly
12000          * when RCV.NXT == Last.ACK.Sent.
12001          */
12002         if ((to->to_flags & TOF_TS) != 0 &&
12003             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12004             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12005             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12006                 tp->ts_recent_age = tcp_ts_getticks();
12007                 tp->ts_recent = to->to_tsval;
12008         }
12009         /*
12010          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12011          * is on (half-synchronized state), then queue data for later
12012          * processing; else drop segment and return.
12013          */
12014         if ((thflags & TH_ACK) == 0) {
12015                 if (tp->t_flags & TF_NEEDSYN) {
12016                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12017                             tiwin, thflags, nxt_pkt));
12018                 } else if (tp->t_flags & TF_ACKNOW) {
12019                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12020                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12021                         return (ret_val);
12022                 } else {
12023                         ctf_do_drop(m, NULL);
12024                         return (0);
12025                 }
12026         }
12027         /*
12028          * Ack processing.
12029          */
12030         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12031                 return (ret_val);
12032         }
12033         if (ourfinisacked) {
12034                 tcp_twstart(tp);
12035                 m_freem(m);
12036                 return (1);
12037         }
12038         if (sbavail(&so->so_snd)) {
12039                 if (ctf_progress_timeout_check(tp, true)) {
12040                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12041                                                 tp, tick, PROGRESS_DROP, __LINE__);
12042                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12043                         return (1);
12044                 }
12045         }
12046         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12047             tiwin, thflags, nxt_pkt));
12048 }
12049
12050 /*
12051  * Return value of 1, the TCB is unlocked and most
12052  * likely gone, return value of 0, the TCP is still
12053  * locked.
12054  */
12055 static int
12056 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12057     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12058     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12059 {
12060         int32_t ret_val = 0;
12061         int32_t ourfinisacked = 0;
12062         struct tcp_rack *rack;
12063
12064         rack = (struct tcp_rack *)tp->t_fb_ptr;
12065         ctf_calc_rwin(so, tp);
12066
12067         if ((thflags & TH_RST) ||
12068             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12069                 return (__ctf_process_rst(m, th, so, tp,
12070                                           &rack->r_ctl.challenge_ack_ts,
12071                                           &rack->r_ctl.challenge_ack_cnt));
12072         /*
12073          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12074          * synchronized state.
12075          */
12076         if (thflags & TH_SYN) {
12077                 ctf_challenge_ack(m, th, tp, &ret_val);
12078                 return (ret_val);
12079         }
12080         /*
12081          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12082          * it's less than ts_recent, drop it.
12083          */
12084         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12085             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12086                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12087                         return (ret_val);
12088         }
12089         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12090                               &rack->r_ctl.challenge_ack_ts,
12091                               &rack->r_ctl.challenge_ack_cnt)) {
12092                 return (ret_val);
12093         }
12094         /*
12095          * If new data are received on a connection after the user processes
12096          * are gone, then RST the other end.
12097          */
12098         if ((tp->t_flags & TF_CLOSED) && tlen &&
12099             rack_check_data_after_close(m, tp, &tlen, th, so))
12100                 return (1);
12101         /*
12102          * If last ACK falls within this segment's sequence numbers, record
12103          * its timestamp. NOTE: 1) That the test incorporates suggestions
12104          * from the latest proposal of the tcplw@cray.com list (Braden
12105          * 1993/04/26). 2) That updating only on newer timestamps interferes
12106          * with our earlier PAWS tests, so this check should be solely
12107          * predicated on the sequence space of this segment. 3) That we
12108          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12109          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12110          * SEG.Len, This modified check allows us to overcome RFC1323's
12111          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12112          * p.869. In such cases, we can still calculate the RTT correctly
12113          * when RCV.NXT == Last.ACK.Sent.
12114          */
12115         if ((to->to_flags & TOF_TS) != 0 &&
12116             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12117             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12118             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12119                 tp->ts_recent_age = tcp_ts_getticks();
12120                 tp->ts_recent = to->to_tsval;
12121         }
12122         /*
12123          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12124          * is on (half-synchronized state), then queue data for later
12125          * processing; else drop segment and return.
12126          */
12127         if ((thflags & TH_ACK) == 0) {
12128                 if (tp->t_flags & TF_NEEDSYN) {
12129                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12130                             tiwin, thflags, nxt_pkt));
12131                 } else if (tp->t_flags & TF_ACKNOW) {
12132                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12133                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12134                         return (ret_val);
12135                 } else {
12136                         ctf_do_drop(m, NULL);
12137                         return (0);
12138                 }
12139         }
12140         /*
12141          * case TCPS_LAST_ACK: Ack processing.
12142          */
12143         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12144                 return (ret_val);
12145         }
12146         if (ourfinisacked) {
12147                 tp = tcp_close(tp);
12148                 ctf_do_drop(m, tp);
12149                 return (1);
12150         }
12151         if (sbavail(&so->so_snd)) {
12152                 if (ctf_progress_timeout_check(tp, true)) {
12153                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12154                                                 tp, tick, PROGRESS_DROP, __LINE__);
12155                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12156                         return (1);
12157                 }
12158         }
12159         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12160             tiwin, thflags, nxt_pkt));
12161 }
12162
12163 /*
12164  * Return value of 1, the TCB is unlocked and most
12165  * likely gone, return value of 0, the TCP is still
12166  * locked.
12167  */
12168 static int
12169 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
12170     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12171     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12172 {
12173         int32_t ret_val = 0;
12174         int32_t ourfinisacked = 0;
12175         struct tcp_rack *rack;
12176
12177         rack = (struct tcp_rack *)tp->t_fb_ptr;
12178         ctf_calc_rwin(so, tp);
12179
12180         /* Reset receive buffer auto scaling when not in bulk receive mode. */
12181         if ((thflags & TH_RST) ||
12182             (tp->t_fin_is_rst && (thflags & TH_FIN)))
12183                 return (__ctf_process_rst(m, th, so, tp,
12184                                           &rack->r_ctl.challenge_ack_ts,
12185                                           &rack->r_ctl.challenge_ack_cnt));
12186         /*
12187          * RFC5961 Section 4.2 Send challenge ACK for any SYN in
12188          * synchronized state.
12189          */
12190         if (thflags & TH_SYN) {
12191                 ctf_challenge_ack(m, th, tp, &ret_val);
12192                 return (ret_val);
12193         }
12194         /*
12195          * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12196          * it's less than ts_recent, drop it.
12197          */
12198         if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12199             TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12200                 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12201                         return (ret_val);
12202         }
12203         if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
12204                               &rack->r_ctl.challenge_ack_ts,
12205                               &rack->r_ctl.challenge_ack_cnt)) {
12206                 return (ret_val);
12207         }
12208         /*
12209          * If new data are received on a connection after the user processes
12210          * are gone, then RST the other end.
12211          */
12212         if ((tp->t_flags & TF_CLOSED) && tlen &&
12213             rack_check_data_after_close(m, tp, &tlen, th, so))
12214                 return (1);
12215         /*
12216          * If last ACK falls within this segment's sequence numbers, record
12217          * its timestamp. NOTE: 1) That the test incorporates suggestions
12218          * from the latest proposal of the tcplw@cray.com list (Braden
12219          * 1993/04/26). 2) That updating only on newer timestamps interferes
12220          * with our earlier PAWS tests, so this check should be solely
12221          * predicated on the sequence space of this segment. 3) That we
12222          * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12223          * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12224          * SEG.Len, This modified check allows us to overcome RFC1323's
12225          * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12226          * p.869. In such cases, we can still calculate the RTT correctly
12227          * when RCV.NXT == Last.ACK.Sent.
12228          */
12229         if ((to->to_flags & TOF_TS) != 0 &&
12230             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12231             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12232             ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12233                 tp->ts_recent_age = tcp_ts_getticks();
12234                 tp->ts_recent = to->to_tsval;
12235         }
12236         /*
12237          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
12238          * is on (half-synchronized state), then queue data for later
12239          * processing; else drop segment and return.
12240          */
12241         if ((thflags & TH_ACK) == 0) {
12242                 if (tp->t_flags & TF_NEEDSYN) {
12243                         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12244                             tiwin, thflags, nxt_pkt));
12245                 } else if (tp->t_flags & TF_ACKNOW) {
12246                         ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
12247                         ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
12248                         return (ret_val);
12249                 } else {
12250                         ctf_do_drop(m, NULL);
12251                         return (0);
12252                 }
12253         }
12254         /*
12255          * Ack processing.
12256          */
12257         if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
12258                 return (ret_val);
12259         }
12260         if (sbavail(&so->so_snd)) {
12261                 if (ctf_progress_timeout_check(tp, true)) {
12262                         rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
12263                                                 tp, tick, PROGRESS_DROP, __LINE__);
12264                         ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
12265                         return (1);
12266                 }
12267         }
12268         return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12269             tiwin, thflags, nxt_pkt));
12270 }
12271
12272 static void inline
12273 rack_clear_rate_sample(struct tcp_rack *rack)
12274 {
12275         rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
12276         rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
12277         rack->r_ctl.rack_rs.rs_rtt_tot = 0;
12278 }
12279
12280 static void
12281 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
12282 {
12283         uint64_t bw_est, rate_wanted;
12284         int chged = 0;
12285         uint32_t user_max, orig_min, orig_max;
12286
12287         orig_min = rack->r_ctl.rc_pace_min_segs;
12288         orig_max = rack->r_ctl.rc_pace_max_segs;
12289         user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
12290         if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
12291                 chged = 1;
12292         rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
12293         if (rack->use_fixed_rate || rack->rc_force_max_seg) {
12294                 if (user_max != rack->r_ctl.rc_pace_max_segs)
12295                         chged = 1;
12296         }
12297         if (rack->rc_force_max_seg) {
12298                 rack->r_ctl.rc_pace_max_segs = user_max;
12299         } else if (rack->use_fixed_rate) {
12300                 bw_est = rack_get_bw(rack);
12301                 if ((rack->r_ctl.crte == NULL) ||
12302                     (bw_est != rack->r_ctl.crte->rate)) {
12303                         rack->r_ctl.rc_pace_max_segs = user_max;
12304                 } else {
12305                         /* We are pacing right at the hardware rate */
12306                         uint32_t segsiz;
12307
12308                         segsiz = min(ctf_fixed_maxseg(tp),
12309                                      rack->r_ctl.rc_pace_min_segs);
12310                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
12311                                                            tp, bw_est, segsiz, 0,
12312                                                            rack->r_ctl.crte, NULL);
12313                 }
12314         } else if (rack->rc_always_pace) {
12315                 if (rack->r_ctl.gp_bw ||
12316 #ifdef NETFLIX_PEAKRATE
12317                     rack->rc_tp->t_maxpeakrate ||
12318 #endif
12319                     rack->r_ctl.init_rate) {
12320                         /* We have a rate of some sort set */
12321                         uint32_t  orig;
12322
12323                         bw_est = rack_get_bw(rack);
12324                         orig = rack->r_ctl.rc_pace_max_segs;
12325                         if (fill_override)
12326                                 rate_wanted = *fill_override;
12327                         else
12328                                 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
12329                         if (rate_wanted) {
12330                                 /* We have something */
12331                                 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
12332                                                                                    rate_wanted,
12333                                                                                    ctf_fixed_maxseg(rack->rc_tp));
12334                         } else
12335                                 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
12336                         if (orig != rack->r_ctl.rc_pace_max_segs)
12337                                 chged = 1;
12338                 } else if ((rack->r_ctl.gp_bw == 0) &&
12339                            (rack->r_ctl.rc_pace_max_segs == 0)) {
12340                         /*
12341                          * If we have nothing limit us to bursting
12342                          * out IW sized pieces.
12343                          */
12344                         chged = 1;
12345                         rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
12346                 }
12347         }
12348         if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
12349                 chged = 1;
12350                 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
12351         }
12352         if (chged)
12353                 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
12354 }
12355
12356
12357 static void
12358 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
12359 {
12360 #ifdef INET6
12361         struct ip6_hdr *ip6 = NULL;
12362 #endif
12363 #ifdef INET
12364         struct ip *ip = NULL;
12365 #endif
12366         struct udphdr *udp = NULL;
12367
12368         /* Ok lets fill in the fast block, it can only be used with no IP options! */
12369 #ifdef INET6
12370         if (rack->r_is_v6) {
12371                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
12372                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
12373                 if (tp->t_port) {
12374                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12375                         udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
12376                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12377                         udp->uh_dport = tp->t_port;
12378                         rack->r_ctl.fsb.udp = udp;
12379                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12380                 } else
12381                 {
12382                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
12383                         rack->r_ctl.fsb.udp = NULL;
12384                 }
12385                 tcpip_fillheaders(rack->rc_inp,
12386                                   tp->t_port,
12387                                   ip6, rack->r_ctl.fsb.th);
12388         } else
12389 #endif                          /* INET6 */
12390         {
12391                 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
12392                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
12393                 if (tp->t_port) {
12394                         rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
12395                         udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
12396                         udp->uh_sport = htons(V_tcp_udp_tunneling_port);
12397                         udp->uh_dport = tp->t_port;
12398                         rack->r_ctl.fsb.udp = udp;
12399                         rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
12400                 } else
12401                 {
12402                         rack->r_ctl.fsb.udp = NULL;
12403                         rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
12404                 }
12405                 tcpip_fillheaders(rack->rc_inp,
12406                                   tp->t_port,
12407                                   ip, rack->r_ctl.fsb.th);
12408         }
12409         rack->r_fsb_inited = 1;
12410 }
12411
12412 static int
12413 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
12414 {
12415         /*
12416          * Allocate the larger of spaces V6 if available else just
12417          * V4 and include udphdr (overbook)
12418          */
12419 #ifdef INET6
12420         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
12421 #else
12422         rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
12423 #endif
12424         rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
12425                                             M_TCPFSB, M_NOWAIT|M_ZERO);
12426         if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
12427                 return (ENOMEM);
12428         }
12429         rack->r_fsb_inited = 0;
12430         return (0);
12431 }
12432
12433 static int
12434 rack_init(struct tcpcb *tp)
12435 {
12436         struct tcp_rack *rack = NULL;
12437 #ifdef INVARIANTS
12438         struct rack_sendmap *insret;
12439 #endif
12440         uint32_t iwin, snt, us_cts;
12441         int err;
12442
12443         tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
12444         if (tp->t_fb_ptr == NULL) {
12445                 /*
12446                  * We need to allocate memory but cant. The INP and INP_INFO
12447                  * locks and they are recursive (happens during setup. So a
12448                  * scheme to drop the locks fails :(
12449                  *
12450                  */
12451                 return (ENOMEM);
12452         }
12453         memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
12454
12455         rack = (struct tcp_rack *)tp->t_fb_ptr;
12456         RB_INIT(&rack->r_ctl.rc_mtree);
12457         TAILQ_INIT(&rack->r_ctl.rc_free);
12458         TAILQ_INIT(&rack->r_ctl.rc_tmap);
12459         rack->rc_tp = tp;
12460         rack->rc_inp = tp->t_inpcb;
12461         /* Set the flag */
12462         rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
12463         /* Probably not needed but lets be sure */
12464         rack_clear_rate_sample(rack);
12465         /*
12466          * Save off the default values, socket options will poke
12467          * at these if pacing is not on or we have not yet
12468          * reached where pacing is on (gp_ready/fixed enabled).
12469          * When they get set into the CC module (when gp_ready
12470          * is enabled or we enable fixed) then we will set these
12471          * values into the CC and place in here the old values
12472          * so we have a restoral. Then we will set the flag
12473          * rc_pacing_cc_set. That way whenever we turn off pacing
12474          * or switch off this stack, we will know to go restore
12475          * the saved values.
12476          */
12477         rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
12478         rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
12479         /* We want abe like behavior as well */
12480         rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
12481         rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
12482         rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
12483         rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
12484         rack->r_ctl.roundends = tp->snd_max;
12485         if (use_rack_rr)
12486                 rack->use_rack_rr = 1;
12487         if (V_tcp_delack_enabled)
12488                 tp->t_delayed_ack = 1;
12489         else
12490                 tp->t_delayed_ack = 0;
12491 #ifdef TCP_ACCOUNTING
12492         if (rack_tcp_accounting) {
12493                 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
12494         }
12495 #endif
12496         if (rack_enable_shared_cwnd)
12497                 rack->rack_enable_scwnd = 1;
12498         rack->rc_user_set_max_segs = rack_hptsi_segments;
12499         rack->rc_force_max_seg = 0;
12500         if (rack_use_imac_dack)
12501                 rack->rc_dack_mode = 1;
12502         TAILQ_INIT(&rack->r_ctl.opt_list);
12503         rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
12504         rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
12505         rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
12506         rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
12507         rack->r_ctl.rc_highest_us_rtt = 0;
12508         rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
12509         rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
12510         if (rack_use_cmp_acks)
12511                 rack->r_use_cmp_ack = 1;
12512         if (rack_disable_prr)
12513                 rack->rack_no_prr = 1;
12514         if (rack_gp_no_rec_chg)
12515                 rack->rc_gp_no_rec_chg = 1;
12516         if (rack_pace_every_seg && tcp_can_enable_pacing()) {
12517                 rack->rc_always_pace = 1;
12518                 if (rack->use_fixed_rate || rack->gp_ready)
12519                         rack_set_cc_pacing(rack);
12520         } else
12521                 rack->rc_always_pace = 0;
12522         if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
12523                 rack->r_mbuf_queue = 1;
12524         else
12525                 rack->r_mbuf_queue = 0;
12526         if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
12527                 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
12528         else
12529                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
12530         rack_set_pace_segments(tp, rack, __LINE__, NULL);
12531         if (rack_limits_scwnd)
12532                 rack->r_limit_scw = 1;
12533         else
12534                 rack->r_limit_scw = 0;
12535         rack->rc_labc = V_tcp_abc_l_var;
12536         rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
12537         rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
12538         rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
12539         rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
12540         rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
12541         rack->r_ctl.rc_min_to = rack_min_to;
12542         microuptime(&rack->r_ctl.act_rcv_time);
12543         rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
12544         rack->rc_init_win = rack_default_init_window;
12545         rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
12546         if (rack_hw_up_only)
12547                 rack->r_up_only = 1;
12548         if (rack_do_dyn_mul) {
12549                 /* When dynamic adjustment is on CA needs to start at 100% */
12550                 rack->rc_gp_dyn_mul = 1;
12551                 if (rack_do_dyn_mul >= 100)
12552                         rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
12553         } else
12554                 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
12555         rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
12556         rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
12557         rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
12558         setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
12559                                 rack_probertt_filter_life);
12560         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
12561         rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
12562         rack->r_ctl.rc_time_of_last_probertt = us_cts;
12563         rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
12564         rack->r_ctl.rc_time_probertt_starts = 0;
12565         if (rack_dsack_std_based & 0x1) {
12566                 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
12567                 rack->rc_rack_tmr_std_based = 1;
12568         }
12569         if (rack_dsack_std_based & 0x2) {
12570                 /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
12571                 rack->rc_rack_use_dsack = 1;
12572         }
12573         /* We require at least one measurement, even if the sysctl is 0 */
12574         if (rack_req_measurements)
12575                 rack->r_ctl.req_measurements = rack_req_measurements;
12576         else
12577                 rack->r_ctl.req_measurements = 1;
12578         if (rack_enable_hw_pacing)
12579                 rack->rack_hdw_pace_ena = 1;
12580         if (rack_hw_rate_caps)
12581                 rack->r_rack_hw_rate_caps = 1;
12582         /* Do we force on detection? */
12583 #ifdef NETFLIX_EXP_DETECTION
12584         if (tcp_force_detection)
12585                 rack->do_detection = 1;
12586         else
12587 #endif
12588                 rack->do_detection = 0;
12589         if (rack_non_rxt_use_cr)
12590                 rack->rack_rec_nonrxt_use_cr = 1;
12591         err = rack_init_fsb(tp, rack);
12592         if (err) {
12593                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12594                 tp->t_fb_ptr = NULL;
12595                 return (err);
12596         }
12597         if (tp->snd_una != tp->snd_max) {
12598                 /* Create a send map for the current outstanding data */
12599                 struct rack_sendmap *rsm;
12600
12601                 rsm = rack_alloc(rack);
12602                 if (rsm == NULL) {
12603                         uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12604                         tp->t_fb_ptr = NULL;
12605                         return (ENOMEM);
12606                 }
12607                 rsm->r_no_rtt_allowed = 1;
12608                 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
12609                 rsm->r_rtr_cnt = 1;
12610                 rsm->r_rtr_bytes = 0;
12611                 if (tp->t_flags & TF_SENTFIN)
12612                         rsm->r_flags |= RACK_HAS_FIN;
12613                 rsm->r_end = tp->snd_max;
12614                 if (tp->snd_una == tp->iss) {
12615                         /* The data space is one beyond snd_una */
12616                         rsm->r_flags |= RACK_HAS_SYN;
12617                         rsm->r_start = tp->iss;
12618                         rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
12619                 } else
12620                         rsm->r_start = tp->snd_una;
12621                 rsm->r_dupack = 0;
12622                 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
12623                         rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
12624                         if (rsm->m)
12625                                 rsm->orig_m_len = rsm->m->m_len;
12626                         else
12627                                 rsm->orig_m_len = 0;
12628                 } else {
12629                         /*
12630                          * This can happen if we have a stand-alone FIN or
12631                          *  SYN.
12632                          */
12633                         rsm->m = NULL;
12634                         rsm->orig_m_len = 0;
12635                         rsm->soff = 0;
12636                 }
12637 #ifndef INVARIANTS
12638                 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12639 #else
12640                 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12641                 if (insret != NULL) {
12642                         panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
12643                               insret, rack, rsm);
12644                 }
12645 #endif
12646                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
12647                 rsm->r_in_tmap = 1;
12648         }
12649         /*
12650          * Timers in Rack are kept in microseconds so lets
12651          * convert any initial incoming variables
12652          * from ticks into usecs. Note that we
12653          * also change the values of t_srtt and t_rttvar, if
12654          * they are non-zero. They are kept with a 5
12655          * bit decimal so we have to carefully convert
12656          * these to get the full precision.
12657          */
12658         rack_convert_rtts(tp);
12659         tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
12660         if (rack_do_hystart) {
12661                 tp->ccv->flags |= CCF_HYSTART_ALLOWED;
12662                 if (rack_do_hystart > 1)
12663                         tp->ccv->flags |= CCF_HYSTART_CAN_SH_CWND;
12664                 if (rack_do_hystart > 2)
12665                         tp->ccv->flags |= CCF_HYSTART_CONS_SSTH;
12666         }
12667         if (rack_def_profile)
12668                 rack_set_profile(rack, rack_def_profile);
12669         /* Cancel the GP measurement in progress */
12670         tp->t_flags &= ~TF_GPUTINPROG;
12671         if (SEQ_GT(tp->snd_max, tp->iss))
12672                 snt = tp->snd_max - tp->iss;
12673         else
12674                 snt = 0;
12675         iwin = rc_init_window(rack);
12676         if (snt < iwin) {
12677                 /* We are not past the initial window
12678                  * so we need to make sure cwnd is
12679                  * correct.
12680                  */
12681                 if (tp->snd_cwnd < iwin)
12682                         tp->snd_cwnd = iwin;
12683                 /*
12684                  * If we are within the initial window
12685                  * we want ssthresh to be unlimited. Setting
12686                  * it to the rwnd (which the default stack does
12687                  * and older racks) is not really a good idea
12688                  * since we want to be in SS and grow both the
12689                  * cwnd and the rwnd (via dynamic rwnd growth). If
12690                  * we set it to the rwnd then as the peer grows its
12691                  * rwnd we will be stuck in CA and never hit SS.
12692                  *
12693                  * Its far better to raise it up high (this takes the
12694                  * risk that there as been a loss already, probably
12695                  * we should have an indicator in all stacks of loss
12696                  * but we don't), but considering the normal use this
12697                  * is a risk worth taking. The consequences of not
12698                  * hitting SS are far worse than going one more time
12699                  * into it early on (before we have sent even a IW).
12700                  * It is highly unlikely that we will have had a loss
12701                  * before getting the IW out.
12702                  */
12703                 tp->snd_ssthresh = 0xffffffff;
12704         }
12705         rack_stop_all_timers(tp);
12706         /* Lets setup the fsb block */
12707         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
12708         rack_log_rtt_shrinks(rack,  us_cts,  tp->t_rxtcur,
12709                              __LINE__, RACK_RTTS_INIT);
12710         return (0);
12711 }
12712
12713 static int
12714 rack_handoff_ok(struct tcpcb *tp)
12715 {
12716         if ((tp->t_state == TCPS_CLOSED) ||
12717             (tp->t_state == TCPS_LISTEN)) {
12718                 /* Sure no problem though it may not stick */
12719                 return (0);
12720         }
12721         if ((tp->t_state == TCPS_SYN_SENT) ||
12722             (tp->t_state == TCPS_SYN_RECEIVED)) {
12723                 /*
12724                  * We really don't know if you support sack,
12725                  * you have to get to ESTAB or beyond to tell.
12726                  */
12727                 return (EAGAIN);
12728         }
12729         if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
12730                 /*
12731                  * Rack will only send a FIN after all data is acknowledged.
12732                  * So in this case we have more data outstanding. We can't
12733                  * switch stacks until either all data and only the FIN
12734                  * is left (in which case rack_init() now knows how
12735                  * to deal with that) <or> all is acknowledged and we
12736                  * are only left with incoming data, though why you
12737                  * would want to switch to rack after all data is acknowledged
12738                  * I have no idea (rrs)!
12739                  */
12740                 return (EAGAIN);
12741         }
12742         if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
12743                 return (0);
12744         }
12745         /*
12746          * If we reach here we don't do SACK on this connection so we can
12747          * never do rack.
12748          */
12749         return (EINVAL);
12750 }
12751
12752
12753 static void
12754 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
12755 {
12756         if (tp->t_fb_ptr) {
12757                 struct tcp_rack *rack;
12758                 struct rack_sendmap *rsm, *nrsm;
12759 #ifdef INVARIANTS
12760                 struct rack_sendmap *rm;
12761 #endif
12762
12763                 rack = (struct tcp_rack *)tp->t_fb_ptr;
12764                 if (tp->t_in_pkt) {
12765                         /*
12766                          * It is unsafe to process the packets since a
12767                          * reset may be lurking in them (its rare but it
12768                          * can occur). If we were to find a RST, then we
12769                          * would end up dropping the connection and the
12770                          * INP lock, so when we return the caller (tcp_usrreq)
12771                          * will blow up when it trys to unlock the inp.
12772                          */
12773                         struct mbuf *save, *m;
12774
12775                         m = tp->t_in_pkt;
12776                         tp->t_in_pkt = NULL;
12777                         tp->t_tail_pkt = NULL;
12778                         while (m) {
12779                                 save = m->m_nextpkt;
12780                                 m->m_nextpkt = NULL;
12781                                 m_freem(m);
12782                                 m = save;
12783                         }
12784                 }
12785                 tp->t_flags &= ~TF_FORCEDATA;
12786 #ifdef NETFLIX_SHARED_CWND
12787                 if (rack->r_ctl.rc_scw) {
12788                         uint32_t limit;
12789
12790                         if (rack->r_limit_scw)
12791                                 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
12792                         else
12793                                 limit = 0;
12794                         tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
12795                                                   rack->r_ctl.rc_scw_index,
12796                                                   limit);
12797                         rack->r_ctl.rc_scw = NULL;
12798                 }
12799 #endif
12800                 if (rack->r_ctl.fsb.tcp_ip_hdr) {
12801                         free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
12802                         rack->r_ctl.fsb.tcp_ip_hdr = NULL;
12803                         rack->r_ctl.fsb.th = NULL;
12804                 }
12805                 /* Convert back to ticks, with  */
12806                 if (tp->t_srtt > 1) {
12807                         uint32_t val, frac;
12808
12809                         val = USEC_2_TICKS(tp->t_srtt);
12810                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
12811                         tp->t_srtt = val << TCP_RTT_SHIFT;
12812                         /*
12813                          * frac is the fractional part here is left
12814                          * over from converting to hz and shifting.
12815                          * We need to convert this to the 5 bit
12816                          * remainder.
12817                          */
12818                         if (frac) {
12819                                 if (hz == 1000) {
12820                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
12821                                 } else {
12822                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
12823                                 }
12824                                 tp->t_srtt += frac;
12825                         }
12826                 }
12827                 if (tp->t_rttvar) {
12828                         uint32_t val, frac;
12829
12830                         val = USEC_2_TICKS(tp->t_rttvar);
12831                         frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
12832                         tp->t_rttvar = val <<  TCP_RTTVAR_SHIFT;
12833                         /*
12834                          * frac is the fractional part here is left
12835                          * over from converting to hz and shifting.
12836                          * We need to convert this to the 5 bit
12837                          * remainder.
12838                          */
12839                         if (frac) {
12840                                 if (hz == 1000) {
12841                                         frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
12842                                 } else {
12843                                         frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
12844                                 }
12845                                 tp->t_rttvar += frac;
12846                         }
12847                 }
12848                 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
12849                 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
12850                 if (rack->rc_always_pace) {
12851                         tcp_decrement_paced_conn();
12852                         rack_undo_cc_pacing(rack);
12853                         rack->rc_always_pace = 0;
12854                 }
12855                 /* Clean up any options if they were not applied */
12856                 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
12857                         struct deferred_opt_list *dol;
12858
12859                         dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
12860                         TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
12861                         free(dol, M_TCPDO);
12862                 }
12863                 /* rack does not use force data but other stacks may clear it */
12864                 if (rack->r_ctl.crte != NULL) {
12865                         tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
12866                         rack->rack_hdrw_pacing = 0;
12867                         rack->r_ctl.crte = NULL;
12868                 }
12869 #ifdef TCP_BLACKBOX
12870                 tcp_log_flowend(tp);
12871 #endif
12872                 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
12873 #ifndef INVARIANTS
12874                         (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12875 #else
12876                         rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
12877                         if (rm != rsm) {
12878                                 panic("At fini, rack:%p rsm:%p rm:%p",
12879                                       rack, rsm, rm);
12880                         }
12881 #endif
12882                         uma_zfree(rack_zone, rsm);
12883                 }
12884                 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
12885                 while (rsm) {
12886                         TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
12887                         uma_zfree(rack_zone, rsm);
12888                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
12889                 }
12890                 rack->rc_free_cnt = 0;
12891                 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
12892                 tp->t_fb_ptr = NULL;
12893         }
12894         if (tp->t_inpcb) {
12895                 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
12896                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
12897                 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
12898                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP;
12899                 /* Cancel the GP measurement in progress */
12900                 tp->t_flags &= ~TF_GPUTINPROG;
12901                 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS;
12902         }
12903         /* Make sure snd_nxt is correctly set */
12904         tp->snd_nxt = tp->snd_max;
12905 }
12906
12907 static void
12908 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
12909 {
12910         if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
12911                 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
12912         }
12913         switch (tp->t_state) {
12914         case TCPS_SYN_SENT:
12915                 rack->r_state = TCPS_SYN_SENT;
12916                 rack->r_substate = rack_do_syn_sent;
12917                 break;
12918         case TCPS_SYN_RECEIVED:
12919                 rack->r_state = TCPS_SYN_RECEIVED;
12920                 rack->r_substate = rack_do_syn_recv;
12921                 break;
12922         case TCPS_ESTABLISHED:
12923                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12924                 rack->r_state = TCPS_ESTABLISHED;
12925                 rack->r_substate = rack_do_established;
12926                 break;
12927         case TCPS_CLOSE_WAIT:
12928                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12929                 rack->r_state = TCPS_CLOSE_WAIT;
12930                 rack->r_substate = rack_do_close_wait;
12931                 break;
12932         case TCPS_FIN_WAIT_1:
12933                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12934                 rack->r_state = TCPS_FIN_WAIT_1;
12935                 rack->r_substate = rack_do_fin_wait_1;
12936                 break;
12937         case TCPS_CLOSING:
12938                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12939                 rack->r_state = TCPS_CLOSING;
12940                 rack->r_substate = rack_do_closing;
12941                 break;
12942         case TCPS_LAST_ACK:
12943                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12944                 rack->r_state = TCPS_LAST_ACK;
12945                 rack->r_substate = rack_do_lastack;
12946                 break;
12947         case TCPS_FIN_WAIT_2:
12948                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
12949                 rack->r_state = TCPS_FIN_WAIT_2;
12950                 rack->r_substate = rack_do_fin_wait_2;
12951                 break;
12952         case TCPS_LISTEN:
12953         case TCPS_CLOSED:
12954         case TCPS_TIME_WAIT:
12955         default:
12956                 break;
12957         };
12958         if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
12959                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
12960
12961 }
12962
12963 static void
12964 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
12965 {
12966         /*
12967          * We received an ack, and then did not
12968          * call send or were bounced out due to the
12969          * hpts was running. Now a timer is up as well, is
12970          * it the right timer?
12971          */
12972         struct rack_sendmap *rsm;
12973         int tmr_up;
12974
12975         tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
12976         if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
12977                 return;
12978         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
12979         if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
12980             (tmr_up == PACE_TMR_RXT)) {
12981                 /* Should be an RXT */
12982                 return;
12983         }
12984         if (rsm == NULL) {
12985                 /* Nothing outstanding? */
12986                 if (tp->t_flags & TF_DELACK) {
12987                         if (tmr_up == PACE_TMR_DELACK)
12988                                 /* We are supposed to have delayed ack up and we do */
12989                                 return;
12990                 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
12991                         /*
12992                          * if we hit enobufs then we would expect the possibility
12993                          * of nothing outstanding and the RXT up (and the hptsi timer).
12994                          */
12995                         return;
12996                 } else if (((V_tcp_always_keepalive ||
12997                              rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
12998                             (tp->t_state <= TCPS_CLOSING)) &&
12999                            (tmr_up == PACE_TMR_KEEP) &&
13000                            (tp->snd_max == tp->snd_una)) {
13001                         /* We should have keep alive up and we do */
13002                         return;
13003                 }
13004         }
13005         if (SEQ_GT(tp->snd_max, tp->snd_una) &&
13006                    ((tmr_up == PACE_TMR_TLP) ||
13007                     (tmr_up == PACE_TMR_RACK) ||
13008                     (tmr_up == PACE_TMR_RXT))) {
13009                 /*
13010                  * Either a Rack, TLP or RXT is fine if  we
13011                  * have outstanding data.
13012                  */
13013                 return;
13014         } else if (tmr_up == PACE_TMR_DELACK) {
13015                 /*
13016                  * If the delayed ack was going to go off
13017                  * before the rtx/tlp/rack timer were going to
13018                  * expire, then that would be the timer in control.
13019                  * Note we don't check the time here trusting the
13020                  * code is correct.
13021                  */
13022                 return;
13023         }
13024         /*
13025          * Ok the timer originally started is not what we want now.
13026          * We will force the hpts to be stopped if any, and restart
13027          * with the slot set to what was in the saved slot.
13028          */
13029         if (tcp_in_hpts(rack->rc_inp)) {
13030                 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
13031                         uint32_t us_cts;
13032
13033                         us_cts = tcp_get_usecs(NULL);
13034                         if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
13035                                 rack->r_early = 1;
13036                                 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
13037                         }
13038                         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
13039                 }
13040                 tcp_hpts_remove(tp->t_inpcb);
13041         }
13042         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13043         rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
13044 }
13045
13046
13047 static void
13048 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
13049 {
13050         if ((SEQ_LT(tp->snd_wl1, seq) ||
13051             (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
13052             (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
13053                 /* keep track of pure window updates */
13054                 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
13055                         KMOD_TCPSTAT_INC(tcps_rcvwinupd);
13056                 tp->snd_wnd = tiwin;
13057                 rack_validate_fo_sendwin_up(tp, rack);
13058                 tp->snd_wl1 = seq;
13059                 tp->snd_wl2 = ack;
13060                 if (tp->snd_wnd > tp->max_sndwnd)
13061                         tp->max_sndwnd = tp->snd_wnd;
13062             rack->r_wanted_output = 1;
13063         } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
13064                 tp->snd_wnd = tiwin;
13065                 rack_validate_fo_sendwin_up(tp, rack);
13066                 tp->snd_wl1 = seq;
13067                 tp->snd_wl2 = ack;
13068         } else {
13069                 /* Not a valid win update */
13070                 return;
13071         }
13072         if (tp->snd_wnd > tp->max_sndwnd)
13073                 tp->max_sndwnd = tp->snd_wnd;
13074         if (tp->snd_wnd < (tp->snd_max - high_seq)) {
13075                 /* The peer collapsed the window */
13076                 rack_collapsed_window(rack);
13077         } else if (rack->rc_has_collapsed)
13078                 rack_un_collapse_window(rack);
13079         /* Do we exit persists? */
13080         if ((rack->rc_in_persist != 0) &&
13081             (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
13082                                 rack->r_ctl.rc_pace_min_segs))) {
13083                 rack_exit_persist(tp, rack, cts);
13084         }
13085         /* Do we enter persists? */
13086         if ((rack->rc_in_persist == 0) &&
13087             (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
13088             TCPS_HAVEESTABLISHED(tp->t_state) &&
13089             ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
13090             sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
13091             (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
13092                 /*
13093                  * Here the rwnd is less than
13094                  * the pacing size, we are established,
13095                  * nothing is outstanding, and there is
13096                  * data to send. Enter persists.
13097                  */
13098                 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
13099         }
13100 }
13101
13102 static void
13103 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
13104 {
13105
13106         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
13107                 union tcp_log_stackspecific log;
13108                 struct timeval ltv;
13109                 char tcp_hdr_buf[60];
13110                 struct tcphdr *th;
13111                 struct timespec ts;
13112                 uint32_t orig_snd_una;
13113                 uint8_t xx = 0;
13114
13115 #ifdef NETFLIX_HTTP_LOGGING
13116                 struct http_sendfile_track *http_req;
13117
13118                 if (SEQ_GT(ae->ack, tp->snd_una)) {
13119                         http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
13120                 } else {
13121                         http_req = tcp_http_find_req_for_seq(tp, ae->ack);
13122                 }
13123 #endif
13124                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
13125                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
13126                 if (rack->rack_no_prr == 0)
13127                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
13128                 else
13129                         log.u_bbr.flex1 = 0;
13130                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
13131                 log.u_bbr.use_lt_bw <<= 1;
13132                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
13133                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
13134                 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
13135                 log.u_bbr.pkts_out = tp->t_maxseg;
13136                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
13137                 log.u_bbr.flex7 = 1;
13138                 log.u_bbr.lost = ae->flags;
13139                 log.u_bbr.cwnd_gain = ackval;
13140                 log.u_bbr.pacing_gain = 0x2;
13141                 if (ae->flags & TSTMP_HDWR) {
13142                         /* Record the hardware timestamp if present */
13143                         log.u_bbr.flex3 = M_TSTMP;
13144                         ts.tv_sec = ae->timestamp / 1000000000;
13145                         ts.tv_nsec = ae->timestamp % 1000000000;
13146                         ltv.tv_sec = ts.tv_sec;
13147                         ltv.tv_usec = ts.tv_nsec / 1000;
13148                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
13149                 } else if (ae->flags & TSTMP_LRO) {
13150                         /* Record the LRO the arrival timestamp */
13151                         log.u_bbr.flex3 = M_TSTMP_LRO;
13152                         ts.tv_sec = ae->timestamp / 1000000000;
13153                         ts.tv_nsec = ae->timestamp % 1000000000;
13154                         ltv.tv_sec = ts.tv_sec;
13155                         ltv.tv_usec = ts.tv_nsec / 1000;
13156                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
13157                 }
13158                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
13159                 /* Log the rcv time */
13160                 log.u_bbr.delRate = ae->timestamp;
13161 #ifdef NETFLIX_HTTP_LOGGING
13162                 log.u_bbr.applimited = tp->t_http_closed;
13163                 log.u_bbr.applimited <<= 8;
13164                 log.u_bbr.applimited |= tp->t_http_open;
13165                 log.u_bbr.applimited <<= 8;
13166                 log.u_bbr.applimited |= tp->t_http_req;
13167                 if (http_req) {
13168                         /* Copy out any client req info */
13169                         /* seconds */
13170                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
13171                         /* useconds */
13172                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
13173                         log.u_bbr.rttProp = http_req->timestamp;
13174                         log.u_bbr.cur_del_rate = http_req->start;
13175                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
13176                                 log.u_bbr.flex8 |= 1;
13177                         } else {
13178                                 log.u_bbr.flex8 |= 2;
13179                                 log.u_bbr.bw_inuse = http_req->end;
13180                         }
13181                         log.u_bbr.flex6 = http_req->start_seq;
13182                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
13183                                 log.u_bbr.flex8 |= 4;
13184                                 log.u_bbr.epoch = http_req->end_seq;
13185                         }
13186                 }
13187 #endif
13188                 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
13189                 th = (struct tcphdr *)tcp_hdr_buf;
13190                 th->th_seq = ae->seq;
13191                 th->th_ack = ae->ack;
13192                 th->th_win = ae->win;
13193                 /* Now fill in the ports */
13194                 th->th_sport = tp->t_inpcb->inp_fport;
13195                 th->th_dport = tp->t_inpcb->inp_lport;
13196                 tcp_set_flags(th, ae->flags);
13197                 /* Now do we have a timestamp option? */
13198                 if (ae->flags & HAS_TSTMP) {
13199                         u_char *cp;
13200                         uint32_t val;
13201
13202                         th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
13203                         cp = (u_char *)(th + 1);
13204                         *cp = TCPOPT_NOP;
13205                         cp++;
13206                         *cp = TCPOPT_NOP;
13207                         cp++;
13208                         *cp = TCPOPT_TIMESTAMP;
13209                         cp++;
13210                         *cp = TCPOLEN_TIMESTAMP;
13211                         cp++;
13212                         val = htonl(ae->ts_value);
13213                         bcopy((char *)&val,
13214                               (char *)cp, sizeof(uint32_t));
13215                         val = htonl(ae->ts_echo);
13216                         bcopy((char *)&val,
13217                               (char *)(cp + 4), sizeof(uint32_t));
13218                 } else
13219                         th->th_off = (sizeof(struct tcphdr) >> 2);
13220
13221                 /*
13222                  * For sane logging we need to play a little trick.
13223                  * If the ack were fully processed we would have moved
13224                  * snd_una to high_seq, but since compressed acks are
13225                  * processed in two phases, at this point (logging) snd_una
13226                  * won't be advanced. So we would see multiple acks showing
13227                  * the advancement. We can prevent that by "pretending" that
13228                  * snd_una was advanced and then un-advancing it so that the
13229                  * logging code has the right value for tlb_snd_una.
13230                  */
13231                 if (tp->snd_una != high_seq) {
13232                         orig_snd_una = tp->snd_una;
13233                         tp->snd_una = high_seq;
13234                         xx = 1;
13235                 } else
13236                         xx = 0;
13237                 TCP_LOG_EVENTP(tp, th,
13238                                &tp->t_inpcb->inp_socket->so_rcv,
13239                                &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0,
13240                                0, &log, true, &ltv);
13241                 if (xx) {
13242                         tp->snd_una = orig_snd_una;
13243                 }
13244         }
13245
13246 }
13247
13248 static void
13249 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
13250 {
13251         uint32_t us_rtt;
13252         /*
13253          * A persist or keep-alive was forced out, update our
13254          * min rtt time. Note now worry about lost responses.
13255          * When a subsequent keep-alive or persist times out
13256          * and forced_ack is still on, then the last probe
13257          * was not responded to. In such cases we have a
13258          * sysctl that controls the behavior. Either we apply
13259          * the rtt but with reduced confidence (0). Or we just
13260          * plain don't apply the rtt estimate. Having data flow
13261          * will clear the probe_not_answered flag i.e. cum-ack
13262          * move forward <or> exiting and reentering persists.
13263          */
13264
13265         rack->forced_ack = 0;
13266         rack->rc_tp->t_rxtshift = 0;
13267         if ((rack->rc_in_persist &&
13268              (tiwin == rack->rc_tp->snd_wnd)) ||
13269             (rack->rc_in_persist == 0)) {
13270                 /*
13271                  * In persists only apply the RTT update if this is
13272                  * a response to our window probe. And that
13273                  * means the rwnd sent must match the current
13274                  * snd_wnd. If it does not, then we got a
13275                  * window update ack instead. For keepalive
13276                  * we allow the answer no matter what the window.
13277                  *
13278                  * Note that if the probe_not_answered is set then
13279                  * the forced_ack_ts is the oldest one i.e. the first
13280                  * probe sent that might have been lost. This assures
13281                  * us that if we do calculate an RTT it is longer not
13282                  * some short thing.
13283                  */
13284                 if (rack->rc_in_persist)
13285                         counter_u64_add(rack_persists_acks, 1);
13286                 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
13287                 if (us_rtt == 0)
13288                         us_rtt = 1;
13289                 if (rack->probe_not_answered == 0) {
13290                         rack_apply_updated_usrtt(rack, us_rtt, us_cts);
13291                         tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
13292                 } else {
13293                         /* We have a retransmitted probe here too */
13294                         if (rack_apply_rtt_with_reduced_conf) {
13295                                 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
13296                                 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
13297                         }
13298                 }
13299         }
13300 }
13301
13302 static int
13303 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
13304 {
13305         /*
13306          * Handle a "special" compressed ack mbuf. Each incoming
13307          * ack has only four possible dispositions:
13308          *
13309          * A) It moves the cum-ack forward
13310          * B) It is behind the cum-ack.
13311          * C) It is a window-update ack.
13312          * D) It is a dup-ack.
13313          *
13314          * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
13315          * in the incoming mbuf. We also need to still pay attention
13316          * to nxt_pkt since there may be another packet after this
13317          * one.
13318          */
13319 #ifdef TCP_ACCOUNTING
13320         uint64_t ts_val;
13321         uint64_t rdstc;
13322 #endif
13323         int segsiz;
13324         struct timespec ts;
13325         struct tcp_rack *rack;
13326         struct tcp_ackent *ae;
13327         uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
13328         int cnt, i, did_out, ourfinisacked = 0;
13329         struct tcpopt to_holder, *to = NULL;
13330 #ifdef TCP_ACCOUNTING
13331         int win_up_req = 0;
13332 #endif
13333         int nsegs = 0;
13334         int under_pacing = 1;
13335         int recovery = 0;
13336 #ifdef TCP_ACCOUNTING
13337         sched_pin();
13338 #endif
13339         rack = (struct tcp_rack *)tp->t_fb_ptr;
13340         if (rack->gp_ready &&
13341             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
13342                 under_pacing = 0;
13343         else
13344                 under_pacing = 1;
13345
13346         if (rack->r_state != tp->t_state)
13347                 rack_set_state(tp, rack);
13348         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13349             (tp->t_flags & TF_GPUTINPROG)) {
13350                 /*
13351                  * We have a goodput in progress
13352                  * and we have entered a late state.
13353                  * Do we have enough data in the sb
13354                  * to handle the GPUT request?
13355                  */
13356                 uint32_t bytes;
13357
13358                 bytes = tp->gput_ack - tp->gput_seq;
13359                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
13360                         bytes += tp->gput_seq - tp->snd_una;
13361                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
13362                         /*
13363                          * There are not enough bytes in the socket
13364                          * buffer that have been sent to cover this
13365                          * measurement. Cancel it.
13366                          */
13367                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
13368                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
13369                                                    tp->gput_seq,
13370                                                    0, 0, 18, __LINE__, NULL, 0);
13371                         tp->t_flags &= ~TF_GPUTINPROG;
13372                 }
13373         }
13374         to = &to_holder;
13375         to->to_flags = 0;
13376         KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
13377                 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
13378         cnt = m->m_len / sizeof(struct tcp_ackent);
13379         counter_u64_add(rack_multi_single_eq, cnt);
13380         high_seq = tp->snd_una;
13381         the_win = tp->snd_wnd;
13382         win_seq = tp->snd_wl1;
13383         win_upd_ack = tp->snd_wl2;
13384         cts = tcp_tv_to_usectick(tv);
13385         ms_cts = tcp_tv_to_mssectick(tv);
13386         rack->r_ctl.rc_rcvtime = cts;
13387         segsiz = ctf_fixed_maxseg(tp);
13388         if ((rack->rc_gp_dyn_mul) &&
13389             (rack->use_fixed_rate == 0) &&
13390             (rack->rc_always_pace)) {
13391                 /* Check in on probertt */
13392                 rack_check_probe_rtt(rack, cts);
13393         }
13394         for (i = 0; i < cnt; i++) {
13395 #ifdef TCP_ACCOUNTING
13396                 ts_val = get_cyclecount();
13397 #endif
13398                 rack_clear_rate_sample(rack);
13399                 ae = ((mtod(m, struct tcp_ackent *)) + i);
13400                 /* Setup the window */
13401                 tiwin = ae->win << tp->snd_scale;
13402                 if (tiwin > rack->r_ctl.rc_high_rwnd)
13403                         rack->r_ctl.rc_high_rwnd = tiwin;
13404                 /* figure out the type of ack */
13405                 if (SEQ_LT(ae->ack, high_seq)) {
13406                         /* Case B*/
13407                         ae->ack_val_set = ACK_BEHIND;
13408                 } else if (SEQ_GT(ae->ack, high_seq)) {
13409                         /* Case A */
13410                         ae->ack_val_set = ACK_CUMACK;
13411                 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
13412                         /* Case D */
13413                         ae->ack_val_set = ACK_DUPACK;
13414                 } else {
13415                         /* Case C */
13416                         ae->ack_val_set = ACK_RWND;
13417                 }
13418                 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
13419                 /* Validate timestamp */
13420                 if (ae->flags & HAS_TSTMP) {
13421                         /* Setup for a timestamp */
13422                         to->to_flags = TOF_TS;
13423                         ae->ts_echo -= tp->ts_offset;
13424                         to->to_tsecr = ae->ts_echo;
13425                         to->to_tsval = ae->ts_value;
13426                         /*
13427                          * If echoed timestamp is later than the current time, fall back to
13428                          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
13429                          * were used when this connection was established.
13430                          */
13431                         if (TSTMP_GT(ae->ts_echo, ms_cts))
13432                                 to->to_tsecr = 0;
13433                         if (tp->ts_recent &&
13434                             TSTMP_LT(ae->ts_value, tp->ts_recent)) {
13435                                 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
13436 #ifdef TCP_ACCOUNTING
13437                                         rdstc = get_cyclecount();
13438                                         if (rdstc > ts_val) {
13439                                                 counter_u64_add(tcp_proc_time[ae->ack_val_set] ,
13440                                                                 (rdstc - ts_val));
13441                                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13442                                                         tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13443                                                 }
13444                                         }
13445 #endif
13446                                         continue;
13447                                 }
13448                         }
13449                         if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
13450                             SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
13451                                 tp->ts_recent_age = tcp_ts_getticks();
13452                                 tp->ts_recent = ae->ts_value;
13453                         }
13454                 } else {
13455                         /* Setup for a no options */
13456                         to->to_flags = 0;
13457                 }
13458                 /* Update the rcv time and perform idle reduction possibly */
13459                 if  (tp->t_idle_reduce &&
13460                      (tp->snd_max == tp->snd_una) &&
13461                      (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
13462                         counter_u64_add(rack_input_idle_reduces, 1);
13463                         rack_cc_after_idle(rack, tp);
13464                 }
13465                 tp->t_rcvtime = ticks;
13466                 /* Now what about ECN? */
13467                 if (tcp_ecn_input_segment(tp, ae->flags, ae->codepoint))
13468                         rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
13469 #ifdef TCP_ACCOUNTING
13470                 /* Count for the specific type of ack in */
13471                 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1);
13472                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13473                         tp->tcp_cnt_counters[ae->ack_val_set]++;
13474                 }
13475 #endif
13476                 /*
13477                  * Note how we could move up these in the determination
13478                  * above, but we don't so that way the timestamp checks (and ECN)
13479                  * is done first before we do any processing on the ACK.
13480                  * The non-compressed path through the code has this
13481                  * weakness (noted by @jtl) that it actually does some
13482                  * processing before verifying the timestamp information.
13483                  * We don't take that path here which is why we set
13484                  * the ack_val_set first, do the timestamp and ecn
13485                  * processing, and then look at what we have setup.
13486                  */
13487                 if (ae->ack_val_set == ACK_BEHIND) {
13488                         /*
13489                          * Case B flag reordering, if window is not closed
13490                          * or it could be a keep-alive or persists
13491                          */
13492                         if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
13493                                 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
13494                         }
13495                 } else if (ae->ack_val_set == ACK_DUPACK) {
13496                         /* Case D */
13497                         rack_strike_dupack(rack);
13498                 } else if (ae->ack_val_set == ACK_RWND) {
13499                         /* Case C */
13500                         if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
13501                                 ts.tv_sec = ae->timestamp / 1000000000;
13502                                 ts.tv_nsec = ae->timestamp % 1000000000;
13503                                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
13504                                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
13505                         } else {
13506                                 rack->r_ctl.act_rcv_time = *tv;
13507                         }
13508                         if (rack->forced_ack) {
13509                                 rack_handle_probe_response(rack, tiwin,
13510                                                            tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
13511                         }
13512 #ifdef TCP_ACCOUNTING
13513                         win_up_req = 1;
13514 #endif
13515                         win_upd_ack = ae->ack;
13516                         win_seq = ae->seq;
13517                         the_win = tiwin;
13518                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13519                 } else {
13520                         /* Case A */
13521                         if (SEQ_GT(ae->ack, tp->snd_max)) {
13522                                 /*
13523                                  * We just send an ack since the incoming
13524                                  * ack is beyond the largest seq we sent.
13525                                  */
13526                                 if ((tp->t_flags & TF_ACKNOW) == 0) {
13527                                         ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
13528                                         if (tp->t_flags && TF_ACKNOW)
13529                                                 rack->r_wanted_output = 1;
13530                                 }
13531                         } else {
13532                                 nsegs++;
13533                                 /* If the window changed setup to update */
13534                                 if (tiwin != tp->snd_wnd) {
13535                                         win_upd_ack = ae->ack;
13536                                         win_seq = ae->seq;
13537                                         the_win = tiwin;
13538                                         rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
13539                                 }
13540 #ifdef TCP_ACCOUNTING
13541                                 /* Account for the acks */
13542                                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13543                                         tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
13544                                 }
13545                                 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN],
13546                                                 (((ae->ack - high_seq) + segsiz - 1) / segsiz));
13547 #endif
13548                                 high_seq = ae->ack;
13549                                 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
13550                                         union tcp_log_stackspecific log;
13551                                         struct timeval tv;
13552
13553                                         memset(&log.u_bbr, 0, sizeof(log.u_bbr));
13554                                         log.u_bbr.timeStamp = tcp_get_usecs(&tv);
13555                                         log.u_bbr.flex1 = high_seq;
13556                                         log.u_bbr.flex2 = rack->r_ctl.roundends;
13557                                         log.u_bbr.flex3 = rack->r_ctl.current_round;
13558                                         log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
13559                                         log.u_bbr.flex8 = 8;
13560                                         tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
13561                                                        0, &log, false, NULL, NULL, 0, &tv);
13562                                 }
13563                                 /*
13564                                  * The draft (v3) calls for us to use SEQ_GEQ, but that
13565                                  * causes issues when we are just going app limited. Lets
13566                                  * instead use SEQ_GT <or> where its equal but more data
13567                                  * is outstanding.
13568                                  */
13569                                 if ((SEQ_GT(high_seq, rack->r_ctl.roundends)) ||
13570                                     ((high_seq == rack->r_ctl.roundends) &&
13571                                      SEQ_GT(tp->snd_max, tp->snd_una))) {
13572                                         rack->r_ctl.current_round++;
13573                                         rack->r_ctl.roundends = tp->snd_max;
13574                                         if (CC_ALGO(tp)->newround != NULL) {
13575                                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
13576                                         }
13577                                 }
13578                                 /* Setup our act_rcv_time */
13579                                 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
13580                                         ts.tv_sec = ae->timestamp / 1000000000;
13581                                         ts.tv_nsec = ae->timestamp % 1000000000;
13582                                         rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
13583                                         rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
13584                                 } else {
13585                                         rack->r_ctl.act_rcv_time = *tv;
13586                                 }
13587                                 rack_process_to_cumack(tp, rack, ae->ack, cts, to);
13588                                 if (rack->rc_dsack_round_seen) {
13589                                         /* Is the dsack round over? */
13590                                         if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
13591                                                 /* Yes it is */
13592                                                 rack->rc_dsack_round_seen = 0;
13593                                                 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
13594                                         }
13595                                 }
13596                         }
13597                 }
13598                 /* And lets be sure to commit the rtt measurements for this ack */
13599                 tcp_rack_xmit_timer_commit(rack, tp);
13600 #ifdef TCP_ACCOUNTING
13601                 rdstc = get_cyclecount();
13602                 if (rdstc > ts_val) {
13603                         counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val));
13604                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13605                                 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
13606                                 if (ae->ack_val_set == ACK_CUMACK)
13607                                         tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
13608                         }
13609                 }
13610 #endif
13611         }
13612 #ifdef TCP_ACCOUNTING
13613         ts_val = get_cyclecount();
13614 #endif
13615         acked_amount = acked = (high_seq - tp->snd_una);
13616         if (acked) {
13617                 /*
13618                  * Clear the probe not answered flag
13619                  * since cum-ack moved forward.
13620                  */
13621                 rack->probe_not_answered = 0;
13622                 if (rack->sack_attack_disable == 0)
13623                         rack_do_decay(rack);
13624                 if (acked >= segsiz) {
13625                         /*
13626                          * You only get credit for
13627                          * MSS and greater (and you get extra
13628                          * credit for larger cum-ack moves).
13629                          */
13630                         int ac;
13631
13632                         ac = acked / segsiz;
13633                         rack->r_ctl.ack_count += ac;
13634                         counter_u64_add(rack_ack_total, ac);
13635                 }
13636                 if (rack->r_ctl.ack_count > 0xfff00000) {
13637                         /*
13638                          * reduce the number to keep us under
13639                          * a uint32_t.
13640                          */
13641                         rack->r_ctl.ack_count /= 2;
13642                         rack->r_ctl.sack_count /= 2;
13643                 }
13644                 if (tp->t_flags & TF_NEEDSYN) {
13645                         /*
13646                          * T/TCP: Connection was half-synchronized, and our SYN has
13647                          * been ACK'd (so connection is now fully synchronized).  Go
13648                          * to non-starred state, increment snd_una for ACK of SYN,
13649                          * and check if we can do window scaling.
13650                          */
13651                         tp->t_flags &= ~TF_NEEDSYN;
13652                         tp->snd_una++;
13653                         acked_amount = acked = (high_seq - tp->snd_una);
13654                 }
13655                 if (acked > sbavail(&so->so_snd))
13656                         acked_amount = sbavail(&so->so_snd);
13657 #ifdef NETFLIX_EXP_DETECTION
13658                 /*
13659                  * We only care on a cum-ack move if we are in a sack-disabled
13660                  * state. We have already added in to the ack_count, and we never
13661                  * would disable on a cum-ack move, so we only care to do the
13662                  * detection if it may "undo" it, i.e. we were in disabled already.
13663                  */
13664                 if (rack->sack_attack_disable)
13665                         rack_do_detection(tp, rack, acked_amount, segsiz);
13666 #endif
13667                 if (IN_FASTRECOVERY(tp->t_flags) &&
13668                     (rack->rack_no_prr == 0))
13669                         rack_update_prr(tp, rack, acked_amount, high_seq);
13670                 if (IN_RECOVERY(tp->t_flags)) {
13671                         if (SEQ_LT(high_seq, tp->snd_recover) &&
13672                             (SEQ_LT(high_seq, tp->snd_max))) {
13673                                 tcp_rack_partialack(tp);
13674                         } else {
13675                                 rack_post_recovery(tp, high_seq);
13676                                 recovery = 1;
13677                         }
13678                 }
13679                 /* Handle the rack-log-ack part (sendmap) */
13680                 if ((sbused(&so->so_snd) == 0) &&
13681                     (acked > acked_amount) &&
13682                     (tp->t_state >= TCPS_FIN_WAIT_1) &&
13683                     (tp->t_flags & TF_SENTFIN)) {
13684                         /*
13685                          * We must be sure our fin
13686                          * was sent and acked (we can be
13687                          * in FIN_WAIT_1 without having
13688                          * sent the fin).
13689                          */
13690                         ourfinisacked = 1;
13691                         /*
13692                          * Lets make sure snd_una is updated
13693                          * since most likely acked_amount = 0 (it
13694                          * should be).
13695                          */
13696                         tp->snd_una = high_seq;
13697                 }
13698                 /* Did we make a RTO error? */
13699                 if ((tp->t_flags & TF_PREVVALID) &&
13700                     ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
13701                         tp->t_flags &= ~TF_PREVVALID;
13702                         if (tp->t_rxtshift == 1 &&
13703                             (int)(ticks - tp->t_badrxtwin) < 0)
13704                                 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__);
13705                 }
13706                 /* Handle the data in the socket buffer */
13707                 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
13708                 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
13709                 if (acked_amount > 0) {
13710                         struct mbuf *mfree;
13711
13712                         rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
13713                         SOCKBUF_LOCK(&so->so_snd);
13714                         mfree = sbcut_locked(&so->so_snd, acked_amount);
13715                         tp->snd_una = high_seq;
13716                         /* Note we want to hold the sb lock through the sendmap adjust */
13717                         rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
13718                         /* Wake up the socket if we have room to write more */
13719                         rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
13720                         sowwakeup_locked(so);
13721                         m_freem(mfree);
13722                 }
13723                 /* update progress */
13724                 tp->t_acktime = ticks;
13725                 rack_log_progress_event(rack, tp, tp->t_acktime,
13726                                         PROGRESS_UPDATE, __LINE__);
13727                 /* Clear out shifts and such */
13728                 tp->t_rxtshift = 0;
13729                 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
13730                                    rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
13731                 rack->rc_tlp_in_progress = 0;
13732                 rack->r_ctl.rc_tlp_cnt_out = 0;
13733                 /* Send recover and snd_nxt must be dragged along */
13734                 if (SEQ_GT(tp->snd_una, tp->snd_recover))
13735                         tp->snd_recover = tp->snd_una;
13736                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
13737                         tp->snd_nxt = tp->snd_una;
13738                 /*
13739                  * If the RXT timer is running we want to
13740                  * stop it, so we can restart a TLP (or new RXT).
13741                  */
13742                 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
13743                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13744 #ifdef NETFLIX_HTTP_LOGGING
13745                 tcp_http_check_for_comp(rack->rc_tp, high_seq);
13746 #endif
13747                 tp->snd_wl2 = high_seq;
13748                 tp->t_dupacks = 0;
13749                 if (under_pacing &&
13750                     (rack->use_fixed_rate == 0) &&
13751                     (rack->in_probe_rtt == 0) &&
13752                     rack->rc_gp_dyn_mul &&
13753                     rack->rc_always_pace) {
13754                         /* Check if we are dragging bottom */
13755                         rack_check_bottom_drag(tp, rack, so, acked);
13756                 }
13757                 if (tp->snd_una == tp->snd_max) {
13758                         tp->t_flags &= ~TF_PREVVALID;
13759                         rack->r_ctl.retran_during_recovery = 0;
13760                         rack->r_ctl.dsack_byte_cnt = 0;
13761                         rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
13762                         if (rack->r_ctl.rc_went_idle_time == 0)
13763                                 rack->r_ctl.rc_went_idle_time = 1;
13764                         rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
13765                         if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
13766                                 tp->t_acktime = 0;
13767                         /* Set so we might enter persists... */
13768                         rack->r_wanted_output = 1;
13769                         rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13770                         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
13771                         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
13772                             (sbavail(&so->so_snd) == 0) &&
13773                             (tp->t_flags2 & TF2_DROP_AF_DATA)) {
13774                                 /*
13775                                  * The socket was gone and the
13776                                  * peer sent data (not now in the past), time to
13777                                  * reset him.
13778                                  */
13779                                 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
13780                                 /* tcp_close will kill the inp pre-log the Reset */
13781                                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13782 #ifdef TCP_ACCOUNTING
13783                                 rdstc = get_cyclecount();
13784                                 if (rdstc > ts_val) {
13785                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13786                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13787                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13788                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13789                                         }
13790                                 }
13791 #endif
13792                                 m_freem(m);
13793                                 tp = tcp_close(tp);
13794                                 if (tp == NULL) {
13795 #ifdef TCP_ACCOUNTING
13796                                         sched_unpin();
13797 #endif
13798                                         return (1);
13799                                 }
13800                                 /*
13801                                  * We would normally do drop-with-reset which would
13802                                  * send back a reset. We can't since we don't have
13803                                  * all the needed bits. Instead lets arrange for
13804                                  * a call to tcp_output(). That way since we
13805                                  * are in the closed state we will generate a reset.
13806                                  *
13807                                  * Note if tcp_accounting is on we don't unpin since
13808                                  * we do that after the goto label.
13809                                  */
13810                                 goto send_out_a_rst;
13811                         }
13812                         if ((sbused(&so->so_snd) == 0) &&
13813                             (tp->t_state >= TCPS_FIN_WAIT_1) &&
13814                             (tp->t_flags & TF_SENTFIN)) {
13815                                 /*
13816                                  * If we can't receive any more data, then closing user can
13817                                  * proceed. Starting the timer is contrary to the
13818                                  * specification, but if we don't get a FIN we'll hang
13819                                  * forever.
13820                                  *
13821                                  */
13822                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13823                                         soisdisconnected(so);
13824                                         tcp_timer_activate(tp, TT_2MSL,
13825                                                            (tcp_fast_finwait2_recycle ?
13826                                                             tcp_finwait2_timeout :
13827                                                             TP_MAXIDLE(tp)));
13828                                 }
13829                                 if (ourfinisacked == 0) {
13830                                         /*
13831                                          * We don't change to fin-wait-2 if we have our fin acked
13832                                          * which means we are probably in TCPS_CLOSING.
13833                                          */
13834                                         tcp_state_change(tp, TCPS_FIN_WAIT_2);
13835                                 }
13836                         }
13837                 }
13838                 /* Wake up the socket if we have room to write more */
13839                 if (sbavail(&so->so_snd)) {
13840                         rack->r_wanted_output = 1;
13841                         if (ctf_progress_timeout_check(tp, true)) {
13842                                 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13843                                                         tp, tick, PROGRESS_DROP, __LINE__);
13844                                 /*
13845                                  * We cheat here and don't send a RST, we should send one
13846                                  * when the pacer drops the connection.
13847                                  */
13848 #ifdef TCP_ACCOUNTING
13849                                 rdstc = get_cyclecount();
13850                                 if (rdstc > ts_val) {
13851                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13852                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13853                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13854                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13855                                         }
13856                                 }
13857                                 sched_unpin();
13858 #endif
13859                                 (void)tcp_drop(tp, ETIMEDOUT);
13860                                 m_freem(m);
13861                                 return (1);
13862                         }
13863                 }
13864                 if (ourfinisacked) {
13865                         switch(tp->t_state) {
13866                         case TCPS_CLOSING:
13867 #ifdef TCP_ACCOUNTING
13868                                 rdstc = get_cyclecount();
13869                                 if (rdstc > ts_val) {
13870                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13871                                                         (rdstc - ts_val));
13872                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13873                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13874                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13875                                         }
13876                                 }
13877                                 sched_unpin();
13878 #endif
13879                                 tcp_twstart(tp);
13880                                 m_freem(m);
13881                                 return (1);
13882                                 break;
13883                         case TCPS_LAST_ACK:
13884 #ifdef TCP_ACCOUNTING
13885                                 rdstc = get_cyclecount();
13886                                 if (rdstc > ts_val) {
13887                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13888                                                         (rdstc - ts_val));
13889                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13890                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13891                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13892                                         }
13893                                 }
13894                                 sched_unpin();
13895 #endif
13896                                 tp = tcp_close(tp);
13897                                 ctf_do_drop(m, tp);
13898                                 return (1);
13899                                 break;
13900                         case TCPS_FIN_WAIT_1:
13901 #ifdef TCP_ACCOUNTING
13902                                 rdstc = get_cyclecount();
13903                                 if (rdstc > ts_val) {
13904                                         counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
13905                                                         (rdstc - ts_val));
13906                                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13907                                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13908                                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13909                                         }
13910                                 }
13911 #endif
13912                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13913                                         soisdisconnected(so);
13914                                         tcp_timer_activate(tp, TT_2MSL,
13915                                                            (tcp_fast_finwait2_recycle ?
13916                                                             tcp_finwait2_timeout :
13917                                                             TP_MAXIDLE(tp)));
13918                                 }
13919                                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13920                                 break;
13921                         default:
13922                                 break;
13923                         }
13924                 }
13925                 if (rack->r_fast_output) {
13926                         /*
13927                          * We re doing fast output.. can we expand that?
13928                          */
13929                         rack_gain_for_fastoutput(rack, tp, so, acked_amount);
13930                 }
13931 #ifdef TCP_ACCOUNTING
13932                 rdstc = get_cyclecount();
13933                 if (rdstc > ts_val) {
13934                         counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
13935                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13936                                 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
13937                                 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
13938                         }
13939                 }
13940
13941         } else if (win_up_req) {
13942                 rdstc = get_cyclecount();
13943                 if (rdstc > ts_val) {
13944                         counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val));
13945                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
13946                                 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
13947                         }
13948                 }
13949 #endif
13950         }
13951         /* Now is there a next packet, if so we are done */
13952         m_freem(m);
13953         did_out = 0;
13954         if (nxt_pkt) {
13955 #ifdef TCP_ACCOUNTING
13956                 sched_unpin();
13957 #endif
13958                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
13959                 return (0);
13960         }
13961         rack_handle_might_revert(tp, rack);
13962         ctf_calc_rwin(so, tp);
13963         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
13964         send_out_a_rst:
13965                 if (tcp_output(tp) < 0) {
13966 #ifdef TCP_ACCOUNTING
13967                         sched_unpin();
13968 #endif
13969                         return (1);
13970                 }
13971                 did_out = 1;
13972         }
13973         rack_free_trim(rack);
13974 #ifdef TCP_ACCOUNTING
13975         sched_unpin();
13976 #endif
13977         rack_timer_audit(tp, rack, &so->so_snd);
13978         rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
13979         return (0);
13980 }
13981
13982
13983 static int
13984 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
13985     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
13986     int32_t nxt_pkt, struct timeval *tv)
13987 {
13988 #ifdef TCP_ACCOUNTING
13989         uint64_t ts_val;
13990 #endif
13991         int32_t thflags, retval, did_out = 0;
13992         int32_t way_out = 0;
13993         /*
13994          * cts - is the current time from tv (caller gets ts) in microseconds.
13995          * ms_cts - is the current time from tv in milliseconds.
13996          * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
13997          */
13998         uint32_t cts, us_cts, ms_cts;
13999         uint32_t tiwin, high_seq;
14000         struct timespec ts;
14001         struct tcpopt to;
14002         struct tcp_rack *rack;
14003         struct rack_sendmap *rsm;
14004         int32_t prev_state = 0;
14005 #ifdef TCP_ACCOUNTING
14006         int ack_val_set = 0xf;
14007 #endif
14008         int nsegs;
14009         /*
14010          * tv passed from common code is from either M_TSTMP_LRO or
14011          * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
14012          */
14013         rack = (struct tcp_rack *)tp->t_fb_ptr;
14014         if (m->m_flags & M_ACKCMP) {
14015                 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
14016         }
14017         if (m->m_flags & M_ACKCMP) {
14018                 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
14019         }
14020         cts = tcp_tv_to_usectick(tv);
14021         ms_cts =  tcp_tv_to_mssectick(tv);
14022         nsegs = m->m_pkthdr.lro_nsegs;
14023         counter_u64_add(rack_proc_non_comp_ack, 1);
14024         thflags = tcp_get_flags(th);
14025 #ifdef TCP_ACCOUNTING
14026         sched_pin();
14027         if (thflags & TH_ACK)
14028                 ts_val = get_cyclecount();
14029 #endif
14030         if ((m->m_flags & M_TSTMP) ||
14031             (m->m_flags & M_TSTMP_LRO)) {
14032                 mbuf_tstmp2timespec(m, &ts);
14033                 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
14034                 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
14035         } else
14036                 rack->r_ctl.act_rcv_time = *tv;
14037         kern_prefetch(rack, &prev_state);
14038         prev_state = 0;
14039         /*
14040          * Unscale the window into a 32-bit value. For the SYN_SENT state
14041          * the scale is zero.
14042          */
14043         tiwin = th->th_win << tp->snd_scale;
14044 #ifdef TCP_ACCOUNTING
14045         if (thflags & TH_ACK) {
14046                 /*
14047                  * We have a tradeoff here. We can either do what we are
14048                  * doing i.e. pinning to this CPU and then doing the accounting
14049                  * <or> we could do a critical enter, setup the rdtsc and cpu
14050                  * as in below, and then validate we are on the same CPU on
14051                  * exit. I have choosen to not do the critical enter since
14052                  * that often will gain you a context switch, and instead lock
14053                  * us (line above this if) to the same CPU with sched_pin(). This
14054                  * means we may be context switched out for a higher priority
14055                  * interupt but we won't be moved to another CPU.
14056                  *
14057                  * If this occurs (which it won't very often since we most likely
14058                  * are running this code in interupt context and only a higher
14059                  * priority will bump us ... clock?) we will falsely add in
14060                  * to the time the interupt processing time plus the ack processing
14061                  * time. This is ok since its a rare event.
14062                  */
14063                 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
14064                                                     ctf_fixed_maxseg(tp));
14065         }
14066 #endif
14067         /*
14068          * Parse options on any incoming segment.
14069          */
14070         memset(&to, 0, sizeof(to));
14071         tcp_dooptions(&to, (u_char *)(th + 1),
14072             (th->th_off << 2) - sizeof(struct tcphdr),
14073             (thflags & TH_SYN) ? TO_SYN : 0);
14074         NET_EPOCH_ASSERT();
14075         INP_WLOCK_ASSERT(tp->t_inpcb);
14076         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
14077             __func__));
14078         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
14079             __func__));
14080         if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
14081             (tp->t_flags & TF_GPUTINPROG)) {
14082                 /*
14083                  * We have a goodput in progress
14084                  * and we have entered a late state.
14085                  * Do we have enough data in the sb
14086                  * to handle the GPUT request?
14087                  */
14088                 uint32_t bytes;
14089
14090                 bytes = tp->gput_ack - tp->gput_seq;
14091                 if (SEQ_GT(tp->gput_seq, tp->snd_una))
14092                         bytes += tp->gput_seq - tp->snd_una;
14093                 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
14094                         /*
14095                          * There are not enough bytes in the socket
14096                          * buffer that have been sent to cover this
14097                          * measurement. Cancel it.
14098                          */
14099                         rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
14100                                                    rack->r_ctl.rc_gp_srtt /*flex1*/,
14101                                                    tp->gput_seq,
14102                                                    0, 0, 18, __LINE__, NULL, 0);
14103                         tp->t_flags &= ~TF_GPUTINPROG;
14104                 }
14105         }
14106         high_seq = th->th_ack;
14107         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
14108                 union tcp_log_stackspecific log;
14109                 struct timeval ltv;
14110 #ifdef NETFLIX_HTTP_LOGGING
14111                 struct http_sendfile_track *http_req;
14112
14113                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
14114                         http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
14115                 } else {
14116                         http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
14117                 }
14118 #endif
14119                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
14120                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
14121                 if (rack->rack_no_prr == 0)
14122                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
14123                 else
14124                         log.u_bbr.flex1 = 0;
14125                 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
14126                 log.u_bbr.use_lt_bw <<= 1;
14127                 log.u_bbr.use_lt_bw |= rack->r_might_revert;
14128                 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
14129                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14130                 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
14131                 log.u_bbr.flex3 = m->m_flags;
14132                 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
14133                 log.u_bbr.lost = thflags;
14134                 log.u_bbr.pacing_gain = 0x1;
14135 #ifdef TCP_ACCOUNTING
14136                 log.u_bbr.cwnd_gain = ack_val_set;
14137 #endif
14138                 log.u_bbr.flex7 = 2;
14139                 if (m->m_flags & M_TSTMP) {
14140                         /* Record the hardware timestamp if present */
14141                         mbuf_tstmp2timespec(m, &ts);
14142                         ltv.tv_sec = ts.tv_sec;
14143                         ltv.tv_usec = ts.tv_nsec / 1000;
14144                         log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
14145                 } else if (m->m_flags & M_TSTMP_LRO) {
14146                         /* Record the LRO the arrival timestamp */
14147                         mbuf_tstmp2timespec(m, &ts);
14148                         ltv.tv_sec = ts.tv_sec;
14149                         ltv.tv_usec = ts.tv_nsec / 1000;
14150                         log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
14151                 }
14152                 log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
14153                 /* Log the rcv time */
14154                 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
14155 #ifdef NETFLIX_HTTP_LOGGING
14156                 log.u_bbr.applimited = tp->t_http_closed;
14157                 log.u_bbr.applimited <<= 8;
14158                 log.u_bbr.applimited |= tp->t_http_open;
14159                 log.u_bbr.applimited <<= 8;
14160                 log.u_bbr.applimited |= tp->t_http_req;
14161                 if (http_req) {
14162                         /* Copy out any client req info */
14163                         /* seconds */
14164                         log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
14165                         /* useconds */
14166                         log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
14167                         log.u_bbr.rttProp = http_req->timestamp;
14168                         log.u_bbr.cur_del_rate = http_req->start;
14169                         if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
14170                                 log.u_bbr.flex8 |= 1;
14171                         } else {
14172                                 log.u_bbr.flex8 |= 2;
14173                                 log.u_bbr.bw_inuse = http_req->end;
14174                         }
14175                         log.u_bbr.flex6 = http_req->start_seq;
14176                         if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
14177                                 log.u_bbr.flex8 |= 4;
14178                                 log.u_bbr.epoch = http_req->end_seq;
14179                         }
14180                 }
14181 #endif
14182                 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
14183                     tlen, &log, true, &ltv);
14184         }
14185         if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
14186                 way_out = 4;
14187                 retval = 0;
14188                 m_freem(m);
14189                 goto done_with_input;
14190         }
14191         /*
14192          * If a segment with the ACK-bit set arrives in the SYN-SENT state
14193          * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
14194          */
14195         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
14196             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
14197                 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
14198                 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
14199 #ifdef TCP_ACCOUNTING
14200                 sched_unpin();
14201 #endif
14202                 return (1);
14203         }
14204         /*
14205          * If timestamps were negotiated during SYN/ACK and a
14206          * segment without a timestamp is received, silently drop
14207          * the segment, unless it is a RST segment or missing timestamps are
14208          * tolerated.
14209          * See section 3.2 of RFC 7323.
14210          */
14211         if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
14212             ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
14213                 way_out = 5;
14214                 retval = 0;
14215                 m_freem(m);
14216                 goto done_with_input;
14217         }
14218
14219         /*
14220          * Segment received on connection. Reset idle time and keep-alive
14221          * timer. XXX: This should be done after segment validation to
14222          * ignore broken/spoofed segs.
14223          */
14224         if  (tp->t_idle_reduce &&
14225              (tp->snd_max == tp->snd_una) &&
14226              (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
14227                 counter_u64_add(rack_input_idle_reduces, 1);
14228                 rack_cc_after_idle(rack, tp);
14229         }
14230         tp->t_rcvtime = ticks;
14231 #ifdef STATS
14232         stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
14233 #endif
14234         if (tiwin > rack->r_ctl.rc_high_rwnd)
14235                 rack->r_ctl.rc_high_rwnd = tiwin;
14236         /*
14237          * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
14238          * this to occur after we've validated the segment.
14239          */
14240         if (tcp_ecn_input_segment(tp, thflags, iptos))
14241                 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
14242
14243         /*
14244          * If echoed timestamp is later than the current time, fall back to
14245          * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
14246          * were used when this connection was established.
14247          */
14248         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
14249                 to.to_tsecr -= tp->ts_offset;
14250                 if (TSTMP_GT(to.to_tsecr, ms_cts))
14251                         to.to_tsecr = 0;
14252         }
14253
14254         /*
14255          * If its the first time in we need to take care of options and
14256          * verify we can do SACK for rack!
14257          */
14258         if (rack->r_state == 0) {
14259                 /* Should be init'd by rack_init() */
14260                 KASSERT(rack->rc_inp != NULL,
14261                     ("%s: rack->rc_inp unexpectedly NULL", __func__));
14262                 if (rack->rc_inp == NULL) {
14263                         rack->rc_inp = tp->t_inpcb;
14264                 }
14265
14266                 /*
14267                  * Process options only when we get SYN/ACK back. The SYN
14268                  * case for incoming connections is handled in tcp_syncache.
14269                  * According to RFC1323 the window field in a SYN (i.e., a
14270                  * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
14271                  * this is traditional behavior, may need to be cleaned up.
14272                  */
14273                 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
14274                         /* Handle parallel SYN for ECN */
14275                         tcp_ecn_input_parallel_syn(tp, thflags, iptos);
14276                         if ((to.to_flags & TOF_SCALE) &&
14277                             (tp->t_flags & TF_REQ_SCALE)) {
14278                                 tp->t_flags |= TF_RCVD_SCALE;
14279                                 tp->snd_scale = to.to_wscale;
14280                         } else
14281                                 tp->t_flags &= ~TF_REQ_SCALE;
14282                         /*
14283                          * Initial send window.  It will be updated with the
14284                          * next incoming segment to the scaled value.
14285                          */
14286                         tp->snd_wnd = th->th_win;
14287                         rack_validate_fo_sendwin_up(tp, rack);
14288                         if ((to.to_flags & TOF_TS) &&
14289                             (tp->t_flags & TF_REQ_TSTMP)) {
14290                                 tp->t_flags |= TF_RCVD_TSTMP;
14291                                 tp->ts_recent = to.to_tsval;
14292                                 tp->ts_recent_age = cts;
14293                         } else
14294                                 tp->t_flags &= ~TF_REQ_TSTMP;
14295                         if (to.to_flags & TOF_MSS) {
14296                                 tcp_mss(tp, to.to_mss);
14297                         }
14298                         if ((tp->t_flags & TF_SACK_PERMIT) &&
14299                             (to.to_flags & TOF_SACKPERM) == 0)
14300                                 tp->t_flags &= ~TF_SACK_PERMIT;
14301                         if (IS_FASTOPEN(tp->t_flags)) {
14302                                 if (to.to_flags & TOF_FASTOPEN) {
14303                                         uint16_t mss;
14304
14305                                         if (to.to_flags & TOF_MSS)
14306                                                 mss = to.to_mss;
14307                                         else
14308                                                 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
14309                                                         mss = TCP6_MSS;
14310                                                 else
14311                                                         mss = TCP_MSS;
14312                                         tcp_fastopen_update_cache(tp, mss,
14313                                             to.to_tfo_len, to.to_tfo_cookie);
14314                                 } else
14315                                         tcp_fastopen_disable_path(tp);
14316                         }
14317                 }
14318                 /*
14319                  * At this point we are at the initial call. Here we decide
14320                  * if we are doing RACK or not. We do this by seeing if
14321                  * TF_SACK_PERMIT is set and the sack-not-required is clear.
14322                  * The code now does do dup-ack counting so if you don't
14323                  * switch back you won't get rack & TLP, but you will still
14324                  * get this stack.
14325                  */
14326
14327                 if ((rack_sack_not_required == 0) &&
14328                     ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
14329                         tcp_switch_back_to_default(tp);
14330                         (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
14331                             tlen, iptos);
14332 #ifdef TCP_ACCOUNTING
14333                         sched_unpin();
14334 #endif
14335                         return (1);
14336                 }
14337                 tcp_set_hpts(tp->t_inpcb);
14338                 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
14339         }
14340         if (thflags & TH_FIN)
14341                 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
14342         us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
14343         if ((rack->rc_gp_dyn_mul) &&
14344             (rack->use_fixed_rate == 0) &&
14345             (rack->rc_always_pace)) {
14346                 /* Check in on probertt */
14347                 rack_check_probe_rtt(rack, us_cts);
14348         }
14349         rack_clear_rate_sample(rack);
14350         if ((rack->forced_ack) &&
14351             ((tcp_get_flags(th) & TH_RST) == 0)) {
14352                 rack_handle_probe_response(rack, tiwin, us_cts);
14353         }
14354         /*
14355          * This is the one exception case where we set the rack state
14356          * always. All other times (timers etc) we must have a rack-state
14357          * set (so we assure we have done the checks above for SACK).
14358          */
14359         rack->r_ctl.rc_rcvtime = cts;
14360         if (rack->r_state != tp->t_state)
14361                 rack_set_state(tp, rack);
14362         if (SEQ_GT(th->th_ack, tp->snd_una) &&
14363             (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
14364                 kern_prefetch(rsm, &prev_state);
14365         prev_state = rack->r_state;
14366         retval = (*rack->r_substate) (m, th, so,
14367             tp, &to, drop_hdrlen,
14368             tlen, tiwin, thflags, nxt_pkt, iptos);
14369 #ifdef INVARIANTS
14370         if ((retval == 0) &&
14371             (tp->t_inpcb == NULL)) {
14372                 panic("retval:%d tp:%p t_inpcb:NULL state:%d",
14373                     retval, tp, prev_state);
14374         }
14375 #endif
14376         if (retval == 0) {
14377                 /*
14378                  * If retval is 1 the tcb is unlocked and most likely the tp
14379                  * is gone.
14380                  */
14381                 INP_WLOCK_ASSERT(tp->t_inpcb);
14382                 if ((rack->rc_gp_dyn_mul) &&
14383                     (rack->rc_always_pace) &&
14384                     (rack->use_fixed_rate == 0) &&
14385                     rack->in_probe_rtt &&
14386                     (rack->r_ctl.rc_time_probertt_starts == 0)) {
14387                         /*
14388                          * If we are going for target, lets recheck before
14389                          * we output.
14390                          */
14391                         rack_check_probe_rtt(rack, us_cts);
14392                 }
14393                 if (rack->set_pacing_done_a_iw == 0) {
14394                         /* How much has been acked? */
14395                         if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
14396                                 /* We have enough to set in the pacing segment size */
14397                                 rack->set_pacing_done_a_iw = 1;
14398                                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14399                         }
14400                 }
14401                 tcp_rack_xmit_timer_commit(rack, tp);
14402 #ifdef TCP_ACCOUNTING
14403                 /*
14404                  * If we set the ack_val_se to what ack processing we are doing
14405                  * we also want to track how many cycles we burned. Note
14406                  * the bits after tcp_output we let be "free". This is because
14407                  * we are also tracking the tcp_output times as well. Note the
14408                  * use of 0xf here since we only have 11 counter (0 - 0xa) and
14409                  * 0xf cannot be returned and is what we initialize it too to
14410                  * indicate we are not doing the tabulations.
14411                  */
14412                 if (ack_val_set != 0xf) {
14413                         uint64_t crtsc;
14414
14415                         crtsc = get_cyclecount();
14416                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14417                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
14418                                 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
14419                         }
14420                 }
14421 #endif
14422                 if (nxt_pkt == 0) {
14423                         if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
14424 do_output_now:
14425                                 if (tcp_output(tp) < 0)
14426                                         return (1);
14427                                 did_out = 1;
14428                         }
14429                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
14430                         rack_free_trim(rack);
14431                 }
14432                 /* Update any rounds needed */
14433                 if (rack_verbose_logging &&  (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
14434                         union tcp_log_stackspecific log;
14435                         struct timeval tv;
14436
14437                         memset(&log.u_bbr, 0, sizeof(log.u_bbr));
14438                         log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14439                         log.u_bbr.flex1 = high_seq;
14440                         log.u_bbr.flex2 = rack->r_ctl.roundends;
14441                         log.u_bbr.flex3 = rack->r_ctl.current_round;
14442                         log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
14443                         log.u_bbr.flex8 = 9;
14444                         tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
14445                                        0, &log, false, NULL, NULL, 0, &tv);
14446                 }
14447                 /*
14448                  * The draft (v3) calls for us to use SEQ_GEQ, but that
14449                  * causes issues when we are just going app limited. Lets
14450                  * instead use SEQ_GT <or> where its equal but more data
14451                  * is outstanding.
14452                  */
14453                 if ((SEQ_GT(tp->snd_una, rack->r_ctl.roundends)) ||
14454                     ((tp->snd_una == rack->r_ctl.roundends) && SEQ_GT(tp->snd_max, tp->snd_una))) {
14455                         rack->r_ctl.current_round++;
14456                         rack->r_ctl.roundends = tp->snd_max;
14457                         if (CC_ALGO(tp)->newround != NULL) {
14458                                 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
14459                         }
14460                 }
14461                 if ((nxt_pkt == 0) &&
14462                     ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
14463                     (SEQ_GT(tp->snd_max, tp->snd_una) ||
14464                      (tp->t_flags & TF_DELACK) ||
14465                      ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
14466                       (tp->t_state <= TCPS_CLOSING)))) {
14467                         /* We could not send (probably in the hpts but stopped the timer earlier)? */
14468                         if ((tp->snd_max == tp->snd_una) &&
14469                             ((tp->t_flags & TF_DELACK) == 0) &&
14470                             (tcp_in_hpts(rack->rc_inp)) &&
14471                             (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
14472                                 /* keep alive not needed if we are hptsi output yet */
14473                                 ;
14474                         } else {
14475                                 int late = 0;
14476                                 if (tcp_in_hpts(rack->rc_inp)) {
14477                                         if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14478                                                 us_cts = tcp_get_usecs(NULL);
14479                                                 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
14480                                                         rack->r_early = 1;
14481                                                         rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
14482                                                 } else
14483                                                         late = 1;
14484                                                 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
14485                                         }
14486                                         tcp_hpts_remove(tp->t_inpcb);
14487                                 }
14488                                 if (late && (did_out == 0)) {
14489                                         /*
14490                                          * We are late in the sending
14491                                          * and we did not call the output
14492                                          * (this probably should not happen).
14493                                          */
14494                                         goto do_output_now;
14495                                 }
14496                                 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
14497                         }
14498                         way_out = 1;
14499                 } else if (nxt_pkt == 0) {
14500                         /* Do we have the correct timer running? */
14501                         rack_timer_audit(tp, rack, &so->so_snd);
14502                         way_out = 2;
14503                 }
14504         done_with_input:
14505                 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
14506                 if (did_out)
14507                         rack->r_wanted_output = 0;
14508 #ifdef INVARIANTS
14509                 if (tp->t_inpcb == NULL) {
14510                         panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
14511                               did_out,
14512                               retval, tp, prev_state);
14513                 }
14514 #endif
14515 #ifdef TCP_ACCOUNTING
14516         } else {
14517                 /*
14518                  * Track the time (see above).
14519                  */
14520                 if (ack_val_set != 0xf) {
14521                         uint64_t crtsc;
14522
14523                         crtsc = get_cyclecount();
14524                         counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
14525                         /*
14526                          * Note we *DO NOT* increment the per-tcb counters since
14527                          * in the else the TP may be gone!!
14528                          */
14529                 }
14530 #endif
14531         }
14532 #ifdef TCP_ACCOUNTING
14533         sched_unpin();
14534 #endif
14535         return (retval);
14536 }
14537
14538 void
14539 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
14540     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
14541 {
14542         struct timeval tv;
14543
14544         /* First lets see if we have old packets */
14545         if (tp->t_in_pkt) {
14546                 if (ctf_do_queued_segments(so, tp, 1)) {
14547                         m_freem(m);
14548                         return;
14549                 }
14550         }
14551         if (m->m_flags & M_TSTMP_LRO) {
14552                 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000;
14553                 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000;
14554         } else {
14555                 /* Should not be should we kassert instead? */
14556                 tcp_get_usecs(&tv);
14557         }
14558         if (rack_do_segment_nounlock(m, th, so, tp,
14559                                      drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
14560                 INP_WUNLOCK(tp->t_inpcb);
14561         }
14562 }
14563
14564 struct rack_sendmap *
14565 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
14566 {
14567         struct rack_sendmap *rsm = NULL;
14568         int32_t idx;
14569         uint32_t srtt = 0, thresh = 0, ts_low = 0;
14570
14571         /* Return the next guy to be re-transmitted */
14572         if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
14573                 return (NULL);
14574         }
14575         if (tp->t_flags & TF_SENTFIN) {
14576                 /* retran the end FIN? */
14577                 return (NULL);
14578         }
14579         /* ok lets look at this one */
14580         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
14581         if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) {
14582                 return (rsm);
14583         }
14584         if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
14585                 goto check_it;
14586         }
14587         rsm = rack_find_lowest_rsm(rack);
14588         if (rsm == NULL) {
14589                 return (NULL);
14590         }
14591 check_it:
14592         if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
14593             (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
14594                 /*
14595                  * No sack so we automatically do the 3 strikes and
14596                  * retransmit (no rack timer would be started).
14597                  */
14598
14599                 return (rsm);
14600         }
14601         if (rsm->r_flags & RACK_ACKED) {
14602                 return (NULL);
14603         }
14604         if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
14605             (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
14606                 /* Its not yet ready */
14607                 return (NULL);
14608         }
14609         srtt = rack_grab_rtt(tp, rack);
14610         idx = rsm->r_rtr_cnt - 1;
14611         ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
14612         thresh = rack_calc_thresh_rack(rack, srtt, tsused);
14613         if ((tsused == ts_low) ||
14614             (TSTMP_LT(tsused, ts_low))) {
14615                 /* No time since sending */
14616                 return (NULL);
14617         }
14618         if ((tsused - ts_low) < thresh) {
14619                 /* It has not been long enough yet */
14620                 return (NULL);
14621         }
14622         if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
14623             ((rsm->r_flags & RACK_SACK_PASSED) &&
14624              (rack->sack_attack_disable == 0))) {
14625                 /*
14626                  * We have passed the dup-ack threshold <or>
14627                  * a SACK has indicated this is missing.
14628                  * Note that if you are a declared attacker
14629                  * it is only the dup-ack threshold that
14630                  * will cause retransmits.
14631                  */
14632                 /* log retransmit reason */
14633                 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
14634                 rack->r_fast_output = 0;
14635                 return (rsm);
14636         }
14637         return (NULL);
14638 }
14639
14640 static void
14641 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
14642                            uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
14643                            int line, struct rack_sendmap *rsm, uint8_t quality)
14644 {
14645         if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
14646                 union tcp_log_stackspecific log;
14647                 struct timeval tv;
14648
14649                 memset(&log, 0, sizeof(log));
14650                 log.u_bbr.flex1 = slot;
14651                 log.u_bbr.flex2 = len;
14652                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
14653                 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
14654                 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
14655                 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
14656                 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
14657                 log.u_bbr.use_lt_bw <<= 1;
14658                 log.u_bbr.use_lt_bw |= rack->r_late;
14659                 log.u_bbr.use_lt_bw <<= 1;
14660                 log.u_bbr.use_lt_bw |= rack->r_early;
14661                 log.u_bbr.use_lt_bw <<= 1;
14662                 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
14663                 log.u_bbr.use_lt_bw <<= 1;
14664                 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
14665                 log.u_bbr.use_lt_bw <<= 1;
14666                 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
14667                 log.u_bbr.use_lt_bw <<= 1;
14668                 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
14669                 log.u_bbr.use_lt_bw <<= 1;
14670                 log.u_bbr.use_lt_bw |= rack->gp_ready;
14671                 log.u_bbr.pkt_epoch = line;
14672                 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
14673                 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
14674                 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
14675                 log.u_bbr.bw_inuse = bw_est;
14676                 log.u_bbr.delRate = bw;
14677                 if (rack->r_ctl.gp_bw == 0)
14678                         log.u_bbr.cur_del_rate = 0;
14679                 else
14680                         log.u_bbr.cur_del_rate = rack_get_bw(rack);
14681                 log.u_bbr.rttProp = len_time;
14682                 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
14683                 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
14684                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
14685                 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
14686                         /* We are in slow start */
14687                         log.u_bbr.flex7 = 1;
14688                 } else {
14689                         /* we are on congestion avoidance */
14690                         log.u_bbr.flex7 = 0;
14691                 }
14692                 log.u_bbr.flex8 = method;
14693                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14694                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14695                 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
14696                 log.u_bbr.cwnd_gain <<= 1;
14697                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
14698                 log.u_bbr.cwnd_gain <<= 1;
14699                 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
14700                 log.u_bbr.bbr_substate = quality;
14701                 TCP_LOG_EVENTP(rack->rc_tp, NULL,
14702                     &rack->rc_inp->inp_socket->so_rcv,
14703                     &rack->rc_inp->inp_socket->so_snd,
14704                     BBR_LOG_HPTSI_CALC, 0,
14705                     0, &log, false, &tv);
14706         }
14707 }
14708
14709 static uint32_t
14710 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
14711 {
14712         uint32_t new_tso, user_max;
14713
14714         user_max = rack->rc_user_set_max_segs * mss;
14715         if (rack->rc_force_max_seg) {
14716                 return (user_max);
14717         }
14718         if (rack->use_fixed_rate &&
14719             ((rack->r_ctl.crte == NULL) ||
14720              (bw != rack->r_ctl.crte->rate))) {
14721                 /* Use the user mss since we are not exactly matched */
14722                 return (user_max);
14723         }
14724         new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
14725         if (new_tso > user_max)
14726                 new_tso = user_max;
14727         return (new_tso);
14728 }
14729
14730 static int32_t
14731 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
14732 {
14733         uint64_t lentim, fill_bw;
14734
14735         /* Lets first see if we are full, if so continue with normal rate */
14736         rack->r_via_fill_cw = 0;
14737         if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
14738                 return (slot);
14739         if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
14740                 return (slot);
14741         if (rack->r_ctl.rc_last_us_rtt == 0)
14742                 return (slot);
14743         if (rack->rc_pace_fill_if_rttin_range &&
14744             (rack->r_ctl.rc_last_us_rtt >=
14745              (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
14746                 /* The rtt is huge, N * smallest, lets not fill */
14747                 return (slot);
14748         }
14749         /*
14750          * first lets calculate the b/w based on the last us-rtt
14751          * and the sndwnd.
14752          */
14753         fill_bw = rack->r_ctl.cwnd_to_use;
14754         /* Take the rwnd if its smaller */
14755         if (fill_bw > rack->rc_tp->snd_wnd)
14756                 fill_bw = rack->rc_tp->snd_wnd;
14757         if (rack->r_fill_less_agg) {
14758                 /*
14759                  * Now take away the inflight (this will reduce our
14760                  * aggressiveness and yeah, if we get that much out in 1RTT
14761                  * we will have had acks come back and still be behind).
14762                  */
14763                 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14764         }
14765         /* Now lets make it into a b/w */
14766         fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
14767         fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
14768         /* We are below the min b/w */
14769         if (non_paced)
14770                 *rate_wanted = fill_bw;
14771         if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
14772                 return (slot);
14773         if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
14774                 fill_bw = rack->r_ctl.bw_rate_cap;
14775         rack->r_via_fill_cw = 1;
14776         if (rack->r_rack_hw_rate_caps &&
14777             (rack->r_ctl.crte != NULL)) {
14778                 uint64_t high_rate;
14779
14780                 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
14781                 if (fill_bw > high_rate) {
14782                         /* We are capping bw at the highest rate table entry */
14783                         if (*rate_wanted > high_rate) {
14784                                 /* The original rate was also capped */
14785                                 rack->r_via_fill_cw = 0;
14786                         }
14787                         rack_log_hdwr_pacing(rack,
14788                                              fill_bw, high_rate, __LINE__,
14789                                              0, 3);
14790                         fill_bw = high_rate;
14791                         if (capped)
14792                                 *capped = 1;
14793                 }
14794         } else if ((rack->r_ctl.crte == NULL) &&
14795                    (rack->rack_hdrw_pacing == 0) &&
14796                    (rack->rack_hdw_pace_ena) &&
14797                    rack->r_rack_hw_rate_caps &&
14798                    (rack->rack_attempt_hdwr_pace == 0) &&
14799                    (rack->rc_inp->inp_route.ro_nh != NULL) &&
14800                    (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
14801                 /*
14802                  * Ok we may have a first attempt that is greater than our top rate
14803                  * lets check.
14804                  */
14805                 uint64_t high_rate;
14806
14807                 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
14808                 if (high_rate) {
14809                         if (fill_bw > high_rate) {
14810                                 fill_bw = high_rate;
14811                                 if (capped)
14812                                         *capped = 1;
14813                         }
14814                 }
14815         }
14816         /*
14817          * Ok fill_bw holds our mythical b/w to fill the cwnd
14818          * in a rtt, what does that time wise equate too?
14819          */
14820         lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
14821         lentim /= fill_bw;
14822         *rate_wanted = fill_bw;
14823         if (non_paced || (lentim < slot)) {
14824                 rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
14825                                            0, lentim, 12, __LINE__, NULL, 0);
14826                 return ((int32_t)lentim);
14827         } else
14828                 return (slot);
14829 }
14830
14831 static int32_t
14832 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
14833 {
14834         uint64_t srtt;
14835         int32_t slot = 0;
14836         int can_start_hw_pacing = 1;
14837         int err;
14838
14839         if (rack->rc_always_pace == 0) {
14840                 /*
14841                  * We use the most optimistic possible cwnd/srtt for
14842                  * sending calculations. This will make our
14843                  * calculation anticipate getting more through
14844                  * quicker then possible. But thats ok we don't want
14845                  * the peer to have a gap in data sending.
14846                  */
14847                 uint64_t cwnd, tr_perms = 0;
14848                 int32_t reduce = 0;
14849
14850         old_method:
14851                 /*
14852                  * We keep no precise pacing with the old method
14853                  * instead we use the pacer to mitigate bursts.
14854                  */
14855                 if (rack->r_ctl.rc_rack_min_rtt)
14856                         srtt = rack->r_ctl.rc_rack_min_rtt;
14857                 else
14858                         srtt = max(tp->t_srtt, 1);
14859                 if (rack->r_ctl.rc_rack_largest_cwnd)
14860                         cwnd = rack->r_ctl.rc_rack_largest_cwnd;
14861                 else
14862                         cwnd = rack->r_ctl.cwnd_to_use;
14863                 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
14864                 tr_perms = (cwnd * 1000) / srtt;
14865                 if (tr_perms == 0) {
14866                         tr_perms = ctf_fixed_maxseg(tp);
14867                 }
14868                 /*
14869                  * Calculate how long this will take to drain, if
14870                  * the calculation comes out to zero, thats ok we
14871                  * will use send_a_lot to possibly spin around for
14872                  * more increasing tot_len_this_send to the point
14873                  * that its going to require a pace, or we hit the
14874                  * cwnd. Which in that case we are just waiting for
14875                  * a ACK.
14876                  */
14877                 slot = len / tr_perms;
14878                 /* Now do we reduce the time so we don't run dry? */
14879                 if (slot && rack_slot_reduction) {
14880                         reduce = (slot / rack_slot_reduction);
14881                         if (reduce < slot) {
14882                                 slot -= reduce;
14883                         } else
14884                                 slot = 0;
14885                 }
14886                 slot *= HPTS_USEC_IN_MSEC;
14887                 if (rack->rc_pace_to_cwnd) {
14888                         uint64_t rate_wanted = 0;
14889
14890                         slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
14891                         rack->rc_ack_can_sendout_data = 1;
14892                         rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
14893                 } else
14894                         rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
14895         } else {
14896                 uint64_t bw_est, res, lentim, rate_wanted;
14897                 uint32_t orig_val, segs, oh;
14898                 int capped = 0;
14899                 int prev_fill;
14900
14901                 if ((rack->r_rr_config == 1) && rsm) {
14902                         return (rack->r_ctl.rc_min_to);
14903                 }
14904                 if (rack->use_fixed_rate) {
14905                         rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
14906                 } else if ((rack->r_ctl.init_rate == 0) &&
14907 #ifdef NETFLIX_PEAKRATE
14908                            (rack->rc_tp->t_maxpeakrate == 0) &&
14909 #endif
14910                            (rack->r_ctl.gp_bw == 0)) {
14911                         /* no way to yet do an estimate */
14912                         bw_est = rate_wanted = 0;
14913                 } else {
14914                         bw_est = rack_get_bw(rack);
14915                         rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
14916                 }
14917                 if ((bw_est == 0) || (rate_wanted == 0) ||
14918                     ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
14919                         /*
14920                          * No way yet to make a b/w estimate or
14921                          * our raise is set incorrectly.
14922                          */
14923                         goto old_method;
14924                 }
14925                 /* We need to account for all the overheads */
14926                 segs = (len + segsiz - 1) / segsiz;
14927                 /*
14928                  * We need the diff between 1514 bytes (e-mtu with e-hdr)
14929                  * and how much data we put in each packet. Yes this
14930                  * means we may be off if we are larger than 1500 bytes
14931                  * or smaller. But this just makes us more conservative.
14932                  */
14933                 if (rack_hw_rate_min &&
14934                     (bw_est < rack_hw_rate_min))
14935                         can_start_hw_pacing = 0;
14936                 if (ETHERNET_SEGMENT_SIZE > segsiz)
14937                         oh = ETHERNET_SEGMENT_SIZE - segsiz;
14938                 else
14939                         oh = 0;
14940                 segs *= oh;
14941                 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
14942                 res = lentim / rate_wanted;
14943                 slot = (uint32_t)res;
14944                 orig_val = rack->r_ctl.rc_pace_max_segs;
14945                 if (rack->r_ctl.crte == NULL) {
14946                         /*
14947                          * Only do this if we are not hardware pacing
14948                          * since if we are doing hw-pacing below we will
14949                          * set make a call after setting up or changing
14950                          * the rate.
14951                          */
14952                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
14953                 } else if (rack->rc_inp->inp_snd_tag == NULL) {
14954                         /*
14955                          * We lost our rate somehow, this can happen
14956                          * if the interface changed underneath us.
14957                          */
14958                         tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
14959                         rack->r_ctl.crte = NULL;
14960                         /* Lets re-allow attempting to setup pacing */
14961                         rack->rack_hdrw_pacing = 0;
14962                         rack->rack_attempt_hdwr_pace = 0;
14963                         rack_log_hdwr_pacing(rack,
14964                                              rate_wanted, bw_est, __LINE__,
14965                                              0, 6);
14966                 }
14967                 /* Did we change the TSO size, if so log it */
14968                 if (rack->r_ctl.rc_pace_max_segs != orig_val)
14969                         rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0);
14970                 prev_fill = rack->r_via_fill_cw;
14971                 if ((rack->rc_pace_to_cwnd) &&
14972                     (capped == 0) &&
14973                     (rack->use_fixed_rate == 0) &&
14974                     (rack->in_probe_rtt == 0) &&
14975                     (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
14976                         /*
14977                          * We want to pace at our rate *or* faster to
14978                          * fill the cwnd to the max if its not full.
14979                          */
14980                         slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
14981                 }
14982                 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
14983                     (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
14984                         if ((rack->rack_hdw_pace_ena) &&
14985                             (can_start_hw_pacing > 0) &&
14986                             (rack->rack_hdrw_pacing == 0) &&
14987                             (rack->rack_attempt_hdwr_pace == 0)) {
14988                                 /*
14989                                  * Lets attempt to turn on hardware pacing
14990                                  * if we can.
14991                                  */
14992                                 rack->rack_attempt_hdwr_pace = 1;
14993                                 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
14994                                                                        rack->rc_inp->inp_route.ro_nh->nh_ifp,
14995                                                                        rate_wanted,
14996                                                                        RS_PACING_GEQ,
14997                                                                        &err, &rack->r_ctl.crte_prev_rate);
14998                                 if (rack->r_ctl.crte) {
14999                                         rack->rack_hdrw_pacing = 1;
15000                                         rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
15001                                                                                                  0, rack->r_ctl.crte,
15002                                                                                                  NULL);
15003                                         rack_log_hdwr_pacing(rack,
15004                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15005                                                              err, 0);
15006                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15007                                 } else {
15008                                         counter_u64_add(rack_hw_pace_init_fail, 1);
15009                                 }
15010                         } else if (rack->rack_hdrw_pacing &&
15011                                    (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
15012                                 /* Do we need to adjust our rate? */
15013                                 const struct tcp_hwrate_limit_table *nrte;
15014
15015                                 if (rack->r_up_only &&
15016                                     (rate_wanted < rack->r_ctl.crte->rate)) {
15017                                         /**
15018                                          * We have four possible states here
15019                                          * having to do with the previous time
15020                                          * and this time.
15021                                          *   previous  |  this-time
15022                                          * A)     0      |     0   -- fill_cw not in the picture
15023                                          * B)     1      |     0   -- we were doing a fill-cw but now are not
15024                                          * C)     1      |     1   -- all rates from fill_cw
15025                                          * D)     0      |     1   -- we were doing non-fill and now we are filling
15026                                          *
15027                                          * For case A, C and D we don't allow a drop. But for
15028                                          * case B where we now our on our steady rate we do
15029                                          * allow a drop.
15030                                          *
15031                                          */
15032                                         if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
15033                                                 goto done_w_hdwr;
15034                                 }
15035                                 if ((rate_wanted > rack->r_ctl.crte->rate) ||
15036                                     (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
15037                                         if (rack_hw_rate_to_low &&
15038                                             (bw_est < rack_hw_rate_to_low)) {
15039                                                 /*
15040                                                  * The pacing rate is too low for hardware, but
15041                                                  * do allow hardware pacing to be restarted.
15042                                                  */
15043                                                 rack_log_hdwr_pacing(rack,
15044                                                              bw_est, rack->r_ctl.crte->rate, __LINE__,
15045                                                              0, 5);
15046                                                 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
15047                                                 rack->r_ctl.crte = NULL;
15048                                                 rack->rack_attempt_hdwr_pace = 0;
15049                                                 rack->rack_hdrw_pacing = 0;
15050                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15051                                                 goto done_w_hdwr;
15052                                         }
15053                                         nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
15054                                                                    rack->rc_tp,
15055                                                                    rack->rc_inp->inp_route.ro_nh->nh_ifp,
15056                                                                    rate_wanted,
15057                                                                    RS_PACING_GEQ,
15058                                                                    &err, &rack->r_ctl.crte_prev_rate);
15059                                         if (nrte == NULL) {
15060                                                 /* Lost the rate */
15061                                                 rack->rack_hdrw_pacing = 0;
15062                                                 rack->r_ctl.crte = NULL;
15063                                                 rack_log_hdwr_pacing(rack,
15064                                                                      rate_wanted, 0, __LINE__,
15065                                                                      err, 1);
15066                                                 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15067                                                 counter_u64_add(rack_hw_pace_lost, 1);
15068                                         } else if (nrte != rack->r_ctl.crte) {
15069                                                 rack->r_ctl.crte = nrte;
15070                                                 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
15071                                                                                                          segsiz, 0,
15072                                                                                                          rack->r_ctl.crte,
15073                                                                                                          NULL);
15074                                                 rack_log_hdwr_pacing(rack,
15075                                                                      rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15076                                                                      err, 2);
15077                                                 rack->r_ctl.last_hw_bw_req = rate_wanted;
15078                                         }
15079                                 } else {
15080                                         /* We just need to adjust the segment size */
15081                                         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
15082                                         rack_log_hdwr_pacing(rack,
15083                                                              rate_wanted, rack->r_ctl.crte->rate, __LINE__,
15084                                                              0, 4);
15085                                         rack->r_ctl.last_hw_bw_req = rate_wanted;
15086                                 }
15087                         }
15088                 }
15089                 if ((rack->r_ctl.crte != NULL) &&
15090                     (rack->r_ctl.crte->rate == rate_wanted)) {
15091                         /*
15092                          * We need to add a extra if the rates
15093                          * are exactly matched. The idea is
15094                          * we want the software to make sure the
15095                          * queue is empty before adding more, this
15096                          * gives us N MSS extra pace times where
15097                          * N is our sysctl
15098                          */
15099                         slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
15100                 }
15101 done_w_hdwr:
15102                 if (rack_limit_time_with_srtt &&
15103                     (rack->use_fixed_rate == 0) &&
15104 #ifdef NETFLIX_PEAKRATE
15105                     (rack->rc_tp->t_maxpeakrate == 0) &&
15106 #endif
15107                     (rack->rack_hdrw_pacing == 0)) {
15108                         /*
15109                          * Sanity check, we do not allow the pacing delay
15110                          * to be longer than the SRTT of the path. If it is
15111                          * a slow path, then adding a packet should increase
15112                          * the RTT and compensate for this i.e. the srtt will
15113                          * be greater so the allowed pacing time will be greater.
15114                          *
15115                          * Note this restriction is not for where a peak rate
15116                          * is set, we are doing fixed pacing or hardware pacing.
15117                          */
15118                         if (rack->rc_tp->t_srtt)
15119                                 srtt = rack->rc_tp->t_srtt;
15120                         else
15121                                 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;    /* its in ms convert */
15122                         if (srtt < (uint64_t)slot) {
15123                                 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
15124                                 slot = srtt;
15125                         }
15126                 }
15127                 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
15128         }
15129         if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
15130                 /*
15131                  * If this rate is seeing enobufs when it
15132                  * goes to send then either the nic is out
15133                  * of gas or we are mis-estimating the time
15134                  * somehow and not letting the queue empty
15135                  * completely. Lets add to the pacing time.
15136                  */
15137                 int hw_boost_delay;
15138
15139                 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
15140                 if (hw_boost_delay > rack_enobuf_hw_max)
15141                         hw_boost_delay = rack_enobuf_hw_max;
15142                 else if (hw_boost_delay < rack_enobuf_hw_min)
15143                         hw_boost_delay = rack_enobuf_hw_min;
15144                 slot += hw_boost_delay;
15145         }
15146         return (slot);
15147 }
15148
15149 static void
15150 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
15151     tcp_seq startseq, uint32_t sb_offset)
15152 {
15153         struct rack_sendmap *my_rsm = NULL;
15154         struct rack_sendmap fe;
15155
15156         if (tp->t_state < TCPS_ESTABLISHED) {
15157                 /*
15158                  * We don't start any measurements if we are
15159                  * not at least established.
15160                  */
15161                 return;
15162         }
15163         if (tp->t_state >= TCPS_FIN_WAIT_1) {
15164                 /*
15165                  * We will get no more data into the SB
15166                  * this means we need to have the data available
15167                  * before we start a measurement.
15168                  */
15169
15170                 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) <
15171                     max(rc_init_window(rack),
15172                         (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
15173                         /* Nope not enough data */
15174                         return;
15175                 }
15176         }
15177         tp->t_flags |= TF_GPUTINPROG;
15178         rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
15179         rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
15180         tp->gput_seq = startseq;
15181         rack->app_limited_needs_set = 0;
15182         if (rack->in_probe_rtt)
15183                 rack->measure_saw_probe_rtt = 1;
15184         else if ((rack->measure_saw_probe_rtt) &&
15185                  (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
15186                 rack->measure_saw_probe_rtt = 0;
15187         if (rack->rc_gp_filled)
15188                 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
15189         else {
15190                 /* Special case initial measurement */
15191                 struct timeval tv;
15192
15193                 tp->gput_ts = tcp_get_usecs(&tv);
15194                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15195         }
15196         /*
15197          * We take a guess out into the future,
15198          * if we have no measurement and no
15199          * initial rate, we measure the first
15200          * initial-windows worth of data to
15201          * speed up getting some GP measurement and
15202          * thus start pacing.
15203          */
15204         if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
15205                 rack->app_limited_needs_set = 1;
15206                 tp->gput_ack = startseq + max(rc_init_window(rack),
15207                                               (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
15208                 rack_log_pacing_delay_calc(rack,
15209                                            tp->gput_seq,
15210                                            tp->gput_ack,
15211                                            0,
15212                                            tp->gput_ts,
15213                                            rack->r_ctl.rc_app_limited_cnt,
15214                                            9,
15215                                            __LINE__, NULL, 0);
15216                 return;
15217         }
15218         if (sb_offset) {
15219                 /*
15220                  * We are out somewhere in the sb
15221                  * can we use the already outstanding data?
15222                  */
15223                 if (rack->r_ctl.rc_app_limited_cnt == 0) {
15224                         /*
15225                          * Yes first one is good and in this case
15226                          * the tp->gput_ts is correctly set based on
15227                          * the last ack that arrived (no need to
15228                          * set things up when an ack comes in).
15229                          */
15230                         my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
15231                         if ((my_rsm == NULL) ||
15232                             (my_rsm->r_rtr_cnt != 1)) {
15233                                 /* retransmission? */
15234                                 goto use_latest;
15235                         }
15236                 } else {
15237                         if (rack->r_ctl.rc_first_appl == NULL) {
15238                                 /*
15239                                  * If rc_first_appl is NULL
15240                                  * then the cnt should be 0.
15241                                  * This is probably an error, maybe
15242                                  * a KASSERT would be approprate.
15243                                  */
15244                                 goto use_latest;
15245                         }
15246                         /*
15247                          * If we have a marker pointer to the last one that is
15248                          * app limited we can use that, but we need to set
15249                          * things up so that when it gets ack'ed we record
15250                          * the ack time (if its not already acked).
15251                          */
15252                         rack->app_limited_needs_set = 1;
15253                         /*
15254                          * We want to get to the rsm that is either
15255                          * next with space i.e. over 1 MSS or the one
15256                          * after that (after the app-limited).
15257                          */
15258                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15259                                          rack->r_ctl.rc_first_appl);
15260                         if (my_rsm) {
15261                                 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
15262                                         /* Have to use the next one */
15263                                         my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
15264                                                          my_rsm);
15265                                 else {
15266                                         /* Use after the first MSS of it is acked */
15267                                         tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
15268                                         goto start_set;
15269                                 }
15270                         }
15271                         if ((my_rsm == NULL) ||
15272                             (my_rsm->r_rtr_cnt != 1)) {
15273                                 /*
15274                                  * Either its a retransmit or
15275                                  * the last is the app-limited one.
15276                                  */
15277                                 goto use_latest;
15278                         }
15279                 }
15280                 tp->gput_seq = my_rsm->r_start;
15281 start_set:
15282                 if (my_rsm->r_flags & RACK_ACKED) {
15283                         /*
15284                          * This one has been acked use the arrival ack time
15285                          */
15286                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15287                         rack->app_limited_needs_set = 0;
15288                 }
15289                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15290                 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
15291                 rack_log_pacing_delay_calc(rack,
15292                                            tp->gput_seq,
15293                                            tp->gput_ack,
15294                                            (uint64_t)my_rsm,
15295                                            tp->gput_ts,
15296                                            rack->r_ctl.rc_app_limited_cnt,
15297                                            9,
15298                                            __LINE__, NULL, 0);
15299                 return;
15300         }
15301
15302 use_latest:
15303         /*
15304          * We don't know how long we may have been
15305          * idle or if this is the first-send. Lets
15306          * setup the flag so we will trim off
15307          * the first ack'd data so we get a true
15308          * measurement.
15309          */
15310         rack->app_limited_needs_set = 1;
15311         tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
15312         /* Find this guy so we can pull the send time */
15313         fe.r_start = startseq;
15314         my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
15315         if (my_rsm) {
15316                 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
15317                 if (my_rsm->r_flags & RACK_ACKED) {
15318                         /*
15319                          * Unlikely since its probably what was
15320                          * just transmitted (but I am paranoid).
15321                          */
15322                         tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
15323                         rack->app_limited_needs_set = 0;
15324                 }
15325                 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
15326                         /* This also is unlikely */
15327                         tp->gput_seq = my_rsm->r_start;
15328                 }
15329         } else {
15330                 /*
15331                  * TSNH unless we have some send-map limit,
15332                  * and even at that it should not be hitting
15333                  * that limit (we should have stopped sending).
15334                  */
15335                 struct timeval tv;
15336
15337                 microuptime(&tv);
15338                 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
15339         }
15340         rack_log_pacing_delay_calc(rack,
15341                                    tp->gput_seq,
15342                                    tp->gput_ack,
15343                                    (uint64_t)my_rsm,
15344                                    tp->gput_ts,
15345                                    rack->r_ctl.rc_app_limited_cnt,
15346                                    9, __LINE__, NULL, 0);
15347 }
15348
15349 static inline uint32_t
15350 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
15351     uint32_t avail, int32_t sb_offset)
15352 {
15353         uint32_t len;
15354         uint32_t sendwin;
15355
15356         if (tp->snd_wnd > cwnd_to_use)
15357                 sendwin = cwnd_to_use;
15358         else
15359                 sendwin = tp->snd_wnd;
15360         if (ctf_outstanding(tp) >= tp->snd_wnd) {
15361                 /* We never want to go over our peers rcv-window */
15362                 len = 0;
15363         } else {
15364                 uint32_t flight;
15365
15366                 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15367                 if (flight >= sendwin) {
15368                         /*
15369                          * We have in flight what we are allowed by cwnd (if
15370                          * it was rwnd blocking it would have hit above out
15371                          * >= tp->snd_wnd).
15372                          */
15373                         return (0);
15374                 }
15375                 len = sendwin - flight;
15376                 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
15377                         /* We would send too much (beyond the rwnd) */
15378                         len = tp->snd_wnd - ctf_outstanding(tp);
15379                 }
15380                 if ((len + sb_offset) > avail) {
15381                         /*
15382                          * We don't have that much in the SB, how much is
15383                          * there?
15384                          */
15385                         len = avail - sb_offset;
15386                 }
15387         }
15388         return (len);
15389 }
15390
15391 static void
15392 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
15393              unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
15394              int rsm_is_null, int optlen, int line, uint16_t mode)
15395 {
15396         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
15397                 union tcp_log_stackspecific log;
15398                 struct timeval tv;
15399
15400                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15401                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
15402                 log.u_bbr.flex1 = error;
15403                 log.u_bbr.flex2 = flags;
15404                 log.u_bbr.flex3 = rsm_is_null;
15405                 log.u_bbr.flex4 = ipoptlen;
15406                 log.u_bbr.flex5 = tp->rcv_numsacks;
15407                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
15408                 log.u_bbr.flex7 = optlen;
15409                 log.u_bbr.flex8 = rack->r_fsb_inited;
15410                 log.u_bbr.applimited = rack->r_fast_output;
15411                 log.u_bbr.bw_inuse = rack_get_bw(rack);
15412                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
15413                 log.u_bbr.cwnd_gain = mode;
15414                 log.u_bbr.pkts_out = orig_len;
15415                 log.u_bbr.lt_epoch = len;
15416                 log.u_bbr.delivered = line;
15417                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15418                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15419                 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
15420                                len, &log, false, NULL, NULL, 0, &tv);
15421         }
15422 }
15423
15424
15425 static struct mbuf *
15426 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
15427                    struct rack_fast_send_blk *fsb,
15428                    int32_t seglimit, int32_t segsize, int hw_tls)
15429 {
15430 #ifdef KERN_TLS
15431         struct ktls_session *tls, *ntls;
15432 #ifdef INVARIANTS
15433         struct mbuf *start;
15434 #endif
15435 #endif
15436         struct mbuf *m, *n, **np, *smb;
15437         struct mbuf *top;
15438         int32_t off, soff;
15439         int32_t len = *plen;
15440         int32_t fragsize;
15441         int32_t len_cp = 0;
15442         uint32_t mlen, frags;
15443
15444         soff = off = the_off;
15445         smb = m = the_m;
15446         np = &top;
15447         top = NULL;
15448 #ifdef KERN_TLS
15449         if (hw_tls && (m->m_flags & M_EXTPG))
15450                 tls = m->m_epg_tls;
15451         else
15452                 tls = NULL;
15453 #ifdef INVARIANTS
15454         start = m;
15455 #endif
15456 #endif
15457         while (len > 0) {
15458                 if (m == NULL) {
15459                         *plen = len_cp;
15460                         break;
15461                 }
15462 #ifdef KERN_TLS
15463                 if (hw_tls) {
15464                         if (m->m_flags & M_EXTPG)
15465                                 ntls = m->m_epg_tls;
15466                         else
15467                                 ntls = NULL;
15468
15469                         /*
15470                          * Avoid mixing TLS records with handshake
15471                          * data or TLS records from different
15472                          * sessions.
15473                          */
15474                         if (tls != ntls) {
15475                                 MPASS(m != start);
15476                                 *plen = len_cp;
15477                                 break;
15478                         }
15479                 }
15480 #endif
15481                 mlen = min(len, m->m_len - off);
15482                 if (seglimit) {
15483                         /*
15484                          * For M_EXTPG mbufs, add 3 segments
15485                          * + 1 in case we are crossing page boundaries
15486                          * + 2 in case the TLS hdr/trailer are used
15487                          * It is cheaper to just add the segments
15488                          * than it is to take the cache miss to look
15489                          * at the mbuf ext_pgs state in detail.
15490                          */
15491                         if (m->m_flags & M_EXTPG) {
15492                                 fragsize = min(segsize, PAGE_SIZE);
15493                                 frags = 3;
15494                         } else {
15495                                 fragsize = segsize;
15496                                 frags = 0;
15497                         }
15498
15499                         /* Break if we really can't fit anymore. */
15500                         if ((frags + 1) >= seglimit) {
15501                                 *plen = len_cp;
15502                                 break;
15503                         }
15504
15505                         /*
15506                          * Reduce size if you can't copy the whole
15507                          * mbuf. If we can't copy the whole mbuf, also
15508                          * adjust len so the loop will end after this
15509                          * mbuf.
15510                          */
15511                         if ((frags + howmany(mlen, fragsize)) >= seglimit) {
15512                                 mlen = (seglimit - frags - 1) * fragsize;
15513                                 len = mlen;
15514                                 *plen = len_cp + len;
15515                         }
15516                         frags += howmany(mlen, fragsize);
15517                         if (frags == 0)
15518                                 frags++;
15519                         seglimit -= frags;
15520                         KASSERT(seglimit > 0,
15521                             ("%s: seglimit went too low", __func__));
15522                 }
15523                 n = m_get(M_NOWAIT, m->m_type);
15524                 *np = n;
15525                 if (n == NULL)
15526                         goto nospace;
15527                 n->m_len = mlen;
15528                 soff += mlen;
15529                 len_cp += n->m_len;
15530                 if (m->m_flags & (M_EXT|M_EXTPG)) {
15531                         n->m_data = m->m_data + off;
15532                         mb_dupcl(n, m);
15533                 } else {
15534                         bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
15535                             (u_int)n->m_len);
15536                 }
15537                 len -= n->m_len;
15538                 off = 0;
15539                 m = m->m_next;
15540                 np = &n->m_next;
15541                 if (len || (soff == smb->m_len)) {
15542                         /*
15543                          * We have more so we move forward  or
15544                          * we have consumed the entire mbuf and
15545                          * len has fell to 0.
15546                          */
15547                         soff = 0;
15548                         smb = m;
15549                 }
15550
15551         }
15552         if (fsb != NULL) {
15553                 fsb->m = smb;
15554                 fsb->off = soff;
15555                 if (smb) {
15556                         /*
15557                          * Save off the size of the mbuf. We do
15558                          * this so that we can recognize when it
15559                          * has been trimmed by sbcut() as acks
15560                          * come in.
15561                          */
15562                         fsb->o_m_len = smb->m_len;
15563                 } else {
15564                         /*
15565                          * This is the case where the next mbuf went to NULL. This
15566                          * means with this copy we have sent everything in the sb.
15567                          * In theory we could clear the fast_output flag, but lets
15568                          * not since its possible that we could get more added
15569                          * and acks that call the extend function which would let
15570                          * us send more.
15571                          */
15572                         fsb->o_m_len = 0;
15573                 }
15574         }
15575         return (top);
15576 nospace:
15577         if (top)
15578                 m_freem(top);
15579         return (NULL);
15580
15581 }
15582
15583 /*
15584  * This is a copy of m_copym(), taking the TSO segment size/limit
15585  * constraints into account, and advancing the sndptr as it goes.
15586  */
15587 static struct mbuf *
15588 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
15589                 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
15590 {
15591         struct mbuf *m, *n;
15592         int32_t soff;
15593
15594         soff = rack->r_ctl.fsb.off;
15595         m = rack->r_ctl.fsb.m;
15596         if (rack->r_ctl.fsb.o_m_len > m->m_len) {
15597                 /*
15598                  * The mbuf had the front of it chopped off by an ack
15599                  * we need to adjust the soff/off by that difference.
15600                  */
15601                 uint32_t delta;
15602
15603                 delta = rack->r_ctl.fsb.o_m_len - m->m_len;
15604                 soff -= delta;
15605         } else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
15606                 /*
15607                  * The mbuf was expanded probably by
15608                  * a m_compress. Just update o_m_len.
15609                  */
15610                 rack->r_ctl.fsb.o_m_len = m->m_len;
15611         }
15612         KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
15613         KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
15614         KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
15615                                  __FUNCTION__,
15616                                  rack, *plen, m, m->m_len));
15617         /* Save off the right location before we copy and advance */
15618         *s_soff = soff;
15619         *s_mb = rack->r_ctl.fsb.m;
15620         n = rack_fo_base_copym(m, soff, plen,
15621                                &rack->r_ctl.fsb,
15622                                seglimit, segsize, rack->r_ctl.fsb.hw_tls);
15623         return (n);
15624 }
15625
15626 static int
15627 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
15628                      uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
15629 {
15630         /*
15631          * Enter the fast retransmit path. We are given that a sched_pin is
15632          * in place (if accounting is compliled in) and the cycle count taken
15633          * at the entry is in the ts_val. The concept her is that the rsm
15634          * now holds the mbuf offsets and such so we can directly transmit
15635          * without a lot of overhead, the len field is already set for
15636          * us to prohibit us from sending too much (usually its 1MSS).
15637          */
15638         struct ip *ip = NULL;
15639         struct udphdr *udp = NULL;
15640         struct tcphdr *th = NULL;
15641         struct mbuf *m = NULL;
15642         struct inpcb *inp;
15643         uint8_t *cpto;
15644         struct tcp_log_buffer *lgb;
15645 #ifdef TCP_ACCOUNTING
15646         uint64_t crtsc;
15647         int cnt_thru = 1;
15648 #endif
15649         struct tcpopt to;
15650         u_char opt[TCP_MAXOLEN];
15651         uint32_t hdrlen, optlen;
15652         int32_t slot, segsiz, max_val, tso = 0, error, ulen = 0;
15653         uint16_t flags;
15654         uint32_t if_hw_tsomaxsegcount = 0, startseq;
15655         uint32_t if_hw_tsomaxsegsize;
15656
15657 #ifdef INET6
15658         struct ip6_hdr *ip6 = NULL;
15659
15660         if (rack->r_is_v6) {
15661                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
15662                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
15663         } else
15664 #endif                          /* INET6 */
15665         {
15666                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
15667                 hdrlen = sizeof(struct tcpiphdr);
15668         }
15669         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
15670                 goto failed;
15671         }
15672         if (doing_tlp) {
15673                 /* Its a TLP add the flag, it may already be there but be sure */
15674                 rsm->r_flags |= RACK_TLP;
15675         } else {
15676                 /* If it was a TLP it is not not on this retransmit */
15677                 rsm->r_flags &= ~RACK_TLP;
15678         }
15679         startseq = rsm->r_start;
15680         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
15681         inp = rack->rc_inp;
15682         to.to_flags = 0;
15683         flags = tcp_outflags[tp->t_state];
15684         if (flags & (TH_SYN|TH_RST)) {
15685                 goto failed;
15686         }
15687         if (rsm->r_flags & RACK_HAS_FIN) {
15688                 /* We can't send a FIN here */
15689                 goto failed;
15690         }
15691         if (flags & TH_FIN) {
15692                 /* We never send a FIN */
15693                 flags &= ~TH_FIN;
15694         }
15695         if (tp->t_flags & TF_RCVD_TSTMP) {
15696                 to.to_tsval = ms_cts + tp->ts_offset;
15697                 to.to_tsecr = tp->ts_recent;
15698                 to.to_flags = TOF_TS;
15699         }
15700         optlen = tcp_addoptions(&to, opt);
15701         hdrlen += optlen;
15702         udp = rack->r_ctl.fsb.udp;
15703         if (udp)
15704                 hdrlen += sizeof(struct udphdr);
15705         if (rack->r_ctl.rc_pace_max_segs)
15706                 max_val = rack->r_ctl.rc_pace_max_segs;
15707         else if (rack->rc_user_set_max_segs)
15708                 max_val = rack->rc_user_set_max_segs * segsiz;
15709         else
15710                 max_val = len;
15711         if ((tp->t_flags & TF_TSO) &&
15712             V_tcp_do_tso &&
15713             (len > segsiz) &&
15714             (tp->t_port == 0))
15715                 tso = 1;
15716 #ifdef INET6
15717         if (MHLEN < hdrlen + max_linkhdr)
15718                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
15719         else
15720 #endif
15721                 m = m_gethdr(M_NOWAIT, MT_DATA);
15722         if (m == NULL)
15723                 goto failed;
15724         m->m_data += max_linkhdr;
15725         m->m_len = hdrlen;
15726         th = rack->r_ctl.fsb.th;
15727         /* Establish the len to send */
15728         if (len > max_val)
15729                 len = max_val;
15730         if ((tso) && (len + optlen > tp->t_maxseg)) {
15731                 uint32_t if_hw_tsomax;
15732                 int32_t max_len;
15733
15734                 /* extract TSO information */
15735                 if_hw_tsomax = tp->t_tsomax;
15736                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
15737                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
15738                 /*
15739                  * Check if we should limit by maximum payload
15740                  * length:
15741                  */
15742                 if (if_hw_tsomax != 0) {
15743                         /* compute maximum TSO length */
15744                         max_len = (if_hw_tsomax - hdrlen -
15745                                    max_linkhdr);
15746                         if (max_len <= 0) {
15747                                 goto failed;
15748                         } else if (len > max_len) {
15749                                 len = max_len;
15750                         }
15751                 }
15752                 if (len <= segsiz) {
15753                         /*
15754                          * In case there are too many small fragments don't
15755                          * use TSO:
15756                          */
15757                         tso = 0;
15758                 }
15759         } else {
15760                 tso = 0;
15761         }
15762         if ((tso == 0) && (len > segsiz))
15763                 len = segsiz;
15764         if ((len == 0) ||
15765             (len <= MHLEN - hdrlen - max_linkhdr)) {
15766                 goto failed;
15767         }
15768         th->th_seq = htonl(rsm->r_start);
15769         th->th_ack = htonl(tp->rcv_nxt);
15770         /*
15771          * The PUSH bit should only be applied
15772          * if the full retransmission is made. If
15773          * we are sending less than this is the
15774          * left hand edge and should not have
15775          * the PUSH bit.
15776          */
15777         if ((rsm->r_flags & RACK_HAD_PUSH) &&
15778             (len == (rsm->r_end - rsm->r_start)))
15779                 flags |= TH_PUSH;
15780         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
15781         if (th->th_win == 0) {
15782                 tp->t_sndzerowin++;
15783                 tp->t_flags |= TF_RXWIN0SENT;
15784         } else
15785                 tp->t_flags &= ~TF_RXWIN0SENT;
15786         if (rsm->r_flags & RACK_TLP) {
15787                 /*
15788                  * TLP should not count in retran count, but
15789                  * in its own bin
15790                  */
15791                 counter_u64_add(rack_tlp_retran, 1);
15792                 counter_u64_add(rack_tlp_retran_bytes, len);
15793         } else {
15794                 tp->t_sndrexmitpack++;
15795                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
15796                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
15797         }
15798 #ifdef STATS
15799         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
15800                                  len);
15801 #endif
15802         if (rsm->m == NULL)
15803                 goto failed;
15804         if (rsm->orig_m_len != rsm->m->m_len) {
15805                 /* Fix up the orig_m_len and possibly the mbuf offset */
15806                 rack_adjust_orig_mlen(rsm);
15807         }
15808         m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
15809         if (len <= segsiz) {
15810                 /*
15811                  * Must have ran out of mbufs for the copy
15812                  * shorten it to no longer need tso. Lets
15813                  * not put on sendalot since we are low on
15814                  * mbufs.
15815                  */
15816                 tso = 0;
15817         }
15818         if ((m->m_next == NULL) || (len <= 0)){
15819                 goto failed;
15820         }
15821         if (udp) {
15822                 if (rack->r_is_v6)
15823                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
15824                 else
15825                         ulen = hdrlen + len - sizeof(struct ip);
15826                 udp->uh_ulen = htons(ulen);
15827         }
15828         m->m_pkthdr.rcvif = (struct ifnet *)0;
15829         if (TCPS_HAVERCVDSYN(tp->t_state) &&
15830             (tp->t_flags2 & TF2_ECN_PERMIT)) {
15831                 int ect = tcp_ecn_output_established(tp, &flags, len, true);
15832                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
15833                     (tp->t_flags2 & TF2_ECN_SND_ECE))
15834                     tp->t_flags2 &= ~TF2_ECN_SND_ECE;
15835 #ifdef INET6
15836                 if (rack->r_is_v6) {
15837                     ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
15838                     ip6->ip6_flow |= htonl(ect << 20);
15839                 }
15840                 else
15841 #endif
15842                 {
15843                     ip->ip_tos &= ~IPTOS_ECN_MASK;
15844                     ip->ip_tos |= ect;
15845                 }
15846         }
15847         tcp_set_flags(th, flags);
15848         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
15849 #ifdef INET6
15850         if (rack->r_is_v6) {
15851                 if (tp->t_port) {
15852                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
15853                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
15854                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
15855                         th->th_sum = htons(0);
15856                         UDPSTAT_INC(udps_opackets);
15857                 } else {
15858                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
15859                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
15860                         th->th_sum = in6_cksum_pseudo(ip6,
15861                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
15862                                                       0);
15863                 }
15864         }
15865 #endif
15866 #if defined(INET6) && defined(INET)
15867         else
15868 #endif
15869 #ifdef INET
15870         {
15871                 if (tp->t_port) {
15872                         m->m_pkthdr.csum_flags = CSUM_UDP;
15873                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
15874                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
15875                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
15876                         th->th_sum = htons(0);
15877                         UDPSTAT_INC(udps_opackets);
15878                 } else {
15879                         m->m_pkthdr.csum_flags = CSUM_TCP;
15880                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
15881                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
15882                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
15883                                                                         IPPROTO_TCP + len + optlen));
15884                 }
15885                 /* IP version must be set here for ipv4/ipv6 checking later */
15886                 KASSERT(ip->ip_v == IPVERSION,
15887                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
15888         }
15889 #endif
15890         if (tso) {
15891                 KASSERT(len > tp->t_maxseg - optlen,
15892                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
15893                 m->m_pkthdr.csum_flags |= CSUM_TSO;
15894                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
15895         }
15896 #ifdef INET6
15897         if (rack->r_is_v6) {
15898                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
15899                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
15900                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
15901                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
15902                 else
15903                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
15904         }
15905 #endif
15906 #if defined(INET) && defined(INET6)
15907         else
15908 #endif
15909 #ifdef INET
15910         {
15911                 ip->ip_len = htons(m->m_pkthdr.len);
15912                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
15913                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
15914                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
15915                         if (tp->t_port == 0 || len < V_tcp_minmss) {
15916                                 ip->ip_off |= htons(IP_DF);
15917                         }
15918                 } else {
15919                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
15920                 }
15921         }
15922 #endif
15923         /* Time to copy in our header */
15924         cpto = mtod(m, uint8_t *);
15925         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
15926         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
15927         if (optlen) {
15928                 bcopy(opt, th + 1, optlen);
15929                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
15930         } else {
15931                 th->th_off = sizeof(struct tcphdr) >> 2;
15932         }
15933         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
15934                 union tcp_log_stackspecific log;
15935
15936                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
15937                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
15938                 if (rack->rack_no_prr)
15939                         log.u_bbr.flex1 = 0;
15940                 else
15941                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
15942                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
15943                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
15944                 log.u_bbr.flex4 = max_val;
15945                 log.u_bbr.flex5 = 0;
15946                 /* Save off the early/late values */
15947                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
15948                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
15949                 log.u_bbr.bw_inuse = rack_get_bw(rack);
15950                 if (doing_tlp == 0)
15951                         log.u_bbr.flex8 = 1;
15952                 else
15953                         log.u_bbr.flex8 = 2;
15954                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
15955                 log.u_bbr.flex7 = 55;
15956                 log.u_bbr.pkts_out = tp->t_maxseg;
15957                 log.u_bbr.timeStamp = cts;
15958                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15959                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
15960                 log.u_bbr.delivered = 0;
15961                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
15962                                      len, &log, false, NULL, NULL, 0, tv);
15963         } else
15964                 lgb = NULL;
15965 #ifdef INET6
15966         if (rack->r_is_v6) {
15967                 error = ip6_output(m, NULL,
15968                                    &inp->inp_route6,
15969                                    0, NULL, NULL, inp);
15970         }
15971 #endif
15972 #if defined(INET) && defined(INET6)
15973         else
15974 #endif
15975 #ifdef INET
15976         {
15977                 error = ip_output(m, NULL,
15978                                   &inp->inp_route,
15979                                   0, 0, inp);
15980         }
15981 #endif
15982         m = NULL;
15983         if (lgb) {
15984                 lgb->tlb_errno = error;
15985                 lgb = NULL;
15986         }
15987         if (error) {
15988                 goto failed;
15989         }
15990         rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
15991                         rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls);
15992         if (doing_tlp && (rack->fast_rsm_hack == 0)) {
15993                 rack->rc_tlp_in_progress = 1;
15994                 rack->r_ctl.rc_tlp_cnt_out++;
15995         }
15996         if (error == 0) {
15997                 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
15998                 if (doing_tlp) {
15999                         rack->rc_last_sent_tlp_past_cumack = 0;
16000                         rack->rc_last_sent_tlp_seq_valid = 1;
16001                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
16002                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
16003                 }
16004         }
16005         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16006         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16007         if (IN_FASTRECOVERY(tp->t_flags) && rsm)
16008                 rack->r_ctl.retran_during_recovery += len;
16009         {
16010                 int idx;
16011
16012                 idx = (len / segsiz) + 3;
16013                 if (idx >= TCP_MSS_ACCT_ATIMER)
16014                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16015                 else
16016                         counter_u64_add(rack_out_size[idx], 1);
16017         }
16018         if (tp->t_rtttime == 0) {
16019                 tp->t_rtttime = ticks;
16020                 tp->t_rtseq = startseq;
16021                 KMOD_TCPSTAT_INC(tcps_segstimed);
16022         }
16023         counter_u64_add(rack_fto_rsm_send, 1);
16024         if (error && (error == ENOBUFS)) {
16025                 if (rack->r_ctl.crte != NULL) {
16026                         rack_trace_point(rack, RACK_TP_HWENOBUF);
16027                 } else
16028                         rack_trace_point(rack, RACK_TP_ENOBUF);
16029                 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
16030                 if (rack->rc_enobuf < 0x7f)
16031                         rack->rc_enobuf++;
16032                 if (slot < (10 * HPTS_USEC_IN_MSEC))
16033                         slot = 10 * HPTS_USEC_IN_MSEC;
16034         } else
16035                 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
16036         if ((slot == 0) ||
16037             (rack->rc_always_pace == 0) ||
16038             (rack->r_rr_config == 1)) {
16039                 /*
16040                  * We have no pacing set or we
16041                  * are using old-style rack or
16042                  * we are overriden to use the old 1ms pacing.
16043                  */
16044                 slot = rack->r_ctl.rc_min_to;
16045         }
16046         rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
16047 #ifdef TCP_ACCOUNTING
16048         crtsc = get_cyclecount();
16049         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16050                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16051         }
16052         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16053         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16054                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16055         }
16056         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16057         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16058                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
16059         }
16060         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz));
16061         sched_unpin();
16062 #endif
16063         return (0);
16064 failed:
16065         if (m)
16066                 m_free(m);
16067         return (-1);
16068 }
16069
16070 static void
16071 rack_sndbuf_autoscale(struct tcp_rack *rack)
16072 {
16073         /*
16074          * Automatic sizing of send socket buffer.  Often the send buffer
16075          * size is not optimally adjusted to the actual network conditions
16076          * at hand (delay bandwidth product).  Setting the buffer size too
16077          * small limits throughput on links with high bandwidth and high
16078          * delay (eg. trans-continental/oceanic links).  Setting the
16079          * buffer size too big consumes too much real kernel memory,
16080          * especially with many connections on busy servers.
16081          *
16082          * The criteria to step up the send buffer one notch are:
16083          *  1. receive window of remote host is larger than send buffer
16084          *     (with a fudge factor of 5/4th);
16085          *  2. send buffer is filled to 7/8th with data (so we actually
16086          *     have data to make use of it);
16087          *  3. send buffer fill has not hit maximal automatic size;
16088          *  4. our send window (slow start and cogestion controlled) is
16089          *     larger than sent but unacknowledged data in send buffer.
16090          *
16091          * Note that the rack version moves things much faster since
16092          * we want to avoid hitting cache lines in the rack_fast_output()
16093          * path so this is called much less often and thus moves
16094          * the SB forward by a percentage.
16095          */
16096         struct socket *so;
16097         struct tcpcb *tp;
16098         uint32_t sendwin, scaleup;
16099
16100         tp = rack->rc_tp;
16101         so = rack->rc_inp->inp_socket;
16102         sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
16103         if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
16104                 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
16105                     sbused(&so->so_snd) >=
16106                     (so->so_snd.sb_hiwat / 8 * 7) &&
16107                     sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
16108                     sendwin >= (sbused(&so->so_snd) -
16109                     (tp->snd_nxt - tp->snd_una))) {
16110                         if (rack_autosndbuf_inc)
16111                                 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
16112                         else
16113                                 scaleup = V_tcp_autosndbuf_inc;
16114                         if (scaleup < V_tcp_autosndbuf_inc)
16115                                 scaleup = V_tcp_autosndbuf_inc;
16116                         scaleup += so->so_snd.sb_hiwat;
16117                         if (scaleup > V_tcp_autosndbuf_max)
16118                                 scaleup = V_tcp_autosndbuf_max;
16119                         if (!sbreserve_locked(so, SO_SND, scaleup, curthread))
16120                                 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
16121                 }
16122         }
16123 }
16124
16125 static int
16126 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
16127                  uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
16128 {
16129         /*
16130          * Enter to do fast output. We are given that the sched_pin is
16131          * in place (if accounting is compiled in) and the cycle count taken
16132          * at entry is in place in ts_val. The idea here is that
16133          * we know how many more bytes needs to be sent (presumably either
16134          * during pacing or to fill the cwnd and that was greater than
16135          * the max-burst). We have how much to send and all the info we
16136          * need to just send.
16137          */
16138         struct ip *ip = NULL;
16139         struct udphdr *udp = NULL;
16140         struct tcphdr *th = NULL;
16141         struct mbuf *m, *s_mb;
16142         struct inpcb *inp;
16143         uint8_t *cpto;
16144         struct tcp_log_buffer *lgb;
16145 #ifdef TCP_ACCOUNTING
16146         uint64_t crtsc;
16147 #endif
16148         struct tcpopt to;
16149         u_char opt[TCP_MAXOLEN];
16150         uint32_t hdrlen, optlen;
16151         int cnt_thru = 1;
16152         int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
16153         uint16_t flags;
16154         uint32_t s_soff;
16155         uint32_t if_hw_tsomaxsegcount = 0, startseq;
16156         uint32_t if_hw_tsomaxsegsize;
16157         uint16_t add_flag = RACK_SENT_FP;
16158 #ifdef INET6
16159         struct ip6_hdr *ip6 = NULL;
16160
16161         if (rack->r_is_v6) {
16162                 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
16163                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
16164         } else
16165 #endif                          /* INET6 */
16166         {
16167                 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
16168                 hdrlen = sizeof(struct tcpiphdr);
16169         }
16170         if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
16171                 m = NULL;
16172                 goto failed;
16173         }
16174         startseq = tp->snd_max;
16175         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
16176         inp = rack->rc_inp;
16177         len = rack->r_ctl.fsb.left_to_send;
16178         to.to_flags = 0;
16179         flags = rack->r_ctl.fsb.tcp_flags;
16180         if (tp->t_flags & TF_RCVD_TSTMP) {
16181                 to.to_tsval = ms_cts + tp->ts_offset;
16182                 to.to_tsecr = tp->ts_recent;
16183                 to.to_flags = TOF_TS;
16184         }
16185         optlen = tcp_addoptions(&to, opt);
16186         hdrlen += optlen;
16187         udp = rack->r_ctl.fsb.udp;
16188         if (udp)
16189                 hdrlen += sizeof(struct udphdr);
16190         if (rack->r_ctl.rc_pace_max_segs)
16191                 max_val = rack->r_ctl.rc_pace_max_segs;
16192         else if (rack->rc_user_set_max_segs)
16193                 max_val = rack->rc_user_set_max_segs * segsiz;
16194         else
16195                 max_val = len;
16196         if ((tp->t_flags & TF_TSO) &&
16197             V_tcp_do_tso &&
16198             (len > segsiz) &&
16199             (tp->t_port == 0))
16200                 tso = 1;
16201 again:
16202 #ifdef INET6
16203         if (MHLEN < hdrlen + max_linkhdr)
16204                 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
16205         else
16206 #endif
16207                 m = m_gethdr(M_NOWAIT, MT_DATA);
16208         if (m == NULL)
16209                 goto failed;
16210         m->m_data += max_linkhdr;
16211         m->m_len = hdrlen;
16212         th = rack->r_ctl.fsb.th;
16213         /* Establish the len to send */
16214         if (len > max_val)
16215                 len = max_val;
16216         if ((tso) && (len + optlen > tp->t_maxseg)) {
16217                 uint32_t if_hw_tsomax;
16218                 int32_t max_len;
16219
16220                 /* extract TSO information */
16221                 if_hw_tsomax = tp->t_tsomax;
16222                 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
16223                 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
16224                 /*
16225                  * Check if we should limit by maximum payload
16226                  * length:
16227                  */
16228                 if (if_hw_tsomax != 0) {
16229                         /* compute maximum TSO length */
16230                         max_len = (if_hw_tsomax - hdrlen -
16231                                    max_linkhdr);
16232                         if (max_len <= 0) {
16233                                 goto failed;
16234                         } else if (len > max_len) {
16235                                 len = max_len;
16236                         }
16237                 }
16238                 if (len <= segsiz) {
16239                         /*
16240                          * In case there are too many small fragments don't
16241                          * use TSO:
16242                          */
16243                         tso = 0;
16244                 }
16245         } else {
16246                 tso = 0;
16247         }
16248         if ((tso == 0) && (len > segsiz))
16249                 len = segsiz;
16250         if ((len == 0) ||
16251             (len <= MHLEN - hdrlen - max_linkhdr)) {
16252                 goto failed;
16253         }
16254         sb_offset = tp->snd_max - tp->snd_una;
16255         th->th_seq = htonl(tp->snd_max);
16256         th->th_ack = htonl(tp->rcv_nxt);
16257         th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
16258         if (th->th_win == 0) {
16259                 tp->t_sndzerowin++;
16260                 tp->t_flags |= TF_RXWIN0SENT;
16261         } else
16262                 tp->t_flags &= ~TF_RXWIN0SENT;
16263         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
16264         KMOD_TCPSTAT_INC(tcps_sndpack);
16265         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
16266 #ifdef STATS
16267         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
16268                                  len);
16269 #endif
16270         if (rack->r_ctl.fsb.m == NULL)
16271                 goto failed;
16272
16273         /* s_mb and s_soff are saved for rack_log_output */
16274         m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
16275                                     &s_mb, &s_soff);
16276         if (len <= segsiz) {
16277                 /*
16278                  * Must have ran out of mbufs for the copy
16279                  * shorten it to no longer need tso. Lets
16280                  * not put on sendalot since we are low on
16281                  * mbufs.
16282                  */
16283                 tso = 0;
16284         }
16285         if (rack->r_ctl.fsb.rfo_apply_push &&
16286             (len == rack->r_ctl.fsb.left_to_send)) {
16287                 flags |= TH_PUSH;
16288                 add_flag |= RACK_HAD_PUSH;
16289         }
16290         if ((m->m_next == NULL) || (len <= 0)){
16291                 goto failed;
16292         }
16293         if (udp) {
16294                 if (rack->r_is_v6)
16295                         ulen = hdrlen + len - sizeof(struct ip6_hdr);
16296                 else
16297                         ulen = hdrlen + len - sizeof(struct ip);
16298                 udp->uh_ulen = htons(ulen);
16299         }
16300         m->m_pkthdr.rcvif = (struct ifnet *)0;
16301         if (TCPS_HAVERCVDSYN(tp->t_state) &&
16302             (tp->t_flags2 & TF2_ECN_PERMIT)) {
16303                 int ect = tcp_ecn_output_established(tp, &flags, len, false);
16304                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
16305                     (tp->t_flags2 & TF2_ECN_SND_ECE))
16306                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
16307 #ifdef INET6
16308                 if (rack->r_is_v6) {
16309                         ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
16310                         ip6->ip6_flow |= htonl(ect << 20);
16311                 }
16312                 else
16313 #endif
16314                 {
16315                         ip->ip_tos &= ~IPTOS_ECN_MASK;
16316                         ip->ip_tos |= ect;
16317                 }
16318         }
16319         tcp_set_flags(th, flags);
16320         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
16321 #ifdef INET6
16322         if (rack->r_is_v6) {
16323                 if (tp->t_port) {
16324                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
16325                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16326                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
16327                         th->th_sum = htons(0);
16328                         UDPSTAT_INC(udps_opackets);
16329                 } else {
16330                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
16331                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16332                         th->th_sum = in6_cksum_pseudo(ip6,
16333                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
16334                                                       0);
16335                 }
16336         }
16337 #endif
16338 #if defined(INET6) && defined(INET)
16339         else
16340 #endif
16341 #ifdef INET
16342         {
16343                 if (tp->t_port) {
16344                         m->m_pkthdr.csum_flags = CSUM_UDP;
16345                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
16346                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
16347                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
16348                         th->th_sum = htons(0);
16349                         UDPSTAT_INC(udps_opackets);
16350                 } else {
16351                         m->m_pkthdr.csum_flags = CSUM_TCP;
16352                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
16353                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
16354                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
16355                                                                         IPPROTO_TCP + len + optlen));
16356                 }
16357                 /* IP version must be set here for ipv4/ipv6 checking later */
16358                 KASSERT(ip->ip_v == IPVERSION,
16359                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
16360         }
16361 #endif
16362         if (tso) {
16363                 KASSERT(len > tp->t_maxseg - optlen,
16364                         ("%s: len <= tso_segsz tp:%p", __func__, tp));
16365                 m->m_pkthdr.csum_flags |= CSUM_TSO;
16366                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
16367         }
16368 #ifdef INET6
16369         if (rack->r_is_v6) {
16370                 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
16371                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
16372                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
16373                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16374                 else
16375                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16376         }
16377 #endif
16378 #if defined(INET) && defined(INET6)
16379         else
16380 #endif
16381 #ifdef INET
16382         {
16383                 ip->ip_len = htons(m->m_pkthdr.len);
16384                 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
16385                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
16386                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
16387                         if (tp->t_port == 0 || len < V_tcp_minmss) {
16388                                 ip->ip_off |= htons(IP_DF);
16389                         }
16390                 } else {
16391                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
16392                 }
16393         }
16394 #endif
16395         /* Time to copy in our header */
16396         cpto = mtod(m, uint8_t *);
16397         memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
16398         th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
16399         if (optlen) {
16400                 bcopy(opt, th + 1, optlen);
16401                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
16402         } else {
16403                 th->th_off = sizeof(struct tcphdr) >> 2;
16404         }
16405         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
16406                 union tcp_log_stackspecific log;
16407
16408                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
16409                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
16410                 if (rack->rack_no_prr)
16411                         log.u_bbr.flex1 = 0;
16412                 else
16413                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16414                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
16415                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
16416                 log.u_bbr.flex4 = max_val;
16417                 log.u_bbr.flex5 = 0;
16418                 /* Save off the early/late values */
16419                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
16420                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
16421                 log.u_bbr.bw_inuse = rack_get_bw(rack);
16422                 log.u_bbr.flex8 = 0;
16423                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
16424                 log.u_bbr.flex7 = 44;
16425                 log.u_bbr.pkts_out = tp->t_maxseg;
16426                 log.u_bbr.timeStamp = cts;
16427                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16428                 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
16429                 log.u_bbr.delivered = 0;
16430                 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
16431                                      len, &log, false, NULL, NULL, 0, tv);
16432         } else
16433                 lgb = NULL;
16434 #ifdef INET6
16435         if (rack->r_is_v6) {
16436                 error = ip6_output(m, NULL,
16437                                    &inp->inp_route6,
16438                                    0, NULL, NULL, inp);
16439         }
16440 #endif
16441 #if defined(INET) && defined(INET6)
16442         else
16443 #endif
16444 #ifdef INET
16445         {
16446                 error = ip_output(m, NULL,
16447                                   &inp->inp_route,
16448                                   0, 0, inp);
16449         }
16450 #endif
16451         if (lgb) {
16452                 lgb->tlb_errno = error;
16453                 lgb = NULL;
16454         }
16455         if (error) {
16456                 *send_err = error;
16457                 m = NULL;
16458                 goto failed;
16459         }
16460         rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
16461                         NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls);
16462         m = NULL;
16463         if (tp->snd_una == tp->snd_max) {
16464                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
16465                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
16466                 tp->t_acktime = ticks;
16467         }
16468         if (error == 0)
16469                 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
16470
16471         rack->forced_ack = 0;   /* If we send something zap the FA flag */
16472         tot_len += len;
16473         if ((tp->t_flags & TF_GPUTINPROG) == 0)
16474                 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
16475         tp->snd_max += len;
16476         tp->snd_nxt = tp->snd_max;
16477         {
16478                 int idx;
16479
16480                 idx = (len / segsiz) + 3;
16481                 if (idx >= TCP_MSS_ACCT_ATIMER)
16482                         counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
16483                 else
16484                         counter_u64_add(rack_out_size[idx], 1);
16485         }
16486         if (len <= rack->r_ctl.fsb.left_to_send)
16487                 rack->r_ctl.fsb.left_to_send -= len;
16488         else
16489                 rack->r_ctl.fsb.left_to_send = 0;
16490         if (rack->r_ctl.fsb.left_to_send < segsiz) {
16491                 rack->r_fast_output = 0;
16492                 rack->r_ctl.fsb.left_to_send = 0;
16493                 /* At the end of fast_output scale up the sb */
16494                 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
16495                 rack_sndbuf_autoscale(rack);
16496                 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
16497         }
16498         if (tp->t_rtttime == 0) {
16499                 tp->t_rtttime = ticks;
16500                 tp->t_rtseq = startseq;
16501                 KMOD_TCPSTAT_INC(tcps_segstimed);
16502         }
16503         if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
16504             (max_val > len) &&
16505             (tso == 0)) {
16506                 max_val -= len;
16507                 len = segsiz;
16508                 th = rack->r_ctl.fsb.th;
16509                 cnt_thru++;
16510                 goto again;
16511         }
16512         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
16513         counter_u64_add(rack_fto_send, 1);
16514         slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
16515         rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
16516 #ifdef TCP_ACCOUNTING
16517         crtsc = get_cyclecount();
16518         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16519                 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
16520         }
16521         counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
16522         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16523                 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
16524         }
16525         counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
16526         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16527                 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
16528         }
16529         counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz));
16530         sched_unpin();
16531 #endif
16532         return (0);
16533 failed:
16534         if (m)
16535                 m_free(m);
16536         rack->r_fast_output = 0;
16537         return (-1);
16538 }
16539
16540 static int
16541 rack_output(struct tcpcb *tp)
16542 {
16543         struct socket *so;
16544         uint32_t recwin;
16545         uint32_t sb_offset, s_moff = 0;
16546         int32_t len, error = 0;
16547         uint16_t flags;
16548         struct mbuf *m, *s_mb = NULL;
16549         struct mbuf *mb;
16550         uint32_t if_hw_tsomaxsegcount = 0;
16551         uint32_t if_hw_tsomaxsegsize;
16552         int32_t segsiz, minseg;
16553         long tot_len_this_send = 0;
16554 #ifdef INET
16555         struct ip *ip = NULL;
16556 #endif
16557         struct udphdr *udp = NULL;
16558         struct tcp_rack *rack;
16559         struct tcphdr *th;
16560         uint8_t pass = 0;
16561         uint8_t mark = 0;
16562         uint8_t wanted_cookie = 0;
16563         u_char opt[TCP_MAXOLEN];
16564         unsigned ipoptlen, optlen, hdrlen, ulen=0;
16565         uint32_t rack_seq;
16566
16567 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
16568         unsigned ipsec_optlen = 0;
16569
16570 #endif
16571         int32_t idle, sendalot;
16572         int32_t sub_from_prr = 0;
16573         volatile int32_t sack_rxmit;
16574         struct rack_sendmap *rsm = NULL;
16575         int32_t tso, mtu;
16576         struct tcpopt to;
16577         int32_t slot = 0;
16578         int32_t sup_rack = 0;
16579         uint32_t cts, ms_cts, delayed, early;
16580         uint16_t add_flag = RACK_SENT_SP;
16581         /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
16582         uint8_t hpts_calling,  doing_tlp = 0;
16583         uint32_t cwnd_to_use, pace_max_seg;
16584         int32_t do_a_prefetch = 0;
16585         int32_t prefetch_rsm = 0;
16586         int32_t orig_len = 0;
16587         struct timeval tv;
16588         int32_t prefetch_so_done = 0;
16589         struct tcp_log_buffer *lgb;
16590         struct inpcb *inp;
16591         struct sockbuf *sb;
16592         uint64_t ts_val = 0;
16593 #ifdef TCP_ACCOUNTING
16594         uint64_t crtsc;
16595 #endif
16596 #ifdef INET6
16597         struct ip6_hdr *ip6 = NULL;
16598         int32_t isipv6;
16599 #endif
16600         uint8_t filled_all = 0;
16601         bool hw_tls = false;
16602
16603         /* setup and take the cache hits here */
16604         rack = (struct tcp_rack *)tp->t_fb_ptr;
16605 #ifdef TCP_ACCOUNTING
16606         sched_pin();
16607         ts_val = get_cyclecount();
16608 #endif
16609         hpts_calling = rack->rc_inp->inp_hpts_calls;
16610         NET_EPOCH_ASSERT();
16611         INP_WLOCK_ASSERT(rack->rc_inp);
16612 #ifdef TCP_OFFLOAD
16613         if (tp->t_flags & TF_TOE) {
16614 #ifdef TCP_ACCOUNTING
16615                 sched_unpin();
16616 #endif
16617                 return (tcp_offload_output(tp));
16618         }
16619 #endif
16620         /*
16621          * For TFO connections in SYN_RECEIVED, only allow the initial
16622          * SYN|ACK and those sent by the retransmit timer.
16623          */
16624         if (IS_FASTOPEN(tp->t_flags) &&
16625             (tp->t_state == TCPS_SYN_RECEIVED) &&
16626             SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
16627             (rack->r_ctl.rc_resend == NULL)) {         /* not a retransmit */
16628 #ifdef TCP_ACCOUNTING
16629                 sched_unpin();
16630 #endif
16631                 return (0);
16632         }
16633 #ifdef INET6
16634         if (rack->r_state) {
16635                 /* Use the cache line loaded if possible */
16636                 isipv6 = rack->r_is_v6;
16637         } else {
16638                 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
16639         }
16640 #endif
16641         early = 0;
16642         cts = tcp_get_usecs(&tv);
16643         ms_cts = tcp_tv_to_mssectick(&tv);
16644         if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
16645             tcp_in_hpts(rack->rc_inp)) {
16646                 /*
16647                  * We are on the hpts for some timer but not hptsi output.
16648                  * Remove from the hpts unconditionally.
16649                  */
16650                 rack_timer_cancel(tp, rack, cts, __LINE__);
16651         }
16652         /* Are we pacing and late? */
16653         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16654             TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
16655                 /* We are delayed */
16656                 delayed = cts - rack->r_ctl.rc_last_output_to;
16657         } else {
16658                 delayed = 0;
16659         }
16660         /* Do the timers, which may override the pacer */
16661         if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
16662                 int retval;
16663
16664                 retval = rack_process_timers(tp, rack, cts, hpts_calling,
16665                     &doing_tlp);
16666                 if (retval != 0) {
16667                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
16668 #ifdef TCP_ACCOUNTING
16669                         sched_unpin();
16670 #endif
16671                         /*
16672                          * If timers want tcp_drop(), then pass error out,
16673                          * otherwise suppress it.
16674                          */
16675                         return (retval < 0 ? retval : 0);
16676                 }
16677         }
16678         if (rack->rc_in_persist) {
16679                 if (tcp_in_hpts(rack->rc_inp) == 0) {
16680                         /* Timer is not running */
16681                         rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16682                 }
16683 #ifdef TCP_ACCOUNTING
16684                 sched_unpin();
16685 #endif
16686                 return (0);
16687         }
16688         if ((rack->r_timer_override) ||
16689             (rack->rc_ack_can_sendout_data) ||
16690             (delayed) ||
16691             (tp->t_state < TCPS_ESTABLISHED)) {
16692                 rack->rc_ack_can_sendout_data = 0;
16693                 if (tcp_in_hpts(rack->rc_inp))
16694                         tcp_hpts_remove(rack->rc_inp);
16695         } else if (tcp_in_hpts(rack->rc_inp)) {
16696                 /*
16697                  * On the hpts you can't pass even if ACKNOW is on, we will
16698                  * when the hpts fires.
16699                  */
16700 #ifdef TCP_ACCOUNTING
16701                 crtsc = get_cyclecount();
16702                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16703                         tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
16704                 }
16705                 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val));
16706                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16707                         tp->tcp_cnt_counters[SND_BLOCKED]++;
16708                 }
16709                 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1);
16710                 sched_unpin();
16711 #endif
16712                 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
16713                 return (0);
16714         }
16715         rack->rc_inp->inp_hpts_calls = 0;
16716         /* Finish out both pacing early and late accounting */
16717         if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16718             TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
16719                 early = rack->r_ctl.rc_last_output_to - cts;
16720         } else
16721                 early = 0;
16722         if (delayed) {
16723                 rack->r_ctl.rc_agg_delayed += delayed;
16724                 rack->r_late = 1;
16725         } else if (early) {
16726                 rack->r_ctl.rc_agg_early += early;
16727                 rack->r_early = 1;
16728         }
16729         /* Now that early/late accounting is done turn off the flag */
16730         rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
16731         rack->r_wanted_output = 0;
16732         rack->r_timer_override = 0;
16733         if ((tp->t_state != rack->r_state) &&
16734             TCPS_HAVEESTABLISHED(tp->t_state)) {
16735                 rack_set_state(tp, rack);
16736         }
16737         if ((rack->r_fast_output) &&
16738             (doing_tlp == 0) &&
16739             (tp->rcv_numsacks == 0)) {
16740                 int ret;
16741
16742                 error = 0;
16743                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
16744                 if (ret >= 0)
16745                         return(ret);
16746                 else if (error) {
16747                         inp = rack->rc_inp;
16748                         so = inp->inp_socket;
16749                         sb = &so->so_snd;
16750                         goto nomore;
16751                 }
16752         }
16753         inp = rack->rc_inp;
16754         /*
16755          * For TFO connections in SYN_SENT or SYN_RECEIVED,
16756          * only allow the initial SYN or SYN|ACK and those sent
16757          * by the retransmit timer.
16758          */
16759         if (IS_FASTOPEN(tp->t_flags) &&
16760             ((tp->t_state == TCPS_SYN_RECEIVED) ||
16761              (tp->t_state == TCPS_SYN_SENT)) &&
16762             SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
16763             (tp->t_rxtshift == 0)) {              /* not a retransmit */
16764                 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
16765                 so = inp->inp_socket;
16766                 sb = &so->so_snd;
16767                 goto just_return_nolock;
16768         }
16769         /*
16770          * Determine length of data that should be transmitted, and flags
16771          * that will be used. If there is some data or critical controls
16772          * (SYN, RST) to send, then transmit; otherwise, investigate
16773          * further.
16774          */
16775         idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
16776         if (tp->t_idle_reduce) {
16777                 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur))
16778                         rack_cc_after_idle(rack, tp);
16779         }
16780         tp->t_flags &= ~TF_LASTIDLE;
16781         if (idle) {
16782                 if (tp->t_flags & TF_MORETOCOME) {
16783                         tp->t_flags |= TF_LASTIDLE;
16784                         idle = 0;
16785                 }
16786         }
16787         if ((tp->snd_una == tp->snd_max) &&
16788             rack->r_ctl.rc_went_idle_time &&
16789             TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
16790                 idle = cts - rack->r_ctl.rc_went_idle_time;
16791                 if (idle > rack_min_probertt_hold) {
16792                         /* Count as a probe rtt */
16793                         if (rack->in_probe_rtt == 0) {
16794                                 rack->r_ctl.rc_lower_rtt_us_cts = cts;
16795                                 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
16796                                 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
16797                                 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
16798                         } else {
16799                                 rack_exit_probertt(rack, cts);
16800                         }
16801                 }
16802                 idle = 0;
16803         }
16804         if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED))
16805                 rack_init_fsb_block(tp, rack);
16806 again:
16807         /*
16808          * If we've recently taken a timeout, snd_max will be greater than
16809          * snd_nxt.  There may be SACK information that allows us to avoid
16810          * resending already delivered data.  Adjust snd_nxt accordingly.
16811          */
16812         sendalot = 0;
16813         cts = tcp_get_usecs(&tv);
16814         ms_cts = tcp_tv_to_mssectick(&tv);
16815         tso = 0;
16816         mtu = 0;
16817         segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
16818         minseg = segsiz;
16819         if (rack->r_ctl.rc_pace_max_segs == 0)
16820                 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
16821         else
16822                 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
16823         sb_offset = tp->snd_max - tp->snd_una;
16824         cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
16825         flags = tcp_outflags[tp->t_state];
16826         while (rack->rc_free_cnt < rack_free_cache) {
16827                 rsm = rack_alloc(rack);
16828                 if (rsm == NULL) {
16829                         if (inp->inp_hpts_calls)
16830                                 /* Retry in a ms */
16831                                 slot = (1 * HPTS_USEC_IN_MSEC);
16832                         so = inp->inp_socket;
16833                         sb = &so->so_snd;
16834                         goto just_return_nolock;
16835                 }
16836                 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
16837                 rack->rc_free_cnt++;
16838                 rsm = NULL;
16839         }
16840         if (inp->inp_hpts_calls)
16841                 inp->inp_hpts_calls = 0;
16842         sack_rxmit = 0;
16843         len = 0;
16844         rsm = NULL;
16845         if (flags & TH_RST) {
16846                 SOCKBUF_LOCK(&inp->inp_socket->so_snd);
16847                 so = inp->inp_socket;
16848                 sb = &so->so_snd;
16849                 goto send;
16850         }
16851         if (rack->r_ctl.rc_resend) {
16852                 /* Retransmit timer */
16853                 rsm = rack->r_ctl.rc_resend;
16854                 rack->r_ctl.rc_resend = NULL;
16855                 len = rsm->r_end - rsm->r_start;
16856                 sack_rxmit = 1;
16857                 sendalot = 0;
16858                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
16859                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
16860                          __func__, __LINE__,
16861                          rsm->r_start, tp->snd_una, tp, rack, rsm));
16862                 sb_offset = rsm->r_start - tp->snd_una;
16863                 if (len >= segsiz)
16864                         len = segsiz;
16865         } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
16866                 /* We have a retransmit that takes precedence */
16867                 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
16868                     ((rsm->r_flags & RACK_MUST_RXT) == 0) &&
16869                     ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
16870                         /* Enter recovery if not induced by a time-out */
16871                         rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
16872                 }
16873 #ifdef INVARIANTS
16874                 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
16875                         panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
16876                               tp, rack, rsm, rsm->r_start, tp->snd_una);
16877                 }
16878 #endif
16879                 len = rsm->r_end - rsm->r_start;
16880                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
16881                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
16882                          __func__, __LINE__,
16883                          rsm->r_start, tp->snd_una, tp, rack, rsm));
16884                 sb_offset = rsm->r_start - tp->snd_una;
16885                 sendalot = 0;
16886                 if (len >= segsiz)
16887                         len = segsiz;
16888                 if (len > 0) {
16889                         sack_rxmit = 1;
16890                         KMOD_TCPSTAT_INC(tcps_sack_rexmits);
16891                         KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
16892                             min(len, segsiz));
16893                 }
16894         } else if (rack->r_ctl.rc_tlpsend) {
16895                 /* Tail loss probe */
16896                 long cwin;
16897                 long tlen;
16898
16899                 /*
16900                  * Check if we can do a TLP with a RACK'd packet
16901                  * this can happen if we are not doing the rack
16902                  * cheat and we skipped to a TLP and it
16903                  * went off.
16904                  */
16905                 rsm = rack->r_ctl.rc_tlpsend;
16906                 /* We are doing a TLP make sure the flag is preent */
16907                 rsm->r_flags |= RACK_TLP;
16908                 rack->r_ctl.rc_tlpsend = NULL;
16909                 sack_rxmit = 1;
16910                 tlen = rsm->r_end - rsm->r_start;
16911                 if (tlen > segsiz)
16912                         tlen = segsiz;
16913                 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
16914                         ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
16915                          __func__, __LINE__,
16916                          rsm->r_start, tp->snd_una, tp, rack, rsm));
16917                 sb_offset = rsm->r_start - tp->snd_una;
16918                 cwin = min(tp->snd_wnd, tlen);
16919                 len = cwin;
16920         }
16921         if (rack->r_must_retran &&
16922             (doing_tlp == 0) &&
16923             (rsm == NULL)) {
16924                 /*
16925                  * Non-Sack and we had a RTO or Sack/non-Sack and a
16926                  * MTU change, we need to retransmit until we reach
16927                  * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
16928                  */
16929                 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
16930                         int sendwin, flight;
16931
16932                         sendwin = min(tp->snd_wnd, tp->snd_cwnd);
16933                         flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
16934                         if (flight >= sendwin) {
16935                                 so = inp->inp_socket;
16936                                 sb = &so->so_snd;
16937                                 goto just_return_nolock;
16938                         }
16939                         rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
16940                         if (rsm == NULL) {
16941                                 /* TSNH */
16942                                 rack->r_must_retran = 0;
16943                                 rack->r_ctl.rc_out_at_rto = 0;
16944                                 so = inp->inp_socket;
16945                                 sb = &so->so_snd;
16946                                 goto just_return_nolock;
16947                         }
16948                         if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
16949                                 /* It does not have the flag, we are done */
16950                                 rack->r_must_retran = 0;
16951                                 rack->r_ctl.rc_out_at_rto = 0;
16952                         } else {
16953                                 sack_rxmit = 1;
16954                                 len = rsm->r_end - rsm->r_start;
16955                                 sendalot = 0;
16956                                 sb_offset = rsm->r_start - tp->snd_una;
16957                                 if (len >= segsiz)
16958                                         len = segsiz;
16959                                 /*
16960                                  * Delay removing the flag RACK_MUST_RXT so
16961                                  * that the fastpath for retransmit will
16962                                  * work with this rsm.
16963                                  */
16964
16965                         }
16966                 } else {
16967                         /* We must be done if there is nothing outstanding */
16968                         rack->r_must_retran = 0;
16969                         rack->r_ctl.rc_out_at_rto = 0;
16970                 }
16971         }
16972         /*
16973          * Enforce a connection sendmap count limit if set
16974          * as long as we are not retransmiting.
16975          */
16976         if ((rsm == NULL) &&
16977             (rack->do_detection == 0) &&
16978             (V_tcp_map_entries_limit > 0) &&
16979             (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
16980                 counter_u64_add(rack_to_alloc_limited, 1);
16981                 if (!rack->alloc_limit_reported) {
16982                         rack->alloc_limit_reported = 1;
16983                         counter_u64_add(rack_alloc_limited_conns, 1);
16984                 }
16985                 so = inp->inp_socket;
16986                 sb = &so->so_snd;
16987                 goto just_return_nolock;
16988         }
16989         if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
16990                 /* we are retransmitting the fin */
16991                 len--;
16992                 if (len) {
16993                         /*
16994                          * When retransmitting data do *not* include the
16995                          * FIN. This could happen from a TLP probe.
16996                          */
16997                         flags &= ~TH_FIN;
16998                 }
16999         }
17000         if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
17001             ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
17002                 int ret;
17003
17004                 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
17005                 if (ret == 0)
17006                         return (0);
17007         }
17008         so = inp->inp_socket;
17009         sb = &so->so_snd;
17010         if (do_a_prefetch == 0) {
17011                 kern_prefetch(sb, &do_a_prefetch);
17012                 do_a_prefetch = 1;
17013         }
17014 #ifdef NETFLIX_SHARED_CWND
17015         if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
17016             rack->rack_enable_scwnd) {
17017                 /* We are doing cwnd sharing */
17018                 if (rack->gp_ready &&
17019                     (rack->rack_attempted_scwnd == 0) &&
17020                     (rack->r_ctl.rc_scw == NULL) &&
17021                     tp->t_lib) {
17022                         /* The pcbid is in, lets make an attempt */
17023                         counter_u64_add(rack_try_scwnd, 1);
17024                         rack->rack_attempted_scwnd = 1;
17025                         rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
17026                                                                    &rack->r_ctl.rc_scw_index,
17027                                                                    segsiz);
17028                 }
17029                 if (rack->r_ctl.rc_scw &&
17030                     (rack->rack_scwnd_is_idle == 1) &&
17031                     sbavail(&so->so_snd)) {
17032                         /* we are no longer out of data */
17033                         tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
17034                         rack->rack_scwnd_is_idle = 0;
17035                 }
17036                 if (rack->r_ctl.rc_scw) {
17037                         /* First lets update and get the cwnd */
17038                         rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
17039                                                                     rack->r_ctl.rc_scw_index,
17040                                                                     tp->snd_cwnd, tp->snd_wnd, segsiz);
17041                 }
17042         }
17043 #endif
17044         /*
17045          * Get standard flags, and add SYN or FIN if requested by 'hidden'
17046          * state flags.
17047          */
17048         if (tp->t_flags & TF_NEEDFIN)
17049                 flags |= TH_FIN;
17050         if (tp->t_flags & TF_NEEDSYN)
17051                 flags |= TH_SYN;
17052         if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
17053                 void *end_rsm;
17054                 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
17055                 if (end_rsm)
17056                         kern_prefetch(end_rsm, &prefetch_rsm);
17057                 prefetch_rsm = 1;
17058         }
17059         SOCKBUF_LOCK(sb);
17060         /*
17061          * If snd_nxt == snd_max and we have transmitted a FIN, the
17062          * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
17063          * negative length.  This can also occur when TCP opens up its
17064          * congestion window while receiving additional duplicate acks after
17065          * fast-retransmit because TCP will reset snd_nxt to snd_max after
17066          * the fast-retransmit.
17067          *
17068          * In the normal retransmit-FIN-only case, however, snd_nxt will be
17069          * set to snd_una, the sb_offset will be 0, and the length may wind
17070          * up 0.
17071          *
17072          * If sack_rxmit is true we are retransmitting from the scoreboard
17073          * in which case len is already set.
17074          */
17075         if ((sack_rxmit == 0) &&
17076             (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
17077                 uint32_t avail;
17078
17079                 avail = sbavail(sb);
17080                 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
17081                         sb_offset = tp->snd_nxt - tp->snd_una;
17082                 else
17083                         sb_offset = 0;
17084                 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
17085                         if (rack->r_ctl.rc_tlp_new_data) {
17086                                 /* TLP is forcing out new data */
17087                                 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
17088                                         rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
17089                                 }
17090                                 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
17091                                         if (tp->snd_wnd > sb_offset)
17092                                                 len = tp->snd_wnd - sb_offset;
17093                                         else
17094                                                 len = 0;
17095                                 } else {
17096                                         len = rack->r_ctl.rc_tlp_new_data;
17097                                 }
17098                                 rack->r_ctl.rc_tlp_new_data = 0;
17099                         }  else {
17100                                 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
17101                         }
17102                         if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
17103                                 /*
17104                                  * For prr=off, we need to send only 1 MSS
17105                                  * at a time. We do this because another sack could
17106                                  * be arriving that causes us to send retransmits and
17107                                  * we don't want to be on a long pace due to a larger send
17108                                  * that keeps us from sending out the retransmit.
17109                                  */
17110                                 len = segsiz;
17111                         }
17112                 } else {
17113                         uint32_t outstanding;
17114                         /*
17115                          * We are inside of a Fast recovery episode, this
17116                          * is caused by a SACK or 3 dup acks. At this point
17117                          * we have sent all the retransmissions and we rely
17118                          * on PRR to dictate what we will send in the form of
17119                          * new data.
17120                          */
17121
17122                         outstanding = tp->snd_max - tp->snd_una;
17123                         if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
17124                                 if (tp->snd_wnd > outstanding) {
17125                                         len = tp->snd_wnd - outstanding;
17126                                         /* Check to see if we have the data */
17127                                         if ((sb_offset + len) > avail) {
17128                                                 /* It does not all fit */
17129                                                 if (avail > sb_offset)
17130                                                         len = avail - sb_offset;
17131                                                 else
17132                                                         len = 0;
17133                                         }
17134                                 } else {
17135                                         len = 0;
17136                                 }
17137                         } else if (avail > sb_offset) {
17138                                 len = avail - sb_offset;
17139                         } else {
17140                                 len = 0;
17141                         }
17142                         if (len > 0) {
17143                                 if (len > rack->r_ctl.rc_prr_sndcnt) {
17144                                         len = rack->r_ctl.rc_prr_sndcnt;
17145                                 }
17146                                 if (len > 0) {
17147                                         sub_from_prr = 1;
17148                                 }
17149                         }
17150                         if (len > segsiz) {
17151                                 /*
17152                                  * We should never send more than a MSS when
17153                                  * retransmitting or sending new data in prr
17154                                  * mode unless the override flag is on. Most
17155                                  * likely the PRR algorithm is not going to
17156                                  * let us send a lot as well :-)
17157                                  */
17158                                 if (rack->r_ctl.rc_prr_sendalot == 0) {
17159                                         len = segsiz;
17160                                 }
17161                         } else if (len < segsiz) {
17162                                 /*
17163                                  * Do we send any? The idea here is if the
17164                                  * send empty's the socket buffer we want to
17165                                  * do it. However if not then lets just wait
17166                                  * for our prr_sndcnt to get bigger.
17167                                  */
17168                                 long leftinsb;
17169
17170                                 leftinsb = sbavail(sb) - sb_offset;
17171                                 if (leftinsb > len) {
17172                                         /* This send does not empty the sb */
17173                                         len = 0;
17174                                 }
17175                         }
17176                 }
17177         } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
17178                 /*
17179                  * If you have not established
17180                  * and are not doing FAST OPEN
17181                  * no data please.
17182                  */
17183                 if ((sack_rxmit == 0) &&
17184                     (!IS_FASTOPEN(tp->t_flags))){
17185                         len = 0;
17186                         sb_offset = 0;
17187                 }
17188         }
17189         if (prefetch_so_done == 0) {
17190                 kern_prefetch(so, &prefetch_so_done);
17191                 prefetch_so_done = 1;
17192         }
17193         /*
17194          * Lop off SYN bit if it has already been sent.  However, if this is
17195          * SYN-SENT state and if segment contains data and if we don't know
17196          * that foreign host supports TAO, suppress sending segment.
17197          */
17198         if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
17199             ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
17200                 /*
17201                  * When sending additional segments following a TFO SYN|ACK,
17202                  * do not include the SYN bit.
17203                  */
17204                 if (IS_FASTOPEN(tp->t_flags) &&
17205                     (tp->t_state == TCPS_SYN_RECEIVED))
17206                         flags &= ~TH_SYN;
17207         }
17208         /*
17209          * Be careful not to send data and/or FIN on SYN segments. This
17210          * measure is needed to prevent interoperability problems with not
17211          * fully conformant TCP implementations.
17212          */
17213         if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
17214                 len = 0;
17215                 flags &= ~TH_FIN;
17216         }
17217         /*
17218          * On TFO sockets, ensure no data is sent in the following cases:
17219          *
17220          *  - When retransmitting SYN|ACK on a passively-created socket
17221          *
17222          *  - When retransmitting SYN on an actively created socket
17223          *
17224          *  - When sending a zero-length cookie (cookie request) on an
17225          *    actively created socket
17226          *
17227          *  - When the socket is in the CLOSED state (RST is being sent)
17228          */
17229         if (IS_FASTOPEN(tp->t_flags) &&
17230             (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
17231              ((tp->t_state == TCPS_SYN_SENT) &&
17232               (tp->t_tfo_client_cookie_len == 0)) ||
17233              (flags & TH_RST))) {
17234                 sack_rxmit = 0;
17235                 len = 0;
17236         }
17237         /* Without fast-open there should never be data sent on a SYN */
17238         if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
17239                 tp->snd_nxt = tp->iss;
17240                 len = 0;
17241         }
17242         if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
17243                 /* We only send 1 MSS if we have a DSACK block */
17244                 add_flag |= RACK_SENT_W_DSACK;
17245                 len = segsiz;
17246         }
17247         orig_len = len;
17248         if (len <= 0) {
17249                 /*
17250                  * If FIN has been sent but not acked, but we haven't been
17251                  * called to retransmit, len will be < 0.  Otherwise, window
17252                  * shrank after we sent into it.  If window shrank to 0,
17253                  * cancel pending retransmit, pull snd_nxt back to (closed)
17254                  * window, and set the persist timer if it isn't already
17255                  * going.  If the window didn't close completely, just wait
17256                  * for an ACK.
17257                  *
17258                  * We also do a general check here to ensure that we will
17259                  * set the persist timer when we have data to send, but a
17260                  * 0-byte window. This makes sure the persist timer is set
17261                  * even if the packet hits one of the "goto send" lines
17262                  * below.
17263                  */
17264                 len = 0;
17265                 if ((tp->snd_wnd == 0) &&
17266                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17267                     (tp->snd_una == tp->snd_max) &&
17268                     (sb_offset < (int)sbavail(sb))) {
17269                         rack_enter_persist(tp, rack, cts);
17270                 }
17271         } else if ((rsm == NULL) &&
17272                    (doing_tlp == 0) &&
17273                    (len < pace_max_seg)) {
17274                 /*
17275                  * We are not sending a maximum sized segment for
17276                  * some reason. Should we not send anything (think
17277                  * sws or persists)?
17278                  */
17279                 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17280                     (TCPS_HAVEESTABLISHED(tp->t_state)) &&
17281                     (len < minseg) &&
17282                     (len < (int)(sbavail(sb) - sb_offset))) {
17283                         /*
17284                          * Here the rwnd is less than
17285                          * the minimum pacing size, this is not a retransmit,
17286                          * we are established and
17287                          * the send is not the last in the socket buffer
17288                          * we send nothing, and we may enter persists
17289                          * if nothing is outstanding.
17290                          */
17291                         len = 0;
17292                         if (tp->snd_max == tp->snd_una) {
17293                                 /*
17294                                  * Nothing out we can
17295                                  * go into persists.
17296                                  */
17297                                 rack_enter_persist(tp, rack, cts);
17298                         }
17299                      } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
17300                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17301                            (len < (int)(sbavail(sb) - sb_offset)) &&
17302                            (len < minseg)) {
17303                         /*
17304                          * Here we are not retransmitting, and
17305                          * the cwnd is not so small that we could
17306                          * not send at least a min size (rxt timer
17307                          * not having gone off), We have 2 segments or
17308                          * more already in flight, its not the tail end
17309                          * of the socket buffer  and the cwnd is blocking
17310                          * us from sending out a minimum pacing segment size.
17311                          * Lets not send anything.
17312                          */
17313                         len = 0;
17314                 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
17315                             min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
17316                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
17317                            (len < (int)(sbavail(sb) - sb_offset)) &&
17318                            (TCPS_HAVEESTABLISHED(tp->t_state))) {
17319                         /*
17320                          * Here we have a send window but we have
17321                          * filled it up and we can't send another pacing segment.
17322                          * We also have in flight more than 2 segments
17323                          * and we are not completing the sb i.e. we allow
17324                          * the last bytes of the sb to go out even if
17325                          * its not a full pacing segment.
17326                          */
17327                         len = 0;
17328                 } else if ((rack->r_ctl.crte != NULL) &&
17329                            (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
17330                            (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
17331                            (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
17332                            (len < (int)(sbavail(sb) - sb_offset))) {
17333                         /*
17334                          * Here we are doing hardware pacing, this is not a TLP,
17335                          * we are not sending a pace max segment size, there is rwnd
17336                          * room to send at least N pace_max_seg, the cwnd is greater
17337                          * than or equal to a full pacing segments plus 4 mss and we have 2 or
17338                          * more segments in flight and its not the tail of the socket buffer.
17339                          *
17340                          * We don't want to send instead we need to get more ack's in to
17341                          * allow us to send a full pacing segment. Normally, if we are pacing
17342                          * about the right speed, we should have finished our pacing
17343                          * send as most of the acks have come back if we are at the
17344                          * right rate. This is a bit fuzzy since return path delay
17345                          * can delay the acks, which is why we want to make sure we
17346                          * have cwnd space to have a bit more than a max pace segments in flight.
17347                          *
17348                          * If we have not gotten our acks back we are pacing at too high a
17349                          * rate delaying will not hurt and will bring our GP estimate down by
17350                          * injecting the delay. If we don't do this we will send
17351                          * 2 MSS out in response to the acks being clocked in which
17352                          * defeats the point of hw-pacing (i.e. to help us get
17353                          * larger TSO's out).
17354                          */
17355                         len = 0;
17356
17357                 }
17358
17359         }
17360         /* len will be >= 0 after this point. */
17361         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
17362         rack_sndbuf_autoscale(rack);
17363         /*
17364          * Decide if we can use TCP Segmentation Offloading (if supported by
17365          * hardware).
17366          *
17367          * TSO may only be used if we are in a pure bulk sending state.  The
17368          * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
17369          * options prevent using TSO.  With TSO the TCP header is the same
17370          * (except for the sequence number) for all generated packets.  This
17371          * makes it impossible to transmit any options which vary per
17372          * generated segment or packet.
17373          *
17374          * IPv4 handling has a clear separation of ip options and ip header
17375          * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
17376          * the right thing below to provide length of just ip options and thus
17377          * checking for ipoptlen is enough to decide if ip options are present.
17378          */
17379         ipoptlen = 0;
17380 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17381         /*
17382          * Pre-calculate here as we save another lookup into the darknesses
17383          * of IPsec that way and can actually decide if TSO is ok.
17384          */
17385 #ifdef INET6
17386         if (isipv6 && IPSEC_ENABLED(ipv6))
17387                 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
17388 #ifdef INET
17389         else
17390 #endif
17391 #endif                          /* INET6 */
17392 #ifdef INET
17393                 if (IPSEC_ENABLED(ipv4))
17394                         ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
17395 #endif                          /* INET */
17396 #endif
17397
17398 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
17399         ipoptlen += ipsec_optlen;
17400 #endif
17401         if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
17402             (tp->t_port == 0) &&
17403             ((tp->t_flags & TF_SIGNATURE) == 0) &&
17404             tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
17405             ipoptlen == 0)
17406                 tso = 1;
17407         {
17408                 uint32_t outstanding;
17409
17410                 outstanding = tp->snd_max - tp->snd_una;
17411                 if (tp->t_flags & TF_SENTFIN) {
17412                         /*
17413                          * If we sent a fin, snd_max is 1 higher than
17414                          * snd_una
17415                          */
17416                         outstanding--;
17417                 }
17418                 if (sack_rxmit) {
17419                         if ((rsm->r_flags & RACK_HAS_FIN) == 0)
17420                                 flags &= ~TH_FIN;
17421                 } else {
17422                         if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
17423                                    sbused(sb)))
17424                                 flags &= ~TH_FIN;
17425                 }
17426         }
17427         recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
17428             (long)TCP_MAXWIN << tp->rcv_scale);
17429
17430         /*
17431          * Sender silly window avoidance.   We transmit under the following
17432          * conditions when len is non-zero:
17433          *
17434          * - We have a full segment (or more with TSO) - This is the last
17435          * buffer in a write()/send() and we are either idle or running
17436          * NODELAY - we've timed out (e.g. persist timer) - we have more
17437          * then 1/2 the maximum send window's worth of data (receiver may be
17438          * limited the window size) - we need to retransmit
17439          */
17440         if (len) {
17441                 if (len >= segsiz) {
17442                         goto send;
17443                 }
17444                 /*
17445                  * NOTE! on localhost connections an 'ack' from the remote
17446                  * end may occur synchronously with the output and cause us
17447                  * to flush a buffer queued with moretocome.  XXX
17448                  *
17449                  */
17450                 if (!(tp->t_flags & TF_MORETOCOME) &&   /* normal case */
17451                     (idle || (tp->t_flags & TF_NODELAY)) &&
17452                     ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17453                     (tp->t_flags & TF_NOPUSH) == 0) {
17454                         pass = 2;
17455                         goto send;
17456                 }
17457                 if ((tp->snd_una == tp->snd_max) && len) {      /* Nothing outstanding */
17458                         pass = 22;
17459                         goto send;
17460                 }
17461                 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
17462                         pass = 4;
17463                         goto send;
17464                 }
17465                 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */
17466                         pass = 5;
17467                         goto send;
17468                 }
17469                 if (sack_rxmit) {
17470                         pass = 6;
17471                         goto send;
17472                 }
17473                 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
17474                     (ctf_outstanding(tp) < (segsiz * 2))) {
17475                         /*
17476                          * We have less than two MSS outstanding (delayed ack)
17477                          * and our rwnd will not let us send a full sized
17478                          * MSS. Lets go ahead and let this small segment
17479                          * out because we want to try to have at least two
17480                          * packets inflight to not be caught by delayed ack.
17481                          */
17482                         pass = 12;
17483                         goto send;
17484                 }
17485         }
17486         /*
17487          * Sending of standalone window updates.
17488          *
17489          * Window updates are important when we close our window due to a
17490          * full socket buffer and are opening it again after the application
17491          * reads data from it.  Once the window has opened again and the
17492          * remote end starts to send again the ACK clock takes over and
17493          * provides the most current window information.
17494          *
17495          * We must avoid the silly window syndrome whereas every read from
17496          * the receive buffer, no matter how small, causes a window update
17497          * to be sent.  We also should avoid sending a flurry of window
17498          * updates when the socket buffer had queued a lot of data and the
17499          * application is doing small reads.
17500          *
17501          * Prevent a flurry of pointless window updates by only sending an
17502          * update when we can increase the advertized window by more than
17503          * 1/4th of the socket buffer capacity.  When the buffer is getting
17504          * full or is very small be more aggressive and send an update
17505          * whenever we can increase by two mss sized segments. In all other
17506          * situations the ACK's to new incoming data will carry further
17507          * window increases.
17508          *
17509          * Don't send an independent window update if a delayed ACK is
17510          * pending (it will get piggy-backed on it) or the remote side
17511          * already has done a half-close and won't send more data.  Skip
17512          * this if the connection is in T/TCP half-open state.
17513          */
17514         if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
17515             !(tp->t_flags & TF_DELACK) &&
17516             !TCPS_HAVERCVDFIN(tp->t_state)) {
17517                 /*
17518                  * "adv" is the amount we could increase the window, taking
17519                  * into account that we are limited by TCP_MAXWIN <<
17520                  * tp->rcv_scale.
17521                  */
17522                 int32_t adv;
17523                 int oldwin;
17524
17525                 adv = recwin;
17526                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
17527                         oldwin = (tp->rcv_adv - tp->rcv_nxt);
17528                         if (adv > oldwin)
17529                             adv -= oldwin;
17530                         else {
17531                                 /* We can't increase the window */
17532                                 adv = 0;
17533                         }
17534                 } else
17535                         oldwin = 0;
17536
17537                 /*
17538                  * If the new window size ends up being the same as or less
17539                  * than the old size when it is scaled, then don't force
17540                  * a window update.
17541                  */
17542                 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
17543                         goto dontupdate;
17544
17545                 if (adv >= (int32_t)(2 * segsiz) &&
17546                     (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
17547                      recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
17548                      so->so_rcv.sb_hiwat <= 8 * segsiz)) {
17549                         pass = 7;
17550                         goto send;
17551                 }
17552                 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
17553                         pass = 23;
17554                         goto send;
17555                 }
17556         }
17557 dontupdate:
17558
17559         /*
17560          * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
17561          * is also a catch-all for the retransmit timer timeout case.
17562          */
17563         if (tp->t_flags & TF_ACKNOW) {
17564                 pass = 8;
17565                 goto send;
17566         }
17567         if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
17568                 pass = 9;
17569                 goto send;
17570         }
17571         /*
17572          * If our state indicates that FIN should be sent and we have not
17573          * yet done so, then we need to send.
17574          */
17575         if ((flags & TH_FIN) &&
17576             (tp->snd_nxt == tp->snd_una)) {
17577                 pass = 11;
17578                 goto send;
17579         }
17580         /*
17581          * No reason to send a segment, just return.
17582          */
17583 just_return:
17584         SOCKBUF_UNLOCK(sb);
17585 just_return_nolock:
17586         {
17587                 int app_limited = CTF_JR_SENT_DATA;
17588
17589                 if (tot_len_this_send > 0) {
17590                         /* Make sure snd_nxt is up to max */
17591                         rack->r_ctl.fsb.recwin = recwin;
17592                         slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
17593                         if ((error == 0) &&
17594                             rack_use_rfo &&
17595                             ((flags & (TH_SYN|TH_FIN)) == 0) &&
17596                             (ipoptlen == 0) &&
17597                             (tp->snd_nxt == tp->snd_max) &&
17598                             (tp->rcv_numsacks == 0) &&
17599                             rack->r_fsb_inited &&
17600                             TCPS_HAVEESTABLISHED(tp->t_state) &&
17601                             (rack->r_must_retran == 0) &&
17602                             ((tp->t_flags & TF_NEEDFIN) == 0) &&
17603                             (len > 0) && (orig_len > 0) &&
17604                             (orig_len > len) &&
17605                             ((orig_len - len) >= segsiz) &&
17606                             ((optlen == 0) ||
17607                              ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
17608                                 /* We can send at least one more MSS using our fsb */
17609
17610                                 rack->r_fast_output = 1;
17611                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
17612                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
17613                                 rack->r_ctl.fsb.tcp_flags = flags;
17614                                 rack->r_ctl.fsb.left_to_send = orig_len - len;
17615                                 if (hw_tls)
17616                                         rack->r_ctl.fsb.hw_tls = 1;
17617                                 else
17618                                         rack->r_ctl.fsb.hw_tls = 0;
17619                                 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
17620                                         ("rack:%p left_to_send:%u sbavail:%u out:%u",
17621                                         rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
17622                                          (tp->snd_max - tp->snd_una)));
17623                                 if (rack->r_ctl.fsb.left_to_send < segsiz)
17624                                         rack->r_fast_output = 0;
17625                                 else {
17626                                         if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
17627                                                 rack->r_ctl.fsb.rfo_apply_push = 1;
17628                                         else
17629                                                 rack->r_ctl.fsb.rfo_apply_push = 0;
17630                                 }
17631                         } else
17632                                 rack->r_fast_output = 0;
17633
17634
17635                         rack_log_fsb(rack, tp, so, flags,
17636                                      ipoptlen, orig_len, len, 0,
17637                                      1, optlen, __LINE__, 1);
17638                         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
17639                                 tp->snd_nxt = tp->snd_max;
17640                 } else {
17641                         int end_window = 0;
17642                         uint32_t seq = tp->gput_ack;
17643
17644                         rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17645                         if (rsm) {
17646                                 /*
17647                                  * Mark the last sent that we just-returned (hinting
17648                                  * that delayed ack may play a role in any rtt measurement).
17649                                  */
17650                                 rsm->r_just_ret = 1;
17651                         }
17652                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
17653                         rack->r_ctl.rc_agg_delayed = 0;
17654                         rack->r_early = 0;
17655                         rack->r_late = 0;
17656                         rack->r_ctl.rc_agg_early = 0;
17657                         if ((ctf_outstanding(tp) +
17658                              min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
17659                                  minseg)) >= tp->snd_wnd) {
17660                                 /* We are limited by the rwnd */
17661                                 app_limited = CTF_JR_RWND_LIMITED;
17662                                 if (IN_FASTRECOVERY(tp->t_flags))
17663                                     rack->r_ctl.rc_prr_sndcnt = 0;
17664                         } else if (ctf_outstanding(tp) >= sbavail(sb)) {
17665                                 /* We are limited by whats available -- app limited */
17666                                 app_limited = CTF_JR_APP_LIMITED;
17667                                 if (IN_FASTRECOVERY(tp->t_flags))
17668                                     rack->r_ctl.rc_prr_sndcnt = 0;
17669                         } else if ((idle == 0) &&
17670                                    ((tp->t_flags & TF_NODELAY) == 0) &&
17671                                    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
17672                                    (len < segsiz)) {
17673                                 /*
17674                                  * No delay is not on and the
17675                                  * user is sending less than 1MSS. This
17676                                  * brings out SWS avoidance so we
17677                                  * don't send. Another app-limited case.
17678                                  */
17679                                 app_limited = CTF_JR_APP_LIMITED;
17680                         } else if (tp->t_flags & TF_NOPUSH) {
17681                                 /*
17682                                  * The user has requested no push of
17683                                  * the last segment and we are
17684                                  * at the last segment. Another app
17685                                  * limited case.
17686                                  */
17687                                 app_limited = CTF_JR_APP_LIMITED;
17688                         } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
17689                                 /* Its the cwnd */
17690                                 app_limited = CTF_JR_CWND_LIMITED;
17691                         } else if (IN_FASTRECOVERY(tp->t_flags) &&
17692                                    (rack->rack_no_prr == 0) &&
17693                                    (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
17694                                 app_limited = CTF_JR_PRR;
17695                         } else {
17696                                 /* Now why here are we not sending? */
17697 #ifdef NOW
17698 #ifdef INVARIANTS
17699                                 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
17700 #endif
17701 #endif
17702                                 app_limited = CTF_JR_ASSESSING;
17703                         }
17704                         /*
17705                          * App limited in some fashion, for our pacing GP
17706                          * measurements we don't want any gap (even cwnd).
17707                          * Close  down the measurement window.
17708                          */
17709                         if (rack_cwnd_block_ends_measure &&
17710                             ((app_limited == CTF_JR_CWND_LIMITED) ||
17711                              (app_limited == CTF_JR_PRR))) {
17712                                 /*
17713                                  * The reason we are not sending is
17714                                  * the cwnd (or prr). We have been configured
17715                                  * to end the measurement window in
17716                                  * this case.
17717                                  */
17718                                 end_window = 1;
17719                         } else if (rack_rwnd_block_ends_measure &&
17720                                    (app_limited == CTF_JR_RWND_LIMITED)) {
17721                                 /*
17722                                  * We are rwnd limited and have been
17723                                  * configured to end the measurement
17724                                  * window in this case.
17725                                  */
17726                                 end_window = 1;
17727                         } else if (app_limited == CTF_JR_APP_LIMITED) {
17728                                 /*
17729                                  * A true application limited period, we have
17730                                  * ran out of data.
17731                                  */
17732                                 end_window = 1;
17733                         } else if (app_limited == CTF_JR_ASSESSING) {
17734                                 /*
17735                                  * In the assessing case we hit the end of
17736                                  * the if/else and had no known reason
17737                                  * This will panic us under invariants..
17738                                  *
17739                                  * If we get this out in logs we need to
17740                                  * investagate which reason we missed.
17741                                  */
17742                                 end_window = 1;
17743                         }
17744                         if (end_window) {
17745                                 uint8_t log = 0;
17746
17747                                 /* Adjust the Gput measurement */
17748                                 if ((tp->t_flags & TF_GPUTINPROG) &&
17749                                     SEQ_GT(tp->gput_ack, tp->snd_max)) {
17750                                         tp->gput_ack = tp->snd_max;
17751                                         if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
17752                                                 /*
17753                                                  * There is not enough to measure.
17754                                                  */
17755                                                 tp->t_flags &= ~TF_GPUTINPROG;
17756                                                 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
17757                                                                            rack->r_ctl.rc_gp_srtt /*flex1*/,
17758                                                                            tp->gput_seq,
17759                                                                            0, 0, 18, __LINE__, NULL, 0);
17760                                         } else
17761                                                 log = 1;
17762                                 }
17763                                 /* Mark the last packet has app limited */
17764                                 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
17765                                 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
17766                                         if (rack->r_ctl.rc_app_limited_cnt == 0)
17767                                                 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
17768                                         else {
17769                                                 /*
17770                                                  * Go out to the end app limited and mark
17771                                                  * this new one as next and move the end_appl up
17772                                                  * to this guy.
17773                                                  */
17774                                                 if (rack->r_ctl.rc_end_appl)
17775                                                         rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
17776                                                 rack->r_ctl.rc_end_appl = rsm;
17777                                         }
17778                                         rsm->r_flags |= RACK_APP_LIMITED;
17779                                         rack->r_ctl.rc_app_limited_cnt++;
17780                                 }
17781                                 if (log)
17782                                         rack_log_pacing_delay_calc(rack,
17783                                                                    rack->r_ctl.rc_app_limited_cnt, seq,
17784                                                                    tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
17785                         }
17786                 }
17787                 /* Check if we need to go into persists or not */
17788                 if ((tp->snd_max == tp->snd_una) &&
17789                     TCPS_HAVEESTABLISHED(tp->t_state) &&
17790                     sbavail(sb) &&
17791                     (sbavail(sb) > tp->snd_wnd) &&
17792                     (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
17793                         /* Yes lets make sure to move to persist before timer-start */
17794                         rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
17795                 }
17796                 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
17797                 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
17798         }
17799 #ifdef NETFLIX_SHARED_CWND
17800         if ((sbavail(sb) == 0) &&
17801             rack->r_ctl.rc_scw) {
17802                 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
17803                 rack->rack_scwnd_is_idle = 1;
17804         }
17805 #endif
17806 #ifdef TCP_ACCOUNTING
17807         if (tot_len_this_send > 0) {
17808                 crtsc = get_cyclecount();
17809                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17810                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
17811                 }
17812                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
17813                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17814                         tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
17815                 }
17816                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
17817                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17818                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
17819                 }
17820                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz));
17821         } else {
17822                 crtsc = get_cyclecount();
17823                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17824                         tp->tcp_cnt_counters[SND_LIMITED]++;
17825                 }
17826                 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1);
17827                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17828                         tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
17829                 }
17830                 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val));
17831         }
17832         sched_unpin();
17833 #endif
17834         return (0);
17835
17836 send:
17837         if (rsm || sack_rxmit)
17838                 counter_u64_add(rack_nfto_resend, 1);
17839         else
17840                 counter_u64_add(rack_non_fto_send, 1);
17841         if ((flags & TH_FIN) &&
17842             sbavail(sb)) {
17843                 /*
17844                  * We do not transmit a FIN
17845                  * with data outstanding. We
17846                  * need to make it so all data
17847                  * is acked first.
17848                  */
17849                 flags &= ~TH_FIN;
17850         }
17851         /* Enforce stack imposed max seg size if we have one */
17852         if (rack->r_ctl.rc_pace_max_segs &&
17853             (len > rack->r_ctl.rc_pace_max_segs)) {
17854                 mark = 1;
17855                 len = rack->r_ctl.rc_pace_max_segs;
17856         }
17857         SOCKBUF_LOCK_ASSERT(sb);
17858         if (len > 0) {
17859                 if (len >= segsiz)
17860                         tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
17861                 else
17862                         tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
17863         }
17864         /*
17865          * Before ESTABLISHED, force sending of initial options unless TCP
17866          * set not to do any options. NOTE: we assume that the IP/TCP header
17867          * plus TCP options always fit in a single mbuf, leaving room for a
17868          * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
17869          * + optlen <= MCLBYTES
17870          */
17871         optlen = 0;
17872 #ifdef INET6
17873         if (isipv6)
17874                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
17875         else
17876 #endif
17877                 hdrlen = sizeof(struct tcpiphdr);
17878
17879         /*
17880          * Compute options for segment. We only have to care about SYN and
17881          * established connection segments.  Options for SYN-ACK segments
17882          * are handled in TCP syncache.
17883          */
17884         to.to_flags = 0;
17885         if ((tp->t_flags & TF_NOOPT) == 0) {
17886                 /* Maximum segment size. */
17887                 if (flags & TH_SYN) {
17888                         tp->snd_nxt = tp->iss;
17889                         to.to_mss = tcp_mssopt(&inp->inp_inc);
17890                         if (tp->t_port)
17891                                 to.to_mss -= V_tcp_udp_tunneling_overhead;
17892                         to.to_flags |= TOF_MSS;
17893
17894                         /*
17895                          * On SYN or SYN|ACK transmits on TFO connections,
17896                          * only include the TFO option if it is not a
17897                          * retransmit, as the presence of the TFO option may
17898                          * have caused the original SYN or SYN|ACK to have
17899                          * been dropped by a middlebox.
17900                          */
17901                         if (IS_FASTOPEN(tp->t_flags) &&
17902                             (tp->t_rxtshift == 0)) {
17903                                 if (tp->t_state == TCPS_SYN_RECEIVED) {
17904                                         to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
17905                                         to.to_tfo_cookie =
17906                                                 (u_int8_t *)&tp->t_tfo_cookie.server;
17907                                         to.to_flags |= TOF_FASTOPEN;
17908                                         wanted_cookie = 1;
17909                                 } else if (tp->t_state == TCPS_SYN_SENT) {
17910                                         to.to_tfo_len =
17911                                                 tp->t_tfo_client_cookie_len;
17912                                         to.to_tfo_cookie =
17913                                                 tp->t_tfo_cookie.client;
17914                                         to.to_flags |= TOF_FASTOPEN;
17915                                         wanted_cookie = 1;
17916                                         /*
17917                                          * If we wind up having more data to
17918                                          * send with the SYN than can fit in
17919                                          * one segment, don't send any more
17920                                          * until the SYN|ACK comes back from
17921                                          * the other end.
17922                                          */
17923                                         sendalot = 0;
17924                                 }
17925                         }
17926                 }
17927                 /* Window scaling. */
17928                 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
17929                         to.to_wscale = tp->request_r_scale;
17930                         to.to_flags |= TOF_SCALE;
17931                 }
17932                 /* Timestamps. */
17933                 if ((tp->t_flags & TF_RCVD_TSTMP) ||
17934                     ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
17935                         to.to_tsval = ms_cts + tp->ts_offset;
17936                         to.to_tsecr = tp->ts_recent;
17937                         to.to_flags |= TOF_TS;
17938                 }
17939                 /* Set receive buffer autosizing timestamp. */
17940                 if (tp->rfbuf_ts == 0 &&
17941                     (so->so_rcv.sb_flags & SB_AUTOSIZE))
17942                         tp->rfbuf_ts = tcp_ts_getticks();
17943                 /* Selective ACK's. */
17944                 if (tp->t_flags & TF_SACK_PERMIT) {
17945                         if (flags & TH_SYN)
17946                                 to.to_flags |= TOF_SACKPERM;
17947                         else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
17948                                  tp->rcv_numsacks > 0) {
17949                                 to.to_flags |= TOF_SACK;
17950                                 to.to_nsacks = tp->rcv_numsacks;
17951                                 to.to_sacks = (u_char *)tp->sackblks;
17952                         }
17953                 }
17954 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
17955                 /* TCP-MD5 (RFC2385). */
17956                 if (tp->t_flags & TF_SIGNATURE)
17957                         to.to_flags |= TOF_SIGNATURE;
17958 #endif                          /* TCP_SIGNATURE */
17959
17960                 /* Processing the options. */
17961                 hdrlen += optlen = tcp_addoptions(&to, opt);
17962                 /*
17963                  * If we wanted a TFO option to be added, but it was unable
17964                  * to fit, ensure no data is sent.
17965                  */
17966                 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
17967                     !(to.to_flags & TOF_FASTOPEN))
17968                         len = 0;
17969         }
17970         if (tp->t_port) {
17971                 if (V_tcp_udp_tunneling_port == 0) {
17972                         /* The port was removed?? */
17973                         SOCKBUF_UNLOCK(&so->so_snd);
17974 #ifdef TCP_ACCOUNTING
17975                         crtsc = get_cyclecount();
17976                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17977                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
17978                         }
17979                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
17980                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
17981                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
17982                         }
17983                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
17984                         sched_unpin();
17985 #endif
17986                         return (EHOSTUNREACH);
17987                 }
17988                 hdrlen += sizeof(struct udphdr);
17989         }
17990 #ifdef INET6
17991         if (isipv6)
17992                 ipoptlen = ip6_optlen(tp->t_inpcb);
17993         else
17994 #endif
17995                 if (tp->t_inpcb->inp_options)
17996                         ipoptlen = tp->t_inpcb->inp_options->m_len -
17997                                 offsetof(struct ipoption, ipopt_list);
17998                 else
17999                         ipoptlen = 0;
18000 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18001         ipoptlen += ipsec_optlen;
18002 #endif
18003
18004         /*
18005          * Adjust data length if insertion of options will bump the packet
18006          * length beyond the t_maxseg length. Clear the FIN bit because we
18007          * cut off the tail of the segment.
18008          */
18009         if (len + optlen + ipoptlen > tp->t_maxseg) {
18010                 if (tso) {
18011                         uint32_t if_hw_tsomax;
18012                         uint32_t moff;
18013                         int32_t max_len;
18014
18015                         /* extract TSO information */
18016                         if_hw_tsomax = tp->t_tsomax;
18017                         if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18018                         if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18019                         KASSERT(ipoptlen == 0,
18020                                 ("%s: TSO can't do IP options", __func__));
18021
18022                         /*
18023                          * Check if we should limit by maximum payload
18024                          * length:
18025                          */
18026                         if (if_hw_tsomax != 0) {
18027                                 /* compute maximum TSO length */
18028                                 max_len = (if_hw_tsomax - hdrlen -
18029                                            max_linkhdr);
18030                                 if (max_len <= 0) {
18031                                         len = 0;
18032                                 } else if (len > max_len) {
18033                                         sendalot = 1;
18034                                         len = max_len;
18035                                         mark = 2;
18036                                 }
18037                         }
18038                         /*
18039                          * Prevent the last segment from being fractional
18040                          * unless the send sockbuf can be emptied:
18041                          */
18042                         max_len = (tp->t_maxseg - optlen);
18043                         if ((sb_offset + len) < sbavail(sb)) {
18044                                 moff = len % (u_int)max_len;
18045                                 if (moff != 0) {
18046                                         mark = 3;
18047                                         len -= moff;
18048                                 }
18049                         }
18050                         /*
18051                          * In case there are too many small fragments don't
18052                          * use TSO:
18053                          */
18054                         if (len <= segsiz) {
18055                                 mark = 4;
18056                                 tso = 0;
18057                         }
18058                         /*
18059                          * Send the FIN in a separate segment after the bulk
18060                          * sending is done. We don't trust the TSO
18061                          * implementations to clear the FIN flag on all but
18062                          * the last segment.
18063                          */
18064                         if (tp->t_flags & TF_NEEDFIN) {
18065                                 sendalot = 4;
18066                         }
18067                 } else {
18068                         mark = 5;
18069                         if (optlen + ipoptlen >= tp->t_maxseg) {
18070                                 /*
18071                                  * Since we don't have enough space to put
18072                                  * the IP header chain and the TCP header in
18073                                  * one packet as required by RFC 7112, don't
18074                                  * send it. Also ensure that at least one
18075                                  * byte of the payload can be put into the
18076                                  * TCP segment.
18077                                  */
18078                                 SOCKBUF_UNLOCK(&so->so_snd);
18079                                 error = EMSGSIZE;
18080                                 sack_rxmit = 0;
18081                                 goto out;
18082                         }
18083                         len = tp->t_maxseg - optlen - ipoptlen;
18084                         sendalot = 5;
18085                 }
18086         } else {
18087                 tso = 0;
18088                 mark = 6;
18089         }
18090         KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
18091                 ("%s: len > IP_MAXPACKET", __func__));
18092 #ifdef DIAGNOSTIC
18093 #ifdef INET6
18094         if (max_linkhdr + hdrlen > MCLBYTES)
18095 #else
18096                 if (max_linkhdr + hdrlen > MHLEN)
18097 #endif
18098                         panic("tcphdr too big");
18099 #endif
18100
18101         /*
18102          * This KASSERT is here to catch edge cases at a well defined place.
18103          * Before, those had triggered (random) panic conditions further
18104          * down.
18105          */
18106         KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
18107         if ((len == 0) &&
18108             (flags & TH_FIN) &&
18109             (sbused(sb))) {
18110                 /*
18111                  * We have outstanding data, don't send a fin by itself!.
18112                  */
18113                 goto just_return;
18114         }
18115         /*
18116          * Grab a header mbuf, attaching a copy of data to be transmitted,
18117          * and initialize the header from the template for sends on this
18118          * connection.
18119          */
18120         hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
18121         if (len) {
18122                 uint32_t max_val;
18123                 uint32_t moff;
18124
18125                 if (rack->r_ctl.rc_pace_max_segs)
18126                         max_val = rack->r_ctl.rc_pace_max_segs;
18127                 else if (rack->rc_user_set_max_segs)
18128                         max_val = rack->rc_user_set_max_segs * segsiz;
18129                 else
18130                         max_val = len;
18131                 /*
18132                  * We allow a limit on sending with hptsi.
18133                  */
18134                 if (len > max_val) {
18135                         mark = 7;
18136                         len = max_val;
18137                 }
18138 #ifdef INET6
18139                 if (MHLEN < hdrlen + max_linkhdr)
18140                         m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18141                 else
18142 #endif
18143                         m = m_gethdr(M_NOWAIT, MT_DATA);
18144
18145                 if (m == NULL) {
18146                         SOCKBUF_UNLOCK(sb);
18147                         error = ENOBUFS;
18148                         sack_rxmit = 0;
18149                         goto out;
18150                 }
18151                 m->m_data += max_linkhdr;
18152                 m->m_len = hdrlen;
18153
18154                 /*
18155                  * Start the m_copy functions from the closest mbuf to the
18156                  * sb_offset in the socket buffer chain.
18157                  */
18158                 mb = sbsndptr_noadv(sb, sb_offset, &moff);
18159                 s_mb = mb;
18160                 s_moff = moff;
18161                 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
18162                         m_copydata(mb, moff, (int)len,
18163                                    mtod(m, caddr_t)+hdrlen);
18164                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18165                                 sbsndptr_adv(sb, mb, len);
18166                         m->m_len += len;
18167                 } else {
18168                         struct sockbuf *msb;
18169
18170                         if (SEQ_LT(tp->snd_nxt, tp->snd_max))
18171                                 msb = NULL;
18172                         else
18173                                 msb = sb;
18174                         m->m_next = tcp_m_copym(
18175                                 mb, moff, &len,
18176                                 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
18177                                 ((rsm == NULL) ? hw_tls : 0)
18178 #ifdef NETFLIX_COPY_ARGS
18179                                 , &filled_all
18180 #endif
18181                                 );
18182                         if (len <= (tp->t_maxseg - optlen)) {
18183                                 /*
18184                                  * Must have ran out of mbufs for the copy
18185                                  * shorten it to no longer need tso. Lets
18186                                  * not put on sendalot since we are low on
18187                                  * mbufs.
18188                                  */
18189                                 tso = 0;
18190                         }
18191                         if (m->m_next == NULL) {
18192                                 SOCKBUF_UNLOCK(sb);
18193                                 (void)m_free(m);
18194                                 error = ENOBUFS;
18195                                 sack_rxmit = 0;
18196                                 goto out;
18197                         }
18198                 }
18199                 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
18200                         if (rsm && (rsm->r_flags & RACK_TLP)) {
18201                                 /*
18202                                  * TLP should not count in retran count, but
18203                                  * in its own bin
18204                                  */
18205                                 counter_u64_add(rack_tlp_retran, 1);
18206                                 counter_u64_add(rack_tlp_retran_bytes, len);
18207                         } else {
18208                                 tp->t_sndrexmitpack++;
18209                                 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18210                                 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18211                         }
18212 #ifdef STATS
18213                         stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18214                                                  len);
18215 #endif
18216                 } else {
18217                         KMOD_TCPSTAT_INC(tcps_sndpack);
18218                         KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
18219 #ifdef STATS
18220                         stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
18221                                                  len);
18222 #endif
18223                 }
18224                 /*
18225                  * If we're sending everything we've got, set PUSH. (This
18226                  * will keep happy those implementations which only give
18227                  * data to the user when a buffer fills or a PUSH comes in.)
18228                  */
18229                 if (sb_offset + len == sbused(sb) &&
18230                     sbused(sb) &&
18231                     !(flags & TH_SYN)) {
18232                         flags |= TH_PUSH;
18233                         add_flag |= RACK_HAD_PUSH;
18234                 }
18235
18236                 SOCKBUF_UNLOCK(sb);
18237         } else {
18238                 SOCKBUF_UNLOCK(sb);
18239                 if (tp->t_flags & TF_ACKNOW)
18240                         KMOD_TCPSTAT_INC(tcps_sndacks);
18241                 else if (flags & (TH_SYN | TH_FIN | TH_RST))
18242                         KMOD_TCPSTAT_INC(tcps_sndctrl);
18243                 else
18244                         KMOD_TCPSTAT_INC(tcps_sndwinup);
18245
18246                 m = m_gethdr(M_NOWAIT, MT_DATA);
18247                 if (m == NULL) {
18248                         error = ENOBUFS;
18249                         sack_rxmit = 0;
18250                         goto out;
18251                 }
18252 #ifdef INET6
18253                 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
18254                     MHLEN >= hdrlen) {
18255                         M_ALIGN(m, hdrlen);
18256                 } else
18257 #endif
18258                         m->m_data += max_linkhdr;
18259                 m->m_len = hdrlen;
18260         }
18261         SOCKBUF_UNLOCK_ASSERT(sb);
18262         m->m_pkthdr.rcvif = (struct ifnet *)0;
18263 #ifdef MAC
18264         mac_inpcb_create_mbuf(inp, m);
18265 #endif
18266         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) &&  rack->r_fsb_inited) {
18267 #ifdef INET6
18268                 if (isipv6)
18269                         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18270                 else
18271 #endif                          /* INET6 */
18272                         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18273                 th = rack->r_ctl.fsb.th;
18274                 udp = rack->r_ctl.fsb.udp;
18275                 if (udp) {
18276 #ifdef INET6
18277                         if (isipv6)
18278                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18279                         else
18280 #endif                          /* INET6 */
18281                                 ulen = hdrlen + len - sizeof(struct ip);
18282                         udp->uh_ulen = htons(ulen);
18283                 }
18284         } else {
18285 #ifdef INET6
18286                 if (isipv6) {
18287                         ip6 = mtod(m, struct ip6_hdr *);
18288                         if (tp->t_port) {
18289                                 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
18290                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18291                                 udp->uh_dport = tp->t_port;
18292                                 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18293                                 udp->uh_ulen = htons(ulen);
18294                                 th = (struct tcphdr *)(udp + 1);
18295                         } else
18296                                 th = (struct tcphdr *)(ip6 + 1);
18297                         tcpip_fillheaders(inp, tp->t_port, ip6, th);
18298                 } else
18299 #endif                          /* INET6 */
18300                 {
18301                         ip = mtod(m, struct ip *);
18302                         if (tp->t_port) {
18303                                 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
18304                                 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
18305                                 udp->uh_dport = tp->t_port;
18306                                 ulen = hdrlen + len - sizeof(struct ip);
18307                                 udp->uh_ulen = htons(ulen);
18308                                 th = (struct tcphdr *)(udp + 1);
18309                         } else
18310                                 th = (struct tcphdr *)(ip + 1);
18311                         tcpip_fillheaders(inp, tp->t_port, ip, th);
18312                 }
18313         }
18314         /*
18315          * Fill in fields, remembering maximum advertised window for use in
18316          * delaying messages about window sizes. If resending a FIN, be sure
18317          * not to use a new sequence number.
18318          */
18319         if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
18320             tp->snd_nxt == tp->snd_max)
18321                 tp->snd_nxt--;
18322         /*
18323          * If we are starting a connection, send ECN setup SYN packet. If we
18324          * are on a retransmit, we may resend those bits a number of times
18325          * as per RFC 3168.
18326          */
18327         if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
18328                 flags |= tcp_ecn_output_syn_sent(tp);
18329         }
18330         /* Also handle parallel SYN for ECN */
18331         if (TCPS_HAVERCVDSYN(tp->t_state) &&
18332             (tp->t_flags2 & TF2_ECN_PERMIT)) {
18333                 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
18334                 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18335                     (tp->t_flags2 & TF2_ECN_SND_ECE))
18336                         tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18337 #ifdef INET6
18338                 if (isipv6) {
18339                         ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
18340                         ip6->ip6_flow |= htonl(ect << 20);
18341                 }
18342                 else
18343 #endif
18344                 {
18345                         ip->ip_tos &= ~IPTOS_ECN_MASK;
18346                         ip->ip_tos |= ect;
18347                 }
18348         }
18349         /*
18350          * If we are doing retransmissions, then snd_nxt will not reflect
18351          * the first unsent octet.  For ACK only packets, we do not want the
18352          * sequence number of the retransmitted packet, we want the sequence
18353          * number of the next unsent octet.  So, if there is no data (and no
18354          * SYN or FIN), use snd_max instead of snd_nxt when filling in
18355          * ti_seq.  But if we are in persist state, snd_max might reflect
18356          * one byte beyond the right edge of the window, so use snd_nxt in
18357          * that case, since we know we aren't doing a retransmission.
18358          * (retransmit and persist are mutually exclusive...)
18359          */
18360         if (sack_rxmit == 0) {
18361                 if (len || (flags & (TH_SYN | TH_FIN))) {
18362                         th->th_seq = htonl(tp->snd_nxt);
18363                         rack_seq = tp->snd_nxt;
18364                 } else {
18365                         th->th_seq = htonl(tp->snd_max);
18366                         rack_seq = tp->snd_max;
18367                 }
18368         } else {
18369                 th->th_seq = htonl(rsm->r_start);
18370                 rack_seq = rsm->r_start;
18371         }
18372         th->th_ack = htonl(tp->rcv_nxt);
18373         tcp_set_flags(th, flags);
18374         /*
18375          * Calculate receive window.  Don't shrink window, but avoid silly
18376          * window syndrome.
18377          * If a RST segment is sent, advertise a window of zero.
18378          */
18379         if (flags & TH_RST) {
18380                 recwin = 0;
18381         } else {
18382                 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
18383                     recwin < (long)segsiz) {
18384                         recwin = 0;
18385                 }
18386                 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
18387                     recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
18388                         recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
18389         }
18390
18391         /*
18392          * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
18393          * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
18394          * handled in syncache.
18395          */
18396         if (flags & TH_SYN)
18397                 th->th_win = htons((u_short)
18398                                    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
18399         else {
18400                 /* Avoid shrinking window with window scaling. */
18401                 recwin = roundup2(recwin, 1 << tp->rcv_scale);
18402                 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
18403         }
18404         /*
18405          * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
18406          * window.  This may cause the remote transmitter to stall.  This
18407          * flag tells soreceive() to disable delayed acknowledgements when
18408          * draining the buffer.  This can occur if the receiver is
18409          * attempting to read more data than can be buffered prior to
18410          * transmitting on the connection.
18411          */
18412         if (th->th_win == 0) {
18413                 tp->t_sndzerowin++;
18414                 tp->t_flags |= TF_RXWIN0SENT;
18415         } else
18416                 tp->t_flags &= ~TF_RXWIN0SENT;
18417         tp->snd_up = tp->snd_una;       /* drag it along, its deprecated */
18418         /* Now are we using fsb?, if so copy the template data to the mbuf */
18419         if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
18420                 uint8_t *cpto;
18421
18422                 cpto = mtod(m, uint8_t *);
18423                 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18424                 /*
18425                  * We have just copied in:
18426                  * IP/IP6
18427                  * <optional udphdr>
18428                  * tcphdr (no options)
18429                  *
18430                  * We need to grab the correct pointers into the mbuf
18431                  * for both the tcp header, and possibly the udp header (if tunneling).
18432                  * We do this by using the offset in the copy buffer and adding it
18433                  * to the mbuf base pointer (cpto).
18434                  */
18435 #ifdef INET6
18436                 if (isipv6)
18437                         ip6 = mtod(m, struct ip6_hdr *);
18438                 else
18439 #endif                          /* INET6 */
18440                         ip = mtod(m, struct ip *);
18441                 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18442                 /* If we have a udp header lets set it into the mbuf as well */
18443                 if (udp)
18444                         udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
18445         }
18446 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18447         if (to.to_flags & TOF_SIGNATURE) {
18448                 /*
18449                  * Calculate MD5 signature and put it into the place
18450                  * determined before.
18451                  * NOTE: since TCP options buffer doesn't point into
18452                  * mbuf's data, calculate offset and use it.
18453                  */
18454                 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
18455                                                        (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
18456                         /*
18457                          * Do not send segment if the calculation of MD5
18458                          * digest has failed.
18459                          */
18460                         goto out;
18461                 }
18462         }
18463 #endif
18464         if (optlen) {
18465                 bcopy(opt, th + 1, optlen);
18466                 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18467         }
18468         /*
18469          * Put TCP length in extended header, and then checksum extended
18470          * header and data.
18471          */
18472         m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18473 #ifdef INET6
18474         if (isipv6) {
18475                 /*
18476                  * ip6_plen is not need to be filled now, and will be filled
18477                  * in ip6_output.
18478                  */
18479                 if (tp->t_port) {
18480                         m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18481                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18482                         udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18483                         th->th_sum = htons(0);
18484                         UDPSTAT_INC(udps_opackets);
18485                 } else {
18486                         m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18487                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18488                         th->th_sum = in6_cksum_pseudo(ip6,
18489                                                       sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18490                                                       0);
18491                 }
18492         }
18493 #endif
18494 #if defined(INET6) && defined(INET)
18495         else
18496 #endif
18497 #ifdef INET
18498         {
18499                 if (tp->t_port) {
18500                         m->m_pkthdr.csum_flags = CSUM_UDP;
18501                         m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18502                         udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18503                                                 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18504                         th->th_sum = htons(0);
18505                         UDPSTAT_INC(udps_opackets);
18506                 } else {
18507                         m->m_pkthdr.csum_flags = CSUM_TCP;
18508                         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18509                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
18510                                                ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18511                                                                         IPPROTO_TCP + len + optlen));
18512                 }
18513                 /* IP version must be set here for ipv4/ipv6 checking later */
18514                 KASSERT(ip->ip_v == IPVERSION,
18515                         ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18516         }
18517 #endif
18518         /*
18519          * Enable TSO and specify the size of the segments. The TCP pseudo
18520          * header checksum is always provided. XXX: Fixme: This is currently
18521          * not the case for IPv6.
18522          */
18523         if (tso) {
18524                 KASSERT(len > tp->t_maxseg - optlen,
18525                         ("%s: len <= tso_segsz", __func__));
18526                 m->m_pkthdr.csum_flags |= CSUM_TSO;
18527                 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
18528         }
18529         KASSERT(len + hdrlen == m_length(m, NULL),
18530                 ("%s: mbuf chain different than expected: %d + %u != %u",
18531                  __func__, len, hdrlen, m_length(m, NULL)));
18532
18533 #ifdef TCP_HHOOK
18534         /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
18535         hhook_run_tcp_est_out(tp, th, &to, len, tso);
18536 #endif
18537         /* We're getting ready to send; log now. */
18538         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
18539                 union tcp_log_stackspecific log;
18540
18541                 memset(&log.u_bbr, 0, sizeof(log.u_bbr));
18542                 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
18543                 if (rack->rack_no_prr)
18544                         log.u_bbr.flex1 = 0;
18545                 else
18546                         log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18547                 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18548                 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18549                 log.u_bbr.flex4 = orig_len;
18550                 if (filled_all)
18551                         log.u_bbr.flex5 = 0x80000000;
18552                 else
18553                         log.u_bbr.flex5 = 0;
18554                 /* Save off the early/late values */
18555                 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18556                 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18557                 log.u_bbr.bw_inuse = rack_get_bw(rack);
18558                 if (rsm || sack_rxmit) {
18559                         if (doing_tlp)
18560                                 log.u_bbr.flex8 = 2;
18561                         else
18562                                 log.u_bbr.flex8 = 1;
18563                 } else {
18564                         if (doing_tlp)
18565                                 log.u_bbr.flex8 = 3;
18566                         else
18567                                 log.u_bbr.flex8 = 0;
18568                 }
18569                 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
18570                 log.u_bbr.flex7 = mark;
18571                 log.u_bbr.flex7 <<= 8;
18572                 log.u_bbr.flex7 |= pass;
18573                 log.u_bbr.pkts_out = tp->t_maxseg;
18574                 log.u_bbr.timeStamp = cts;
18575                 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18576                 log.u_bbr.lt_epoch = cwnd_to_use;
18577                 log.u_bbr.delivered = sendalot;
18578                 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
18579                                      len, &log, false, NULL, NULL, 0, &tv);
18580         } else
18581                 lgb = NULL;
18582
18583         /*
18584          * Fill in IP length and desired time to live and send to IP level.
18585          * There should be a better way to handle ttl and tos; we could keep
18586          * them in the template, but need a way to checksum without them.
18587          */
18588         /*
18589          * m->m_pkthdr.len should have been set before cksum calcuration,
18590          * because in6_cksum() need it.
18591          */
18592 #ifdef INET6
18593         if (isipv6) {
18594                 /*
18595                  * we separately set hoplimit for every segment, since the
18596                  * user might want to change the value via setsockopt. Also,
18597                  * desired default hop limit might be changed via Neighbor
18598                  * Discovery.
18599                  */
18600                 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
18601
18602                 /*
18603                  * Set the packet size here for the benefit of DTrace
18604                  * probes. ip6_output() will set it properly; it's supposed
18605                  * to include the option header lengths as well.
18606                  */
18607                 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18608
18609                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18610                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18611                 else
18612                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18613
18614                 if (tp->t_state == TCPS_SYN_SENT)
18615                         TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
18616
18617                 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
18618                 /* TODO: IPv6 IP6TOS_ECT bit on */
18619                 error = ip6_output(m,
18620 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18621                                    inp->in6p_outputopts,
18622 #else
18623                                    NULL,
18624 #endif
18625                                    &inp->inp_route6,
18626                                    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
18627                                    NULL, NULL, inp);
18628
18629                 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
18630                         mtu = inp->inp_route6.ro_nh->nh_mtu;
18631         }
18632 #endif                          /* INET6 */
18633 #if defined(INET) && defined(INET6)
18634         else
18635 #endif
18636 #ifdef INET
18637         {
18638                 ip->ip_len = htons(m->m_pkthdr.len);
18639 #ifdef INET6
18640                 if (inp->inp_vflag & INP_IPV6PROTO)
18641                         ip->ip_ttl = in6_selecthlim(inp, NULL);
18642 #endif                          /* INET6 */
18643                 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
18644                 /*
18645                  * If we do path MTU discovery, then we set DF on every
18646                  * packet. This might not be the best thing to do according
18647                  * to RFC3390 Section 2. However the tcp hostcache migitates
18648                  * the problem so it affects only the first tcp connection
18649                  * with a host.
18650                  *
18651                  * NB: Don't set DF on small MTU/MSS to have a safe
18652                  * fallback.
18653                  */
18654                 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18655                         tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18656                         if (tp->t_port == 0 || len < V_tcp_minmss) {
18657                                 ip->ip_off |= htons(IP_DF);
18658                         }
18659                 } else {
18660                         tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18661                 }
18662
18663                 if (tp->t_state == TCPS_SYN_SENT)
18664                         TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
18665
18666                 TCP_PROBE5(send, NULL, tp, ip, tp, th);
18667
18668                 error = ip_output(m,
18669 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
18670                                   inp->inp_options,
18671 #else
18672                                   NULL,
18673 #endif
18674                                   &inp->inp_route,
18675                                   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
18676                                   inp);
18677                 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
18678                         mtu = inp->inp_route.ro_nh->nh_mtu;
18679         }
18680 #endif                          /* INET */
18681
18682 out:
18683         if (lgb) {
18684                 lgb->tlb_errno = error;
18685                 lgb = NULL;
18686         }
18687         /*
18688          * In transmit state, time the transmission and arrange for the
18689          * retransmit.  In persist state, just set snd_max.
18690          */
18691         if (error == 0) {
18692                 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
18693                 if (rsm && doing_tlp) {
18694                         rack->rc_last_sent_tlp_past_cumack = 0;
18695                         rack->rc_last_sent_tlp_seq_valid = 1;
18696                         rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18697                         rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18698                 }
18699                 rack->forced_ack = 0;   /* If we send something zap the FA flag */
18700                 if (rsm && (doing_tlp == 0)) {
18701                         /* Set we retransmitted */
18702                         rack->rc_gp_saw_rec = 1;
18703                 } else {
18704                         if (cwnd_to_use > tp->snd_ssthresh) {
18705                                 /* Set we sent in CA */
18706                                 rack->rc_gp_saw_ca = 1;
18707                         } else {
18708                                 /* Set we sent in SS */
18709                                 rack->rc_gp_saw_ss = 1;
18710                         }
18711                 }
18712                 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
18713                     (tp->t_flags & TF_SACK_PERMIT) &&
18714                     tp->rcv_numsacks > 0)
18715                         tcp_clean_dsack_blocks(tp);
18716                 tot_len_this_send += len;
18717                 if (len == 0)
18718                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
18719                 else if (len == 1) {
18720                         counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
18721                 } else if (len > 1) {
18722                         int idx;
18723
18724                         idx = (len / segsiz) + 3;
18725                         if (idx >= TCP_MSS_ACCT_ATIMER)
18726                                 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18727                         else
18728                                 counter_u64_add(rack_out_size[idx], 1);
18729                 }
18730         }
18731         if ((rack->rack_no_prr == 0) &&
18732             sub_from_prr &&
18733             (error == 0)) {
18734                 if (rack->r_ctl.rc_prr_sndcnt >= len)
18735                         rack->r_ctl.rc_prr_sndcnt -= len;
18736                 else
18737                         rack->r_ctl.rc_prr_sndcnt = 0;
18738         }
18739         sub_from_prr = 0;
18740         if (doing_tlp) {
18741                 /* Make sure the TLP is added */
18742                 add_flag |= RACK_TLP;
18743         } else if (rsm) {
18744                 /* If its a resend without TLP then it must not have the flag */
18745                 rsm->r_flags &= ~RACK_TLP;
18746         }
18747         rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
18748                         rack_to_usec_ts(&tv),
18749                         rsm, add_flag, s_mb, s_moff, hw_tls);
18750
18751
18752         if ((error == 0) &&
18753             (len > 0) &&
18754             (tp->snd_una == tp->snd_max))
18755                 rack->r_ctl.rc_tlp_rxt_last_time = cts;
18756         {
18757                 tcp_seq startseq = tp->snd_nxt;
18758
18759                 /* Track our lost count */
18760                 if (rsm && (doing_tlp == 0))
18761                         rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
18762                 /*
18763                  * Advance snd_nxt over sequence space of this segment.
18764                  */
18765                 if (error)
18766                         /* We don't log or do anything with errors */
18767                         goto nomore;
18768                 if (doing_tlp == 0) {
18769                         if (rsm == NULL) {
18770                                 /*
18771                                  * Not a retransmission of some
18772                                  * sort, new data is going out so
18773                                  * clear our TLP count and flag.
18774                                  */
18775                                 rack->rc_tlp_in_progress = 0;
18776                                 rack->r_ctl.rc_tlp_cnt_out = 0;
18777                         }
18778                 } else {
18779                         /*
18780                          * We have just sent a TLP, mark that it is true
18781                          * and make sure our in progress is set so we
18782                          * continue to check the count.
18783                          */
18784                         rack->rc_tlp_in_progress = 1;
18785                         rack->r_ctl.rc_tlp_cnt_out++;
18786                 }
18787                 if (flags & (TH_SYN | TH_FIN)) {
18788                         if (flags & TH_SYN)
18789                                 tp->snd_nxt++;
18790                         if (flags & TH_FIN) {
18791                                 tp->snd_nxt++;
18792                                 tp->t_flags |= TF_SENTFIN;
18793                         }
18794                 }
18795                 /* In the ENOBUFS case we do *not* update snd_max */
18796                 if (sack_rxmit)
18797                         goto nomore;
18798
18799                 tp->snd_nxt += len;
18800                 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
18801                         if (tp->snd_una == tp->snd_max) {
18802                                 /*
18803                                  * Update the time we just added data since
18804                                  * none was outstanding.
18805                                  */
18806                                 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
18807                                 tp->t_acktime = ticks;
18808                         }
18809                         tp->snd_max = tp->snd_nxt;
18810                         /*
18811                          * Time this transmission if not a retransmission and
18812                          * not currently timing anything.
18813                          * This is only relevant in case of switching back to
18814                          * the base stack.
18815                          */
18816                         if (tp->t_rtttime == 0) {
18817                                 tp->t_rtttime = ticks;
18818                                 tp->t_rtseq = startseq;
18819                                 KMOD_TCPSTAT_INC(tcps_segstimed);
18820                         }
18821                         if (len &&
18822                             ((tp->t_flags & TF_GPUTINPROG) == 0))
18823                                 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
18824                 }
18825                 /*
18826                  * If we are doing FO we need to update the mbuf position and subtract
18827                  * this happens when the peer sends us duplicate information and
18828                  * we thus want to send a DSACK.
18829                  *
18830                  * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
18831                  * turned off? If not then we are going to echo multiple DSACK blocks
18832                  * out (with the TSO), which we should not be doing.
18833                  */
18834                 if (rack->r_fast_output && len) {
18835                         if (rack->r_ctl.fsb.left_to_send > len)
18836                                 rack->r_ctl.fsb.left_to_send -= len;
18837                         else
18838                                 rack->r_ctl.fsb.left_to_send = 0;
18839                         if (rack->r_ctl.fsb.left_to_send < segsiz)
18840                                 rack->r_fast_output = 0;
18841                         if (rack->r_fast_output) {
18842                                 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
18843                                 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
18844                         }
18845                 }
18846         }
18847 nomore:
18848         if (error) {
18849                 rack->r_ctl.rc_agg_delayed = 0;
18850                 rack->r_early = 0;
18851                 rack->r_late = 0;
18852                 rack->r_ctl.rc_agg_early = 0;
18853                 SOCKBUF_UNLOCK_ASSERT(sb);      /* Check gotos. */
18854                 /*
18855                  * Failures do not advance the seq counter above. For the
18856                  * case of ENOBUFS we will fall out and retry in 1ms with
18857                  * the hpts. Everything else will just have to retransmit
18858                  * with the timer.
18859                  *
18860                  * In any case, we do not want to loop around for another
18861                  * send without a good reason.
18862                  */
18863                 sendalot = 0;
18864                 switch (error) {
18865                 case EPERM:
18866                         tp->t_softerror = error;
18867 #ifdef TCP_ACCOUNTING
18868                         crtsc = get_cyclecount();
18869                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18870                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
18871                         }
18872                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
18873                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18874                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
18875                         }
18876                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
18877                         sched_unpin();
18878 #endif
18879                         return (error);
18880                 case ENOBUFS:
18881                         /*
18882                          * Pace us right away to retry in a some
18883                          * time
18884                          */
18885                         if (rack->r_ctl.crte != NULL) {
18886                                 rack_trace_point(rack, RACK_TP_HWENOBUF);
18887                         } else
18888                                 rack_trace_point(rack, RACK_TP_ENOBUF);
18889                         slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
18890                         if (rack->rc_enobuf < 0x7f)
18891                                 rack->rc_enobuf++;
18892                         if (slot < (10 * HPTS_USEC_IN_MSEC))
18893                                 slot = 10 * HPTS_USEC_IN_MSEC;
18894                         if (rack->r_ctl.crte != NULL) {
18895                                 counter_u64_add(rack_saw_enobuf_hw, 1);
18896                                 tcp_rl_log_enobuf(rack->r_ctl.crte);
18897                         }
18898                         counter_u64_add(rack_saw_enobuf, 1);
18899                         goto enobufs;
18900                 case EMSGSIZE:
18901                         /*
18902                          * For some reason the interface we used initially
18903                          * to send segments changed to another or lowered
18904                          * its MTU. If TSO was active we either got an
18905                          * interface without TSO capabilits or TSO was
18906                          * turned off. If we obtained mtu from ip_output()
18907                          * then update it and try again.
18908                          */
18909                         if (tso)
18910                                 tp->t_flags &= ~TF_TSO;
18911                         if (mtu != 0) {
18912                                 tcp_mss_update(tp, -1, mtu, NULL, NULL);
18913                                 goto again;
18914                         }
18915                         slot = 10 * HPTS_USEC_IN_MSEC;
18916                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
18917 #ifdef TCP_ACCOUNTING
18918                         crtsc = get_cyclecount();
18919                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18920                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
18921                         }
18922                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
18923                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18924                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
18925                         }
18926                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
18927                         sched_unpin();
18928 #endif
18929                         return (error);
18930                 case ENETUNREACH:
18931                         counter_u64_add(rack_saw_enetunreach, 1);
18932                 case EHOSTDOWN:
18933                 case EHOSTUNREACH:
18934                 case ENETDOWN:
18935                         if (TCPS_HAVERCVDSYN(tp->t_state)) {
18936                                 tp->t_softerror = error;
18937                         }
18938                         /* FALLTHROUGH */
18939                 default:
18940                         slot = 10 * HPTS_USEC_IN_MSEC;
18941                         rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
18942 #ifdef TCP_ACCOUNTING
18943                         crtsc = get_cyclecount();
18944                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18945                                 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
18946                         }
18947                         counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
18948                         if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18949                                 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
18950                         }
18951                         counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
18952                         sched_unpin();
18953 #endif
18954                         return (error);
18955                 }
18956         } else {
18957                 rack->rc_enobuf = 0;
18958                 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
18959                         rack->r_ctl.retran_during_recovery += len;
18960         }
18961         KMOD_TCPSTAT_INC(tcps_sndtotal);
18962
18963         /*
18964          * Data sent (as far as we can tell). If this advertises a larger
18965          * window than any other segment, then remember the size of the
18966          * advertised window. Any pending ACK has now been sent.
18967          */
18968         if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
18969                 tp->rcv_adv = tp->rcv_nxt + recwin;
18970
18971         tp->last_ack_sent = tp->rcv_nxt;
18972         tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
18973 enobufs:
18974         if (sendalot) {
18975                 /* Do we need to turn off sendalot? */
18976                 if (rack->r_ctl.rc_pace_max_segs &&
18977                     (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
18978                         /* We hit our max. */
18979                         sendalot = 0;
18980                 } else if ((rack->rc_user_set_max_segs) &&
18981                            (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
18982                         /* We hit the user defined max */
18983                         sendalot = 0;
18984                 }
18985         }
18986         if ((error == 0) && (flags & TH_FIN))
18987                 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
18988         if (flags & TH_RST) {
18989                 /*
18990                  * We don't send again after sending a RST.
18991                  */
18992                 slot = 0;
18993                 sendalot = 0;
18994                 if (error == 0)
18995                         tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
18996         } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
18997                 /*
18998                  * Get our pacing rate, if an error
18999                  * occurred in sending (ENOBUF) we would
19000                  * hit the else if with slot preset. Other
19001                  * errors return.
19002                  */
19003                 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
19004         }
19005         if (rsm &&
19006             (rsm->r_flags & RACK_HAS_SYN) == 0 &&
19007             rack->use_rack_rr) {
19008                 /* Its a retransmit and we use the rack cheat? */
19009                 if ((slot == 0) ||
19010                     (rack->rc_always_pace == 0) ||
19011                     (rack->r_rr_config == 1)) {
19012                         /*
19013                          * We have no pacing set or we
19014                          * are using old-style rack or
19015                          * we are overriden to use the old 1ms pacing.
19016                          */
19017                         slot = rack->r_ctl.rc_min_to;
19018                 }
19019         }
19020         /* We have sent clear the flag */
19021         rack->r_ent_rec_ns = 0;
19022         if (rack->r_must_retran) {
19023                 if (rsm) {
19024                         rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
19025                         if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
19026                                 /*
19027                                  * We have retransmitted all.
19028                                  */
19029                                 rack->r_must_retran = 0;
19030                                 rack->r_ctl.rc_out_at_rto = 0;
19031                         }
19032                 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19033                         /*
19034                          * Sending new data will also kill
19035                          * the loop.
19036                          */
19037                         rack->r_must_retran = 0;
19038                         rack->r_ctl.rc_out_at_rto = 0;
19039                 }
19040         }
19041         rack->r_ctl.fsb.recwin = recwin;
19042         if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
19043             SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
19044                 /*
19045                  * We hit an RTO and now have past snd_max at the RTO
19046                  * clear all the WAS flags.
19047                  */
19048                 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
19049         }
19050         if (slot) {
19051                 /* set the rack tcb into the slot N */
19052                 if ((error == 0) &&
19053                     rack_use_rfo &&
19054                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19055                     (rsm == NULL) &&
19056                     (tp->snd_nxt == tp->snd_max) &&
19057                     (ipoptlen == 0) &&
19058                     (tp->rcv_numsacks == 0) &&
19059                     rack->r_fsb_inited &&
19060                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19061                     (rack->r_must_retran == 0) &&
19062                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19063                     (len > 0) && (orig_len > 0) &&
19064                     (orig_len > len) &&
19065                     ((orig_len - len) >= segsiz) &&
19066                     ((optlen == 0) ||
19067                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19068                         /* We can send at least one more MSS using our fsb */
19069
19070                         rack->r_fast_output = 1;
19071                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19072                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19073                         rack->r_ctl.fsb.tcp_flags = flags;
19074                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19075                         if (hw_tls)
19076                                 rack->r_ctl.fsb.hw_tls = 1;
19077                         else
19078                                 rack->r_ctl.fsb.hw_tls = 0;
19079                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19080                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19081                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19082                                  (tp->snd_max - tp->snd_una)));
19083                         if (rack->r_ctl.fsb.left_to_send < segsiz)
19084                                 rack->r_fast_output = 0;
19085                         else {
19086                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19087                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19088                                 else
19089                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19090                         }
19091                 } else
19092                         rack->r_fast_output = 0;
19093                 rack_log_fsb(rack, tp, so, flags,
19094                              ipoptlen, orig_len, len, error,
19095                              (rsm == NULL), optlen, __LINE__, 2);
19096         } else if (sendalot) {
19097                 int ret;
19098
19099                 sack_rxmit = 0;
19100                 if ((error == 0) &&
19101                     rack_use_rfo &&
19102                     ((flags & (TH_SYN|TH_FIN)) == 0) &&
19103                     (rsm == NULL) &&
19104                     (ipoptlen == 0) &&
19105                     (tp->rcv_numsacks == 0) &&
19106                     (tp->snd_nxt == tp->snd_max) &&
19107                     (rack->r_must_retran == 0) &&
19108                     rack->r_fsb_inited &&
19109                     TCPS_HAVEESTABLISHED(tp->t_state) &&
19110                     ((tp->t_flags & TF_NEEDFIN) == 0) &&
19111                     (len > 0) && (orig_len > 0) &&
19112                     (orig_len > len) &&
19113                     ((orig_len - len) >= segsiz) &&
19114                     ((optlen == 0) ||
19115                      ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
19116                         /* we can use fast_output for more */
19117
19118                         rack->r_fast_output = 1;
19119                         rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19120                         rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19121                         rack->r_ctl.fsb.tcp_flags = flags;
19122                         rack->r_ctl.fsb.left_to_send = orig_len - len;
19123                         if (hw_tls)
19124                                 rack->r_ctl.fsb.hw_tls = 1;
19125                         else
19126                                 rack->r_ctl.fsb.hw_tls = 0;
19127                         KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19128                                 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19129                                  rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19130                                  (tp->snd_max - tp->snd_una)));
19131                         if (rack->r_ctl.fsb.left_to_send < segsiz) {
19132                                 rack->r_fast_output = 0;
19133                         }
19134                         if (rack->r_fast_output) {
19135                                 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19136                                         rack->r_ctl.fsb.rfo_apply_push = 1;
19137                                 else
19138                                         rack->r_ctl.fsb.rfo_apply_push = 0;
19139                                 rack_log_fsb(rack, tp, so, flags,
19140                                              ipoptlen, orig_len, len, error,
19141                                              (rsm == NULL), optlen, __LINE__, 3);
19142                                 error = 0;
19143                                 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
19144                                 if (ret >= 0)
19145                                         return (ret);
19146                                 else if (error)
19147                                         goto nomore;
19148
19149                         }
19150                 }
19151                 goto again;
19152         }
19153         /* Assure when we leave that snd_nxt will point to top */
19154         if (SEQ_GT(tp->snd_max, tp->snd_nxt))
19155                 tp->snd_nxt = tp->snd_max;
19156         rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
19157 #ifdef TCP_ACCOUNTING
19158         crtsc = get_cyclecount() - ts_val;
19159         if (tot_len_this_send) {
19160                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19161                         tp->tcp_cnt_counters[SND_OUT_DATA]++;
19162                 }
19163                 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
19164                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19165                         tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
19166                 }
19167                 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc);
19168                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19169                         tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
19170                 }
19171                 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz));
19172         } else {
19173                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19174                         tp->tcp_cnt_counters[SND_OUT_ACK]++;
19175                 }
19176                 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1);
19177                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19178                         tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
19179                 }
19180                 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc);
19181         }
19182         sched_unpin();
19183 #endif
19184         if (error == ENOBUFS)
19185                 error = 0;
19186         return (error);
19187 }
19188
19189 static void
19190 rack_update_seg(struct tcp_rack *rack)
19191 {
19192         uint32_t orig_val;
19193
19194         orig_val = rack->r_ctl.rc_pace_max_segs;
19195         rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
19196         if (orig_val != rack->r_ctl.rc_pace_max_segs)
19197                 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
19198 }
19199
19200 static void
19201 rack_mtu_change(struct tcpcb *tp)
19202 {
19203         /*
19204          * The MSS may have changed
19205          */
19206         struct tcp_rack *rack;
19207         struct rack_sendmap *rsm;
19208
19209         rack = (struct tcp_rack *)tp->t_fb_ptr;
19210         if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
19211                 /*
19212                  * The MTU has changed we need to resend everything
19213                  * since all we have sent is lost. We first fix
19214                  * up the mtu though.
19215                  */
19216                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
19217                 /* We treat this like a full retransmit timeout without the cwnd adjustment */
19218                 rack_remxt_tmr(tp);
19219                 rack->r_fast_output = 0;
19220                 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
19221                                                 rack->r_ctl.rc_sacked);
19222                 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
19223                 rack->r_must_retran = 1;
19224                 /* Mark all inflight to needing to be rxt'd */
19225                 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
19226                         rsm->r_flags |= RACK_MUST_RXT;
19227                 }
19228         }
19229         sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
19230         /* We don't use snd_nxt to retransmit */
19231         tp->snd_nxt = tp->snd_max;
19232 }
19233
19234 static int
19235 rack_set_profile(struct tcp_rack *rack, int prof)
19236 {
19237         int err = EINVAL;
19238         if (prof == 1) {
19239                 /* pace_always=1 */
19240                 if (rack->rc_always_pace == 0) {
19241                         if (tcp_can_enable_pacing() == 0)
19242                                 return (EBUSY);
19243                 }
19244                 rack->rc_always_pace = 1;
19245                 if (rack->use_fixed_rate || rack->gp_ready)
19246                         rack_set_cc_pacing(rack);
19247                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19248                 rack->rack_attempt_hdwr_pace = 0;
19249                 /* cmpack=1 */
19250                 if (rack_use_cmp_acks)
19251                         rack->r_use_cmp_ack = 1;
19252                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19253                     rack->r_use_cmp_ack)
19254                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19255                 /* scwnd=1 */
19256                 rack->rack_enable_scwnd = 1;
19257                 /* dynamic=100 */
19258                 rack->rc_gp_dyn_mul = 1;
19259                 /* gp_inc_ca */
19260                 rack->r_ctl.rack_per_of_gp_ca = 100;
19261                 /* rrr_conf=3 */
19262                 rack->r_rr_config = 3;
19263                 /* npush=2 */
19264                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19265                 /* fillcw=1 */
19266                 rack->rc_pace_to_cwnd = 1;
19267                 rack->rc_pace_fill_if_rttin_range = 0;
19268                 rack->rtt_limit_mul = 0;
19269                 /* noprr=1 */
19270                 rack->rack_no_prr = 1;
19271                 /* lscwnd=1 */
19272                 rack->r_limit_scw = 1;
19273                 /* gp_inc_rec */
19274                 rack->r_ctl.rack_per_of_gp_rec = 90;
19275                 err = 0;
19276
19277         } else if (prof == 3) {
19278                 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
19279                 /* pace_always=1 */
19280                 if (rack->rc_always_pace == 0) {
19281                         if (tcp_can_enable_pacing() == 0)
19282                                 return (EBUSY);
19283                 }
19284                 rack->rc_always_pace = 1;
19285                 if (rack->use_fixed_rate || rack->gp_ready)
19286                         rack_set_cc_pacing(rack);
19287                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19288                 rack->rack_attempt_hdwr_pace = 0;
19289                 /* cmpack=1 */
19290                 if (rack_use_cmp_acks)
19291                         rack->r_use_cmp_ack = 1;
19292                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
19293                     rack->r_use_cmp_ack)
19294                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19295                 /* scwnd=1 */
19296                 rack->rack_enable_scwnd = 1;
19297                 /* dynamic=100 */
19298                 rack->rc_gp_dyn_mul = 1;
19299                 /* gp_inc_ca */
19300                 rack->r_ctl.rack_per_of_gp_ca = 100;
19301                 /* rrr_conf=3 */
19302                 rack->r_rr_config = 3;
19303                 /* npush=2 */
19304                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19305                 /* fillcw=2 */
19306                 rack->rc_pace_to_cwnd = 1;
19307                 rack->r_fill_less_agg = 1;
19308                 rack->rc_pace_fill_if_rttin_range = 0;
19309                 rack->rtt_limit_mul = 0;
19310                 /* noprr=1 */
19311                 rack->rack_no_prr = 1;
19312                 /* lscwnd=1 */
19313                 rack->r_limit_scw = 1;
19314                 /* gp_inc_rec */
19315                 rack->r_ctl.rack_per_of_gp_rec = 90;
19316                 err = 0;
19317
19318
19319         } else if (prof == 2) {
19320                 /* cmpack=1 */
19321                 if (rack->rc_always_pace == 0) {
19322                         if (tcp_can_enable_pacing() == 0)
19323                                 return (EBUSY);
19324                 }
19325                 rack->rc_always_pace = 1;
19326                 if (rack->use_fixed_rate || rack->gp_ready)
19327                         rack_set_cc_pacing(rack);
19328                 rack->r_use_cmp_ack = 1;
19329                 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19330                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19331                 /* pace_always=1 */
19332                 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19333                 /* scwnd=1 */
19334                 rack->rack_enable_scwnd = 1;
19335                 /* dynamic=100 */
19336                 rack->rc_gp_dyn_mul = 1;
19337                 rack->r_ctl.rack_per_of_gp_ca = 100;
19338                 /* rrr_conf=3 */
19339                 rack->r_rr_config = 3;
19340                 /* npush=2 */
19341                 rack->r_ctl.rc_no_push_at_mrtt = 2;
19342                 /* fillcw=1 */
19343                 rack->rc_pace_to_cwnd = 1;
19344                 rack->rc_pace_fill_if_rttin_range = 0;
19345                 rack->rtt_limit_mul = 0;
19346                 /* noprr=1 */
19347                 rack->rack_no_prr = 1;
19348                 /* lscwnd=0 */
19349                 rack->r_limit_scw = 0;
19350                 err = 0;
19351         } else if (prof == 0) {
19352                 /* This changes things back to the default settings */
19353                 err = 0;
19354                 if (rack->rc_always_pace) {
19355                         tcp_decrement_paced_conn();
19356                         rack_undo_cc_pacing(rack);
19357                         rack->rc_always_pace = 0;
19358                 }
19359                 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
19360                         rack->rc_always_pace = 1;
19361                         if (rack->use_fixed_rate || rack->gp_ready)
19362                                 rack_set_cc_pacing(rack);
19363                 } else
19364                         rack->rc_always_pace = 0;
19365                 if (rack_dsack_std_based & 0x1) {
19366                         /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
19367                         rack->rc_rack_tmr_std_based = 1;
19368                 }
19369                 if (rack_dsack_std_based & 0x2) {
19370                         /* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
19371                         rack->rc_rack_use_dsack = 1;
19372                 }
19373                 if (rack_use_cmp_acks)
19374                         rack->r_use_cmp_ack = 1;
19375                 else
19376                         rack->r_use_cmp_ack = 0;
19377                 if (rack_disable_prr)
19378                         rack->rack_no_prr = 1;
19379                 else
19380                         rack->rack_no_prr = 0;
19381                 if (rack_gp_no_rec_chg)
19382                         rack->rc_gp_no_rec_chg = 1;
19383                 else
19384                         rack->rc_gp_no_rec_chg = 0;
19385                 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
19386                         rack->r_mbuf_queue = 1;
19387                         if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
19388                                 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19389                         rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19390                 } else {
19391                         rack->r_mbuf_queue = 0;
19392                         rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19393                 }
19394                 if (rack_enable_shared_cwnd)
19395                         rack->rack_enable_scwnd = 1;
19396                 else
19397                         rack->rack_enable_scwnd = 0;
19398                 if (rack_do_dyn_mul) {
19399                         /* When dynamic adjustment is on CA needs to start at 100% */
19400                         rack->rc_gp_dyn_mul = 1;
19401                         if (rack_do_dyn_mul >= 100)
19402                                 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
19403                 } else {
19404                         rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
19405                         rack->rc_gp_dyn_mul = 0;
19406                 }
19407                 rack->r_rr_config = 0;
19408                 rack->r_ctl.rc_no_push_at_mrtt = 0;
19409                 rack->rc_pace_to_cwnd = 0;
19410                 rack->rc_pace_fill_if_rttin_range = 0;
19411                 rack->rtt_limit_mul = 0;
19412
19413                 if (rack_enable_hw_pacing)
19414                         rack->rack_hdw_pace_ena = 1;
19415                 else
19416                         rack->rack_hdw_pace_ena = 0;
19417                 if (rack_disable_prr)
19418                         rack->rack_no_prr = 1;
19419                 else
19420                         rack->rack_no_prr = 0;
19421                 if (rack_limits_scwnd)
19422                         rack->r_limit_scw  = 1;
19423                 else
19424                         rack->r_limit_scw  = 0;
19425                 err = 0;
19426         }
19427         return (err);
19428 }
19429
19430 static int
19431 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
19432 {
19433         struct deferred_opt_list *dol;
19434
19435         dol = malloc(sizeof(struct deferred_opt_list),
19436                      M_TCPFSB, M_NOWAIT|M_ZERO);
19437         if (dol == NULL) {
19438                 /*
19439                  * No space yikes -- fail out..
19440                  */
19441                 return (0);
19442         }
19443         dol->optname = sopt_name;
19444         dol->optval = loptval;
19445         TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
19446         return (1);
19447 }
19448
19449 static int
19450 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
19451                     uint32_t optval, uint64_t loptval)
19452 {
19453         struct epoch_tracker et;
19454         struct sockopt sopt;
19455         struct cc_newreno_opts opt;
19456         uint64_t val;
19457         int error = 0;
19458         uint16_t ca, ss;
19459
19460         switch (sopt_name) {
19461
19462         case TCP_RACK_DSACK_OPT:
19463                 RACK_OPTS_INC(tcp_rack_dsack_opt);
19464                 if (optval & 0x1) {
19465                         rack->rc_rack_tmr_std_based = 1;
19466                 } else {
19467                         rack->rc_rack_tmr_std_based = 0;
19468                 }
19469                 if (optval & 0x2) {
19470                         rack->rc_rack_use_dsack = 1;
19471                 } else {
19472                         rack->rc_rack_use_dsack = 0;
19473                 }
19474                 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
19475                 break;
19476         case TCP_RACK_PACING_BETA:
19477                 RACK_OPTS_INC(tcp_rack_beta);
19478                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19479                         /* This only works for newreno. */
19480                         error = EINVAL;
19481                         break;
19482                 }
19483                 if (rack->rc_pacing_cc_set) {
19484                         /*
19485                          * Set them into the real CC module
19486                          * whats in the rack pcb is the old values
19487                          * to be used on restoral/
19488                          */
19489                         sopt.sopt_dir = SOPT_SET;
19490                         opt.name = CC_NEWRENO_BETA;
19491                         opt.val = optval;
19492                         if (CC_ALGO(tp)->ctl_output != NULL)
19493                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19494                         else {
19495                                 error = ENOENT;
19496                                 break;
19497                         }
19498                 } else {
19499                         /*
19500                          * Not pacing yet so set it into our local
19501                          * rack pcb storage.
19502                          */
19503                         rack->r_ctl.rc_saved_beta.beta = optval;
19504                 }
19505                 break;
19506         case TCP_RACK_TIMER_SLOP:
19507                 RACK_OPTS_INC(tcp_rack_timer_slop);
19508                 rack->r_ctl.timer_slop = optval;
19509                 if (rack->rc_tp->t_srtt) {
19510                         /*
19511                          * If we have an SRTT lets update t_rxtcur
19512                          * to have the new slop.
19513                          */
19514                         RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
19515                                            rack_rto_min, rack_rto_max,
19516                                            rack->r_ctl.timer_slop);
19517                 }
19518                 break;
19519         case TCP_RACK_PACING_BETA_ECN:
19520                 RACK_OPTS_INC(tcp_rack_beta_ecn);
19521                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
19522                         /* This only works for newreno. */
19523                         error = EINVAL;
19524                         break;
19525                 }
19526                 if (rack->rc_pacing_cc_set) {
19527                         /*
19528                          * Set them into the real CC module
19529                          * whats in the rack pcb is the old values
19530                          * to be used on restoral/
19531                          */
19532                         sopt.sopt_dir = SOPT_SET;
19533                         opt.name = CC_NEWRENO_BETA_ECN;
19534                         opt.val = optval;
19535                         if (CC_ALGO(tp)->ctl_output != NULL)
19536                                 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
19537                         else
19538                                 error = ENOENT;
19539                 } else {
19540                         /*
19541                          * Not pacing yet so set it into our local
19542                          * rack pcb storage.
19543                          */
19544                         rack->r_ctl.rc_saved_beta.beta_ecn = optval;
19545                         rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED;
19546                 }
19547                 break;
19548         case TCP_DEFER_OPTIONS:
19549                 RACK_OPTS_INC(tcp_defer_opt);
19550                 if (optval) {
19551                         if (rack->gp_ready) {
19552                                 /* Too late */
19553                                 error = EINVAL;
19554                                 break;
19555                         }
19556                         rack->defer_options = 1;
19557                 } else
19558                         rack->defer_options = 0;
19559                 break;
19560         case TCP_RACK_MEASURE_CNT:
19561                 RACK_OPTS_INC(tcp_rack_measure_cnt);
19562                 if (optval && (optval <= 0xff)) {
19563                         rack->r_ctl.req_measurements = optval;
19564                 } else
19565                         error = EINVAL;
19566                 break;
19567         case TCP_REC_ABC_VAL:
19568                 RACK_OPTS_INC(tcp_rec_abc_val);
19569                 if (optval > 0)
19570                         rack->r_use_labc_for_rec = 1;
19571                 else
19572                         rack->r_use_labc_for_rec = 0;
19573                 break;
19574         case TCP_RACK_ABC_VAL:
19575                 RACK_OPTS_INC(tcp_rack_abc_val);
19576                 if ((optval > 0) && (optval < 255))
19577                         rack->rc_labc = optval;
19578                 else
19579                         error = EINVAL;
19580                 break;
19581         case TCP_HDWR_UP_ONLY:
19582                 RACK_OPTS_INC(tcp_pacing_up_only);
19583                 if (optval)
19584                         rack->r_up_only = 1;
19585                 else
19586                         rack->r_up_only = 0;
19587                 break;
19588         case TCP_PACING_RATE_CAP:
19589                 RACK_OPTS_INC(tcp_pacing_rate_cap);
19590                 rack->r_ctl.bw_rate_cap = loptval;
19591                 break;
19592         case TCP_RACK_PROFILE:
19593                 RACK_OPTS_INC(tcp_profile);
19594                 error = rack_set_profile(rack, optval);
19595                 break;
19596         case TCP_USE_CMP_ACKS:
19597                 RACK_OPTS_INC(tcp_use_cmp_acks);
19598                 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
19599                         /* You can't turn it off once its on! */
19600                         error = EINVAL;
19601                 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
19602                         rack->r_use_cmp_ack = 1;
19603                         rack->r_mbuf_queue = 1;
19604                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19605                 }
19606                 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
19607                         rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
19608                 break;
19609         case TCP_SHARED_CWND_TIME_LIMIT:
19610                 RACK_OPTS_INC(tcp_lscwnd);
19611                 if (optval)
19612                         rack->r_limit_scw = 1;
19613                 else
19614                         rack->r_limit_scw = 0;
19615                 break;
19616         case TCP_RACK_PACE_TO_FILL:
19617                 RACK_OPTS_INC(tcp_fillcw);
19618                 if (optval == 0)
19619                         rack->rc_pace_to_cwnd = 0;
19620                 else {
19621                         rack->rc_pace_to_cwnd = 1;
19622                         if (optval > 1)
19623                                 rack->r_fill_less_agg = 1;
19624                 }
19625                 if ((optval >= rack_gp_rtt_maxmul) &&
19626                     rack_gp_rtt_maxmul &&
19627                     (optval < 0xf)) {
19628                         rack->rc_pace_fill_if_rttin_range = 1;
19629                         rack->rtt_limit_mul = optval;
19630                 } else {
19631                         rack->rc_pace_fill_if_rttin_range = 0;
19632                         rack->rtt_limit_mul = 0;
19633                 }
19634                 break;
19635         case TCP_RACK_NO_PUSH_AT_MAX:
19636                 RACK_OPTS_INC(tcp_npush);
19637                 if (optval == 0)
19638                         rack->r_ctl.rc_no_push_at_mrtt = 0;
19639                 else if (optval < 0xff)
19640                         rack->r_ctl.rc_no_push_at_mrtt = optval;
19641                 else
19642                         error = EINVAL;
19643                 break;
19644         case TCP_SHARED_CWND_ENABLE:
19645                 RACK_OPTS_INC(tcp_rack_scwnd);
19646                 if (optval == 0)
19647                         rack->rack_enable_scwnd = 0;
19648                 else
19649                         rack->rack_enable_scwnd = 1;
19650                 break;
19651         case TCP_RACK_MBUF_QUEUE:
19652                 /* Now do we use the LRO mbuf-queue feature */
19653                 RACK_OPTS_INC(tcp_rack_mbufq);
19654                 if (optval || rack->r_use_cmp_ack)
19655                         rack->r_mbuf_queue = 1;
19656                 else
19657                         rack->r_mbuf_queue = 0;
19658                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19659                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19660                 else
19661                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19662                 break;
19663         case TCP_RACK_NONRXT_CFG_RATE:
19664                 RACK_OPTS_INC(tcp_rack_cfg_rate);
19665                 if (optval == 0)
19666                         rack->rack_rec_nonrxt_use_cr = 0;
19667                 else
19668                         rack->rack_rec_nonrxt_use_cr = 1;
19669                 break;
19670         case TCP_NO_PRR:
19671                 RACK_OPTS_INC(tcp_rack_noprr);
19672                 if (optval == 0)
19673                         rack->rack_no_prr = 0;
19674                 else if (optval == 1)
19675                         rack->rack_no_prr = 1;
19676                 else if (optval == 2)
19677                         rack->no_prr_addback = 1;
19678                 else
19679                         error = EINVAL;
19680                 break;
19681         case TCP_TIMELY_DYN_ADJ:
19682                 RACK_OPTS_INC(tcp_timely_dyn);
19683                 if (optval == 0)
19684                         rack->rc_gp_dyn_mul = 0;
19685                 else {
19686                         rack->rc_gp_dyn_mul = 1;
19687                         if (optval >= 100) {
19688                                 /*
19689                                  * If the user sets something 100 or more
19690                                  * its the gp_ca value.
19691                                  */
19692                                 rack->r_ctl.rack_per_of_gp_ca  = optval;
19693                         }
19694                 }
19695                 break;
19696         case TCP_RACK_DO_DETECTION:
19697                 RACK_OPTS_INC(tcp_rack_do_detection);
19698                 if (optval == 0)
19699                         rack->do_detection = 0;
19700                 else
19701                         rack->do_detection = 1;
19702                 break;
19703         case TCP_RACK_TLP_USE:
19704                 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
19705                         error = EINVAL;
19706                         break;
19707                 }
19708                 RACK_OPTS_INC(tcp_tlp_use);
19709                 rack->rack_tlp_threshold_use = optval;
19710                 break;
19711         case TCP_RACK_TLP_REDUCE:
19712                 /* RACK TLP cwnd reduction (bool) */
19713                 RACK_OPTS_INC(tcp_rack_tlp_reduce);
19714                 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
19715                 break;
19716         /*  Pacing related ones */
19717         case TCP_RACK_PACE_ALWAYS:
19718                 /*
19719                  * zero is old rack method, 1 is new
19720                  * method using a pacing rate.
19721                  */
19722                 RACK_OPTS_INC(tcp_rack_pace_always);
19723                 if (optval > 0) {
19724                         if (rack->rc_always_pace) {
19725                                 error = EALREADY;
19726                                 break;
19727                         } else if (tcp_can_enable_pacing()) {
19728                                 rack->rc_always_pace = 1;
19729                                 if (rack->use_fixed_rate || rack->gp_ready)
19730                                         rack_set_cc_pacing(rack);
19731                         }
19732                         else {
19733                                 error = ENOSPC;
19734                                 break;
19735                         }
19736                 } else {
19737                         if (rack->rc_always_pace) {
19738                                 tcp_decrement_paced_conn();
19739                                 rack->rc_always_pace = 0;
19740                                 rack_undo_cc_pacing(rack);
19741                         }
19742                 }
19743                 if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
19744                         tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
19745                 else
19746                         tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
19747                 /* A rate may be set irate or other, if so set seg size */
19748                 rack_update_seg(rack);
19749                 break;
19750         case TCP_BBR_RACK_INIT_RATE:
19751                 RACK_OPTS_INC(tcp_initial_rate);
19752                 val = optval;
19753                 /* Change from kbits per second to bytes per second */
19754                 val *= 1000;
19755                 val /= 8;
19756                 rack->r_ctl.init_rate = val;
19757                 if (rack->rc_init_win != rack_default_init_window) {
19758                         uint32_t win, snt;
19759
19760                         /*
19761                          * Options don't always get applied
19762                          * in the order you think. So in order
19763                          * to assure we update a cwnd we need
19764                          * to check and see if we are still
19765                          * where we should raise the cwnd.
19766                          */
19767                         win = rc_init_window(rack);
19768                         if (SEQ_GT(tp->snd_max, tp->iss))
19769                                 snt = tp->snd_max - tp->iss;
19770                         else
19771                                 snt = 0;
19772                         if ((snt < win) &&
19773                             (tp->snd_cwnd < win))
19774                                 tp->snd_cwnd = win;
19775                 }
19776                 if (rack->rc_always_pace)
19777                         rack_update_seg(rack);
19778                 break;
19779         case TCP_BBR_IWINTSO:
19780                 RACK_OPTS_INC(tcp_initial_win);
19781                 if (optval && (optval <= 0xff)) {
19782                         uint32_t win, snt;
19783
19784                         rack->rc_init_win = optval;
19785                         win = rc_init_window(rack);
19786                         if (SEQ_GT(tp->snd_max, tp->iss))
19787                                 snt = tp->snd_max - tp->iss;
19788                         else
19789                                 snt = 0;
19790                         if ((snt < win) &&
19791                             (tp->t_srtt |
19792 #ifdef NETFLIX_PEAKRATE
19793                              tp->t_maxpeakrate |
19794 #endif
19795                              rack->r_ctl.init_rate)) {
19796                                 /*
19797                                  * We are not past the initial window
19798                                  * and we have some bases for pacing,
19799                                  * so we need to possibly adjust up
19800                                  * the cwnd. Note even if we don't set
19801                                  * the cwnd, its still ok to raise the rc_init_win
19802                                  * which can be used coming out of idle when we
19803                                  * would have a rate.
19804                                  */
19805                                 if (tp->snd_cwnd < win)
19806                                         tp->snd_cwnd = win;
19807                         }
19808                         if (rack->rc_always_pace)
19809                                 rack_update_seg(rack);
19810                 } else
19811                         error = EINVAL;
19812                 break;
19813         case TCP_RACK_FORCE_MSEG:
19814                 RACK_OPTS_INC(tcp_rack_force_max_seg);
19815                 if (optval)
19816                         rack->rc_force_max_seg = 1;
19817                 else
19818                         rack->rc_force_max_seg = 0;
19819                 break;
19820         case TCP_RACK_PACE_MAX_SEG:
19821                 /* Max segments size in a pace in bytes */
19822                 RACK_OPTS_INC(tcp_rack_max_seg);
19823                 rack->rc_user_set_max_segs = optval;
19824                 rack_set_pace_segments(tp, rack, __LINE__, NULL);
19825                 break;
19826         case TCP_RACK_PACE_RATE_REC:
19827                 /* Set the fixed pacing rate in Bytes per second ca */
19828                 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
19829                 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
19830                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
19831                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
19832                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
19833                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
19834                 rack->use_fixed_rate = 1;
19835                 if (rack->rc_always_pace)
19836                         rack_set_cc_pacing(rack);
19837                 rack_log_pacing_delay_calc(rack,
19838                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
19839                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
19840                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
19841                                            __LINE__, NULL,0);
19842                 break;
19843
19844         case TCP_RACK_PACE_RATE_SS:
19845                 /* Set the fixed pacing rate in Bytes per second ca */
19846                 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
19847                 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
19848                 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
19849                         rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
19850                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
19851                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
19852                 rack->use_fixed_rate = 1;
19853                 if (rack->rc_always_pace)
19854                         rack_set_cc_pacing(rack);
19855                 rack_log_pacing_delay_calc(rack,
19856                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
19857                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
19858                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
19859                                            __LINE__, NULL, 0);
19860                 break;
19861
19862         case TCP_RACK_PACE_RATE_CA:
19863                 /* Set the fixed pacing rate in Bytes per second ca */
19864                 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
19865                 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
19866                 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
19867                         rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
19868                 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
19869                         rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
19870                 rack->use_fixed_rate = 1;
19871                 if (rack->rc_always_pace)
19872                         rack_set_cc_pacing(rack);
19873                 rack_log_pacing_delay_calc(rack,
19874                                            rack->r_ctl.rc_fixed_pacing_rate_ss,
19875                                            rack->r_ctl.rc_fixed_pacing_rate_ca,
19876                                            rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
19877                                            __LINE__, NULL, 0);
19878                 break;
19879         case TCP_RACK_GP_INCREASE_REC:
19880                 RACK_OPTS_INC(tcp_gp_inc_rec);
19881                 rack->r_ctl.rack_per_of_gp_rec = optval;
19882                 rack_log_pacing_delay_calc(rack,
19883                                            rack->r_ctl.rack_per_of_gp_ss,
19884                                            rack->r_ctl.rack_per_of_gp_ca,
19885                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
19886                                            __LINE__, NULL, 0);
19887                 break;
19888         case TCP_RACK_GP_INCREASE_CA:
19889                 RACK_OPTS_INC(tcp_gp_inc_ca);
19890                 ca = optval;
19891                 if (ca < 100) {
19892                         /*
19893                          * We don't allow any reduction
19894                          * over the GP b/w.
19895                          */
19896                         error = EINVAL;
19897                         break;
19898                 }
19899                 rack->r_ctl.rack_per_of_gp_ca = ca;
19900                 rack_log_pacing_delay_calc(rack,
19901                                            rack->r_ctl.rack_per_of_gp_ss,
19902                                            rack->r_ctl.rack_per_of_gp_ca,
19903                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
19904                                            __LINE__, NULL, 0);
19905                 break;
19906         case TCP_RACK_GP_INCREASE_SS:
19907                 RACK_OPTS_INC(tcp_gp_inc_ss);
19908                 ss = optval;
19909                 if (ss < 100) {
19910                         /*
19911                          * We don't allow any reduction
19912                          * over the GP b/w.
19913                          */
19914                         error = EINVAL;
19915                         break;
19916                 }
19917                 rack->r_ctl.rack_per_of_gp_ss = ss;
19918                 rack_log_pacing_delay_calc(rack,
19919                                            rack->r_ctl.rack_per_of_gp_ss,
19920                                            rack->r_ctl.rack_per_of_gp_ca,
19921                                            rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
19922                                            __LINE__, NULL, 0);
19923                 break;
19924         case TCP_RACK_RR_CONF:
19925                 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
19926                 if (optval && optval <= 3)
19927                         rack->r_rr_config = optval;
19928                 else
19929                         rack->r_rr_config = 0;
19930                 break;
19931         case TCP_HDWR_RATE_CAP:
19932                 RACK_OPTS_INC(tcp_hdwr_rate_cap);
19933                 if (optval) {
19934                         if (rack->r_rack_hw_rate_caps == 0)
19935                                 rack->r_rack_hw_rate_caps = 1;
19936                         else
19937                                 error = EALREADY;
19938                 } else {
19939                         rack->r_rack_hw_rate_caps = 0;
19940                 }
19941                 break;
19942         case TCP_BBR_HDWR_PACE:
19943                 RACK_OPTS_INC(tcp_hdwr_pacing);
19944                 if (optval){
19945                         if (rack->rack_hdrw_pacing == 0) {
19946                                 rack->rack_hdw_pace_ena = 1;
19947                                 rack->rack_attempt_hdwr_pace = 0;
19948                         } else
19949                                 error = EALREADY;
19950                 } else {
19951                         rack->rack_hdw_pace_ena = 0;
19952 #ifdef RATELIMIT
19953                         if (rack->r_ctl.crte != NULL) {
19954                                 rack->rack_hdrw_pacing = 0;
19955                                 rack->rack_attempt_hdwr_pace = 0;
19956                                 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
19957                                 rack->r_ctl.crte = NULL;
19958                         }
19959 #endif
19960                 }
19961                 break;
19962         /*  End Pacing related ones */
19963         case TCP_RACK_PRR_SENDALOT:
19964                 /* Allow PRR to send more than one seg */
19965                 RACK_OPTS_INC(tcp_rack_prr_sendalot);
19966                 rack->r_ctl.rc_prr_sendalot = optval;
19967                 break;
19968         case TCP_RACK_MIN_TO:
19969                 /* Minimum time between rack t-o's in ms */
19970                 RACK_OPTS_INC(tcp_rack_min_to);
19971                 rack->r_ctl.rc_min_to = optval;
19972                 break;
19973         case TCP_RACK_EARLY_SEG:
19974                 /* If early recovery max segments */
19975                 RACK_OPTS_INC(tcp_rack_early_seg);
19976                 rack->r_ctl.rc_early_recovery_segs = optval;
19977                 break;
19978         case TCP_RACK_ENABLE_HYSTART:
19979         {
19980                 if (optval) {
19981                         tp->ccv->flags |= CCF_HYSTART_ALLOWED;
19982                         if (rack_do_hystart > RACK_HYSTART_ON)
19983                                 tp->ccv->flags |= CCF_HYSTART_CAN_SH_CWND;
19984                         if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
19985                                 tp->ccv->flags |= CCF_HYSTART_CONS_SSTH;
19986                 } else {
19987                         tp->ccv->flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
19988                 }
19989         }
19990         break;
19991         case TCP_RACK_REORD_THRESH:
19992                 /* RACK reorder threshold (shift amount) */
19993                 RACK_OPTS_INC(tcp_rack_reord_thresh);
19994                 if ((optval > 0) && (optval < 31))
19995                         rack->r_ctl.rc_reorder_shift = optval;
19996                 else
19997                         error = EINVAL;
19998                 break;
19999         case TCP_RACK_REORD_FADE:
20000                 /* Does reordering fade after ms time */
20001                 RACK_OPTS_INC(tcp_rack_reord_fade);
20002                 rack->r_ctl.rc_reorder_fade = optval;
20003                 break;
20004         case TCP_RACK_TLP_THRESH:
20005                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20006                 RACK_OPTS_INC(tcp_rack_tlp_thresh);
20007                 if (optval)
20008                         rack->r_ctl.rc_tlp_threshold = optval;
20009                 else
20010                         error = EINVAL;
20011                 break;
20012         case TCP_BBR_USE_RACK_RR:
20013                 RACK_OPTS_INC(tcp_rack_rr);
20014                 if (optval)
20015                         rack->use_rack_rr = 1;
20016                 else
20017                         rack->use_rack_rr = 0;
20018                 break;
20019         case TCP_FAST_RSM_HACK:
20020                 RACK_OPTS_INC(tcp_rack_fastrsm_hack);
20021                 if (optval)
20022                         rack->fast_rsm_hack = 1;
20023                 else
20024                         rack->fast_rsm_hack = 0;
20025                 break;
20026         case TCP_RACK_PKT_DELAY:
20027                 /* RACK added ms i.e. rack-rtt + reord + N */
20028                 RACK_OPTS_INC(tcp_rack_pkt_delay);
20029                 rack->r_ctl.rc_pkt_delay = optval;
20030                 break;
20031         case TCP_DELACK:
20032                 RACK_OPTS_INC(tcp_rack_delayed_ack);
20033                 if (optval == 0)
20034                         tp->t_delayed_ack = 0;
20035                 else
20036                         tp->t_delayed_ack = 1;
20037                 if (tp->t_flags & TF_DELACK) {
20038                         tp->t_flags &= ~TF_DELACK;
20039                         tp->t_flags |= TF_ACKNOW;
20040                         NET_EPOCH_ENTER(et);
20041                         rack_output(tp);
20042                         NET_EPOCH_EXIT(et);
20043                 }
20044                 break;
20045
20046         case TCP_BBR_RACK_RTT_USE:
20047                 RACK_OPTS_INC(tcp_rack_rtt_use);
20048                 if ((optval != USE_RTT_HIGH) &&
20049                     (optval != USE_RTT_LOW) &&
20050                     (optval != USE_RTT_AVG))
20051                         error = EINVAL;
20052                 else
20053                         rack->r_ctl.rc_rate_sample_method = optval;
20054                 break;
20055         case TCP_DATA_AFTER_CLOSE:
20056                 RACK_OPTS_INC(tcp_data_after_close);
20057                 if (optval)
20058                         rack->rc_allow_data_af_clo = 1;
20059                 else
20060                         rack->rc_allow_data_af_clo = 0;
20061                 break;
20062         default:
20063                 break;
20064         }
20065 #ifdef NETFLIX_STATS
20066         tcp_log_socket_option(tp, sopt_name, optval, error);
20067 #endif
20068         return (error);
20069 }
20070
20071
20072 static void
20073 rack_apply_deferred_options(struct tcp_rack *rack)
20074 {
20075         struct deferred_opt_list *dol, *sdol;
20076         uint32_t s_optval;
20077
20078         TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
20079                 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
20080                 /* Disadvantage of deferal is you loose the error return */
20081                 s_optval = (uint32_t)dol->optval;
20082                 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
20083                 free(dol, M_TCPDO);
20084         }
20085 }
20086
20087 static void
20088 rack_hw_tls_change(struct tcpcb *tp, int chg)
20089 {
20090         /*
20091          * HW tls state has changed.. fix all
20092          * rsm's in flight.
20093          */
20094         struct tcp_rack *rack;
20095         struct rack_sendmap *rsm;
20096
20097         rack = (struct tcp_rack *)tp->t_fb_ptr;
20098         RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
20099                 if (chg)
20100                         rsm->r_hw_tls = 1;
20101                 else
20102                         rsm->r_hw_tls = 0;
20103         }
20104         if (chg)
20105                 rack->r_ctl.fsb.hw_tls = 1;
20106         else
20107                 rack->r_ctl.fsb.hw_tls = 0;
20108 }
20109
20110 static int
20111 rack_pru_options(struct tcpcb *tp, int flags)
20112 {
20113         if (flags & PRUS_OOB)
20114                 return (EOPNOTSUPP);
20115         return (0);
20116 }
20117
20118 static struct tcp_function_block __tcp_rack = {
20119         .tfb_tcp_block_name = __XSTRING(STACKNAME),
20120         .tfb_tcp_output = rack_output,
20121         .tfb_do_queued_segments = ctf_do_queued_segments,
20122         .tfb_do_segment_nounlock = rack_do_segment_nounlock,
20123         .tfb_tcp_do_segment = rack_do_segment,
20124         .tfb_tcp_ctloutput = rack_ctloutput,
20125         .tfb_tcp_fb_init = rack_init,
20126         .tfb_tcp_fb_fini = rack_fini,
20127         .tfb_tcp_timer_stop_all = rack_stopall,
20128         .tfb_tcp_timer_activate = rack_timer_activate,
20129         .tfb_tcp_timer_active = rack_timer_active,
20130         .tfb_tcp_timer_stop = rack_timer_stop,
20131         .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
20132         .tfb_tcp_handoff_ok = rack_handoff_ok,
20133         .tfb_tcp_mtu_chg = rack_mtu_change,
20134         .tfb_pru_options = rack_pru_options,
20135         .tfb_hwtls_change = rack_hw_tls_change,
20136         .tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
20137 };
20138
20139 /*
20140  * rack_ctloutput() must drop the inpcb lock before performing copyin on
20141  * socket option arguments.  When it re-acquires the lock after the copy, it
20142  * has to revalidate that the connection is still valid for the socket
20143  * option.
20144  */
20145 static int
20146 rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
20147 {
20148 #ifdef INET6
20149         struct ip6_hdr *ip6;
20150 #endif
20151 #ifdef INET
20152         struct ip *ip;
20153 #endif
20154         struct tcpcb *tp;
20155         struct tcp_rack *rack;
20156         uint64_t loptval;
20157         int32_t error = 0, optval;
20158
20159         tp = intotcpcb(inp);
20160         rack = (struct tcp_rack *)tp->t_fb_ptr;
20161         if (rack == NULL) {
20162                 INP_WUNLOCK(inp);
20163                 return (EINVAL);
20164         }
20165 #ifdef INET6
20166         ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
20167 #endif
20168 #ifdef INET
20169         ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
20170 #endif
20171
20172         switch (sopt->sopt_level) {
20173 #ifdef INET6
20174         case IPPROTO_IPV6:
20175                 MPASS(inp->inp_vflag & INP_IPV6PROTO);
20176                 switch (sopt->sopt_name) {
20177                 case IPV6_USE_MIN_MTU:
20178                         tcp6_use_min_mtu(tp);
20179                         break;
20180                 case IPV6_TCLASS:
20181                         /*
20182                          * The DSCP codepoint has changed, update the fsb.
20183                          */
20184                         ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
20185                             (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK);
20186                         break;
20187                 }
20188                 INP_WUNLOCK(inp);
20189                 return (0);
20190 #endif
20191 #ifdef INET
20192         case IPPROTO_IP:
20193                 switch (sopt->sopt_name) {
20194                 case IP_TOS:
20195                         /*
20196                          * The DSCP codepoint has changed, update the fsb.
20197                          */
20198                         ip->ip_tos = rack->rc_inp->inp_ip_tos;
20199                         break;
20200                 case IP_TTL:
20201                         /*
20202                          * The TTL has changed, update the fsb.
20203                          */
20204                         ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
20205                         break;
20206                 }
20207                 INP_WUNLOCK(inp);
20208                 return (0);
20209 #endif
20210         }
20211
20212         switch (sopt->sopt_name) {
20213         case TCP_RACK_TLP_REDUCE:               /*  URL:tlp_reduce */
20214         /*  Pacing related ones */
20215         case TCP_RACK_PACE_ALWAYS:              /*  URL:pace_always */
20216         case TCP_BBR_RACK_INIT_RATE:            /*  URL:irate */
20217         case TCP_BBR_IWINTSO:                   /*  URL:tso_iwin */
20218         case TCP_RACK_PACE_MAX_SEG:             /*  URL:pace_max_seg */
20219         case TCP_RACK_FORCE_MSEG:               /*  URL:force_max_seg */
20220         case TCP_RACK_PACE_RATE_CA:             /*  URL:pr_ca */
20221         case TCP_RACK_PACE_RATE_SS:             /*  URL:pr_ss*/
20222         case TCP_RACK_PACE_RATE_REC:            /*  URL:pr_rec */
20223         case TCP_RACK_GP_INCREASE_CA:           /*  URL:gp_inc_ca */
20224         case TCP_RACK_GP_INCREASE_SS:           /*  URL:gp_inc_ss */
20225         case TCP_RACK_GP_INCREASE_REC:          /*  URL:gp_inc_rec */
20226         case TCP_RACK_RR_CONF:                  /*  URL:rrr_conf */
20227         case TCP_BBR_HDWR_PACE:                 /*  URL:hdwrpace */
20228         case TCP_HDWR_RATE_CAP:                 /*  URL:hdwrcap boolean */
20229         case TCP_PACING_RATE_CAP:               /*  URL:cap  -- used by side-channel */
20230         case TCP_HDWR_UP_ONLY:                  /*  URL:uponly -- hardware pacing  boolean */
20231        /* End pacing related */
20232         case TCP_FAST_RSM_HACK:                 /*  URL:frsm_hack */
20233         case TCP_DELACK:                        /*  URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
20234         case TCP_RACK_PRR_SENDALOT:             /*  URL:prr_sendalot */
20235         case TCP_RACK_MIN_TO:                   /*  URL:min_to */
20236         case TCP_RACK_EARLY_SEG:                /*  URL:early_seg */
20237         case TCP_RACK_REORD_THRESH:             /*  URL:reord_thresh */
20238         case TCP_RACK_REORD_FADE:               /*  URL:reord_fade */
20239         case TCP_RACK_TLP_THRESH:               /*  URL:tlp_thresh */
20240         case TCP_RACK_PKT_DELAY:                /*  URL:pkt_delay */
20241         case TCP_RACK_TLP_USE:                  /*  URL:tlp_use */
20242         case TCP_BBR_RACK_RTT_USE:              /*  URL:rttuse */
20243         case TCP_BBR_USE_RACK_RR:               /*  URL:rackrr */
20244         case TCP_RACK_DO_DETECTION:             /*  URL:detect */
20245         case TCP_NO_PRR:                        /*  URL:noprr */
20246         case TCP_TIMELY_DYN_ADJ:                /*  URL:dynamic */
20247         case TCP_DATA_AFTER_CLOSE:              /*  no URL */
20248         case TCP_RACK_NONRXT_CFG_RATE:          /*  URL:nonrxtcr */
20249         case TCP_SHARED_CWND_ENABLE:            /*  URL:scwnd */
20250         case TCP_RACK_MBUF_QUEUE:               /*  URL:mqueue */
20251         case TCP_RACK_NO_PUSH_AT_MAX:           /*  URL:npush */
20252         case TCP_RACK_PACE_TO_FILL:             /*  URL:fillcw */
20253         case TCP_SHARED_CWND_TIME_LIMIT:        /*  URL:lscwnd */
20254         case TCP_RACK_PROFILE:                  /*  URL:profile */
20255         case TCP_USE_CMP_ACKS:                  /*  URL:cmpack */
20256         case TCP_RACK_ABC_VAL:                  /*  URL:labc */
20257         case TCP_REC_ABC_VAL:                   /*  URL:reclabc */
20258         case TCP_RACK_MEASURE_CNT:              /*  URL:measurecnt */
20259         case TCP_DEFER_OPTIONS:                 /*  URL:defer */
20260         case TCP_RACK_DSACK_OPT:                /*  URL:dsack */
20261         case TCP_RACK_PACING_BETA:              /*  URL:pacing_beta */
20262         case TCP_RACK_PACING_BETA_ECN:          /*  URL:pacing_beta_ecn */
20263         case TCP_RACK_TIMER_SLOP:               /*  URL:timer_slop */
20264         case TCP_RACK_ENABLE_HYSTART:           /*  URL:hystart */
20265                 break;
20266         default:
20267                 /* Filter off all unknown options to the base stack */
20268                 return (tcp_default_ctloutput(inp, sopt));
20269                 break;
20270         }
20271         INP_WUNLOCK(inp);
20272         if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
20273                 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
20274                 /*
20275                  * We truncate it down to 32 bits for the socket-option trace this
20276                  * means rates > 34Gbps won't show right, but thats probably ok.
20277                  */
20278                 optval = (uint32_t)loptval;
20279         } else {
20280                 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
20281                 /* Save it in 64 bit form too */
20282                 loptval = optval;
20283         }
20284         if (error)
20285                 return (error);
20286         INP_WLOCK(inp);
20287         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
20288                 INP_WUNLOCK(inp);
20289                 return (ECONNRESET);
20290         }
20291         if (tp->t_fb != &__tcp_rack) {
20292                 INP_WUNLOCK(inp);
20293                 return (ENOPROTOOPT);
20294         }
20295         if (rack->defer_options && (rack->gp_ready == 0) &&
20296             (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
20297             (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
20298             (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
20299             (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
20300                 /* Options are beind deferred */
20301                 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
20302                         INP_WUNLOCK(inp);
20303                         return (0);
20304                 } else {
20305                         /* No memory to defer, fail */
20306                         INP_WUNLOCK(inp);
20307                         return (ENOMEM);
20308                 }
20309         }
20310         error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
20311         INP_WUNLOCK(inp);
20312         return (error);
20313 }
20314
20315 static void
20316 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
20317 {
20318
20319         INP_WLOCK_ASSERT(tp->t_inpcb);
20320         bzero(ti, sizeof(*ti));
20321
20322         ti->tcpi_state = tp->t_state;
20323         if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
20324                 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
20325         if (tp->t_flags & TF_SACK_PERMIT)
20326                 ti->tcpi_options |= TCPI_OPT_SACK;
20327         if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
20328                 ti->tcpi_options |= TCPI_OPT_WSCALE;
20329                 ti->tcpi_snd_wscale = tp->snd_scale;
20330                 ti->tcpi_rcv_wscale = tp->rcv_scale;
20331         }
20332         if (tp->t_flags2 & TF2_ECN_PERMIT)
20333                 ti->tcpi_options |= TCPI_OPT_ECN;
20334         if (tp->t_flags & TF_FASTOPEN)
20335                 ti->tcpi_options |= TCPI_OPT_TFO;
20336         /* still kept in ticks is t_rcvtime */
20337         ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
20338         /* Since we hold everything in precise useconds this is easy */
20339         ti->tcpi_rtt = tp->t_srtt;
20340         ti->tcpi_rttvar = tp->t_rttvar;
20341         ti->tcpi_rto = tp->t_rxtcur;
20342         ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
20343         ti->tcpi_snd_cwnd = tp->snd_cwnd;
20344         /*
20345          * FreeBSD-specific extension fields for tcp_info.
20346          */
20347         ti->tcpi_rcv_space = tp->rcv_wnd;
20348         ti->tcpi_rcv_nxt = tp->rcv_nxt;
20349         ti->tcpi_snd_wnd = tp->snd_wnd;
20350         ti->tcpi_snd_bwnd = 0;          /* Unused, kept for compat. */
20351         ti->tcpi_snd_nxt = tp->snd_nxt;
20352         ti->tcpi_snd_mss = tp->t_maxseg;
20353         ti->tcpi_rcv_mss = tp->t_maxseg;
20354         ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
20355         ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
20356         ti->tcpi_snd_zerowin = tp->t_sndzerowin;
20357 #ifdef NETFLIX_STATS
20358         ti->tcpi_total_tlp = tp->t_sndtlppack;
20359         ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
20360         memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
20361 #endif
20362 #ifdef TCP_OFFLOAD
20363         if (tp->t_flags & TF_TOE) {
20364                 ti->tcpi_options |= TCPI_OPT_TOE;
20365                 tcp_offload_tcp_info(tp, ti);
20366         }
20367 #endif
20368 }
20369
20370 static int
20371 rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
20372 {
20373         struct tcpcb *tp;
20374         struct tcp_rack *rack;
20375         int32_t error, optval;
20376         uint64_t val, loptval;
20377         struct  tcp_info ti;
20378         /*
20379          * Because all our options are either boolean or an int, we can just
20380          * pull everything into optval and then unlock and copy. If we ever
20381          * add a option that is not a int, then this will have quite an
20382          * impact to this routine.
20383          */
20384         error = 0;
20385         tp = intotcpcb(inp);
20386         rack = (struct tcp_rack *)tp->t_fb_ptr;
20387         if (rack == NULL) {
20388                 INP_WUNLOCK(inp);
20389                 return (EINVAL);
20390         }
20391         switch (sopt->sopt_name) {
20392         case TCP_INFO:
20393                 /* First get the info filled */
20394                 rack_fill_info(tp, &ti);
20395                 /* Fix up the rtt related fields if needed */
20396                 INP_WUNLOCK(inp);
20397                 error = sooptcopyout(sopt, &ti, sizeof ti);
20398                 return (error);
20399         /*
20400          * Beta is the congestion control value for NewReno that influences how
20401          * much of a backoff happens when loss is detected. It is normally set
20402          * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
20403          * when you exit recovery.
20404          */
20405         case TCP_RACK_PACING_BETA:
20406                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20407                         error = EINVAL;
20408                 else if (rack->rc_pacing_cc_set == 0)
20409                         optval = rack->r_ctl.rc_saved_beta.beta;
20410                 else {
20411                         /*
20412                          * Reach out into the CC data and report back what
20413                          * I have previously set. Yeah it looks hackish but
20414                          * we don't want to report the saved values.
20415                          */
20416                         if (tp->ccv->cc_data)
20417                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta;
20418                         else
20419                                 error = EINVAL;
20420                 }
20421                 break;
20422                 /*
20423                  * Beta_ecn is the congestion control value for NewReno that influences how
20424                  * much of a backoff happens when a ECN mark is detected. It is normally set
20425                  * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
20426                  * you exit recovery. Note that classic ECN has a beta of 50, it is only
20427                  * ABE Ecn that uses this "less" value, but we do too with pacing :)
20428                  */
20429
20430         case TCP_RACK_PACING_BETA_ECN:
20431                 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
20432                         error = EINVAL;
20433                 else if (rack->rc_pacing_cc_set == 0)
20434                         optval = rack->r_ctl.rc_saved_beta.beta_ecn;
20435                 else {
20436                         /*
20437                          * Reach out into the CC data and report back what
20438                          * I have previously set. Yeah it looks hackish but
20439                          * we don't want to report the saved values.
20440                          */
20441                         if (tp->ccv->cc_data)
20442                                 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn;
20443                         else
20444                                 error = EINVAL;
20445                 }
20446                 break;
20447         case TCP_RACK_DSACK_OPT:
20448                 optval = 0;
20449                 if (rack->rc_rack_tmr_std_based) {
20450                         optval |= 1;
20451                 }
20452                 if (rack->rc_rack_use_dsack) {
20453                         optval |= 2;
20454                 }
20455                 break;
20456         case TCP_RACK_ENABLE_HYSTART:
20457         {
20458                 if (tp->ccv->flags & CCF_HYSTART_ALLOWED) {
20459                         optval = RACK_HYSTART_ON;
20460                         if (tp->ccv->flags & CCF_HYSTART_CAN_SH_CWND)
20461                                 optval = RACK_HYSTART_ON_W_SC;
20462                         if (tp->ccv->flags & CCF_HYSTART_CONS_SSTH)
20463                                 optval = RACK_HYSTART_ON_W_SC_C;
20464                 } else {
20465                         optval = RACK_HYSTART_OFF;
20466                 }
20467         }
20468         break;
20469         case TCP_FAST_RSM_HACK:
20470                 optval = rack->fast_rsm_hack;
20471                 break;
20472         case TCP_DEFER_OPTIONS:
20473                 optval = rack->defer_options;
20474                 break;
20475         case TCP_RACK_MEASURE_CNT:
20476                 optval = rack->r_ctl.req_measurements;
20477                 break;
20478         case TCP_REC_ABC_VAL:
20479                 optval = rack->r_use_labc_for_rec;
20480                 break;
20481         case TCP_RACK_ABC_VAL:
20482                 optval = rack->rc_labc;
20483                 break;
20484         case TCP_HDWR_UP_ONLY:
20485                 optval= rack->r_up_only;
20486                 break;
20487         case TCP_PACING_RATE_CAP:
20488                 loptval = rack->r_ctl.bw_rate_cap;
20489                 break;
20490         case TCP_RACK_PROFILE:
20491                 /* You cannot retrieve a profile, its write only */
20492                 error = EINVAL;
20493                 break;
20494         case TCP_USE_CMP_ACKS:
20495                 optval = rack->r_use_cmp_ack;
20496                 break;
20497         case TCP_RACK_PACE_TO_FILL:
20498                 optval = rack->rc_pace_to_cwnd;
20499                 if (optval && rack->r_fill_less_agg)
20500                         optval++;
20501                 break;
20502         case TCP_RACK_NO_PUSH_AT_MAX:
20503                 optval = rack->r_ctl.rc_no_push_at_mrtt;
20504                 break;
20505         case TCP_SHARED_CWND_ENABLE:
20506                 optval = rack->rack_enable_scwnd;
20507                 break;
20508         case TCP_RACK_NONRXT_CFG_RATE:
20509                 optval = rack->rack_rec_nonrxt_use_cr;
20510                 break;
20511         case TCP_NO_PRR:
20512                 if (rack->rack_no_prr  == 1)
20513                         optval = 1;
20514                 else if (rack->no_prr_addback == 1)
20515                         optval = 2;
20516                 else
20517                         optval = 0;
20518                 break;
20519         case TCP_RACK_DO_DETECTION:
20520                 optval = rack->do_detection;
20521                 break;
20522         case TCP_RACK_MBUF_QUEUE:
20523                 /* Now do we use the LRO mbuf-queue feature */
20524                 optval = rack->r_mbuf_queue;
20525                 break;
20526         case TCP_TIMELY_DYN_ADJ:
20527                 optval = rack->rc_gp_dyn_mul;
20528                 break;
20529         case TCP_BBR_IWINTSO:
20530                 optval = rack->rc_init_win;
20531                 break;
20532         case TCP_RACK_TLP_REDUCE:
20533                 /* RACK TLP cwnd reduction (bool) */
20534                 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
20535                 break;
20536         case TCP_BBR_RACK_INIT_RATE:
20537                 val = rack->r_ctl.init_rate;
20538                 /* convert to kbits per sec */
20539                 val *= 8;
20540                 val /= 1000;
20541                 optval = (uint32_t)val;
20542                 break;
20543         case TCP_RACK_FORCE_MSEG:
20544                 optval = rack->rc_force_max_seg;
20545                 break;
20546         case TCP_RACK_PACE_MAX_SEG:
20547                 /* Max segments in a pace */
20548                 optval = rack->rc_user_set_max_segs;
20549                 break;
20550         case TCP_RACK_PACE_ALWAYS:
20551                 /* Use the always pace method */
20552                 optval = rack->rc_always_pace;
20553                 break;
20554         case TCP_RACK_PRR_SENDALOT:
20555                 /* Allow PRR to send more than one seg */
20556                 optval = rack->r_ctl.rc_prr_sendalot;
20557                 break;
20558         case TCP_RACK_MIN_TO:
20559                 /* Minimum time between rack t-o's in ms */
20560                 optval = rack->r_ctl.rc_min_to;
20561                 break;
20562         case TCP_RACK_EARLY_SEG:
20563                 /* If early recovery max segments */
20564                 optval = rack->r_ctl.rc_early_recovery_segs;
20565                 break;
20566         case TCP_RACK_REORD_THRESH:
20567                 /* RACK reorder threshold (shift amount) */
20568                 optval = rack->r_ctl.rc_reorder_shift;
20569                 break;
20570         case TCP_RACK_REORD_FADE:
20571                 /* Does reordering fade after ms time */
20572                 optval = rack->r_ctl.rc_reorder_fade;
20573                 break;
20574         case TCP_BBR_USE_RACK_RR:
20575                 /* Do we use the rack cheat for rxt */
20576                 optval = rack->use_rack_rr;
20577                 break;
20578         case TCP_RACK_RR_CONF:
20579                 optval = rack->r_rr_config;
20580                 break;
20581         case TCP_HDWR_RATE_CAP:
20582                 optval = rack->r_rack_hw_rate_caps;
20583                 break;
20584         case TCP_BBR_HDWR_PACE:
20585                 optval = rack->rack_hdw_pace_ena;
20586                 break;
20587         case TCP_RACK_TLP_THRESH:
20588                 /* RACK TLP theshold i.e. srtt+(srtt/N) */
20589                 optval = rack->r_ctl.rc_tlp_threshold;
20590                 break;
20591         case TCP_RACK_PKT_DELAY:
20592                 /* RACK added ms i.e. rack-rtt + reord + N */
20593                 optval = rack->r_ctl.rc_pkt_delay;
20594                 break;
20595         case TCP_RACK_TLP_USE:
20596                 optval = rack->rack_tlp_threshold_use;
20597                 break;
20598         case TCP_RACK_PACE_RATE_CA:
20599                 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
20600                 break;
20601         case TCP_RACK_PACE_RATE_SS:
20602                 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
20603                 break;
20604         case TCP_RACK_PACE_RATE_REC:
20605                 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
20606                 break;
20607         case TCP_RACK_GP_INCREASE_SS:
20608                 optval = rack->r_ctl.rack_per_of_gp_ca;
20609                 break;
20610         case TCP_RACK_GP_INCREASE_CA:
20611                 optval = rack->r_ctl.rack_per_of_gp_ss;
20612                 break;
20613         case TCP_BBR_RACK_RTT_USE:
20614                 optval = rack->r_ctl.rc_rate_sample_method;
20615                 break;
20616         case TCP_DELACK:
20617                 optval = tp->t_delayed_ack;
20618                 break;
20619         case TCP_DATA_AFTER_CLOSE:
20620                 optval = rack->rc_allow_data_af_clo;
20621                 break;
20622         case TCP_SHARED_CWND_TIME_LIMIT:
20623                 optval = rack->r_limit_scw;
20624                 break;
20625         case TCP_RACK_TIMER_SLOP:
20626                 optval = rack->r_ctl.timer_slop;
20627                 break;
20628         default:
20629                 return (tcp_default_ctloutput(inp, sopt));
20630                 break;
20631         }
20632         INP_WUNLOCK(inp);
20633         if (error == 0) {
20634                 if (TCP_PACING_RATE_CAP)
20635                         error = sooptcopyout(sopt, &loptval, sizeof loptval);
20636                 else
20637                         error = sooptcopyout(sopt, &optval, sizeof optval);
20638         }
20639         return (error);
20640 }
20641
20642 static int
20643 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt)
20644 {
20645         if (sopt->sopt_dir == SOPT_SET) {
20646                 return (rack_set_sockopt(inp, sopt));
20647         } else if (sopt->sopt_dir == SOPT_GET) {
20648                 return (rack_get_sockopt(inp, sopt));
20649         } else {
20650                 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
20651         }
20652 }
20653
20654 static const char *rack_stack_names[] = {
20655         __XSTRING(STACKNAME),
20656 #ifdef STACKALIAS
20657         __XSTRING(STACKALIAS),
20658 #endif
20659 };
20660
20661 static int
20662 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
20663 {
20664         memset(mem, 0, size);
20665         return (0);
20666 }
20667
20668 static void
20669 rack_dtor(void *mem, int32_t size, void *arg)
20670 {
20671
20672 }
20673
20674 static bool rack_mod_inited = false;
20675
20676 static int
20677 tcp_addrack(module_t mod, int32_t type, void *data)
20678 {
20679         int32_t err = 0;
20680         int num_stacks;
20681
20682         switch (type) {
20683         case MOD_LOAD:
20684                 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
20685                     sizeof(struct rack_sendmap),
20686                     rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
20687
20688                 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
20689                     sizeof(struct tcp_rack),
20690                     rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
20691
20692                 sysctl_ctx_init(&rack_sysctl_ctx);
20693                 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
20694                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
20695                     OID_AUTO,
20696 #ifdef STACKALIAS
20697                     __XSTRING(STACKALIAS),
20698 #else
20699                     __XSTRING(STACKNAME),
20700 #endif
20701                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
20702                     "");
20703                 if (rack_sysctl_root == NULL) {
20704                         printf("Failed to add sysctl node\n");
20705                         err = EFAULT;
20706                         goto free_uma;
20707                 }
20708                 rack_init_sysctls();
20709                 num_stacks = nitems(rack_stack_names);
20710                 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
20711                     rack_stack_names, &num_stacks);
20712                 if (err) {
20713                         printf("Failed to register %s stack name for "
20714                             "%s module\n", rack_stack_names[num_stacks],
20715                             __XSTRING(MODNAME));
20716                         sysctl_ctx_free(&rack_sysctl_ctx);
20717 free_uma:
20718                         uma_zdestroy(rack_zone);
20719                         uma_zdestroy(rack_pcb_zone);
20720                         rack_counter_destroy();
20721                         printf("Failed to register rack module -- err:%d\n", err);
20722                         return (err);
20723                 }
20724                 tcp_lro_reg_mbufq();
20725                 rack_mod_inited = true;
20726                 break;
20727         case MOD_QUIESCE:
20728                 err = deregister_tcp_functions(&__tcp_rack, true, false);
20729                 break;
20730         case MOD_UNLOAD:
20731                 err = deregister_tcp_functions(&__tcp_rack, false, true);
20732                 if (err == EBUSY)
20733                         break;
20734                 if (rack_mod_inited) {
20735                         uma_zdestroy(rack_zone);
20736                         uma_zdestroy(rack_pcb_zone);
20737                         sysctl_ctx_free(&rack_sysctl_ctx);
20738                         rack_counter_destroy();
20739                         rack_mod_inited = false;
20740                 }
20741                 tcp_lro_dereg_mbufq();
20742                 err = 0;
20743                 break;
20744         default:
20745                 return (EOPNOTSUPP);
20746         }
20747         return (err);
20748 }
20749
20750 static moduledata_t tcp_rack = {
20751         .name = __XSTRING(MODNAME),
20752         .evhand = tcp_addrack,
20753         .priv = 0
20754 };
20755
20756 MODULE_VERSION(MODNAME, 1);
20757 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
20758 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);